]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
iomap: support write completions from interrupt context
authorChristoph Hellwig <hch@lst.de>
Thu, 13 Nov 2025 17:06:29 +0000 (18:06 +0100)
committerChristian Brauner <brauner@kernel.org>
Tue, 25 Nov 2025 09:22:19 +0000 (10:22 +0100)
Completions for pure overwrites don't need to be deferred to a workqueue
as there is no work to be done, or at least no work that needs a user
context.  Set the IOMAP_DIO_INLINE_COMP by default for writes like we
already do for reads, and the clear it for all the cases that actually
do need a user context for completions to update the inode size or
record updates to the logical to physical mapping.

I've audited all users of the ->end_io callback, and they only require
user context for I/O that involves unwritten extents, COW, size
extensions, or error handling and all those are still run from workqueue
context.

This restores the behavior of the old pre-iomap direct I/O code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-5-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
fs/iomap/direct-io.c

index fb2d83f640efe0aa70e7464eb5a05918dfd360f0..60884c8cf8b7bcf0d111a8549cbc6d8754ab3788 100644 (file)
@@ -184,6 +184,21 @@ static void iomap_dio_done(struct iomap_dio *dio)
        if (dio->error)
                dio->flags &= ~IOMAP_DIO_INLINE_COMP;
 
+       /*
+        * Never invalidate pages from this context to avoid deadlocks with
+        * buffered I/O completions when called from the ioend workqueue,
+        * or avoid sleeping when called directly from ->bi_end_io.
+        * Tough luck if you hit the tiny race with someone dirtying the range
+        * right between this check and the actual completion.
+        */
+       if ((dio->flags & IOMAP_DIO_WRITE) &&
+           (dio->flags & IOMAP_DIO_INLINE_COMP)) {
+               if (dio->iocb->ki_filp->f_mapping->nrpages)
+                       dio->flags &= ~IOMAP_DIO_INLINE_COMP;
+               else
+                       dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+       }
+
        if (dio->flags & IOMAP_DIO_INLINE_COMP) {
                WRITE_ONCE(iocb->private, NULL);
                iomap_dio_complete_work(&dio->aio.work);
@@ -234,15 +249,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
                /*
                 * Try to avoid another context switch for the completion given
                 * that we are already called from the ioend completion
-                * workqueue, but never invalidate pages from this thread to
-                * avoid deadlocks with buffered I/O completions.  Tough luck if
-                * you hit the tiny race with someone dirtying the range now
-                * between this check and the actual completion.
+                * workqueue.
                 */
-               if (!dio->iocb->ki_filp->f_mapping->nrpages) {
-                       dio->flags |= IOMAP_DIO_INLINE_COMP;
-                       dio->flags |= IOMAP_DIO_NO_INVALIDATE;
-               }
+               dio->flags |= IOMAP_DIO_INLINE_COMP;
                iomap_dio_done(dio);
        }
 
@@ -378,6 +387,20 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
                        else
                                dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
                }
+
+               /*
+                * We can only do inline completion for pure overwrites that
+                * don't require additional I/O at completion time.
+                *
+                * This rules out writes that need zeroing or metdata updates to
+                * convert unwritten or shared extents.
+                *
+                * Writes that extend i_size are also not supported, but this is
+                * handled in __iomap_dio_rw().
+                */
+               if (need_completion_work)
+                       dio->flags &= ~IOMAP_DIO_INLINE_COMP;
+
                bio_opf |= REQ_OP_WRITE;
        } else {
                bio_opf |= REQ_OP_READ;
@@ -638,10 +661,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
                dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
 
-       if (iov_iter_rw(iter) == READ) {
-               /* reads can always complete inline */
-               dio->flags |= IOMAP_DIO_INLINE_COMP;
+       /*
+        * Try to complete inline if we can.  For reads this is always possible,
+        * but for writes we'll end up clearing this more often than not.
+        */
+       dio->flags |= IOMAP_DIO_INLINE_COMP;
 
+       if (iov_iter_rw(iter) == READ) {
                if (iomi.pos >= dio->i_size)
                        goto out_free_dio;
 
@@ -683,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                                dio->flags |= IOMAP_DIO_WRITE_THROUGH;
                }
 
+               /*
+                * i_size updates must to happen from process context.
+                */
+               if (iomi.pos + iomi.len > dio->i_size)
+                       dio->flags &= ~IOMAP_DIO_INLINE_COMP;
+
                /*
                 * Try to invalidate cache pages for the range we are writing.
                 * If this invalidation fails, let the caller fall back to
@@ -755,9 +787,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         * If all the writes we issued were already written through to the
         * media, we don't need to flush the cache on IO completion. Clear the
         * sync flag for this case.
+        *
+        * Otherwise clear the inline completion flag if any sync work is
+        * needed, as that needs to be performed from process context.
         */
        if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
                dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+       else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+               dio->flags &= ~IOMAP_DIO_INLINE_COMP;
 
        /*
         * We are about to drop our additional submission reference, which