From: Christoph Hellwig Date: Thu, 13 Nov 2025 17:06:29 +0000 (+0100) Subject: iomap: support write completions from interrupt context X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=eca9dc20891de4ce6b7f012ac157ca6f8fa12ce4;p=thirdparty%2Flinux.git iomap: support write completions from interrupt context Completions for pure overwrites don't need to be deferred to a workqueue as there is no work to be done, or at least no work that needs a user context. Set the IOMAP_DIO_INLINE_COMP by default for writes like we already do for reads, and the clear it for all the cases that actually do need a user context for completions to update the inode size or record updates to the logical to physical mapping. I've audited all users of the ->end_io callback, and they only require user context for I/O that involves unwritten extents, COW, size extensions, or error handling and all those are still run from workqueue context. This restores the behavior of the old pre-iomap direct I/O code. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113170633.1453259-5-hch@lst.de Signed-off-by: Christian Brauner --- diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index fb2d83f640ef..60884c8cf8b7 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -184,6 +184,21 @@ static void iomap_dio_done(struct iomap_dio *dio) if (dio->error) dio->flags &= ~IOMAP_DIO_INLINE_COMP; + /* + * Never invalidate pages from this context to avoid deadlocks with + * buffered I/O completions when called from the ioend workqueue, + * or avoid sleeping when called directly from ->bi_end_io. + * Tough luck if you hit the tiny race with someone dirtying the range + * right between this check and the actual completion. + */ + if ((dio->flags & IOMAP_DIO_WRITE) && + (dio->flags & IOMAP_DIO_INLINE_COMP)) { + if (dio->iocb->ki_filp->f_mapping->nrpages) + dio->flags &= ~IOMAP_DIO_INLINE_COMP; + else + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); @@ -234,15 +249,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) /* * Try to avoid another context switch for the completion given * that we are already called from the ioend completion - * workqueue, but never invalidate pages from this thread to - * avoid deadlocks with buffered I/O completions. Tough luck if - * you hit the tiny race with someone dirtying the range now - * between this check and the actual completion. + * workqueue. */ - if (!dio->iocb->ki_filp->f_mapping->nrpages) { - dio->flags |= IOMAP_DIO_INLINE_COMP; - dio->flags |= IOMAP_DIO_NO_INVALIDATE; - } + dio->flags |= IOMAP_DIO_INLINE_COMP; iomap_dio_done(dio); } @@ -378,6 +387,20 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; } + + /* + * We can only do inline completion for pure overwrites that + * don't require additional I/O at completion time. + * + * This rules out writes that need zeroing or metdata updates to + * convert unwritten or shared extents. + * + * Writes that extend i_size are also not supported, but this is + * handled in __iomap_dio_rw(). + */ + if (need_completion_work) + dio->flags &= ~IOMAP_DIO_INLINE_COMP; + bio_opf |= REQ_OP_WRITE; } else { bio_opf |= REQ_OP_READ; @@ -638,10 +661,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; - if (iov_iter_rw(iter) == READ) { - /* reads can always complete inline */ - dio->flags |= IOMAP_DIO_INLINE_COMP; + /* + * Try to complete inline if we can. For reads this is always possible, + * but for writes we'll end up clearing this more often than not. + */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -683,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_WRITE_THROUGH; } + /* + * i_size updates must to happen from process context. + */ + if (iomi.pos + iomi.len > dio->i_size) + dio->flags &= ~IOMAP_DIO_INLINE_COMP; + /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to @@ -755,9 +787,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If all the writes we issued were already written through to the * media, we don't need to flush the cache on IO completion. Clear the * sync flag for this case. + * + * Otherwise clear the inline completion flag if any sync work is + * needed, as that needs to be performed from process context. */ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags &= ~IOMAP_DIO_INLINE_COMP; /* * We are about to drop our additional submission reference, which