From: Mike Snitzer Date: Fri, 19 Sep 2025 14:36:29 +0000 (-0400) Subject: nfs/localio: add proper O_DIRECT support for READ and WRITE X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c817248fc831f5494d076421672b70a6ec1a92dc;p=thirdparty%2Fkernel%2Fstable.git nfs/localio: add proper O_DIRECT support for READ and WRITE Because the NFS client will already happily handle misaligned O_DIRECT IO (by sending it out to NFSD via RPC) this commit's new capabilities are for the benefit of LOCALIO. LOCALIO will make best effort to transform misaligned IO to DIO-aligned extents when possible. LOCALIO's READ and WRITE DIO that is misaligned will be split into as many as 3 component IOs (@start, @middle and @end) as needed -- IFF the @middle extent is verified to be DIO-aligned, and then the @start and/or @end are misaligned (due to each being a partial page). Otherwise if the @middle isn't DIO-aligned the code will fallback to issuing only a single contiguous buffered IO. The @middle is only DIO-aligned if both the memory and on-disk offsets for the IO are aligned relative to the underlying local filesystem's block device limits (@dma_alignment and @logical_block_size respectively). The misaligned @start and/or @end extents are issued using buffered IO and the DIO-aligned @middle is issued using O_DIRECT. The @start and @end IOs are issued first using buffered IO with IOCB_SYNC and then the @middle is issued last using direct IO with async completion (AIO). This out of order IO completion means that LOCALIO's IO completion code (nfs_local_read_done and nfs_local_write_done) is only called for the IO's last associated iov_iter completion. And in the case of DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done() is updated to handle piece-wise partial completion of each iov_iter. This implementation for LOCALIO's misaligned DIO handling uses 3 iov_iter that share the same backing pages in their bio_vecs (so unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1). [Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be explored in the future. One logical progression to improve this code, and eliminate explicit loops over up to 3 iov_iter, is by extending 'struct iov_iter' to support iov_iter_clone() and iov_iter_chain() interfaces that are comparable to what 'struct bio' is able to support in the block layer. But even that wouldn't avoid the need to allocate/use up to 3 iov_iter] Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 150719d8ed8b0..8978e1ad4bc94 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -30,14 +30,23 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#define NFSLOCAL_MAX_IOS 3 + struct nfs_local_kiocb { struct kiocb kiocb; struct bio_vec *bvec; struct nfs_pgio_header *hdr; struct work_struct work; void (*aio_complete_work)(struct work_struct *); - struct iov_iter iter ____cacheline_aligned; struct nfsd_file *localio; + /* Begin mostly DIO-specific members */ + size_t end_len; + short int end_iter_index; + short int n_iters; + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; + loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; + struct iov_iter iters[NFSLOCAL_MAX_IOS]; + /* End mostly DIO-specific members */ }; struct nfs_local_fsync_ctx { @@ -291,7 +300,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, { struct nfs_local_kiocb *iocb; - iocb = kmalloc(sizeof(*iocb), flags); + iocb = kzalloc(sizeof(*iocb), flags); if (iocb == NULL) return NULL; @@ -303,25 +312,72 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, } init_sync_kiocb(&iocb->kiocb, file); - if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) - iocb->kiocb.ki_flags = IOCB_DIRECT; - iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; iocb->aio_complete_work = NULL; + iocb->end_iter_index = -1; + return iocb; } +struct nfs_local_dio { + u32 mem_align; + u32 offset_align; + loff_t middle_offset; + loff_t end_offset; + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + +static bool +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, + size_t len, struct nfs_local_dio *local_dio) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + loff_t offset = hdr->args.offset; + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + loff_t start_end, orig_end, middle_end; + + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, &nf_dio_read_offset_align); + if (rw == ITER_DEST) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) + return false; + if (unlikely(nf_dio_offset_align > PAGE_SIZE)) + return false; + if (unlikely(len < nf_dio_offset_align)) + return false; + + local_dio->mem_align = nf_dio_mem_align; + local_dio->offset_align = nf_dio_offset_align; + + start_end = round_up(offset, nf_dio_offset_align); + orig_end = offset + len; + middle_end = round_down(orig_end, nf_dio_offset_align); + + local_dio->middle_offset = start_end; + local_dio->end_offset = middle_end; + + local_dio->start_len = start_end - offset; + local_dio->middle_len = middle_end - start_end; + local_dio->end_len = orig_end - middle_end; + + return true; +} + static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, - loff_t offset, unsigned int addr_mask, unsigned int len_mask) + unsigned int addr_mask, unsigned int len_mask) { const struct bio_vec *bvec = i->bvec; size_t skip = i->iov_offset; size_t size = i->count; - if ((offset | size) & len_mask) + if (size & len_mask) return false; do { size_t len = bvec->bv_len; @@ -338,8 +394,68 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, return true; } -static void -nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw) +/* + * Setup as many as 3 iov_iter based on extents described by @local_dio. + * Returns the number of iov_iter that were setup. + */ +static int +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, + unsigned int nvecs, size_t len, + struct nfs_local_dio *local_dio) +{ + int n_iters = 0; + struct iov_iter *iters = iocb->iters; + + /* Setup misaligned start? */ + if (local_dio->start_len) { + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + iters[n_iters].count = local_dio->start_len; + iocb->offset[n_iters] = iocb->hdr->args.offset; + iocb->iter_is_dio_aligned[n_iters] = false; + ++n_iters; + } + + /* Setup misaligned end? + * If so, the end is purposely setup to be issued using buffered IO + * before the middle (which will use DIO, if DIO-aligned, with AIO). + * This creates problems if/when the end results in a partial write. + * So must save index and length of end to handle this corner case. + */ + if (local_dio->end_len) { + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + iocb->offset[n_iters] = local_dio->end_offset; + iov_iter_advance(&iters[n_iters], + local_dio->start_len + local_dio->middle_len); + iocb->iter_is_dio_aligned[n_iters] = false; + /* Save index and length of end */ + iocb->end_iter_index = n_iters; + iocb->end_len = local_dio->end_len; + ++n_iters; + } + + /* Setup DIO-aligned middle to be issued last, to allow for + * DIO with AIO completion (see nfs_local_call_{read,write}). + */ + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + if (local_dio->start_len) + iov_iter_advance(&iters[n_iters], local_dio->start_len); + iters[n_iters].count -= local_dio->end_len; + iocb->offset[n_iters] = local_dio->middle_offset; + + iocb->iter_is_dio_aligned[n_iters] = + nfs_iov_iter_aligned_bvec(&iters[n_iters], + local_dio->mem_align-1, local_dio->offset_align-1); + + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) + return 0; /* no DIO-aligned IO possible */ + ++n_iters; + + iocb->n_iters = n_iters; + return n_iters; +} + +static noinline_for_stack void +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) { struct nfs_pgio_header *hdr = iocb->hdr; struct page **pagevec = hdr->page_array.pagevec; @@ -360,26 +476,18 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw) } len = hdr->args.count - total; - iov_iter_bvec(i, rw, iocb->bvec, v, len); - - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; - /* Verify the IO is DIO-aligned as required */ - nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, - &nf_dio_offset_align, - &nf_dio_read_offset_align); - if (rw == ITER_DEST) - nf_dio_offset_align = nf_dio_read_offset_align; - - if (nf_dio_mem_align && nf_dio_offset_align && - nfs_iov_iter_aligned_bvec(i, hdr->args.offset, - nf_dio_mem_align - 1, - nf_dio_offset_align - 1)) - return; /* is DIO-aligned */ + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { + struct nfs_local_dio local_dio; - /* Fallback to using buffered for this misaligned IO */ - iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) + return; /* is DIO-aligned */ } + + /* Use buffered IO */ + iocb->offset[0] = hdr->args.offset; + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); + iocb->n_iters = 1; } static void @@ -402,10 +510,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, static void nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) { + /* Must handle partial completions */ if (status >= 0) { - hdr->res.count = status; - hdr->res.op_status = NFS4_OK; - hdr->task.tk_status = 0; + hdr->res.count += status; + /* @hdr was initialized to 0 (zeroed during allocation) */ + if (hdr->task.tk_status == 0) + hdr->res.op_status = NFS4_OK; } else { hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; @@ -451,8 +561,6 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); } - nfs_local_pgio_done(hdr, status); - /* * Must clear replen otherwise NFSv3 data corruption will occur * if/when switching from LOCALIO back to using normal RPC. @@ -480,6 +588,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); + nfs_local_pgio_done(iocb->hdr, ret); nfs_local_read_done(iocb, ret); nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } @@ -494,12 +603,21 @@ static void nfs_local_call_read(struct work_struct *work) save_cred = override_creds(filp->f_cred); - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } + for (int i = 0; i < iocb->n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } - status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter); + iocb->kiocb.ki_pos = iocb->offset[i]; + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { + nfs_local_pgio_done(iocb->hdr, status); + if (iocb->hdr->task.tk_status) + break; + } + } revert_creds(save_cred); @@ -635,6 +753,7 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) } /* Handle short writes as if they are ENOSPC */ + status = hdr->res.count; if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; hdr->args.offset += status; @@ -642,11 +761,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) hdr->args.count -= status; nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; + /* record -ENOSPC in terms of nfs_local_pgio_done */ + nfs_local_pgio_done(hdr, status); } - if (status < 0) + if (hdr->task.tk_status < 0) nfs_reset_boot_verifier(inode); - - nfs_local_pgio_done(hdr, status); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -663,6 +782,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); + nfs_local_pgio_done(iocb->hdr, ret); nfs_local_write_done(iocb, ret); nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } @@ -679,13 +799,48 @@ static void nfs_local_call_write(struct work_struct *work) current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; save_cred = override_creds(filp->f_cred); - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } - file_start_write(filp); - status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter); + for (int i = 0; i < iocb->n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } +retry: + iocb->kiocb.ki_pos = iocb->offset[i]; + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) { + /* partial write */ + if (i == iocb->end_iter_index) { + /* Must not account partial end, otherwise, due + * to end being issued before middle: the partial + * write accounting in nfs_local_write_done() + * would incorrectly advance hdr->args.offset + */ + status = 0; + } else { + /* Partial write at start or buffered middle, + * exit early. + */ + nfs_local_pgio_done(iocb->hdr, status); + break; + } + } else if (unlikely(status == -ENOTBLK && + (iocb->kiocb.ki_flags & IOCB_DIRECT))) { + /* VFS will return -ENOTBLK if DIO WRITE fails to + * invalidate the page cache. Retry using buffered IO. + */ + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + iocb->kiocb.ki_complete = NULL; + iocb->aio_complete_work = NULL; + goto retry; + } + nfs_local_pgio_done(iocb->hdr, status); + if (iocb->hdr->task.tk_status) + break; + } + } file_end_write(filp); revert_creds(save_cred); @@ -754,7 +909,7 @@ nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio) iocb->hdr = hdr; iocb->localio = localio; - nfs_local_iter_init(&iocb->iter, iocb, rw); + nfs_local_iters_init(iocb, rw); return iocb; }