]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
nfs/localio: add proper O_DIRECT support for READ and WRITE
authorMike Snitzer <snitzer@kernel.org>
Fri, 19 Sep 2025 14:36:29 +0000 (10:36 -0400)
committerAnna Schumaker <anna.schumaker@oracle.com>
Tue, 30 Sep 2025 20:10:30 +0000 (16:10 -0400)
Because the NFS client will already happily handle misaligned O_DIRECT
IO (by sending it out to NFSD via RPC) this commit's new capabilities
are for the benefit of LOCALIO.

LOCALIO will make best effort to transform misaligned IO to
DIO-aligned extents when possible.

LOCALIO's READ and WRITE DIO that is misaligned will be split into as
many as 3 component IOs (@start, @middle and @end) as needed -- IFF
the @middle extent is verified to be DIO-aligned, and then the @start
and/or @end are misaligned (due to each being a partial page).
Otherwise if the @middle isn't DIO-aligned the code will fallback to
issuing only a single contiguous buffered IO.

The @middle is only DIO-aligned if both the memory and on-disk offsets
for the IO are aligned relative to the underlying local filesystem's
block device limits (@dma_alignment and @logical_block_size
respectively).

The misaligned @start and/or @end extents are issued using buffered IO
and the DIO-aligned @middle is issued using O_DIRECT. The @start and
@end IOs are issued first using buffered IO with IOCB_SYNC and then
the @middle is issued last using direct IO with async completion (AIO).
This out of order IO completion means that LOCALIO's IO completion
code (nfs_local_read_done and nfs_local_write_done) is only called for
the IO's last associated iov_iter completion. And in the case of
DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done()
is updated to handle piece-wise partial completion of each iov_iter.

This implementation for LOCALIO's misaligned DIO handling uses 3
iov_iter that share the same backing pages in their bio_vecs (so
unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1).

[Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be
explored in the future. One logical progression to improve this code,
and eliminate explicit loops over up to 3 iov_iter, is by extending
'struct iov_iter' to support iov_iter_clone() and iov_iter_chain()
interfaces that are comparable to what 'struct bio' is able to support
in the block layer. But even that wouldn't avoid the need to
allocate/use up to 3 iov_iter]

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
fs/nfs/localio.c

index 150719d8ed8b045536f23fb68c458f9f38209881..8978e1ad4bc94c5de96ef334450908db7a1db6c5 100644 (file)
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
+#define NFSLOCAL_MAX_IOS       3
+
 struct nfs_local_kiocb {
        struct kiocb            kiocb;
        struct bio_vec          *bvec;
        struct nfs_pgio_header  *hdr;
        struct work_struct      work;
        void (*aio_complete_work)(struct work_struct *);
-       struct iov_iter         iter ____cacheline_aligned;
        struct nfsd_file        *localio;
+       /* Begin mostly DIO-specific members */
+       size_t                  end_len;
+       short int               end_iter_index;
+       short int               n_iters;
+       bool                    iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+       loff_t                  offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+       struct iov_iter         iters[NFSLOCAL_MAX_IOS];
+       /* End mostly DIO-specific members */
 };
 
 struct nfs_local_fsync_ctx {
@@ -291,7 +300,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
 {
        struct nfs_local_kiocb *iocb;
 
-       iocb = kmalloc(sizeof(*iocb), flags);
+       iocb = kzalloc(sizeof(*iocb), flags);
        if (iocb == NULL)
                return NULL;
 
@@ -303,25 +312,72 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
        }
 
        init_sync_kiocb(&iocb->kiocb, file);
-       if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags))
-               iocb->kiocb.ki_flags = IOCB_DIRECT;
 
-       iocb->kiocb.ki_pos = hdr->args.offset;
        iocb->hdr = hdr;
        iocb->kiocb.ki_flags &= ~IOCB_APPEND;
        iocb->aio_complete_work = NULL;
 
+       iocb->end_iter_index = -1;
+
        return iocb;
 }
 
+struct nfs_local_dio {
+       u32 mem_align;
+       u32 offset_align;
+       loff_t middle_offset;
+       loff_t end_offset;
+       ssize_t start_len;      /* Length for misaligned first extent */
+       ssize_t middle_len;     /* Length for DIO-aligned middle extent */
+       ssize_t end_len;        /* Length for misaligned last extent */
+};
+
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+                         size_t len, struct nfs_local_dio *local_dio)
+{
+       struct nfs_pgio_header *hdr = iocb->hdr;
+       loff_t offset = hdr->args.offset;
+       u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+       loff_t start_end, orig_end, middle_end;
+
+       nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+                       &nf_dio_offset_align, &nf_dio_read_offset_align);
+       if (rw == ITER_DEST)
+               nf_dio_offset_align = nf_dio_read_offset_align;
+
+       if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+               return false;
+       if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+               return false;
+       if (unlikely(len < nf_dio_offset_align))
+               return false;
+
+       local_dio->mem_align = nf_dio_mem_align;
+       local_dio->offset_align = nf_dio_offset_align;
+
+       start_end = round_up(offset, nf_dio_offset_align);
+       orig_end = offset + len;
+       middle_end = round_down(orig_end, nf_dio_offset_align);
+
+       local_dio->middle_offset = start_end;
+       local_dio->end_offset = middle_end;
+
+       local_dio->start_len = start_end - offset;
+       local_dio->middle_len = middle_end - start_end;
+       local_dio->end_len = orig_end - middle_end;
+
+       return true;
+}
+
 static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
-               loff_t offset, unsigned int addr_mask, unsigned int len_mask)
+               unsigned int addr_mask, unsigned int len_mask)
 {
        const struct bio_vec *bvec = i->bvec;
        size_t skip = i->iov_offset;
        size_t size = i->count;
 
-       if ((offset | size) & len_mask)
+       if (size & len_mask)
                return false;
        do {
                size_t len = bvec->bv_len;
@@ -338,8 +394,68 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
        return true;
 }
 
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+                         unsigned int nvecs, size_t len,
+                         struct nfs_local_dio *local_dio)
+{
+       int n_iters = 0;
+       struct iov_iter *iters = iocb->iters;
+
+       /* Setup misaligned start? */
+       if (local_dio->start_len) {
+               iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+               iters[n_iters].count = local_dio->start_len;
+               iocb->offset[n_iters] = iocb->hdr->args.offset;
+               iocb->iter_is_dio_aligned[n_iters] = false;
+               ++n_iters;
+       }
+
+       /* Setup misaligned end?
+        * If so, the end is purposely setup to be issued using buffered IO
+        * before the middle (which will use DIO, if DIO-aligned, with AIO).
+        * This creates problems if/when the end results in a partial write.
+        * So must save index and length of end to handle this corner case.
+        */
+       if (local_dio->end_len) {
+               iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+               iocb->offset[n_iters] = local_dio->end_offset;
+               iov_iter_advance(&iters[n_iters],
+                       local_dio->start_len + local_dio->middle_len);
+               iocb->iter_is_dio_aligned[n_iters] = false;
+               /* Save index and length of end */
+               iocb->end_iter_index = n_iters;
+               iocb->end_len = local_dio->end_len;
+               ++n_iters;
+       }
+
+       /* Setup DIO-aligned middle to be issued last, to allow for
+        * DIO with AIO completion (see nfs_local_call_{read,write}).
+        */
+       iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+       if (local_dio->start_len)
+               iov_iter_advance(&iters[n_iters], local_dio->start_len);
+       iters[n_iters].count -= local_dio->end_len;
+       iocb->offset[n_iters] = local_dio->middle_offset;
+
+       iocb->iter_is_dio_aligned[n_iters] =
+               nfs_iov_iter_aligned_bvec(&iters[n_iters],
+                       local_dio->mem_align-1, local_dio->offset_align-1);
+
+       if (unlikely(!iocb->iter_is_dio_aligned[n_iters]))
+               return 0; /* no DIO-aligned IO possible */
+       ++n_iters;
+
+       iocb->n_iters = n_iters;
+       return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
 {
        struct nfs_pgio_header *hdr = iocb->hdr;
        struct page **pagevec = hdr->page_array.pagevec;
@@ -360,26 +476,18 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
        }
        len = hdr->args.count - total;
 
-       iov_iter_bvec(i, rw, iocb->bvec, v, len);
-
-       if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
-               u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
-               /* Verify the IO is DIO-aligned as required */
-               nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
-                                               &nf_dio_offset_align,
-                                               &nf_dio_read_offset_align);
-               if (rw == ITER_DEST)
-                       nf_dio_offset_align = nf_dio_read_offset_align;
-
-               if (nf_dio_mem_align && nf_dio_offset_align &&
-                   nfs_iov_iter_aligned_bvec(i, hdr->args.offset,
-                                             nf_dio_mem_align - 1,
-                                             nf_dio_offset_align - 1))
-                       return; /* is DIO-aligned */
+       if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+               struct nfs_local_dio local_dio;
 
-               /* Fallback to using buffered for this misaligned IO */
-               iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+               if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+                   nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+                       return; /* is DIO-aligned */
        }
+
+       /* Use buffered IO */
+       iocb->offset[0] = hdr->args.offset;
+       iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+       iocb->n_iters = 1;
 }
 
 static void
@@ -402,10 +510,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
 static void
 nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
 {
+       /* Must handle partial completions */
        if (status >= 0) {
-               hdr->res.count = status;
-               hdr->res.op_status = NFS4_OK;
-               hdr->task.tk_status = 0;
+               hdr->res.count += status;
+               /* @hdr was initialized to 0 (zeroed during allocation) */
+               if (hdr->task.tk_status == 0)
+                       hdr->res.op_status = NFS4_OK;
        } else {
                hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
                hdr->task.tk_status = status;
@@ -451,8 +561,6 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
                pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
        }
 
-       nfs_local_pgio_done(hdr, status);
-
        /*
         * Must clear replen otherwise NFSv3 data corruption will occur
         * if/when switching from LOCALIO back to using normal RPC.
@@ -480,6 +588,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
        struct nfs_local_kiocb *iocb =
                container_of(kiocb, struct nfs_local_kiocb, kiocb);
 
+       nfs_local_pgio_done(iocb->hdr, ret);
        nfs_local_read_done(iocb, ret);
        nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
 }
@@ -494,12 +603,21 @@ static void nfs_local_call_read(struct work_struct *work)
 
        save_cred = override_creds(filp->f_cred);
 
-       if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
-               iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
-               iocb->aio_complete_work = nfs_local_read_aio_complete_work;
-       }
+       for (int i = 0; i < iocb->n_iters ; i++) {
+               if (iocb->iter_is_dio_aligned[i]) {
+                       iocb->kiocb.ki_flags |= IOCB_DIRECT;
+                       iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+                       iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+               }
 
-       status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
+               iocb->kiocb.ki_pos = iocb->offset[i];
+               status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+               if (status != -EIOCBQUEUED) {
+                       nfs_local_pgio_done(iocb->hdr, status);
+                       if (iocb->hdr->task.tk_status)
+                               break;
+               }
+       }
 
        revert_creds(save_cred);
 
@@ -635,6 +753,7 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
        }
 
        /* Handle short writes as if they are ENOSPC */
+       status = hdr->res.count;
        if (status > 0 && status < hdr->args.count) {
                hdr->mds_offset += status;
                hdr->args.offset += status;
@@ -642,11 +761,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
                hdr->args.count -= status;
                nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
                status = -ENOSPC;
+               /* record -ENOSPC in terms of nfs_local_pgio_done */
+               nfs_local_pgio_done(hdr, status);
        }
-       if (status < 0)
+       if (hdr->task.tk_status < 0)
                nfs_reset_boot_verifier(inode);
-
-       nfs_local_pgio_done(hdr, status);
 }
 
 static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -663,6 +782,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
        struct nfs_local_kiocb *iocb =
                container_of(kiocb, struct nfs_local_kiocb, kiocb);
 
+       nfs_local_pgio_done(iocb->hdr, ret);
        nfs_local_write_done(iocb, ret);
        nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
 }
@@ -679,13 +799,48 @@ static void nfs_local_call_write(struct work_struct *work)
        current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
        save_cred = override_creds(filp->f_cred);
 
-       if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
-               iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
-               iocb->aio_complete_work = nfs_local_write_aio_complete_work;
-       }
-
        file_start_write(filp);
-       status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
+       for (int i = 0; i < iocb->n_iters ; i++) {
+               if (iocb->iter_is_dio_aligned[i]) {
+                       iocb->kiocb.ki_flags |= IOCB_DIRECT;
+                       iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+                       iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+               }
+retry:
+               iocb->kiocb.ki_pos = iocb->offset[i];
+               status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+               if (status != -EIOCBQUEUED) {
+                       if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+                               /* partial write */
+                               if (i == iocb->end_iter_index) {
+                                       /* Must not account partial end, otherwise, due
+                                        * to end being issued before middle: the partial
+                                        * write accounting in nfs_local_write_done()
+                                        * would incorrectly advance hdr->args.offset
+                                        */
+                                       status = 0;
+                               } else {
+                                       /* Partial write at start or buffered middle,
+                                        * exit early.
+                                        */
+                                       nfs_local_pgio_done(iocb->hdr, status);
+                                       break;
+                               }
+                       } else if (unlikely(status == -ENOTBLK &&
+                                           (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+                               /* VFS will return -ENOTBLK if DIO WRITE fails to
+                                * invalidate the page cache. Retry using buffered IO.
+                                */
+                               iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+                               iocb->kiocb.ki_complete = NULL;
+                               iocb->aio_complete_work = NULL;
+                               goto retry;
+                       }
+                       nfs_local_pgio_done(iocb->hdr, status);
+                       if (iocb->hdr->task.tk_status)
+                               break;
+               }
+       }
        file_end_write(filp);
 
        revert_creds(save_cred);
@@ -754,7 +909,7 @@ nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
        iocb->hdr = hdr;
        iocb->localio = localio;
 
-       nfs_local_iter_init(&iocb->iter, iocb, rw);
+       nfs_local_iters_init(iocb, rw);
 
        return iocb;
 }