]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
authorMike Snitzer <snitzer@kernel.org>
Tue, 11 Nov 2025 14:59:31 +0000 (09:59 -0500)
committerChuck Lever <chuck.lever@oracle.com>
Mon, 1 Dec 2025 14:57:10 +0000 (09:57 -0500)
When NFSD_IO_DIRECT is selected via the
/sys/kernel/debug/nfsd/io_cache_write experimental tunable, split
incoming unaligned NFS WRITE requests into a prefix, middle and
suffix segment, as needed. The middle segment is now DIO-aligned and
the prefix and/or suffix are unaligned. Synchronous buffered IO is
used for the unaligned segments, and IOCB_DIRECT is used for the
middle DIO-aligned extent.

Although IOCB_DIRECT avoids the use of the page cache, by itself it
doesn't guarantee data durability. For UNSTABLE WRITE requests,
durability is obtained by a subsequent NFS COMMIT request.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Co-developed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
fs/nfsd/debugfs.c
fs/nfsd/trace.h
fs/nfsd/vfs.c

index 00eb1ecef6ac1802000d5195a0f3f65bafe61e02..7f44689e0a53edbfd6ade9dda6af4052976a65d3 100644 (file)
@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
        switch (val) {
        case NFSD_IO_BUFFERED:
        case NFSD_IO_DONTCACHE:
+       case NFSD_IO_DIRECT:
                nfsd_io_cache_write = val;
                break;
        default:
index 85a1521ad7574d4bc5eed567ad9955d0b245847b..5ae2a611e57f4b4e51a4d9eb6e0fccb66ad8d288 100644 (file)
@@ -469,6 +469,8 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
 DEFINE_NFSD_IO_EVENT(read_done);
 DEFINE_NFSD_IO_EVENT(write_start);
 DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_direct);
+DEFINE_NFSD_IO_EVENT(write_vector);
 DEFINE_NFSD_IO_EVENT(write_io_done);
 DEFINE_NFSD_IO_EVENT(write_done);
 DEFINE_NFSD_IO_EVENT(commit_start);
index 5333d49910d9aedcc80591168f68fd66875c7a80..ab46301da4ae0b68b7e14cbf1eb570175a5006ce 100644 (file)
@@ -1254,6 +1254,136 @@ static int wait_for_concurrent_writes(struct file *file)
        return err;
 }
 
+struct nfsd_write_dio_seg {
+       struct iov_iter                 iter;
+       int                             flags;
+};
+
+static unsigned long
+iov_iter_bvec_offset(const struct iov_iter *iter)
+{
+       return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset);
+}
+
+static void
+nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment,
+                       struct bio_vec *bvec, unsigned int nvecs,
+                       unsigned long total, size_t start, size_t len,
+                       struct kiocb *iocb)
+{
+       iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total);
+       if (start)
+               iov_iter_advance(&segment->iter, start);
+       iov_iter_truncate(&segment->iter, len);
+       segment->flags = iocb->ki_flags;
+}
+
+static unsigned int
+nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec,
+                         unsigned int nvecs, struct kiocb *iocb,
+                         unsigned long total,
+                         struct nfsd_write_dio_seg segments[3])
+{
+       u32 offset_align = nf->nf_dio_offset_align;
+       loff_t prefix_end, orig_end, middle_end;
+       u32 mem_align = nf->nf_dio_mem_align;
+       size_t prefix, middle, suffix;
+       loff_t offset = iocb->ki_pos;
+       unsigned int nsegs = 0;
+
+       /*
+        * Check if direct I/O is feasible for this write request.
+        * If alignments are not available, the write is too small,
+        * or no alignment can be found, fall back to buffered I/O.
+        */
+       if (unlikely(!mem_align || !offset_align) ||
+           unlikely(total < max(offset_align, mem_align)))
+               goto no_dio;
+
+       prefix_end = round_up(offset, offset_align);
+       orig_end = offset + total;
+       middle_end = round_down(orig_end, offset_align);
+
+       prefix = prefix_end - offset;
+       middle = middle_end - prefix_end;
+       suffix = orig_end - middle_end;
+
+       if (!middle)
+               goto no_dio;
+
+       if (prefix)
+               nfsd_write_dio_seg_init(&segments[nsegs++], bvec,
+                                       nvecs, total, 0, prefix, iocb);
+
+       nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs,
+                               total, prefix, middle, iocb);
+
+       /*
+        * Check if the bvec iterator is aligned for direct I/O.
+        *
+        * bvecs generated from RPC receive buffers are contiguous: After
+        * the first bvec, all subsequent bvecs start at bv_offset zero
+        * (page-aligned). Therefore, only the first bvec is checked.
+        */
+       if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1))
+               goto no_dio;
+       segments[nsegs].flags |= IOCB_DIRECT;
+       nsegs++;
+
+       if (suffix)
+               nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total,
+                                       prefix + middle, suffix, iocb);
+
+       return nsegs;
+
+no_dio:
+       /* No DIO alignment possible - pack into single non-DIO segment. */
+       nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0,
+                               total, iocb);
+       return 1;
+}
+
+static noinline_for_stack int
+nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                 struct nfsd_file *nf, unsigned int nvecs,
+                 unsigned long *cnt, struct kiocb *kiocb)
+{
+       struct nfsd_write_dio_seg segments[3];
+       struct file *file = nf->nf_file;
+       unsigned int nsegs, i;
+       ssize_t host_err;
+
+       nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs,
+                                         kiocb, *cnt, segments);
+
+       *cnt = 0;
+       for (i = 0; i < nsegs; i++) {
+               kiocb->ki_flags = segments[i].flags;
+               if (kiocb->ki_flags & IOCB_DIRECT)
+                       trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos,
+                                               segments[i].iter.count);
+               else {
+                       trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos,
+                                               segments[i].iter.count);
+                       /*
+                        * Mark the I/O buffer as evict-able to reduce
+                        * memory contention.
+                        */
+                       if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
+                               kiocb->ki_flags |= IOCB_DONTCACHE;
+               }
+
+               host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter);
+               if (host_err < 0)
+                       return host_err;
+               *cnt += host_err;
+               if (host_err < segments[i].iter.count)
+                       break;  /* partial write */
+       }
+
+       return 0;
+}
+
 /**
  * nfsd_vfs_write - write data to an already-open file
  * @rqstp: RPC execution context
@@ -1328,25 +1458,32 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
        }
 
        nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
-       iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+
        since = READ_ONCE(file->f_wb_err);
        if (verf)
                nfsd_copy_write_verifier(verf, nn);
 
        switch (nfsd_io_cache_write) {
-       case NFSD_IO_BUFFERED:
+       case NFSD_IO_DIRECT:
+               host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs,
+                                            cnt, &kiocb);
                break;
        case NFSD_IO_DONTCACHE:
                if (file->f_op->fop_flags & FOP_DONTCACHE)
                        kiocb.ki_flags |= IOCB_DONTCACHE;
+               fallthrough;
+       case NFSD_IO_BUFFERED:
+               iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+               host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
+               if (host_err < 0)
+                       break;
+               *cnt = host_err;
                break;
        }
-       host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
        if (host_err < 0) {
                commit_reset_write_verifier(nn, rqstp, host_err);
                goto out_nfserr;
        }
-       *cnt = host_err;
        nfsd_stats_io_write_add(nn, exp, *cnt);
        fsnotify_modify(file);
        host_err = filemap_check_wb_err(file->f_mapping, since);