]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
NFSv4/flexfiles: Read path updates for striped layouts
authorJonathan Curley <jcurley@purestorage.com>
Wed, 24 Sep 2025 16:20:46 +0000 (16:20 +0000)
committerAnna Schumaker <anna.schumaker@oracle.com>
Fri, 26 Sep 2025 19:40:22 +0000 (15:40 -0400)
Updates read path to calculate and use dss_id to direct IO to the
appropriate stripe DS.

Signed-off-by: Jonathan Curley <jcurley@purestorage.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
fs/nfs/flexfilelayout/flexfilelayout.c

index ca6dc625b71cdd3985ac9958af143eab1fd62434..60daa65cd8654d804ae79ec56645b28cb7e4b420 100644 (file)
@@ -770,6 +770,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
 static struct nfs4_pnfs_ds *
 ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
                             u32 start_idx, u32 *best_idx,
+                            u32 offset, u32 *dss_id,
                             bool check_device)
 {
        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
@@ -780,12 +781,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
        /* mirrors are initially sorted by efficiency */
        for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
                mirror = FF_LAYOUT_COMP(lseg, idx);
-               ds = nfs4_ff_layout_prepare_ds(lseg, mirror, 0, false);
+               *dss_id = nfs4_ff_layout_calc_dss_id(
+                       fls->stripe_unit,
+                       fls->mirror_array[idx]->dss_count,
+                       offset);
+               ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false);
                if (IS_ERR(ds))
                        continue;
 
                if (check_device &&
-                   nfs4_test_deviceid_unavailable(&mirror->dss[0].mirror_ds->id_node)) {
+                   nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) {
                        // reinitialize the error state in case if this is the last iteration
                        ds = ERR_PTR(-EINVAL);
                        continue;
@@ -800,42 +805,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 
 static struct nfs4_pnfs_ds *
 ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
-                                u32 start_idx, u32 *best_idx)
+                                u32 start_idx, u32 *best_idx,
+                                u32 offset, u32 *dss_id)
 {
-       return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+       return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+                                           offset, dss_id, false);
 }
 
 static struct nfs4_pnfs_ds *
 ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
-                                  u32 start_idx, u32 *best_idx)
+                                  u32 start_idx, u32 *best_idx,
+                                  u32 offset, u32 *dss_id)
 {
-       return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+       return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+                                           offset, dss_id, true);
 }
 
 static struct nfs4_pnfs_ds *
 ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
-                                 u32 start_idx, u32 *best_idx)
+                                 u32 start_idx, u32 *best_idx,
+                                 u32 offset, u32 *dss_id)
 {
        struct nfs4_pnfs_ds *ds;
 
-       ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+       ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx,
+                                               offset, dss_id);
        if (!IS_ERR(ds))
                return ds;
-       return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+       return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx,
+                                               offset, dss_id);
 }
 
 static struct nfs4_pnfs_ds *
 ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
-                         u32 *best_idx)
+                         u32 *best_idx,
+                         u32 offset,
+                         u32 *dss_id)
 {
        struct pnfs_layout_segment *lseg = pgio->pg_lseg;
        struct nfs4_pnfs_ds *ds;
 
        ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
-                                              best_idx);
+                                              best_idx, offset, dss_id);
        if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
                return ds;
-       return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+       return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx,
+                                                offset, dss_id);
 }
 
 static void
@@ -854,6 +869,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
        }
 }
 
+static bool
+ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls)
+{
+       return fls->mirror_array[0]->dss_count > 1;
+}
+
+/*
+ * ff_layout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                 struct nfs_page *req)
+{
+       unsigned int size;
+       u64 p_stripe, r_stripe;
+       u32 stripe_offset;
+       u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+       u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+       /* calls nfs_generic_pg_test */
+       size = pnfs_generic_pg_test(pgio, prev, req);
+       if (!size)
+               return 0;
+       else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg)))
+               return size;
+
+       /* see if req and prev are in the same stripe */
+       if (prev) {
+               p_stripe = (u64)req_offset(prev) - segment_offset;
+               r_stripe = (u64)req_offset(req) - segment_offset;
+               do_div(p_stripe, stripe_unit);
+               do_div(r_stripe, stripe_unit);
+
+               if (p_stripe != r_stripe)
+                       return 0;
+       }
+
+       /* calculate remaining bytes in the current stripe */
+       div_u64_rem((u64)req_offset(req) - segment_offset,
+                       stripe_unit,
+                       &stripe_offset);
+       WARN_ON_ONCE(stripe_offset > stripe_unit);
+       if (stripe_offset >= stripe_unit)
+               return 0;
+       return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
 static void
 ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
@@ -861,7 +926,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        struct nfs_pgio_mirror *pgm;
        struct nfs4_ff_layout_mirror *mirror;
        struct nfs4_pnfs_ds *ds;
-       u32 ds_idx;
+       u32 ds_idx, dss_id;
 
        if (NFS_SERVER(pgio->pg_inode)->flags &
                        (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
@@ -882,7 +947,8 @@ retry:
        /* Reset wb_nio, since getting layout segment was successful */
        req->wb_nio = 0;
 
-       ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+       ds = ff_layout_get_ds_for_read(pgio, &ds_idx,
+                                      req_offset(req), &dss_id);
        if (IS_ERR(ds)) {
                if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
                        goto out_mds;
@@ -894,7 +960,7 @@ retry:
 
        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
        pgm = &pgio->pg_mirrors[0];
-       pgm->pg_bsize = mirror->dss[0].mirror_ds->ds_versions[0].rsize;
+       pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize;
 
        pgio->pg_mirror_idx = ds_idx;
        return;
@@ -1032,7 +1098,7 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
 
 static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
        .pg_init = ff_layout_pg_init_read,
-       .pg_test = pnfs_generic_pg_test,
+       .pg_test = ff_layout_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
@@ -1087,9 +1153,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
 {
        u32 idx = hdr->pgio_mirror_idx + 1;
        u32 new_idx = 0;
+       u32 dss_id = 0;
        struct nfs4_pnfs_ds *ds;
 
-       ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+       ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx,
+                                             hdr->args.offset, &dss_id);
        if (IS_ERR(ds))
                pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
        else
@@ -1884,6 +1952,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        u32 idx = hdr->pgio_mirror_idx;
        int vers;
        struct nfs_fh *fh;
+       u32 dss_id;
        bool ds_fatal_error = false;
 
        dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
@@ -1891,22 +1960,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
                hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
        mirror = FF_LAYOUT_COMP(lseg, idx);
-       ds = nfs4_ff_layout_prepare_ds(lseg, mirror, 0, false);
+       dss_id = nfs4_ff_layout_calc_dss_id(
+               FF_LAYOUT_LSEG(lseg)->stripe_unit,
+               mirror->dss_count,
+               offset);
+       ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false);
        if (IS_ERR(ds)) {
                ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
                goto out_failed;
        }
 
        ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
-                                                  hdr->inode, 0);
+                                                  hdr->inode, dss_id);
        if (IS_ERR(ds_clnt))
                goto out_failed;
 
-       ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, 0);
+       ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
        if (!ds_cred)
                goto out_failed;
 
-       vers = nfs4_ff_layout_ds_version(mirror, 0);
+       vers = nfs4_ff_layout_ds_version(mirror, dss_id);
 
        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
                ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1914,11 +1987,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        hdr->pgio_done_cb = ff_layout_read_done_cb;
        refcount_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
-       fh = nfs4_ff_layout_select_ds_fh(mirror, 0);
+       fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
        if (fh)
                hdr->args.fh = fh;
 
-       nfs4_ff_layout_select_ds_stateid(mirror, 0, &hdr->args.stateid);
+       nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
 
        /*
         * Note that if we ever decide to split across DSes,
@@ -1928,7 +2001,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        hdr->mds_offset = offset;
 
        /* Start IO accounting for local read */
-       localio = ff_local_open_fh(lseg, idx, 0, ds->ds_clp, ds_cred, fh, FMODE_READ);
+       localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
+                               FMODE_READ);
        if (localio) {
                hdr->task.tk_start = ktime_get();
                ff_layout_read_record_layoutstats_start(&hdr->task, hdr);