]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
NFSv4/flexfiles: Add support for striped layouts
authorJonathan Curley <jcurley@purestorage.com>
Wed, 24 Sep 2025 16:20:50 +0000 (16:20 +0000)
committerAnna Schumaker <anna.schumaker@oracle.com>
Fri, 26 Sep 2025 19:43:40 +0000 (15:43 -0400)
Updates lseg creation path to parse and add striped layouts. Enable
support for striped layouts.

Limitations:

1. All mirrors must have the same number of stripes.

Signed-off-by: Jonathan Curley <jcurley@purestorage.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h

index 45c4efd3a7acf2f05e187a6180fd6dbe86cfcea7..df01d2876b68b7ade60d34d9f6bf0921f3a459d9 100644 (file)
@@ -177,18 +177,19 @@ ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id,
 #endif
 }
 
-static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
-               const struct nfs4_ff_layout_mirror *m2)
+static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1,
+               const struct nfs4_ff_layout_ds_stripe *dss2)
 {
        int i, j;
 
-       if (m1->dss[0].fh_versions_cnt != m2->dss[0].fh_versions_cnt)
+       if (dss1->fh_versions_cnt != dss2->fh_versions_cnt)
                return false;
-       for (i = 0; i < m1->dss[0].fh_versions_cnt; i++) {
+
+       for (i = 0; i < dss1->fh_versions_cnt; i++) {
                bool found_fh = false;
-               for (j = 0; j < m2->dss[0].fh_versions_cnt; j++) {
-                       if (nfs_compare_fh(&m1->dss[0].fh_versions[i],
-                                       &m2->dss[0].fh_versions[j]) == 0) {
+               for (j = 0; j < dss2->fh_versions_cnt; j++) {
+                       if (nfs_compare_fh(&dss1->fh_versions[i],
+                                       &dss2->fh_versions[j]) == 0) {
                                found_fh = true;
                                break;
                        }
@@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
        return true;
 }
 
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+               const struct nfs4_ff_layout_mirror *m2)
+{
+       u32 dss_id;
+
+       if (m1->dss_count != m2->dss_count)
+               return false;
+
+       for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+               if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id]))
+                       return false;
+
+       return true;
+}
+
+static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1,
+               const struct nfs4_ff_layout_mirror *m2)
+{
+       u32 dss_id;
+
+       if (m1->dss_count != m2->dss_count)
+               return false;
+
+       for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+               if (memcmp(&m1->dss[dss_id].devid,
+                          &m2->dss[dss_id].devid,
+                          sizeof(m1->dss[dss_id].devid)) != 0)
+                       return false;
+
+       return true;
+}
+
 static struct nfs4_ff_layout_mirror *
 ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
                struct nfs4_ff_layout_mirror *mirror)
@@ -209,8 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 
        spin_lock(&inode->i_lock);
        list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
-               if (memcmp(&mirror->dss[0].devid, &pos->dss[0].devid,
-                          sizeof(pos->dss[0].devid)) != 0)
+               if (!ff_mirror_match_devid(mirror, pos))
                        continue;
                if (!ff_mirror_match_fh(mirror, pos))
                        continue;
@@ -241,13 +273,15 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
 static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 {
        struct nfs4_ff_layout_mirror *mirror;
+       u32 dss_id;
 
        mirror = kzalloc(sizeof(*mirror), gfp_flags);
        if (mirror != NULL) {
                spin_lock_init(&mirror->lock);
                refcount_set(&mirror->ref, 1);
                INIT_LIST_HEAD(&mirror->mirrors);
-               nfs_localio_file_init(&mirror->dss[0].nfl);
+               for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+                       nfs_localio_file_init(&mirror->dss[dss_id].nfl);
        }
        return mirror;
 }
@@ -255,17 +289,19 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
        const struct cred       *cred;
-       int dss_id = 0;
+       u32 dss_id;
 
        ff_layout_remove_mirror(mirror);
 
-       kfree(mirror->dss[dss_id].fh_versions);
-       nfs_close_local_fh(&mirror->dss[dss_id].nfl);
-       cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
-       put_cred(cred);
-       cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
-       put_cred(cred);
-       nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
+       for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+               kfree(mirror->dss[dss_id].fh_versions);
+               cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
+               put_cred(cred);
+               cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
+               put_cred(cred);
+               nfs_close_local_fh(&mirror->dss[dss_id].nfl);
+               nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
+       }
 
        kfree(mirror->dss);
        kfree(mirror);
@@ -371,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
                        free_me);
 }
 
+static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror)
+{
+       u32 dss_id, sum = 0;
+
+       for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+               sum += mirror->dss[dss_id].efficiency;
+
+       return sum;
+}
+
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
        int i, j;
 
        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
                for (j = i + 1; j < fls->mirror_array_cnt; j++)
-                       if (fls->mirror_array[i]->dss[0].efficiency <
-                           fls->mirror_array[j]->dss[0].efficiency)
+                       if (ff_mirror_efficiency_sum(fls->mirror_array[i]) <
+                           ff_mirror_efficiency_sum(fls->mirror_array[j]))
                                swap(fls->mirror_array[i],
                                     fls->mirror_array[j]);
        }
@@ -398,6 +444,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
        u32 mirror_array_cnt;
        __be32 *p;
        int i, rc;
+       struct nfs4_ff_layout_ds_stripe *dss_info;
 
        dprintk("--> %s\n", __func__);
        scratch = folio_alloc(gfp_flags, 0);
@@ -440,17 +487,24 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                kuid_t uid;
                kgid_t gid;
                u32 fh_count, id;
-               int j, dss_id = 0;
+               int j, dss_id;
 
                rc = -EIO;
                p = xdr_inline_decode(&stream, 4);
                if (!p)
                        goto out_err_free;
 
-               dss_count = be32_to_cpup(p);
+               // Ensure all mirrors have same stripe count.
+               if (dss_count == 0)
+                       dss_count = be32_to_cpup(p);
+               else if (dss_count != be32_to_cpup(p))
+                       goto out_err_free;
+
+               if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT ||
+                   dss_count == 0)
+                       goto out_err_free;
 
-               /* FIXME: allow for striping? */
-               if (dss_count != 1)
+               if (dss_count > 1 && stripe_unit == 0)
                        goto out_err_free;
 
                fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
@@ -464,91 +518,100 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                    kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
                            gfp_flags);
 
-               /* deviceid */
-               rc = decode_deviceid(&stream, &fls->mirror_array[i]->dss[dss_id].devid);
-               if (rc)
-                       goto out_err_free;
+               for (dss_id = 0; dss_id < dss_count; dss_id++) {
+                       dss_info = &fls->mirror_array[i]->dss[dss_id];
+                       dss_info->mirror = fls->mirror_array[i];
 
-               /* efficiency */
-               rc = -EIO;
-               p = xdr_inline_decode(&stream, 4);
-               if (!p)
-                       goto out_err_free;
-               fls->mirror_array[i]->dss[dss_id].efficiency = be32_to_cpup(p);
+                       /* deviceid */
+                       rc = decode_deviceid(&stream, &dss_info->devid);
+                       if (rc)
+                               goto out_err_free;
 
-               /* stateid */
-               rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->dss[dss_id].stateid);
-               if (rc)
-                       goto out_err_free;
+                       /* efficiency */
+                       rc = -EIO;
+                       p = xdr_inline_decode(&stream, 4);
+                       if (!p)
+                               goto out_err_free;
+                       dss_info->efficiency = be32_to_cpup(p);
 
-               /* fh */
-               rc = -EIO;
-               p = xdr_inline_decode(&stream, 4);
-               if (!p)
-                       goto out_err_free;
-               fh_count = be32_to_cpup(p);
+                       /* stateid */
+                       rc = decode_pnfs_stateid(&stream, &dss_info->stateid);
+                       if (rc)
+                               goto out_err_free;
 
-               fls->mirror_array[i]->dss[dss_id].fh_versions =
-                   kcalloc(fh_count, sizeof(struct nfs_fh),
-                           gfp_flags);
-               if (fls->mirror_array[i]->dss[dss_id].fh_versions == NULL) {
-                       rc = -ENOMEM;
-                       goto out_err_free;
-               }
+                       /* fh */
+                       rc = -EIO;
+                       p = xdr_inline_decode(&stream, 4);
+                       if (!p)
+                               goto out_err_free;
+                       fh_count = be32_to_cpup(p);
 
-               for (j = 0; j < fh_count; j++) {
-                       rc = decode_nfs_fh(&stream,
-                                          &fls->mirror_array[i]->dss[dss_id].fh_versions[j]);
+                       dss_info->fh_versions =
+                           kcalloc(fh_count, sizeof(struct nfs_fh),
+                                   gfp_flags);
+                       if (dss_info->fh_versions == NULL) {
+                               rc = -ENOMEM;
+                               goto out_err_free;
+                       }
+
+                       for (j = 0; j < fh_count; j++) {
+                               rc = decode_nfs_fh(&stream,
+                                                  &dss_info->fh_versions[j]);
+                               if (rc)
+                                       goto out_err_free;
+                       }
+
+                       dss_info->fh_versions_cnt = fh_count;
+
+                       /* user */
+                       rc = decode_name(&stream, &id);
                        if (rc)
                                goto out_err_free;
-               }
 
-               fls->mirror_array[i]->dss[dss_id].fh_versions_cnt = fh_count;
+                       uid = make_kuid(&init_user_ns, id);
 
-               /* user */
-               rc = decode_name(&stream, &id);
-               if (rc)
-                       goto out_err_free;
+                       /* group */
+                       rc = decode_name(&stream, &id);
+                       if (rc)
+                               goto out_err_free;
 
-               uid = make_kuid(&init_user_ns, id);
+                       gid = make_kgid(&init_user_ns, id);
 
-               /* group */
-               rc = decode_name(&stream, &id);
-               if (rc)
-                       goto out_err_free;
+                       if (gfp_flags & __GFP_FS)
+                               kcred = prepare_kernel_cred(&init_task);
+                       else {
+                               unsigned int nofs_flags = memalloc_nofs_save();
 
-               gid = make_kgid(&init_user_ns, id);
+                               kcred = prepare_kernel_cred(&init_task);
+                               memalloc_nofs_restore(nofs_flags);
+                       }
+                       rc = -ENOMEM;
+                       if (!kcred)
+                               goto out_err_free;
+                       kcred->fsuid = uid;
+                       kcred->fsgid = gid;
+                       cred = RCU_INITIALIZER(kcred);
 
-               if (gfp_flags & __GFP_FS)
-                       kcred = prepare_kernel_cred(&init_task);
-               else {
-                       unsigned int nofs_flags = memalloc_nofs_save();
-                       kcred = prepare_kernel_cred(&init_task);
-                       memalloc_nofs_restore(nofs_flags);
+                       if (lgr->range.iomode == IOMODE_READ)
+                               rcu_assign_pointer(dss_info->ro_cred, cred);
+                       else
+                               rcu_assign_pointer(dss_info->rw_cred, cred);
                }
-               rc = -ENOMEM;
-               if (!kcred)
-                       goto out_err_free;
-               kcred->fsuid = uid;
-               kcred->fsgid = gid;
-               cred = RCU_INITIALIZER(kcred);
-
-               if (lgr->range.iomode == IOMODE_READ)
-                       rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].ro_cred, cred);
-               else
-                       rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].rw_cred, cred);
 
                mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
                if (mirror != fls->mirror_array[i]) {
-                       /* swap cred ptrs so free_mirror will clean up old */
-                       if (lgr->range.iomode == IOMODE_READ) {
-                               cred = xchg(&mirror->dss[dss_id].ro_cred,
-                                           fls->mirror_array[i]->dss[dss_id].ro_cred);
-                               rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].ro_cred, cred);
-                       } else {
-                               cred = xchg(&mirror->dss[dss_id].rw_cred,
-                                           fls->mirror_array[i]->dss[dss_id].rw_cred);
-                               rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].rw_cred, cred);
+                       for (dss_id = 0; dss_id < dss_count; dss_id++) {
+                               dss_info = &fls->mirror_array[i]->dss[dss_id];
+                               /* swap cred ptrs so free_mirror will clean up old */
+                               if (lgr->range.iomode == IOMODE_READ) {
+                                       cred = xchg(&mirror->dss[dss_id].ro_cred,
+                                                   dss_info->ro_cred);
+                                       rcu_assign_pointer(dss_info->ro_cred, cred);
+                               } else {
+                                       cred = xchg(&mirror->dss[dss_id].rw_cred,
+                                                   dss_info->rw_cred);
+                                       rcu_assign_pointer(dss_info->rw_cred, cred);
+                               }
                        }
                        ff_layout_free_mirror(fls->mirror_array[i]);
                        fls->mirror_array[i] = mirror;
index 142324d6d5c591c1354751608716919577fdbe3c..17a008c8e97ce97df3e9c06d6562efc3d18691db 100644 (file)
@@ -21,6 +21,8 @@
  * due to network error etc. */
 #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
 
+#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096
+
 /* LAYOUTSTATS report interval in ms */
 #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
 #define FF_LAYOUTSTATS_MAXDEV 4