]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
xfs_scrub: balance inode chunk scan across CPUs
authorDarrick J. Wong <djwong@kernel.org>
Wed, 18 May 2022 02:48:13 +0000 (22:48 -0400)
committerEric Sandeen <sandeen@sandeen.net>
Wed, 18 May 2022 02:48:13 +0000 (22:48 -0400)
Use the bounded workqueue functionality to spread the inode chunk scan
load across the CPUs more evenly.  First, we create per-AG workers to
walk each AG's inode btree to create inode batch work items for each
inobt record.  These items are added to a (second) bounded workqueue
that invokes BULKSTAT and invokes the caller's function on each bulkstat
record.

By splitting the work items into batches of 64 inodes instead of one
thread per AG, we keep the level of parallelism at a reasonably high
level almost all the way to the end of the inode scan if the inodes are
not evenly divided across AGs or if a few files have far more extent
records than average.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
scrub/inodes.c

index 80af8a747cdd5d2caf8e396795e883c4664d54a1..41e5fdc75f79a8fab952927d185f2ed215fd8bed 100644 (file)
@@ -16,6 +16,7 @@
 #include "xfs_scrub.h"
 #include "common.h"
 #include "inodes.h"
+#include "descr.h"
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
 
@@ -49,7 +50,7 @@
 static void
 bulkstat_for_inumbers(
        struct scrub_ctx        *ctx,
-       const char              *descr,
+       struct descr            *dsc,
        const struct xfs_inumbers *inumbers,
        struct xfs_bulkstat_req *breq)
 {
@@ -65,7 +66,7 @@ bulkstat_for_inumbers(
        if (error) {
                char    errbuf[DESCR_BUFSZ];
 
-               str_info(ctx, descr, "%s",
+               str_info(ctx, descr_render(dsc), "%s",
                         strerror_r(error, errbuf, DESCR_BUFSZ));
        }
 
@@ -95,61 +96,206 @@ bulkstat_for_inumbers(
 
 /* BULKSTAT wrapper routines. */
 struct scan_inodes {
+       struct workqueue        wq_bulkstat;
        scrub_inode_iter_fn     fn;
        void                    *arg;
+       unsigned int            nr_threads;
        bool                    aborted;
 };
 
 /*
- * Call into the filesystem for inode/bulkstat information and call our
- * iterator function.  We'll try to fill the bulkstat information in batches,
- * but we also can detect iget failures.
+ * A single unit of inode scan work.  This contains a pointer to the parent
+ * information, followed by an INUMBERS request structure, followed by a
+ * BULKSTAT request structure.  The last two are VLAs, so we can't represent
+ * them here.
+ */
+struct scan_ichunk {
+       struct scan_inodes      *si;
+};
+
+static inline struct xfs_inumbers_req *
+ichunk_to_inumbers(
+       struct scan_ichunk      *ichunk)
+{
+       char                    *p = (char *)ichunk;
+
+       return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
+}
+
+static inline struct xfs_bulkstat_req *
+ichunk_to_bulkstat(
+       struct scan_ichunk      *ichunk)
+{
+       char                    *p = (char *)ichunk_to_inumbers(ichunk);
+
+       return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
+}
+
+static inline int
+alloc_ichunk(
+       struct scan_inodes      *si,
+       uint32_t                agno,
+       uint64_t                startino,
+       struct scan_ichunk      **ichunkp)
+{
+       struct scan_ichunk      *ichunk;
+       struct xfs_inumbers_req *ireq;
+       struct xfs_bulkstat_req *breq;
+
+       ichunk = calloc(1, sizeof(struct scan_ichunk) +
+                          XFS_INUMBERS_REQ_SIZE(1) +
+                          XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
+       if (!ichunk)
+               return -errno;
+
+       ichunk->si = si;
+
+       ireq = ichunk_to_inumbers(ichunk);
+       ireq->hdr.icount = 1;
+       ireq->hdr.ino = startino;
+       ireq->hdr.agno = agno;
+       ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
+
+       breq = ichunk_to_bulkstat(ichunk);
+       breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
+
+       *ichunkp = ichunk;
+       return 0;
+}
+
+int
+render_ino_from_bulkstat(
+       struct scrub_ctx        *ctx,
+       char                    *buf,
+       size_t                  buflen,
+       void                    *data)
+{
+       struct xfs_bulkstat     *bstat = data;
+
+       return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
+                       bstat->bs_gen, NULL);
+}
+
+static int
+render_inumbers_from_agno(
+       struct scrub_ctx        *ctx,
+       char                    *buf,
+       size_t                  buflen,
+       void                    *data)
+{
+       xfs_agnumber_t          *agno = data;
+
+       return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
+                               major(ctx->fsinfo.fs_datadev),
+                               minor(ctx->fsinfo.fs_datadev),
+                               *agno);
+}
+
+/*
+ * Call BULKSTAT for information on a single chunk's worth of inodes and call
+ * our iterator function.  We'll try to fill the bulkstat information in
+ * batches, but we also can detect iget failures.
  */
 static void
-scan_ag_inodes(
+scan_ag_bulkstat(
        struct workqueue        *wq,
        xfs_agnumber_t          agno,
        void                    *arg)
 {
        struct xfs_handle       handle = { };
-       char                    descr[DESCR_BUFSZ];
-       struct xfs_inumbers_req *ireq;
-       struct xfs_bulkstat_req *breq;
-       struct scan_inodes      *si = arg;
        struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
+       struct scan_ichunk      *ichunk = arg;
+       struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk);
+       struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk);
+       struct scan_inodes      *si = ichunk->si;
        struct xfs_bulkstat     *bs;
-       struct xfs_inumbers     *inumbers;
-       uint64_t                nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
+       struct xfs_inumbers     *inumbers = &ireq->inumbers[0];
        int                     i;
        int                     error;
        int                     stale_count = 0;
+       DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
+       DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
 
-       snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u inodes"),
-                               major(ctx->fsinfo.fs_datadev),
-                               minor(ctx->fsinfo.fs_datadev),
-                               agno);
+       descr_set(&dsc_inumbers, &agno);
 
        memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid));
        handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                        sizeof(handle.ha_fid.fid_len);
        handle.ha_fid.fid_pad = 0;
 
-       error = -xfrog_bulkstat_alloc_req(LIBFROG_BULKSTAT_CHUNKSIZE, 0, &breq);
-       if (error) {
-               str_liberror(ctx, error, descr);
-               si->aborted = true;
-               return;
+retry:
+       bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq);
+
+       /* Iterate all the inodes. */
+       bs = &breq->bulkstat[0];
+       for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) {
+               descr_set(&dsc_bulkstat, bs);
+               handle.ha_fid.fid_ino = bs->bs_ino;
+               handle.ha_fid.fid_gen = bs->bs_gen;
+               error = si->fn(ctx, &handle, bs, si->arg);
+               switch (error) {
+               case 0:
+                       break;
+               case ESTALE: {
+                       stale_count++;
+                       if (stale_count < 30) {
+                               ireq->hdr.ino = inumbers->xi_startino;
+                               error = -xfrog_inumbers(&ctx->mnt, ireq);
+                               if (error)
+                                       goto err;
+                               goto retry;
+                       }
+                       str_info(ctx, descr_render(&dsc_bulkstat),
+_("Changed too many times during scan; giving up."));
+                       si->aborted = true;
+                       goto out;
+               }
+               case ECANCELED:
+                       error = 0;
+                       fallthrough;
+               default:
+                       goto err;
+               }
+               if (scrub_excessive_errors(ctx)) {
+                       si->aborted = true;
+                       goto out;
+               }
        }
 
-       error = -xfrog_inumbers_alloc_req(1, 0, &ireq);
+err:
        if (error) {
-               str_liberror(ctx, error, descr);
-               free(breq);
+               str_liberror(ctx, error, descr_render(&dsc_bulkstat));
                si->aborted = true;
-               return;
        }
-       inumbers = &ireq->inumbers[0];
-       xfrog_inumbers_set_ag(ireq, agno);
+out:
+       free(ichunk);
+}
+
+/*
+ * Call INUMBERS for information about inode chunks, then queue the inumbers
+ * responses in the bulkstat workqueue.  This helps us maximize CPU parallelism
+ * if the filesystem AGs are not evenly loaded.
+ */
+static void
+scan_ag_inumbers(
+       struct workqueue        *wq,
+       xfs_agnumber_t          agno,
+       void                    *arg)
+{
+       struct scan_ichunk      *ichunk = NULL;
+       struct scan_inodes      *si = arg;
+       struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
+       struct xfs_inumbers_req *ireq;
+       uint64_t                nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
+       int                     error;
+       DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
+
+       descr_set(&dsc, &agno);
+
+       error = alloc_ichunk(si, agno, 0, &ichunk);
+       if (error)
+               goto err;
+       ireq = ichunk_to_inumbers(ichunk);
 
        /* Find the inode chunk & alloc mask */
        error = -xfrog_inumbers(&ctx->mnt, ireq);
@@ -158,8 +304,8 @@ scan_ag_inodes(
                 * Make sure that we always make forward progress while we
                 * scan the inode btree.
                 */
-               if (nextino > inumbers->xi_startino) {
-                       str_corrupt(ctx, descr,
+               if (nextino > ireq->inumbers[0].xi_startino) {
+                       str_corrupt(ctx, descr_render(&dsc),
        _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
                                cvt_ino_to_agino(&ctx->mnt, nextino),
                                cvt_ino_to_agino(&ctx->mnt,
@@ -169,64 +315,53 @@ scan_ag_inodes(
                }
                nextino = ireq->hdr.ino;
 
-               /*
-                * We can have totally empty inode chunks on filesystems where
-                * there are more than 64 inodes per block.  Skip these.
-                */
-               if (inumbers->xi_alloccount == 0)
-                       goto igrp_retry;
-
-               bulkstat_for_inumbers(ctx, descr, inumbers, breq);
-
-               /* Iterate all the inodes. */
-               for (i = 0, bs = breq->bulkstat;
-                    !si->aborted && i < inumbers->xi_alloccount;
-                    i++, bs++) {
-                       handle.ha_fid.fid_ino = bs->bs_ino;
-                       handle.ha_fid.fid_gen = bs->bs_gen;
-                       error = si->fn(ctx, &handle, bs, si->arg);
-                       switch (error) {
-                       case 0:
-                               break;
-                       case ESTALE: {
-                               char    idescr[DESCR_BUFSZ];
-
-                               stale_count++;
-                               if (stale_count < 30) {
-                                       ireq->hdr.ino = inumbers->xi_startino;
-                                       goto igrp_retry;
-                               }
-                               scrub_render_ino_descr(ctx, idescr, DESCR_BUFSZ,
-                                               bs->bs_ino, bs->bs_gen, NULL);
-                               str_info(ctx, idescr,
-_("Changed too many times during scan; giving up."));
-                               break;
-                       }
-                       case ECANCELED:
-                               error = 0;
-                               fallthrough;
-                       default:
-                               goto err;
-                       }
-                       if (scrub_excessive_errors(ctx)) {
+               if (ireq->inumbers[0].xi_alloccount == 0) {
+                       /*
+                        * We can have totally empty inode chunks on
+                        * filesystems where there are more than 64 inodes per
+                        * block.  Skip these.
+                        */
+                       ;
+               } else if (si->nr_threads > 0) {
+                       /* Queue this inode chunk on the bulkstat workqueue. */
+                       error = -workqueue_add(&si->wq_bulkstat,
+                                       scan_ag_bulkstat, agno, ichunk);
+                       if (error) {
                                si->aborted = true;
+                               str_liberror(ctx, error,
+                                               _("queueing bulkstat work"));
                                goto out;
                        }
+                       ichunk = NULL;
+               } else {
+                       /*
+                        * Only one thread, call bulkstat directly.  Remember,
+                        * ichunk is freed by the worker before returning.
+                        */
+                       scan_ag_bulkstat(wq, agno, ichunk);
+                       ichunk = NULL;
+                       if (si->aborted)
+                               break;
+               }
+
+               if (!ichunk) {
+                       error = alloc_ichunk(si, agno, nextino, &ichunk);
+                       if (error)
+                               goto err;
                }
+               ireq = ichunk_to_inumbers(ichunk);
 
-               stale_count = 0;
-igrp_retry:
                error = -xfrog_inumbers(&ctx->mnt, ireq);
        }
 
 err:
        if (error) {
-               str_liberror(ctx, error, descr);
+               str_liberror(ctx, error, descr_render(&dsc));
                si->aborted = true;
        }
 out:
-       free(ireq);
-       free(breq);
+       if (ichunk)
+               free(ichunk);
 }
 
 /*
@@ -242,33 +377,58 @@ scrub_scan_all_inodes(
        struct scan_inodes      si = {
                .fn             = fn,
                .arg            = arg,
+               .nr_threads     = scrub_nproc_workqueue(ctx),
        };
        xfs_agnumber_t          agno;
-       struct workqueue        wq;
+       struct workqueue        wq_inumbers;
+       unsigned int            max_bulkstat;
        int                     ret;
 
-       ret = -workqueue_create(&wq, (struct xfs_mount *)ctx,
-                       scrub_nproc_workqueue(ctx));
+       /*
+        * The bulkstat workqueue should queue at most one inobt block's worth
+        * of inode chunk records per worker thread.  If we're running in
+        * single thread mode (nr_threads==0) then we skip the workqueues.
+        */
+       max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
+
+       ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
+                       si.nr_threads, max_bulkstat);
        if (ret) {
                str_liberror(ctx, ret, _("creating bulkstat workqueue"));
                return -1;
        }
 
+       ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
+                       si.nr_threads);
+       if (ret) {
+               str_liberror(ctx, ret, _("creating inumbers workqueue"));
+               si.aborted = true;
+               goto kill_bulkstat;
+       }
+
        for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
-               ret = -workqueue_add(&wq, scan_ag_inodes, agno, &si);
+               ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
                if (ret) {
                        si.aborted = true;
-                       str_liberror(ctx, ret, _("queueing bulkstat work"));
+                       str_liberror(ctx, ret, _("queueing inumbers work"));
                        break;
                }
        }
 
-       ret = -workqueue_terminate(&wq);
+       ret = -workqueue_terminate(&wq_inumbers);
+       if (ret) {
+               si.aborted = true;
+               str_liberror(ctx, ret, _("finishing inumbers work"));
+       }
+       workqueue_destroy(&wq_inumbers);
+
+kill_bulkstat:
+       ret = -workqueue_terminate(&si.wq_bulkstat);
        if (ret) {
                si.aborted = true;
                str_liberror(ctx, ret, _("finishing bulkstat work"));
        }
-       workqueue_destroy(&wq);
+       workqueue_destroy(&si.wq_bulkstat);
 
        return si.aborted ? -1 : 0;
 }