]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blobdiff - scrub/inodes.c
xfs_scrub: handle spurious wakeups in scan_fs_tree
[thirdparty/xfsprogs-dev.git] / scrub / inodes.c
index 099489d83778dd05a189e582672c69e6bbf4fb8d..78f0914b8d91b95150cfbca231120e96b386501c 100644 (file)
 #include <sys/statvfs.h>
 #include "platform_defs.h"
 #include "xfs_arch.h"
-#include "xfs_format.h"
 #include "handle.h"
 #include "libfrog/paths.h"
 #include "libfrog/workqueue.h"
 #include "xfs_scrub.h"
 #include "common.h"
 #include "inodes.h"
+#include "descr.h"
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
 
@@ -50,7 +50,7 @@
 static void
 bulkstat_for_inumbers(
        struct scrub_ctx        *ctx,
-       const char              *descr,
+       struct descr            *dsc,
        const struct xfs_inumbers *inumbers,
        struct xfs_bulkstat_req *breq)
 {
@@ -66,7 +66,7 @@ bulkstat_for_inumbers(
        if (error) {
                char    errbuf[DESCR_BUFSZ];
 
-               str_info(ctx, descr, "%s",
+               str_info(ctx, descr_render(dsc), "%s",
                         strerror_r(error, errbuf, DESCR_BUFSZ));
        }
 
@@ -74,7 +74,7 @@ bulkstat_for_inumbers(
         * Check each of the stats we got back to make sure we got the inodes
         * we asked for.
         */
-       for (i = 0, bs = bstat; i < XFS_INODES_PER_CHUNK; i++) {
+       for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
                if (!(inumbers->xi_allocmask & (1ULL << i)))
                        continue;
                if (bs->bs_ino == inumbers->xi_startino + i) {
@@ -96,122 +96,280 @@ bulkstat_for_inumbers(
 
 /* BULKSTAT wrapper routines. */
 struct scan_inodes {
+       struct workqueue        wq_bulkstat;
        scrub_inode_iter_fn     fn;
        void                    *arg;
+       unsigned int            nr_threads;
        bool                    aborted;
 };
 
 /*
- * Call into the filesystem for inode/bulkstat information and call our
- * iterator function.  We'll try to fill the bulkstat information in batches,
- * but we also can detect iget failures.
+ * A single unit of inode scan work.  This contains a pointer to the parent
+ * information, followed by an INUMBERS request structure, followed by a
+ * BULKSTAT request structure.  The last two are VLAs, so we can't represent
+ * them here.
+ */
+struct scan_ichunk {
+       struct scan_inodes      *si;
+};
+
+static inline struct xfs_inumbers_req *
+ichunk_to_inumbers(
+       struct scan_ichunk      *ichunk)
+{
+       char                    *p = (char *)ichunk;
+
+       return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
+}
+
+static inline struct xfs_bulkstat_req *
+ichunk_to_bulkstat(
+       struct scan_ichunk      *ichunk)
+{
+       char                    *p = (char *)ichunk_to_inumbers(ichunk);
+
+       return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
+}
+
+static inline int
+alloc_ichunk(
+       struct scan_inodes      *si,
+       uint32_t                agno,
+       uint64_t                startino,
+       struct scan_ichunk      **ichunkp)
+{
+       struct scan_ichunk      *ichunk;
+       struct xfs_inumbers_req *ireq;
+       struct xfs_bulkstat_req *breq;
+
+       ichunk = calloc(1, sizeof(struct scan_ichunk) +
+                          XFS_INUMBERS_REQ_SIZE(1) +
+                          XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
+       if (!ichunk)
+               return -errno;
+
+       ichunk->si = si;
+
+       ireq = ichunk_to_inumbers(ichunk);
+       ireq->hdr.icount = 1;
+       ireq->hdr.ino = startino;
+       ireq->hdr.agno = agno;
+       ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
+
+       breq = ichunk_to_bulkstat(ichunk);
+       breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
+
+       *ichunkp = ichunk;
+       return 0;
+}
+
+static int
+render_ino_from_bulkstat(
+       struct scrub_ctx        *ctx,
+       char                    *buf,
+       size_t                  buflen,
+       void                    *data)
+{
+       struct xfs_bulkstat     *bstat = data;
+
+       return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
+                       bstat->bs_gen, NULL);
+}
+
+static int
+render_inumbers_from_agno(
+       struct scrub_ctx        *ctx,
+       char                    *buf,
+       size_t                  buflen,
+       void                    *data)
+{
+       xfs_agnumber_t          *agno = data;
+
+       return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
+                               major(ctx->fsinfo.fs_datadev),
+                               minor(ctx->fsinfo.fs_datadev),
+                               *agno);
+}
+
+/*
+ * Call BULKSTAT for information on a single chunk's worth of inodes and call
+ * our iterator function.  We'll try to fill the bulkstat information in
+ * batches, but we also can detect iget failures.
  */
 static void
-scan_ag_inodes(
+scan_ag_bulkstat(
        struct workqueue        *wq,
        xfs_agnumber_t          agno,
        void                    *arg)
 {
-       struct xfs_handle       handle;
-       char                    descr[DESCR_BUFSZ];
-       struct xfs_inumbers_req *ireq;
-       struct xfs_bulkstat_req *breq;
-       struct scan_inodes      *si = arg;
+       struct xfs_handle       handle = { };
        struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
+       struct scan_ichunk      *ichunk = arg;
+       struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk);
+       struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk);
+       struct scan_inodes      *si = ichunk->si;
        struct xfs_bulkstat     *bs;
-       struct xfs_inumbers     *inumbers;
+       struct xfs_inumbers     *inumbers = &ireq->inumbers[0];
+       uint64_t                last_ino = 0;
        int                     i;
        int                     error;
        int                     stale_count = 0;
+       DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
+       DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
 
-       snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u inodes"),
-                               major(ctx->fsinfo.fs_datadev),
-                               minor(ctx->fsinfo.fs_datadev),
-                               agno);
+       descr_set(&dsc_inumbers, &agno);
 
        memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid));
        handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                        sizeof(handle.ha_fid.fid_len);
        handle.ha_fid.fid_pad = 0;
 
-       error = -xfrog_bulkstat_alloc_req(XFS_INODES_PER_CHUNK, 0, &breq);
-       if (error) {
-               str_liberror(ctx, error, descr);
-               si->aborted = true;
-               return;
+retry:
+       bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq);
+
+       /* Iterate all the inodes. */
+       bs = &breq->bulkstat[0];
+       for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) {
+               uint64_t        scan_ino = bs->bs_ino;
+
+               /* ensure forward progress if we retried */
+               if (scan_ino < last_ino)
+                       continue;
+
+               descr_set(&dsc_bulkstat, bs);
+               handle.ha_fid.fid_ino = scan_ino;
+               handle.ha_fid.fid_gen = bs->bs_gen;
+               error = si->fn(ctx, &handle, bs, si->arg);
+               switch (error) {
+               case 0:
+                       break;
+               case ESTALE: {
+                       stale_count++;
+                       if (stale_count < 30) {
+                               ireq->hdr.ino = inumbers->xi_startino;
+                               error = -xfrog_inumbers(&ctx->mnt, ireq);
+                               if (error)
+                                       goto err;
+                               goto retry;
+                       }
+                       str_info(ctx, descr_render(&dsc_bulkstat),
+_("Changed too many times during scan; giving up."));
+                       si->aborted = true;
+                       goto out;
+               }
+               case ECANCELED:
+                       error = 0;
+                       fallthrough;
+               default:
+                       goto err;
+               }
+               if (scrub_excessive_errors(ctx)) {
+                       si->aborted = true;
+                       goto out;
+               }
+               last_ino = scan_ino;
        }
 
-       error = -xfrog_inumbers_alloc_req(1, 0, &ireq);
+err:
        if (error) {
-               str_liberror(ctx, error, descr);
-               free(breq);
+               str_liberror(ctx, error, descr_render(&dsc_bulkstat));
                si->aborted = true;
-               return;
        }
-       inumbers = &ireq->inumbers[0];
-       xfrog_inumbers_set_ag(ireq, agno);
+out:
+       free(ichunk);
+}
+
+/*
+ * Call INUMBERS for information about inode chunks, then queue the inumbers
+ * responses in the bulkstat workqueue.  This helps us maximize CPU parallelism
+ * if the filesystem AGs are not evenly loaded.
+ */
+static void
+scan_ag_inumbers(
+       struct workqueue        *wq,
+       xfs_agnumber_t          agno,
+       void                    *arg)
+{
+       struct scan_ichunk      *ichunk = NULL;
+       struct scan_inodes      *si = arg;
+       struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
+       struct xfs_inumbers_req *ireq;
+       uint64_t                nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
+       int                     error;
+       DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
+
+       descr_set(&dsc, &agno);
+
+       error = alloc_ichunk(si, agno, 0, &ichunk);
+       if (error)
+               goto err;
+       ireq = ichunk_to_inumbers(ichunk);
 
        /* Find the inode chunk & alloc mask */
        error = -xfrog_inumbers(&ctx->mnt, ireq);
        while (!error && !si->aborted && ireq->hdr.ocount > 0) {
                /*
-                * We can have totally empty inode chunks on filesystems where
-                * there are more than 64 inodes per block.  Skip these.
+                * Make sure that we always make forward progress while we
+                * scan the inode btree.
                 */
-               if (inumbers->xi_alloccount == 0)
-                       goto igrp_retry;
-
-               bulkstat_for_inumbers(ctx, descr, inumbers, breq);
-
-               /* Iterate all the inodes. */
-               for (i = 0, bs = breq->bulkstat;
-                    !si->aborted && i < inumbers->xi_alloccount;
-                    i++, bs++) {
-                       handle.ha_fid.fid_ino = bs->bs_ino;
-                       handle.ha_fid.fid_gen = bs->bs_gen;
-                       error = si->fn(ctx, &handle, bs, si->arg);
-                       switch (error) {
-                       case 0:
-                               break;
-                       case ESTALE: {
-                               char    idescr[DESCR_BUFSZ];
-
-                               stale_count++;
-                               if (stale_count < 30) {
-                                       ireq->hdr.ino = inumbers->xi_startino;
-                                       goto igrp_retry;
-                               }
-                               scrub_render_ino_descr(ctx, idescr, DESCR_BUFSZ,
-                                               bs->bs_ino, bs->bs_gen, NULL);
-                               str_info(ctx, idescr,
-_("Changed too many times during scan; giving up."));
-                               break;
-                       }
-                       case ECANCELED:
-                               error = 0;
-                               /* fall thru */
-                       default:
-                               goto err;
-                       }
-                       if (xfs_scrub_excessive_errors(ctx)) {
+               if (nextino > ireq->inumbers[0].xi_startino) {
+                       str_corrupt(ctx, descr_render(&dsc),
+       _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
+                               cvt_ino_to_agino(&ctx->mnt, nextino),
+                               cvt_ino_to_agino(&ctx->mnt,
+                                               ireq->inumbers[0].xi_startino));
+                       si->aborted = true;
+                       break;
+               }
+               nextino = ireq->hdr.ino;
+
+               if (ireq->inumbers[0].xi_alloccount == 0) {
+                       /*
+                        * We can have totally empty inode chunks on
+                        * filesystems where there are more than 64 inodes per
+                        * block.  Skip these.
+                        */
+                       ;
+               } else if (si->nr_threads > 0) {
+                       /* Queue this inode chunk on the bulkstat workqueue. */
+                       error = -workqueue_add(&si->wq_bulkstat,
+                                       scan_ag_bulkstat, agno, ichunk);
+                       if (error) {
                                si->aborted = true;
+                               str_liberror(ctx, error,
+                                               _("queueing bulkstat work"));
                                goto out;
                        }
+                       ichunk = NULL;
+               } else {
+                       /*
+                        * Only one thread, call bulkstat directly.  Remember,
+                        * ichunk is freed by the worker before returning.
+                        */
+                       scan_ag_bulkstat(wq, agno, ichunk);
+                       ichunk = NULL;
+                       if (si->aborted)
+                               break;
+               }
+
+               if (!ichunk) {
+                       error = alloc_ichunk(si, agno, nextino, &ichunk);
+                       if (error)
+                               goto err;
                }
+               ireq = ichunk_to_inumbers(ichunk);
 
-               stale_count = 0;
-igrp_retry:
                error = -xfrog_inumbers(&ctx->mnt, ireq);
        }
 
 err:
        if (error) {
-               str_liberror(ctx, error, descr);
+               str_liberror(ctx, error, descr_render(&dsc));
                si->aborted = true;
        }
 out:
-       free(ireq);
-       free(breq);
+       if (ichunk)
+               free(ichunk);
 }
 
 /*
@@ -227,33 +385,58 @@ scrub_scan_all_inodes(
        struct scan_inodes      si = {
                .fn             = fn,
                .arg            = arg,
+               .nr_threads     = scrub_nproc_workqueue(ctx),
        };
        xfs_agnumber_t          agno;
-       struct workqueue        wq;
+       struct workqueue        wq_inumbers;
+       unsigned int            max_bulkstat;
        int                     ret;
 
-       ret = -workqueue_create(&wq, (struct xfs_mount *)ctx,
-                       scrub_nproc_workqueue(ctx));
+       /*
+        * The bulkstat workqueue should queue at most one inobt block's worth
+        * of inode chunk records per worker thread.  If we're running in
+        * single thread mode (nr_threads==0) then we skip the workqueues.
+        */
+       max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
+
+       ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
+                       si.nr_threads, max_bulkstat);
        if (ret) {
                str_liberror(ctx, ret, _("creating bulkstat workqueue"));
                return -1;
        }
 
+       ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
+                       si.nr_threads);
+       if (ret) {
+               str_liberror(ctx, ret, _("creating inumbers workqueue"));
+               si.aborted = true;
+               goto kill_bulkstat;
+       }
+
        for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
-               ret = -workqueue_add(&wq, scan_ag_inodes, agno, &si);
+               ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
                if (ret) {
                        si.aborted = true;
-                       str_liberror(ctx, ret, _("queueing bulkstat work"));
+                       str_liberror(ctx, ret, _("queueing inumbers work"));
                        break;
                }
        }
 
-       ret = -workqueue_terminate(&wq);
+       ret = -workqueue_terminate(&wq_inumbers);
+       if (ret) {
+               si.aborted = true;
+               str_liberror(ctx, ret, _("finishing inumbers work"));
+       }
+       workqueue_destroy(&wq_inumbers);
+
+kill_bulkstat:
+       ret = -workqueue_terminate(&si.wq_bulkstat);
        if (ret) {
                si.aborted = true;
                str_liberror(ctx, ret, _("finishing bulkstat work"));
        }
-       workqueue_destroy(&wq);
+       workqueue_destroy(&si.wq_bulkstat);
 
        return si.aborted ? -1 : 0;
 }