]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blobdiff - scrub/inodes.c
xfs_scrub: handle spurious wakeups in scan_fs_tree
[thirdparty/xfsprogs-dev.git] / scrub / inodes.c
index 91632b550b48c0f4eedf27aea4bd6530100df15f..78f0914b8d91b95150cfbca231120e96b386501c 100644 (file)
 #include <sys/statvfs.h>
 #include "platform_defs.h"
 #include "xfs_arch.h"
-#include "xfs_format.h"
 #include "handle.h"
 #include "libfrog/paths.h"
 #include "libfrog/workqueue.h"
 #include "xfs_scrub.h"
 #include "common.h"
 #include "inodes.h"
+#include "descr.h"
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
 
  */
 
 /*
- * Did we get exactly the inodes we expected?  If not, load them one at a
- * time (or fake it) into the bulkstat data.
+ * Run bulkstat on an entire inode allocation group, then check that we got
+ * exactly the inodes we expected.  If not, load them one at a time (or fake
+ * it) into the bulkstat data.
  */
 static void
-xfs_iterate_inodes_range_check(
+bulkstat_for_inumbers(
        struct scrub_ctx        *ctx,
-       struct xfs_inumbers     *inumbers,
-       struct xfs_bulkstat     *bstat)
+       struct descr            *dsc,
+       const struct xfs_inumbers *inumbers,
+       struct xfs_bulkstat_req *breq)
 {
+       struct xfs_bulkstat     *bstat = breq->bulkstat;
        struct xfs_bulkstat     *bs;
        int                     i;
        int                     error;
 
-       for (i = 0, bs = bstat; i < XFS_INODES_PER_CHUNK; i++) {
+       /* First we try regular bulkstat, for speed. */
+       breq->hdr.ino = inumbers->xi_startino;
+       breq->hdr.icount = inumbers->xi_alloccount;
+       error = -xfrog_bulkstat(&ctx->mnt, breq);
+       if (error) {
+               char    errbuf[DESCR_BUFSZ];
+
+               str_info(ctx, descr_render(dsc), "%s",
+                        strerror_r(error, errbuf, DESCR_BUFSZ));
+       }
+
+       /*
+        * Check each of the stats we got back to make sure we got the inodes
+        * we asked for.
+        */
+       for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
                if (!(inumbers->xi_allocmask & (1ULL << i)))
                        continue;
                if (bs->bs_ino == inumbers->xi_startino + i) {
@@ -65,7 +83,7 @@ xfs_iterate_inodes_range_check(
                }
 
                /* Load the one inode. */
-               error = xfrog_bulkstat_single(&ctx->mnt,
+               error = -xfrog_bulkstat_single(&ctx->mnt,
                                inumbers->xi_startino + i, 0, bs);
                if (error || bs->bs_ino != inumbers->xi_startino + i) {
                        memset(bs, 0, sizeof(struct xfs_bulkstat));
@@ -76,201 +94,356 @@ xfs_iterate_inodes_range_check(
        }
 }
 
+/* BULKSTAT wrapper routines. */
+struct scan_inodes {
+       struct workqueue        wq_bulkstat;
+       scrub_inode_iter_fn     fn;
+       void                    *arg;
+       unsigned int            nr_threads;
+       bool                    aborted;
+};
+
 /*
- * Call into the filesystem for inode/bulkstat information and call our
- * iterator function.  We'll try to fill the bulkstat information in batches,
- * but we also can detect iget failures.
+ * A single unit of inode scan work.  This contains a pointer to the parent
+ * information, followed by an INUMBERS request structure, followed by a
+ * BULKSTAT request structure.  The last two are VLAs, so we can't represent
+ * them here.
  */
-static bool
-xfs_iterate_inodes_ag(
-       struct scrub_ctx        *ctx,
-       const char              *descr,
-       void                    *fshandle,
+struct scan_ichunk {
+       struct scan_inodes      *si;
+};
+
+static inline struct xfs_inumbers_req *
+ichunk_to_inumbers(
+       struct scan_ichunk      *ichunk)
+{
+       char                    *p = (char *)ichunk;
+
+       return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
+}
+
+static inline struct xfs_bulkstat_req *
+ichunk_to_bulkstat(
+       struct scan_ichunk      *ichunk)
+{
+       char                    *p = (char *)ichunk_to_inumbers(ichunk);
+
+       return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
+}
+
+static inline int
+alloc_ichunk(
+       struct scan_inodes      *si,
        uint32_t                agno,
-       xfs_inode_iter_fn       fn,
-       void                    *arg)
+       uint64_t                startino,
+       struct scan_ichunk      **ichunkp)
 {
-       struct xfs_handle       handle;
+       struct scan_ichunk      *ichunk;
        struct xfs_inumbers_req *ireq;
        struct xfs_bulkstat_req *breq;
-       char                    idescr[DESCR_BUFSZ];
+
+       ichunk = calloc(1, sizeof(struct scan_ichunk) +
+                          XFS_INUMBERS_REQ_SIZE(1) +
+                          XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
+       if (!ichunk)
+               return -errno;
+
+       ichunk->si = si;
+
+       ireq = ichunk_to_inumbers(ichunk);
+       ireq->hdr.icount = 1;
+       ireq->hdr.ino = startino;
+       ireq->hdr.agno = agno;
+       ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
+
+       breq = ichunk_to_bulkstat(ichunk);
+       breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
+
+       *ichunkp = ichunk;
+       return 0;
+}
+
+static int
+render_ino_from_bulkstat(
+       struct scrub_ctx        *ctx,
+       char                    *buf,
+       size_t                  buflen,
+       void                    *data)
+{
+       struct xfs_bulkstat     *bstat = data;
+
+       return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
+                       bstat->bs_gen, NULL);
+}
+
+static int
+render_inumbers_from_agno(
+       struct scrub_ctx        *ctx,
+       char                    *buf,
+       size_t                  buflen,
+       void                    *data)
+{
+       xfs_agnumber_t          *agno = data;
+
+       return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
+                               major(ctx->fsinfo.fs_datadev),
+                               minor(ctx->fsinfo.fs_datadev),
+                               *agno);
+}
+
+/*
+ * Call BULKSTAT for information on a single chunk's worth of inodes and call
+ * our iterator function.  We'll try to fill the bulkstat information in
+ * batches, but we also can detect iget failures.
+ */
+static void
+scan_ag_bulkstat(
+       struct workqueue        *wq,
+       xfs_agnumber_t          agno,
+       void                    *arg)
+{
+       struct xfs_handle       handle = { };
+       struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
+       struct scan_ichunk      *ichunk = arg;
+       struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk);
+       struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk);
+       struct scan_inodes      *si = ichunk->si;
        struct xfs_bulkstat     *bs;
-       struct xfs_inumbers     *inumbers;
-       bool                    moveon = true;
+       struct xfs_inumbers     *inumbers = &ireq->inumbers[0];
+       uint64_t                last_ino = 0;
        int                     i;
        int                     error;
        int                     stale_count = 0;
+       DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
+       DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
+
+       descr_set(&dsc_inumbers, &agno);
 
-       memcpy(&handle.ha_fsid, fshandle, sizeof(handle.ha_fsid));
+       memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid));
        handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                        sizeof(handle.ha_fid.fid_len);
        handle.ha_fid.fid_pad = 0;
 
-       breq = xfrog_bulkstat_alloc_req(XFS_INODES_PER_CHUNK, 0);
-       if (!breq) {
-               str_info(ctx, descr, _("Insufficient memory; giving up."));
-               return false;
-       }
-
-       ireq = xfrog_inumbers_alloc_req(1, 0);
-       if (!ireq) {
-               str_info(ctx, descr, _("Insufficient memory; giving up."));
-               free(breq);
-               return false;
-       }
-       inumbers = &ireq->inumbers[0];
-       xfrog_inumbers_set_ag(ireq, agno);
-
-       /* Find the inode chunk & alloc mask */
-       error = xfrog_inumbers(&ctx->mnt, ireq);
-       while (!error && ireq->hdr.ocount > 0) {
-               /*
-                * We can have totally empty inode chunks on filesystems where
-                * there are more than 64 inodes per block.  Skip these.
-                */
-               if (inumbers->xi_alloccount == 0)
-                       goto igrp_retry;
+retry:
+       bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq);
 
-               breq->hdr.ino = inumbers->xi_startino;
-               breq->hdr.icount = inumbers->xi_alloccount;
-               error = xfrog_bulkstat(&ctx->mnt, breq);
-               if (error) {
-                       char    errbuf[DESCR_BUFSZ];
+       /* Iterate all the inodes. */
+       bs = &breq->bulkstat[0];
+       for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) {
+               uint64_t        scan_ino = bs->bs_ino;
 
-                       str_info(ctx, descr, "%s", strerror_r(error,
-                                               errbuf, DESCR_BUFSZ));
-               }
+               /* ensure forward progress if we retried */
+               if (scan_ino < last_ino)
+                       continue;
 
-               xfs_iterate_inodes_range_check(ctx, inumbers, breq->bulkstat);
-
-               /* Iterate all the inodes. */
-               for (i = 0, bs = breq->bulkstat;
-                    i < inumbers->xi_alloccount;
-                    i++, bs++) {
-                       handle.ha_fid.fid_ino = bs->bs_ino;
-                       handle.ha_fid.fid_gen = bs->bs_gen;
-                       error = fn(ctx, &handle, bs, arg);
-                       switch (error) {
-                       case 0:
-                               break;
-                       case ESTALE:
-                               stale_count++;
-                               if (stale_count < 30) {
-                                       ireq->hdr.ino = inumbers->xi_startino;
-                                       goto igrp_retry;
-                               }
-                               snprintf(idescr, DESCR_BUFSZ, "inode %"PRIu64,
-                                               (uint64_t)bs->bs_ino);
-                               str_info(ctx, idescr,
-_("Changed too many times during scan; giving up."));
-                               break;
-                       case XFS_ITERATE_INODES_ABORT:
-                               error = 0;
-                               /* fall thru */
-                       default:
-                               moveon = false;
-                               errno = error;
-                               goto err;
-                       }
-                       if (xfs_scrub_excessive_errors(ctx)) {
-                               moveon = false;
-                               goto out;
+               descr_set(&dsc_bulkstat, bs);
+               handle.ha_fid.fid_ino = scan_ino;
+               handle.ha_fid.fid_gen = bs->bs_gen;
+               error = si->fn(ctx, &handle, bs, si->arg);
+               switch (error) {
+               case 0:
+                       break;
+               case ESTALE: {
+                       stale_count++;
+                       if (stale_count < 30) {
+                               ireq->hdr.ino = inumbers->xi_startino;
+                               error = -xfrog_inumbers(&ctx->mnt, ireq);
+                               if (error)
+                                       goto err;
+                               goto retry;
                        }
+                       str_info(ctx, descr_render(&dsc_bulkstat),
+_("Changed too many times during scan; giving up."));
+                       si->aborted = true;
+                       goto out;
                }
-
-               stale_count = 0;
-igrp_retry:
-               error = xfrog_inumbers(&ctx->mnt, ireq);
+               case ECANCELED:
+                       error = 0;
+                       fallthrough;
+               default:
+                       goto err;
+               }
+               if (scrub_excessive_errors(ctx)) {
+                       si->aborted = true;
+                       goto out;
+               }
+               last_ino = scan_ino;
        }
 
 err:
        if (error) {
-               str_liberror(ctx, error, descr);
-               moveon = false;
+               str_liberror(ctx, error, descr_render(&dsc_bulkstat));
+               si->aborted = true;
        }
 out:
-       free(ireq);
-       free(breq);
-       return moveon;
+       free(ichunk);
 }
 
-/* BULKSTAT wrapper routines. */
-struct xfs_scan_inodes {
-       xfs_inode_iter_fn       fn;
-       void                    *arg;
-       bool                    moveon;
-};
-
-/* Scan all the inodes in an AG. */
+/*
+ * Call INUMBERS for information about inode chunks, then queue the inumbers
+ * responses in the bulkstat workqueue.  This helps us maximize CPU parallelism
+ * if the filesystem AGs are not evenly loaded.
+ */
 static void
-xfs_scan_ag_inodes(
+scan_ag_inumbers(
        struct workqueue        *wq,
        xfs_agnumber_t          agno,
        void                    *arg)
 {
-       struct xfs_scan_inodes  *si = arg;
+       struct scan_ichunk      *ichunk = NULL;
+       struct scan_inodes      *si = arg;
        struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
-       char                    descr[DESCR_BUFSZ];
-       bool                    moveon;
+       struct xfs_inumbers_req *ireq;
+       uint64_t                nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
+       int                     error;
+       DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
 
-       snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u inodes"),
-                               major(ctx->fsinfo.fs_datadev),
-                               minor(ctx->fsinfo.fs_datadev),
-                               agno);
+       descr_set(&dsc, &agno);
 
-       moveon = xfs_iterate_inodes_ag(ctx, descr, ctx->fshandle, agno,
-                       si->fn, si->arg);
-       if (!moveon)
-               si->moveon = false;
+       error = alloc_ichunk(si, agno, 0, &ichunk);
+       if (error)
+               goto err;
+       ireq = ichunk_to_inumbers(ichunk);
+
+       /* Find the inode chunk & alloc mask */
+       error = -xfrog_inumbers(&ctx->mnt, ireq);
+       while (!error && !si->aborted && ireq->hdr.ocount > 0) {
+               /*
+                * Make sure that we always make forward progress while we
+                * scan the inode btree.
+                */
+               if (nextino > ireq->inumbers[0].xi_startino) {
+                       str_corrupt(ctx, descr_render(&dsc),
+       _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
+                               cvt_ino_to_agino(&ctx->mnt, nextino),
+                               cvt_ino_to_agino(&ctx->mnt,
+                                               ireq->inumbers[0].xi_startino));
+                       si->aborted = true;
+                       break;
+               }
+               nextino = ireq->hdr.ino;
+
+               if (ireq->inumbers[0].xi_alloccount == 0) {
+                       /*
+                        * We can have totally empty inode chunks on
+                        * filesystems where there are more than 64 inodes per
+                        * block.  Skip these.
+                        */
+                       ;
+               } else if (si->nr_threads > 0) {
+                       /* Queue this inode chunk on the bulkstat workqueue. */
+                       error = -workqueue_add(&si->wq_bulkstat,
+                                       scan_ag_bulkstat, agno, ichunk);
+                       if (error) {
+                               si->aborted = true;
+                               str_liberror(ctx, error,
+                                               _("queueing bulkstat work"));
+                               goto out;
+                       }
+                       ichunk = NULL;
+               } else {
+                       /*
+                        * Only one thread, call bulkstat directly.  Remember,
+                        * ichunk is freed by the worker before returning.
+                        */
+                       scan_ag_bulkstat(wq, agno, ichunk);
+                       ichunk = NULL;
+                       if (si->aborted)
+                               break;
+               }
+
+               if (!ichunk) {
+                       error = alloc_ichunk(si, agno, nextino, &ichunk);
+                       if (error)
+                               goto err;
+               }
+               ireq = ichunk_to_inumbers(ichunk);
+
+               error = -xfrog_inumbers(&ctx->mnt, ireq);
+       }
+
+err:
+       if (error) {
+               str_liberror(ctx, error, descr_render(&dsc));
+               si->aborted = true;
+       }
+out:
+       if (ichunk)
+               free(ichunk);
 }
 
-/* Scan all the inodes in a filesystem. */
-bool
-xfs_scan_all_inodes(
+/*
+ * Scan all the inodes in a filesystem.  On error, this function will log
+ * an error message and return -1.
+ */
+int
+scrub_scan_all_inodes(
        struct scrub_ctx        *ctx,
-       xfs_inode_iter_fn       fn,
+       scrub_inode_iter_fn     fn,
        void                    *arg)
 {
-       struct xfs_scan_inodes  si;
+       struct scan_inodes      si = {
+               .fn             = fn,
+               .arg            = arg,
+               .nr_threads     = scrub_nproc_workqueue(ctx),
+       };
        xfs_agnumber_t          agno;
-       struct workqueue        wq;
+       struct workqueue        wq_inumbers;
+       unsigned int            max_bulkstat;
        int                     ret;
 
-       si.moveon = true;
-       si.fn = fn;
-       si.arg = arg;
+       /*
+        * The bulkstat workqueue should queue at most one inobt block's worth
+        * of inode chunk records per worker thread.  If we're running in
+        * single thread mode (nr_threads==0) then we skip the workqueues.
+        */
+       max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
 
-       ret = workqueue_create(&wq, (struct xfs_mount *)ctx,
-                       scrub_nproc_workqueue(ctx));
+       ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
+                       si.nr_threads, max_bulkstat);
        if (ret) {
                str_liberror(ctx, ret, _("creating bulkstat workqueue"));
-               return false;
+               return -1;
+       }
+
+       ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
+                       si.nr_threads);
+       if (ret) {
+               str_liberror(ctx, ret, _("creating inumbers workqueue"));
+               si.aborted = true;
+               goto kill_bulkstat;
        }
 
        for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
-               ret = workqueue_add(&wq, xfs_scan_ag_inodes, agno, &si);
+               ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
                if (ret) {
-                       si.moveon = false;
-                       str_liberror(ctx, ret, _("queueing bulkstat work"));
+                       si.aborted = true;
+                       str_liberror(ctx, ret, _("queueing inumbers work"));
                        break;
                }
        }
 
-       ret = workqueue_terminate(&wq);
+       ret = -workqueue_terminate(&wq_inumbers);
+       if (ret) {
+               si.aborted = true;
+               str_liberror(ctx, ret, _("finishing inumbers work"));
+       }
+       workqueue_destroy(&wq_inumbers);
+
+kill_bulkstat:
+       ret = -workqueue_terminate(&si.wq_bulkstat);
        if (ret) {
-               si.moveon = false;
+               si.aborted = true;
                str_liberror(ctx, ret, _("finishing bulkstat work"));
        }
-       workqueue_destroy(&wq);
+       workqueue_destroy(&si.wq_bulkstat);
 
-       return si.moveon;
+       return si.aborted ? -1 : 0;
 }
 
-/*
- * Open a file by handle, or return a negative error code.
- */
+/* Open a file by handle, returning either the fd or -1 on error. */
 int
-xfs_open_handle(
+scrub_open_handle(
        struct xfs_handle       *handle)
 {
        return open_by_fshandle(handle, sizeof(*handle),