#include <sys/statvfs.h>
#include "platform_defs.h"
#include "xfs_arch.h"
-#include "xfs_format.h"
#include "handle.h"
#include "libfrog/paths.h"
#include "libfrog/workqueue.h"
#include "xfs_scrub.h"
#include "common.h"
#include "inodes.h"
+#include "descr.h"
#include "libfrog/fsgeom.h"
#include "libfrog/bulkstat.h"
static void
bulkstat_for_inumbers(
struct scrub_ctx *ctx,
- const char *descr,
+ struct descr *dsc,
const struct xfs_inumbers *inumbers,
struct xfs_bulkstat_req *breq)
{
if (error) {
char errbuf[DESCR_BUFSZ];
- str_info(ctx, descr, "%s",
+ str_info(ctx, descr_render(dsc), "%s",
strerror_r(error, errbuf, DESCR_BUFSZ));
}
* Check each of the stats we got back to make sure we got the inodes
* we asked for.
*/
- for (i = 0, bs = bstat; i < XFS_INODES_PER_CHUNK; i++) {
+ for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
if (!(inumbers->xi_allocmask & (1ULL << i)))
continue;
if (bs->bs_ino == inumbers->xi_startino + i) {
/* BULKSTAT wrapper routines. */
struct scan_inodes {
+ struct workqueue wq_bulkstat;
scrub_inode_iter_fn fn;
void *arg;
+ unsigned int nr_threads;
bool aborted;
};
/*
- * Call into the filesystem for inode/bulkstat information and call our
- * iterator function. We'll try to fill the bulkstat information in batches,
- * but we also can detect iget failures.
+ * A single unit of inode scan work. This contains a pointer to the parent
+ * information, followed by an INUMBERS request structure, followed by a
+ * BULKSTAT request structure. The last two are VLAs, so we can't represent
+ * them here.
+ */
+struct scan_ichunk {
+ struct scan_inodes *si;
+};
+
+static inline struct xfs_inumbers_req *
+ichunk_to_inumbers(
+ struct scan_ichunk *ichunk)
+{
+ char *p = (char *)ichunk;
+
+ return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
+}
+
+static inline struct xfs_bulkstat_req *
+ichunk_to_bulkstat(
+ struct scan_ichunk *ichunk)
+{
+ char *p = (char *)ichunk_to_inumbers(ichunk);
+
+ return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
+}
+
+static inline int
+alloc_ichunk(
+ struct scan_inodes *si,
+ uint32_t agno,
+ uint64_t startino,
+ struct scan_ichunk **ichunkp)
+{
+ struct scan_ichunk *ichunk;
+ struct xfs_inumbers_req *ireq;
+ struct xfs_bulkstat_req *breq;
+
+ ichunk = calloc(1, sizeof(struct scan_ichunk) +
+ XFS_INUMBERS_REQ_SIZE(1) +
+ XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
+ if (!ichunk)
+ return -errno;
+
+ ichunk->si = si;
+
+ ireq = ichunk_to_inumbers(ichunk);
+ ireq->hdr.icount = 1;
+ ireq->hdr.ino = startino;
+ ireq->hdr.agno = agno;
+ ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
+
+ breq = ichunk_to_bulkstat(ichunk);
+ breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
+
+ *ichunkp = ichunk;
+ return 0;
+}
+
+static int
+render_ino_from_bulkstat(
+ struct scrub_ctx *ctx,
+ char *buf,
+ size_t buflen,
+ void *data)
+{
+ struct xfs_bulkstat *bstat = data;
+
+ return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
+ bstat->bs_gen, NULL);
+}
+
+static int
+render_inumbers_from_agno(
+ struct scrub_ctx *ctx,
+ char *buf,
+ size_t buflen,
+ void *data)
+{
+ xfs_agnumber_t *agno = data;
+
+ return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
+ major(ctx->fsinfo.fs_datadev),
+ minor(ctx->fsinfo.fs_datadev),
+ *agno);
+}
+
+/*
+ * Call BULKSTAT for information on a single chunk's worth of inodes and call
+ * our iterator function. We'll try to fill the bulkstat information in
+ * batches, but we also can detect iget failures.
*/
static void
-scan_ag_inodes(
+scan_ag_bulkstat(
struct workqueue *wq,
xfs_agnumber_t agno,
void *arg)
{
- struct xfs_handle handle;
- char descr[DESCR_BUFSZ];
- struct xfs_inumbers_req *ireq;
- struct xfs_bulkstat_req *breq;
- struct scan_inodes *si = arg;
+ struct xfs_handle handle = { };
struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx;
+ struct scan_ichunk *ichunk = arg;
+ struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk);
+ struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk);
+ struct scan_inodes *si = ichunk->si;
struct xfs_bulkstat *bs;
- struct xfs_inumbers *inumbers;
+ struct xfs_inumbers *inumbers = &ireq->inumbers[0];
+ uint64_t last_ino = 0;
int i;
int error;
int stale_count = 0;
+ DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
+ DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
- snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u inodes"),
- major(ctx->fsinfo.fs_datadev),
- minor(ctx->fsinfo.fs_datadev),
- agno);
+ descr_set(&dsc_inumbers, &agno);
memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid));
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
- error = -xfrog_bulkstat_alloc_req(XFS_INODES_PER_CHUNK, 0, &breq);
- if (error) {
- str_liberror(ctx, error, descr);
- si->aborted = true;
- return;
+retry:
+ bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq);
+
+ /* Iterate all the inodes. */
+ bs = &breq->bulkstat[0];
+ for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) {
+ uint64_t scan_ino = bs->bs_ino;
+
+ /* ensure forward progress if we retried */
+ if (scan_ino < last_ino)
+ continue;
+
+ descr_set(&dsc_bulkstat, bs);
+ handle.ha_fid.fid_ino = scan_ino;
+ handle.ha_fid.fid_gen = bs->bs_gen;
+ error = si->fn(ctx, &handle, bs, si->arg);
+ switch (error) {
+ case 0:
+ break;
+ case ESTALE: {
+ stale_count++;
+ if (stale_count < 30) {
+ ireq->hdr.ino = inumbers->xi_startino;
+ error = -xfrog_inumbers(&ctx->mnt, ireq);
+ if (error)
+ goto err;
+ goto retry;
+ }
+ str_info(ctx, descr_render(&dsc_bulkstat),
+_("Changed too many times during scan; giving up."));
+ si->aborted = true;
+ goto out;
+ }
+ case ECANCELED:
+ error = 0;
+ fallthrough;
+ default:
+ goto err;
+ }
+ if (scrub_excessive_errors(ctx)) {
+ si->aborted = true;
+ goto out;
+ }
+ last_ino = scan_ino;
}
- error = -xfrog_inumbers_alloc_req(1, 0, &ireq);
+err:
if (error) {
- str_liberror(ctx, error, descr);
- free(breq);
+ str_liberror(ctx, error, descr_render(&dsc_bulkstat));
si->aborted = true;
- return;
}
- inumbers = &ireq->inumbers[0];
- xfrog_inumbers_set_ag(ireq, agno);
+out:
+ free(ichunk);
+}
+
+/*
+ * Call INUMBERS for information about inode chunks, then queue the inumbers
+ * responses in the bulkstat workqueue. This helps us maximize CPU parallelism
+ * if the filesystem AGs are not evenly loaded.
+ */
+static void
+scan_ag_inumbers(
+ struct workqueue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct scan_ichunk *ichunk = NULL;
+ struct scan_inodes *si = arg;
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx;
+ struct xfs_inumbers_req *ireq;
+ uint64_t nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
+ int error;
+ DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
+
+ descr_set(&dsc, &agno);
+
+ error = alloc_ichunk(si, agno, 0, &ichunk);
+ if (error)
+ goto err;
+ ireq = ichunk_to_inumbers(ichunk);
/* Find the inode chunk & alloc mask */
error = -xfrog_inumbers(&ctx->mnt, ireq);
while (!error && !si->aborted && ireq->hdr.ocount > 0) {
/*
- * We can have totally empty inode chunks on filesystems where
- * there are more than 64 inodes per block. Skip these.
+ * Make sure that we always make forward progress while we
+ * scan the inode btree.
*/
- if (inumbers->xi_alloccount == 0)
- goto igrp_retry;
-
- bulkstat_for_inumbers(ctx, descr, inumbers, breq);
-
- /* Iterate all the inodes. */
- for (i = 0, bs = breq->bulkstat;
- !si->aborted && i < inumbers->xi_alloccount;
- i++, bs++) {
- handle.ha_fid.fid_ino = bs->bs_ino;
- handle.ha_fid.fid_gen = bs->bs_gen;
- error = si->fn(ctx, &handle, bs, si->arg);
- switch (error) {
- case 0:
- break;
- case ESTALE: {
- char idescr[DESCR_BUFSZ];
-
- stale_count++;
- if (stale_count < 30) {
- ireq->hdr.ino = inumbers->xi_startino;
- goto igrp_retry;
- }
- scrub_render_ino_descr(ctx, idescr, DESCR_BUFSZ,
- bs->bs_ino, bs->bs_gen, NULL);
- str_info(ctx, idescr,
-_("Changed too many times during scan; giving up."));
- break;
- }
- case ECANCELED:
- error = 0;
- /* fall thru */
- default:
- goto err;
- }
- if (xfs_scrub_excessive_errors(ctx)) {
+ if (nextino > ireq->inumbers[0].xi_startino) {
+ str_corrupt(ctx, descr_render(&dsc),
+ _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
+ cvt_ino_to_agino(&ctx->mnt, nextino),
+ cvt_ino_to_agino(&ctx->mnt,
+ ireq->inumbers[0].xi_startino));
+ si->aborted = true;
+ break;
+ }
+ nextino = ireq->hdr.ino;
+
+ if (ireq->inumbers[0].xi_alloccount == 0) {
+ /*
+ * We can have totally empty inode chunks on
+ * filesystems where there are more than 64 inodes per
+ * block. Skip these.
+ */
+ ;
+ } else if (si->nr_threads > 0) {
+ /* Queue this inode chunk on the bulkstat workqueue. */
+ error = -workqueue_add(&si->wq_bulkstat,
+ scan_ag_bulkstat, agno, ichunk);
+ if (error) {
si->aborted = true;
+ str_liberror(ctx, error,
+ _("queueing bulkstat work"));
goto out;
}
+ ichunk = NULL;
+ } else {
+ /*
+ * Only one thread, call bulkstat directly. Remember,
+ * ichunk is freed by the worker before returning.
+ */
+ scan_ag_bulkstat(wq, agno, ichunk);
+ ichunk = NULL;
+ if (si->aborted)
+ break;
+ }
+
+ if (!ichunk) {
+ error = alloc_ichunk(si, agno, nextino, &ichunk);
+ if (error)
+ goto err;
}
+ ireq = ichunk_to_inumbers(ichunk);
- stale_count = 0;
-igrp_retry:
error = -xfrog_inumbers(&ctx->mnt, ireq);
}
err:
if (error) {
- str_liberror(ctx, error, descr);
+ str_liberror(ctx, error, descr_render(&dsc));
si->aborted = true;
}
out:
- free(ireq);
- free(breq);
+ if (ichunk)
+ free(ichunk);
}
/*
struct scan_inodes si = {
.fn = fn,
.arg = arg,
+ .nr_threads = scrub_nproc_workqueue(ctx),
};
xfs_agnumber_t agno;
- struct workqueue wq;
+ struct workqueue wq_inumbers;
+ unsigned int max_bulkstat;
int ret;
- ret = -workqueue_create(&wq, (struct xfs_mount *)ctx,
- scrub_nproc_workqueue(ctx));
+ /*
+ * The bulkstat workqueue should queue at most one inobt block's worth
+ * of inode chunk records per worker thread. If we're running in
+ * single thread mode (nr_threads==0) then we skip the workqueues.
+ */
+ max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
+
+ ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
+ si.nr_threads, max_bulkstat);
if (ret) {
str_liberror(ctx, ret, _("creating bulkstat workqueue"));
return -1;
}
+ ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
+ si.nr_threads);
+ if (ret) {
+ str_liberror(ctx, ret, _("creating inumbers workqueue"));
+ si.aborted = true;
+ goto kill_bulkstat;
+ }
+
for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
- ret = -workqueue_add(&wq, scan_ag_inodes, agno, &si);
+ ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
if (ret) {
si.aborted = true;
- str_liberror(ctx, ret, _("queueing bulkstat work"));
+ str_liberror(ctx, ret, _("queueing inumbers work"));
break;
}
}
- ret = -workqueue_terminate(&wq);
+ ret = -workqueue_terminate(&wq_inumbers);
+ if (ret) {
+ si.aborted = true;
+ str_liberror(ctx, ret, _("finishing inumbers work"));
+ }
+ workqueue_destroy(&wq_inumbers);
+
+kill_bulkstat:
+ ret = -workqueue_terminate(&si.wq_bulkstat);
if (ret) {
si.aborted = true;
str_liberror(ctx, ret, _("finishing bulkstat work"));
}
- workqueue_destroy(&wq);
+ workqueue_destroy(&si.wq_bulkstat);
return si.aborted ? -1 : 0;
}