scrub/inodes.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2018-2024 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <djwong@kernel.org>
   5  */
   6 #include "xfs.h"
   7 #include <stdint.h>
   8 #include <stdlib.h>
   9 #include <pthread.h>
  10 #include <sys/statvfs.h>
  11 #include "platform_defs.h"
  12 #include "xfs_arch.h"
  13 #include "handle.h"
  14 #include "libfrog/paths.h"
  15 #include "libfrog/workqueue.h"
  16 #include "xfs_scrub.h"
  17 #include "common.h"
  18 #include "inodes.h"
  19 #include "descr.h"
  20 #include "libfrog/fsgeom.h"
  21 #include "libfrog/bulkstat.h"
  22 #include "libfrog/handle_priv.h"
  23 #include "bitops.h"
  24 #include "libfrog/bitmask.h"
  25
  26 /*
  27  * Iterate a range of inodes.
  28  *
  29  * This is a little more involved than repeatedly asking BULKSTAT for a
  30  * buffer's worth of stat data for some number of inodes.  We want to scan as
  31  * many of the inodes that the inobt thinks there are, so we use the INUMBERS
  32  * ioctl to walk all the inobt records in the filesystem and spawn a worker to
  33  * bulkstat and iterate.  The worker starts with an inumbers record that can
  34  * look like this:
  35  *
  36  * {startino = S, allocmask = 0b11011}
  37  *
  38  * Given a starting inumber S and count C=64, bulkstat will return a sorted
  39  * array of stat information.  The bs_ino of those array elements can look like
  40  * any of the following:
  41  *
  42  * 0. [S, S+1, S+3, S+4]
  43  * 1. [S+e, S+e+1, S+e+3, S+e+4, S+e+C+1...], where e >= 0
  44  * 2. [S+e+n], where n >= 0
  45  * 3. []
  46  * 4. [], errno == EFSCORRUPTED
  47  *
  48  * We know that bulkstat scanned the entire inode range between S and bs_ino of
  49  * the last array element, even though it only fills out an array element for
  50  * allocated inodes.  Therefore, we can say in cases 0-2 that S was filled,
  51  * even if there is no bstat[] record for S.  In turn, we can create a bitmask
  52  * of inodes that we have seen, and set bits 0 through (bstat[-1].bs_ino - S),
  53  * being careful not to set any bits past S+C.
  54  *
  55  * In case (0) we find that seen mask matches the inumber record
  56  * exactly, so the caller can walk the stat records and move on.  In case (1)
  57  * this is also true, but we must be careful to reduce the array length to
  58  * avoid scanning inodes that are not in the inumber chunk.  In case (3) we
  59  * conclude that there were no inodes left to scan and terminate.
  60  *
  61  * In case (2) and (4) we don't know why bulkstat returned fewer than C
  62  * elements.  We might have found the end of the filesystem, or the kernel
  63  * might have found a corrupt inode and stopped.  This we must investigate by
  64  * trying to fill out the rest of the bstat array starting with the next
  65  * inumber after the last bstat array element filled, and continuing until S'
  66  * is beyond S0 + C, or the array is full.  Each time we succeed in loading
  67  * new records, the kernel increases S' for us; if instead we encounter case
  68  * (4), we can increment S' ourselves.
  69  *
  70  * Inodes that are set in the allocmask but not set in the seen mask are the
  71  * corrupt inodes.  For each of these cases, we try to populate the bulkstat
  72  * array one inode at a time.  If the kernel returns a matching record we can
  73  * use it; if instead we receive an error, we synthesize enough of a record
  74  * to be able to run online scrub by handle.
  75  *
  76  * If the iteration function returns ESTALE, that means that the inode has
  77  * been deleted and possibly recreated since the BULKSTAT call.  We wil
  78  * refresh the stat information and try again up to 30 times before reporting
  79  * the staleness as an error.
  80  */
  81
  82 /*
  83  * Return the inumber of the highest inode in the bulkstat data, assuming the
  84  * records are sorted in inumber order.
  85  */
  86 static inline uint64_t last_bstat_ino(const struct xfs_bulkstat_req *b)
  87 {
  88         return b->hdr.ocount ? b->bulkstat[b->hdr.ocount - 1].bs_ino : 0;
  89 }
  90
  91 /*
  92  * Deduce the bitmask of the inodes in inums that were seen by bulkstat.  If
  93  * the inode is present in the bstat array this is trivially true; or if it is
  94  * not in the array but higher inumbers are present, then it was freed.
  95  */
  96 static __u64
  97 seen_mask_from_bulkstat(
  98         const struct xfs_inumbers       *inums,
  99         __u64                           breq_startino,
 100         const struct xfs_bulkstat_req   *breq)
 101 {
 102         const __u64                     limit_ino =
 103                 inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
 104         const __u64                     last = last_bstat_ino(breq);
 105         __u64                           ret = 0;
 106         int                             i, maxi;
 107
 108         /* Ignore the bulkstat results if they don't cover inumbers */
 109         if (breq_startino > limit_ino || last < inums->xi_startino)
 110                 return 0;
 111
 112         maxi = min(LIBFROG_BULKSTAT_CHUNKSIZE, last - inums->xi_startino + 1);
 113         for (i = breq_startino - inums->xi_startino; i < maxi; i++)
 114                 ret |= 1ULL << i;
 115
 116         return ret;
 117 }
 118
 119 /*
 120  * Try to fill the rest of orig_breq with bulkstat data by re-running bulkstat
 121  * with increasing start_ino until we either hit the end of the inumbers info
 122  * or fill up the bstat array with something.  Returns a bitmask of the inodes
 123  * within inums that were filled by the bulkstat requests.
 124  */
 125 static __u64
 126 bulkstat_the_rest(
 127         struct scrub_ctx                *ctx,
 128         const struct xfs_inumbers       *inums,
 129         struct xfs_bulkstat_req         *orig_breq,
 130         int                             orig_error)
 131 {
 132         struct xfs_bulkstat_req         *new_breq;
 133         struct xfs_bulkstat             *old_bstat =
 134                 &orig_breq->bulkstat[orig_breq->hdr.ocount];
 135         const __u64                     limit_ino =
 136                 inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
 137         __u64                           start_ino = orig_breq->hdr.ino;
 138         __u64                           seen_mask = 0;
 139         int                             error;
 140
 141         assert(orig_breq->hdr.ocount < orig_breq->hdr.icount);
 142
 143         /*
 144          * If the first bulkstat returned a corruption error, that means
 145          * start_ino is corrupt.  Restart instead at the next inumber.
 146          */
 147         if (orig_error == EFSCORRUPTED)
 148                 start_ino++;
 149         if (start_ino >= limit_ino)
 150                 return 0;
 151
 152         error = -xfrog_bulkstat_alloc_req(
 153                         orig_breq->hdr.icount - orig_breq->hdr.ocount,
 154                         start_ino, &new_breq);
 155         if (error)
 156                 return error;
 157         new_breq->hdr.flags = orig_breq->hdr.flags;
 158
 159         do {
 160                 /*
 161                  * Fill the new bulkstat request with stat data starting at
 162                  * start_ino.
 163                  */
 164                 error = -xfrog_bulkstat(&ctx->mnt, new_breq);
 165                 if (error == EFSCORRUPTED) {
 166                         /*
 167                          * start_ino is corrupt, increment and try the next
 168                          * inode.
 169                          */
 170                         start_ino++;
 171                         new_breq->hdr.ino = start_ino;
 172                         continue;
 173                 }
 174                 if (error) {
 175                         /*
 176                          * Any other error means the caller falls back to
 177                          * single stepping.
 178                          */
 179                         break;
 180                 }
 181                 if (new_breq->hdr.ocount == 0)
 182                         break;
 183
 184                 /* Copy new results to the original bstat buffer */
 185                 memcpy(old_bstat, new_breq->bulkstat,
 186                        new_breq->hdr.ocount * sizeof(struct xfs_bulkstat));
 187                 orig_breq->hdr.ocount += new_breq->hdr.ocount;
 188                 old_bstat += new_breq->hdr.ocount;
 189                 seen_mask |= seen_mask_from_bulkstat(inums, start_ino,
 190                                         new_breq);
 191
 192                 new_breq->hdr.icount -= new_breq->hdr.ocount;
 193                 start_ino = new_breq->hdr.ino;
 194         } while (new_breq->hdr.icount > 0 && new_breq->hdr.ino < limit_ino);
 195
 196         free(new_breq);
 197         return seen_mask;
 198 }
 199
 200 #define cmp_int(l, r)           ((l > r) - (l < r))
 201
 202 /* Compare two bulkstat records by inumber. */
 203 static int
 204 compare_bstat(
 205         const void              *a,
 206         const void              *b)
 207 {
 208         const struct xfs_bulkstat *ba = a;
 209         const struct xfs_bulkstat *bb = b;
 210
 211         return cmp_int(ba->bs_ino, bb->bs_ino);
 212 }
 213
 214 /*
 215  * Walk the xi_allocmask looking for set bits that aren't present in
 216  * the fill mask.  For each such inode, fill the entries at the end of
 217  * the array with stat information one at a time, synthesizing them if
 218  * necessary.  At this point, (xi_allocmask & ~seen_mask) should be the
 219  * corrupt inodes.
 220  */
 221 static void
 222 bulkstat_single_step(
 223         struct scrub_ctx                *ctx,
 224         const struct xfs_inumbers       *inumbers,
 225         uint64_t                        seen_mask,
 226         struct xfs_bulkstat_req         *breq)
 227 {
 228         struct xfs_bulkstat             *bs = NULL;
 229         int                             i;
 230         int                             error;
 231
 232         for (i = 0; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
 233                 /*
 234                  * Don't single-step if inumbers said it wasn't allocated or
 235                  * bulkstat actually filled it.
 236                  */
 237                 if (!(inumbers->xi_allocmask & (1ULL << i)))
 238                         continue;
 239                 if (seen_mask & (1ULL << i))
 240                         continue;
 241
 242                 assert(breq->hdr.ocount < LIBFROG_BULKSTAT_CHUNKSIZE);
 243
 244                 if (!bs)
 245                         bs = &breq->bulkstat[breq->hdr.ocount];
 246
 247                 /*
 248                  * Didn't get desired stat data and we've hit the end of the
 249                  * returned data.  We can't distinguish between the inode being
 250                  * freed vs. the inode being to corrupt to load, so try a
 251                  * bulkstat single to see if we can load the inode.
 252                  */
 253                 error = -xfrog_bulkstat_single(&ctx->mnt,
 254                                 inumbers->xi_startino + i, breq->hdr.flags, bs);
 255                 switch (error) {
 256                 case ENOENT:
 257                         /*
 258                          * This inode wasn't found, and no results were
 259                          * returned.  We've likely hit the end of the
 260                          * filesystem, but we'll move on to the next inode in
 261                          * the mask for the sake of caution.
 262                          */
 263                         continue;
 264                 case 0:
 265                         /*
 266                          * If a result was returned but it wasn't the inode
 267                          * we were looking for, then the missing inode was
 268                          * freed.  Move on to the next inode in the mask.
 269                          */
 270                         if (bs->bs_ino != inumbers->xi_startino + i)
 271                                 continue;
 272                         break;
 273                 default:
 274                         /*
 275                          * Some error happened.  Synthesize a bulkstat record
 276                          * so that phase3 can try to see if there's a corrupt
 277                          * inode that needs repairing.
 278                          */
 279                         memset(bs, 0, sizeof(struct xfs_bulkstat));
 280                         bs->bs_ino = inumbers->xi_startino + i;
 281                         bs->bs_blksize = ctx->mnt_sv.f_frsize;
 282                         break;
 283                 }
 284
 285                 breq->hdr.ocount++;
 286                 bs++;
 287         }
 288
 289         /* If we added any entries, re-sort the array. */
 290         if (bs)
 291                 qsort(breq->bulkstat, breq->hdr.ocount,
 292                                 sizeof(struct xfs_bulkstat), compare_bstat);
 293 }
 294
 295 /* Return the inumber of the highest allocated inode in the inumbers data. */
 296 static inline uint64_t last_allocmask_ino(const struct xfs_inumbers *i)
 297 {
 298         return i->xi_startino + xfrog_highbit64(i->xi_allocmask);
 299 }
 300
 301 /*
 302  * Run bulkstat on an entire inode allocation group, then check that we got
 303  * exactly the inodes we expected.  If not, load them one at a time (or fake
 304  * it) into the bulkstat data.
 305  */
 306 static void
 307 bulkstat_for_inumbers(
 308         struct scrub_ctx                *ctx,
 309         const struct xfs_inumbers       *inumbers,
 310         struct xfs_bulkstat_req         *breq)
 311 {
 312         const uint64_t                  limit_ino =
 313                 inumbers->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
 314         uint64_t                        seen_mask = 0;
 315         int                             i;
 316         int                             error;
 317
 318         assert(inumbers->xi_allocmask != 0);
 319
 320         /* First we try regular bulkstat, for speed. */
 321         breq->hdr.ino = inumbers->xi_startino;
 322         error = -xfrog_bulkstat(&ctx->mnt, breq);
 323         if (!error) {
 324                 if (!breq->hdr.ocount)
 325                         return;
 326                 seen_mask |= seen_mask_from_bulkstat(inumbers,
 327                                         inumbers->xi_startino, breq);
 328         }
 329
 330         /*
 331          * If the last allocated inode as reported by inumbers is higher than
 332          * the last inode reported by bulkstat, two things could have happened.
 333          * Either all the inodes at the high end of the cluster were freed
 334          * since the inumbers call; or bulkstat encountered a corrupt inode and
 335          * returned early.  Try to bulkstat the rest of the array.
 336          */
 337         if (last_allocmask_ino(inumbers) > last_bstat_ino(breq))
 338                 seen_mask |= bulkstat_the_rest(ctx, inumbers, breq, error);
 339
 340         /*
 341          * Bulkstat might return inodes beyond xi_startino + CHUNKSIZE.  Reduce
 342          * ocount to ignore inodes not described by the inumbers record.
 343          */
 344         for (i = breq->hdr.ocount - 1; i >= 0; i--) {
 345                 if (breq->bulkstat[i].bs_ino < limit_ino)
 346                         break;
 347                 breq->hdr.ocount--;
 348         }
 349
 350         /*
 351          * Fill in any missing inodes that are mentioned in the alloc mask but
 352          * weren't previously seen by bulkstat.  These are the corrupt inodes.
 353          */
 354         bulkstat_single_step(ctx, inumbers, seen_mask, breq);
 355 }
 356
 357 /* BULKSTAT wrapper routines. */
 358 struct scan_inodes {
 359         struct workqueue        wq_bulkstat;
 360         scrub_inode_iter_fn     fn;
 361         void                    *arg;
 362         unsigned int            nr_threads;
 363         bool                    aborted;
 364 };
 365
 366 /*
 367  * A single unit of inode scan work.  This contains a pointer to the parent
 368  * information, followed by an INUMBERS request structure, followed by a
 369  * BULKSTAT request structure.  The last two are VLAs, so we can't represent
 370  * them here.
 371  */
 372 struct scan_ichunk {
 373         struct scan_inodes      *si;
 374 };
 375
 376 static inline struct xfs_inumbers_req *
 377 ichunk_to_inumbers(
 378         struct scan_ichunk      *ichunk)
 379 {
 380         char                    *p = (char *)ichunk;
 381
 382         return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
 383 }
 384
 385 static inline struct xfs_bulkstat_req *
 386 ichunk_to_bulkstat(
 387         struct scan_ichunk      *ichunk)
 388 {
 389         char                    *p = (char *)ichunk_to_inumbers(ichunk);
 390
 391         return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
 392 }
 393
 394 static inline int
 395 alloc_ichunk(
 396         struct scrub_ctx        *ctx,
 397         struct scan_inodes      *si,
 398         uint32_t                agno,
 399         uint64_t                startino,
 400         struct scan_ichunk      **ichunkp)
 401 {
 402         struct scan_ichunk      *ichunk;
 403         struct xfs_inumbers_req *ireq;
 404         struct xfs_bulkstat_req *breq;
 405
 406         ichunk = calloc(1, sizeof(struct scan_ichunk) +
 407                            XFS_INUMBERS_REQ_SIZE(1) +
 408                            XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
 409         if (!ichunk)
 410                 return -errno;
 411
 412         ichunk->si = si;
 413
 414         ireq = ichunk_to_inumbers(ichunk);
 415         ireq->hdr.icount = 1;
 416         ireq->hdr.ino = startino;
 417         ireq->hdr.agno = agno;
 418         ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
 419
 420         breq = ichunk_to_bulkstat(ichunk);
 421         breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
 422
 423         /* Scan the metadata directory tree too. */
 424         if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR)
 425                 breq->hdr.flags |= XFS_BULK_IREQ_METADIR;
 426
 427         *ichunkp = ichunk;
 428         return 0;
 429 }
 430
 431 static int
 432 render_ino_from_bulkstat(
 433         struct scrub_ctx        *ctx,
 434         char                    *buf,
 435         size_t                  buflen,
 436         void                    *data)
 437 {
 438         struct xfs_bulkstat     *bstat = data;
 439
 440         return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
 441                         bstat->bs_gen, NULL);
 442 }
 443
 444 static int
 445 render_inumbers_from_agno(
 446         struct scrub_ctx        *ctx,
 447         char                    *buf,
 448         size_t                  buflen,
 449         void                    *data)
 450 {
 451         xfs_agnumber_t          *agno = data;
 452
 453         return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
 454                                 major(ctx->fsinfo.fs_datadev),
 455                                 minor(ctx->fsinfo.fs_datadev),
 456                                 *agno);
 457 }
 458
 459 /*
 460  * Call BULKSTAT for information on a single chunk's worth of inodes and call
 461  * our iterator function.  We'll try to fill the bulkstat information in
 462  * batches, but we also can detect iget failures.
 463  */
 464 static void
 465 scan_ag_bulkstat(
 466         struct workqueue        *wq,
 467         xfs_agnumber_t          agno,
 468         void                    *arg)
 469 {
 470         struct xfs_handle       handle;
 471         struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
 472         struct scan_ichunk      *ichunk = arg;
 473         struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk);
 474         struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk);
 475         struct scan_inodes      *si = ichunk->si;
 476         struct xfs_bulkstat     *bs = &breq->bulkstat[0];
 477         struct xfs_inumbers     *inumbers = &ireq->inumbers[0];
 478         uint64_t                last_ino = 0;
 479         int                     i;
 480         int                     error;
 481         int                     stale_count = 0;
 482         DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
 483         DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
 484
 485         descr_set(&dsc_inumbers, &agno);
 486         handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len);
 487 retry:
 488         bulkstat_for_inumbers(ctx, inumbers, breq);
 489
 490         /* Iterate all the inodes. */
 491         for (i = 0; !si->aborted && i < breq->hdr.ocount; i++, bs++) {
 492                 uint64_t        scan_ino = bs->bs_ino;
 493
 494                 /* ensure forward progress if we retried */
 495                 if (scan_ino < last_ino)
 496                         continue;
 497
 498                 descr_set(&dsc_bulkstat, bs);
 499                 handle_from_bulkstat(&handle, bs);
 500                 error = si->fn(ctx, &handle, bs, si->arg);
 501                 switch (error) {
 502                 case 0:
 503                         break;
 504                 case ESTALE: {
 505                         stale_count++;
 506                         if (stale_count < 30) {
 507                                 uint64_t        old_startino;
 508
 509                                 ireq->hdr.ino = old_startino =
 510                                         inumbers->xi_startino;
 511                                 error = -xfrog_inumbers(&ctx->mnt, ireq);
 512                                 if (error)
 513                                         goto err;
 514                                 /*
 515                                  * Retry only if inumbers returns the same
 516                                  * inobt record as the previous record and
 517                                  * there are allocated inodes in it.
 518                                  */
 519                                 if (!si->aborted &&
 520                                     ireq->hdr.ocount > 0 &&
 521                                     inumbers->xi_alloccount > 0 &&
 522                                     inumbers->xi_startino == old_startino)
 523                                         goto retry;
 524                                 goto out;
 525                         }
 526                         str_info(ctx, descr_render(&dsc_bulkstat),
 527 _("Changed too many times during scan; giving up."));
 528                         si->aborted = true;
 529                         goto out;
 530                 }
 531                 case ECANCELED:
 532                         error = 0;
 533                         fallthrough;
 534                 default:
 535                         goto err;
 536                 }
 537                 if (scrub_excessive_errors(ctx)) {
 538                         si->aborted = true;
 539                         goto out;
 540                 }
 541                 last_ino = scan_ino;
 542         }
 543
 544 err:
 545         if (error) {
 546                 str_liberror(ctx, error, descr_render(&dsc_bulkstat));
 547                 si->aborted = true;
 548         }
 549 out:
 550         free(ichunk);
 551 }
 552
 553 /*
 554  * Call INUMBERS for information about inode chunks, then queue the inumbers
 555  * responses in the bulkstat workqueue.  This helps us maximize CPU parallelism
 556  * if the filesystem AGs are not evenly loaded.
 557  */
 558 static void
 559 scan_ag_inumbers(
 560         struct workqueue        *wq,
 561         xfs_agnumber_t          agno,
 562         void                    *arg)
 563 {
 564         struct scan_ichunk      *ichunk = NULL;
 565         struct scan_inodes      *si = arg;
 566         struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
 567         struct xfs_inumbers_req *ireq;
 568         uint64_t                nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
 569         int                     error;
 570         DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
 571
 572         descr_set(&dsc, &agno);
 573
 574         error = alloc_ichunk(ctx, si, agno, 0, &ichunk);
 575         if (error)
 576                 goto err;
 577         ireq = ichunk_to_inumbers(ichunk);
 578
 579         /* Find the inode chunk & alloc mask */
 580         error = -xfrog_inumbers(&ctx->mnt, ireq);
 581         while (!error && !si->aborted && ireq->hdr.ocount > 0) {
 582                 /*
 583                  * Make sure that we always make forward progress while we
 584                  * scan the inode btree.
 585                  */
 586                 if (nextino > ireq->inumbers[0].xi_startino) {
 587                         str_corrupt(ctx, descr_render(&dsc),
 588         _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
 589                                 cvt_ino_to_agino(&ctx->mnt, nextino),
 590                                 cvt_ino_to_agino(&ctx->mnt,
 591                                                 ireq->inumbers[0].xi_startino));
 592                         si->aborted = true;
 593                         break;
 594                 }
 595                 nextino = ireq->hdr.ino;
 596
 597                 if (ireq->inumbers[0].xi_alloccount == 0) {
 598                         /*
 599                          * We can have totally empty inode chunks on
 600                          * filesystems where there are more than 64 inodes per
 601                          * block.  Skip these.
 602                          */
 603                         ;
 604                 } else if (si->nr_threads > 0) {
 605                         /* Queue this inode chunk on the bulkstat workqueue. */
 606                         error = -workqueue_add(&si->wq_bulkstat,
 607                                         scan_ag_bulkstat, agno, ichunk);
 608                         if (error) {
 609                                 si->aborted = true;
 610                                 str_liberror(ctx, error,
 611                                                 _("queueing bulkstat work"));
 612                                 goto out;
 613                         }
 614                         ichunk = NULL;
 615                 } else {
 616                         /*
 617                          * Only one thread, call bulkstat directly.  Remember,
 618                          * ichunk is freed by the worker before returning.
 619                          */
 620                         scan_ag_bulkstat(wq, agno, ichunk);
 621                         ichunk = NULL;
 622                         if (si->aborted)
 623                                 break;
 624                 }
 625
 626                 if (!ichunk) {
 627                         error = alloc_ichunk(ctx, si, agno, nextino, &ichunk);
 628                         if (error)
 629                                 goto err;
 630                 }
 631                 ireq = ichunk_to_inumbers(ichunk);
 632
 633                 error = -xfrog_inumbers(&ctx->mnt, ireq);
 634         }
 635
 636 err:
 637         if (error) {
 638                 str_liberror(ctx, error, descr_render(&dsc));
 639                 si->aborted = true;
 640         }
 641 out:
 642         if (ichunk)
 643                 free(ichunk);
 644 }
 645
 646 /*
 647  * Scan all the inodes in a filesystem, including metadata directory files and
 648  * broken files.  On error, this function will log an error message and return
 649  * -1.
 650  */
 651 int
 652 scrub_scan_all_inodes(
 653         struct scrub_ctx        *ctx,
 654         scrub_inode_iter_fn     fn,
 655         void                    *arg)
 656 {
 657         struct scan_inodes      si = {
 658                 .fn             = fn,
 659                 .arg            = arg,
 660                 .nr_threads     = scrub_nproc_workqueue(ctx),
 661         };
 662         xfs_agnumber_t          agno;
 663         struct workqueue        wq_inumbers;
 664         unsigned int            max_bulkstat;
 665         int                     ret;
 666
 667         /*
 668          * The bulkstat workqueue should queue at most one inobt block's worth
 669          * of inode chunk records per worker thread.  If we're running in
 670          * single thread mode (nr_threads==0) then we skip the workqueues.
 671          */
 672         max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
 673
 674         ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
 675                         si.nr_threads, max_bulkstat);
 676         if (ret) {
 677                 str_liberror(ctx, ret, _("creating bulkstat workqueue"));
 678                 return -1;
 679         }
 680
 681         ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
 682                         si.nr_threads);
 683         if (ret) {
 684                 str_liberror(ctx, ret, _("creating inumbers workqueue"));
 685                 si.aborted = true;
 686                 goto kill_bulkstat;
 687         }
 688
 689         for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
 690                 ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
 691                 if (ret) {
 692                         si.aborted = true;
 693                         str_liberror(ctx, ret, _("queueing inumbers work"));
 694                         break;
 695                 }
 696         }
 697
 698         ret = -workqueue_terminate(&wq_inumbers);
 699         if (ret) {
 700                 si.aborted = true;
 701                 str_liberror(ctx, ret, _("finishing inumbers work"));
 702         }
 703         workqueue_destroy(&wq_inumbers);
 704
 705 kill_bulkstat:
 706         ret = -workqueue_terminate(&si.wq_bulkstat);
 707         if (ret) {
 708                 si.aborted = true;
 709                 str_liberror(ctx, ret, _("finishing bulkstat work"));
 710         }
 711         workqueue_destroy(&si.wq_bulkstat);
 712
 713         return si.aborted ? -1 : 0;
 714 }
 715
 716 struct user_bulkstat {
 717         struct scan_inodes      *si;
 718
 719         /* vla, must be last */
 720         struct xfs_bulkstat_req breq;
 721 };
 722
 723 /* Iterate all the user files returned by a bulkstat. */
 724 static void
 725 scan_user_files(
 726         struct workqueue        *wq,
 727         xfs_agnumber_t          agno,
 728         void                    *arg)
 729 {
 730         struct xfs_handle       handle;
 731         struct scrub_ctx        *ctx = (struct scrub_ctx *)wq->wq_ctx;
 732         struct user_bulkstat    *ureq = arg;
 733         struct xfs_bulkstat     *bs = &ureq->breq.bulkstat[0];
 734         struct scan_inodes      *si = ureq->si;
 735         int                     i;
 736         int                     error = 0;
 737         DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
 738
 739         handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len);
 740
 741         for (i = 0; !si->aborted && i < ureq->breq.hdr.ocount; i++, bs++) {
 742                 descr_set(&dsc_bulkstat, bs);
 743                 handle_from_bulkstat(&handle, bs);
 744                 error = si->fn(ctx, &handle, bs, si->arg);
 745                 switch (error) {
 746                 case 0:
 747                         break;
 748                 case ESTALE:
 749                 case ECANCELED:
 750                         error = 0;
 751                         fallthrough;
 752                 default:
 753                         goto err;
 754                 }
 755                 if (scrub_excessive_errors(ctx)) {
 756                         si->aborted = true;
 757                         goto out;
 758                 }
 759         }
 760
 761 err:
 762         if (error) {
 763                 str_liberror(ctx, error, descr_render(&dsc_bulkstat));
 764                 si->aborted = true;
 765         }
 766 out:
 767         free(ureq);
 768 }
 769
 770 /*
 771  * Run one step of the user files bulkstat scan and schedule background
 772  * processing of the stat data returned.  Returns 1 to keep going, or 0 to
 773  * stop.
 774  */
 775 static int
 776 scan_user_bulkstat(
 777         struct scrub_ctx        *ctx,
 778         struct scan_inodes      *si,
 779         uint64_t                *cursor)
 780 {
 781         struct user_bulkstat    *ureq;
 782         const char              *what = NULL;
 783         int                     ret;
 784
 785         ureq = calloc(1, sizeof(struct user_bulkstat) +
 786                          XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
 787         if (!ureq) {
 788                 ret = ENOMEM;
 789                 what = _("creating bulkstat work item");
 790                 goto err;
 791         }
 792         ureq->si = si;
 793         ureq->breq.hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
 794         ureq->breq.hdr.ino = *cursor;
 795
 796         ret = -xfrog_bulkstat(&ctx->mnt, &ureq->breq);
 797         if (ret) {
 798                 what = _("user files bulkstat");
 799                 goto err_ureq;
 800         }
 801         if (ureq->breq.hdr.ocount == 0) {
 802                 *cursor = NULLFSINO;
 803                 free(ureq);
 804                 return 0;
 805         }
 806
 807         *cursor = ureq->breq.hdr.ino;
 808
 809         /* scan_user_files frees ureq; do not access it */
 810         ret = -workqueue_add(&si->wq_bulkstat, scan_user_files, 0, ureq);
 811         if (ret) {
 812                 what = _("queueing bulkstat work");
 813                 goto err_ureq;
 814         }
 815         ureq = NULL;
 816
 817         return 1;
 818
 819 err_ureq:
 820         free(ureq);
 821 err:
 822         si->aborted = true;
 823         str_liberror(ctx, ret, what);
 824         return 0;
 825 }
 826
 827 /*
 828  * Scan all the user files in a filesystem in inumber order.  On error, this
 829  * function will log an error message and return -1.
 830  */
 831 int
 832 scrub_scan_user_files(
 833         struct scrub_ctx        *ctx,
 834         scrub_inode_iter_fn     fn,
 835         void                    *arg)
 836 {
 837         struct scan_inodes      si = {
 838                 .fn             = fn,
 839                 .arg            = arg,
 840                 .nr_threads     = scrub_nproc_workqueue(ctx),
 841         };
 842         uint64_t                ino = 0;
 843         int                     ret;
 844
 845         /* Queue up to four bulkstat result sets per thread. */
 846         ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
 847                         si.nr_threads, si.nr_threads * 4);
 848         if (ret) {
 849                 str_liberror(ctx, ret, _("creating bulkstat workqueue"));
 850                 return -1;
 851         }
 852
 853         while ((ret = scan_user_bulkstat(ctx, &si, &ino)) == 1) {
 854                 /* empty */
 855         }
 856
 857         ret = -workqueue_terminate(&si.wq_bulkstat);
 858         if (ret) {
 859                 si.aborted = true;
 860                 str_liberror(ctx, ret, _("finishing bulkstat work"));
 861         }
 862         workqueue_destroy(&si.wq_bulkstat);
 863
 864         return si.aborted ? -1 : 0;
 865 }
 866
 867 /* Open a file by handle, returning either the fd or -1 on error. */
 868 int
 869 scrub_open_handle(
 870         struct xfs_handle       *handle)
 871 {
 872         return open_by_fshandle(handle, sizeof(*handle),
 873                         O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
 874 }