xfs_scrub: don't blow away new inodes in bulkstat_single_step

author Darrick J. Wong <djwong@kernel.org>

Mon, 24 Feb 2025 18:21:43 +0000 (10:21 -0800)

committer Darrick J. Wong <djwong@kernel.org>

Tue, 25 Feb 2025 17:15:57 +0000 (09:15 -0800)
author Darrick J. Wong <djwong@kernel.org>
Mon, 24 Feb 2025 18:21:43 +0000 (10:21 -0800)
committer Darrick J. Wong <djwong@kernel.org>
Tue, 25 Feb 2025 17:15:57 +0000 (09:15 -0800)
diff --git a/scrub/inodes.c b/scrub/inodes.c

index 3b9026ce8fa2f4811a1c16b077306f499d755878..ffdf0f2ae42c17fbf8f3c959ec8ce084b30421e2 100644 (file)
--- a/scrub/inodes.c
+++ b/scrub/inodes.c
@@ -26,16 +26,41 @@
   *
   * This is a little more involved than repeatedly asking BULKSTAT for a
   * buffer's worth of stat data for some number of inodes.  We want to scan as
- * many of the inodes that the inobt thinks there are, including the ones that
- * are broken, but if we ask for n inodes starting at x, it'll skip the bad
- * ones and fill from beyond the range (x + n).
+ * many of the inodes that the inobt thinks there are, so we use the INUMBERS
+ * ioctl to walk all the inobt records in the filesystem and spawn a worker to
+ * bulkstat and iterate.  The worker starts with an inumbers record that can
+ * look like this:
   *
- * Therefore, we ask INUMBERS to return one inobt chunk's worth of inode
- * bitmap information.  Then we try to BULKSTAT only the inodes that were
- * present in that chunk, and compare what we got against what INUMBERS said
- * was there.  If there's a mismatch, we know that we have an inode that fails
- * the verifiers but we can inject the bulkstat information to force the scrub
- * code to deal with the broken inodes.
+ * {startino = S, allocmask = 0b11011}
+ *
+ * Given a starting inumber S and count C=64, bulkstat will return a sorted
+ * array of stat information.  The bs_ino of those array elements can look like
+ * any of the following:
+ *
+ * 0. [S, S+1, S+3, S+4]
+ * 1. [S+e, S+e+1, S+e+3, S+e+4, S+e+C+1...], where e >= 0
+ * 2. [S+e+n], where n >= 0
+ * 3. []
+ * 4. [], errno == EFSCORRUPTED
+ *
+ * We know that bulkstat scanned the entire inode range between S and bs_ino of
+ * the last array element, even though it only fills out an array element for
+ * allocated inodes.  Therefore, we can say in cases 0-2 that S was filled,
+ * even if there is no bstat[] record for S.  In turn, we can create a bitmask
+ * of inodes that we have seen, and set bits 0 through (bstat[-1].bs_ino - S),
+ * being careful not to set any bits past S+C.
+ *
+ * In case (0) we find that seen mask matches the inumber record
+ * exactly, so the caller can walk the stat records and move on.  In case (1)
+ * this is also true, but we must be careful to reduce the array length to
+ * avoid scanning inodes that are not in the inumber chunk.  In case (3) we
+ * conclude that there were no inodes left to scan and terminate.
+ *
+ * Inodes that are set in the allocmask but not set in the seen mask are the
+ * corrupt inodes.  For each of these cases, we try to populate the bulkstat
+ * array one inode at a time.  If the kernel returns a matching record we can
+ * use it; if instead we receive an error, we synthesize enough of a record
+ * to be able to run online scrub by handle.
   *
   * If the iteration function returns ESTALE, that means that the inode has
   * been deleted and possibly recreated since the BULKSTAT call.  We wil
@@ -43,6 +68,57 @@
   * the staleness as an error.
   */
  
+/*
+ * Return the inumber of the highest inode in the bulkstat data, assuming the
+ * records are sorted in inumber order.
+ */
+static inline uint64_t last_bstat_ino(const struct xfs_bulkstat_req *b)
+{
+       return b->hdr.ocount ? b->bulkstat[b->hdr.ocount - 1].bs_ino : 0;
+}
+
+/*
+ * Deduce the bitmask of the inodes in inums that were seen by bulkstat.  If
+ * the inode is present in the bstat array this is trivially true; or if it is
+ * not in the array but higher inumbers are present, then it was freed.
+ */
+static __u64
+seen_mask_from_bulkstat(
+       const struct xfs_inumbers       *inums,
+       __u64                           breq_startino,
+       const struct xfs_bulkstat_req   *breq)
+{
+       const __u64                     limit_ino =
+               inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
+       const __u64                     last = last_bstat_ino(breq);
+       __u64                           ret = 0;
+       int                             i, maxi;
+
+       /* Ignore the bulkstat results if they don't cover inumbers */
+       if (breq_startino > limit_ino || last < inums->xi_startino)
+               return 0;
+
+       maxi = min(LIBFROG_BULKSTAT_CHUNKSIZE, last - inums->xi_startino + 1);
+       for (i = breq_startino - inums->xi_startino; i < maxi; i++)
+               ret |= 1ULL << i;
+
+       return ret;
+}
+
+#define cmp_int(l, r)          ((l > r) - (l < r))
+
+/* Compare two bulkstat records by inumber. */
+static int
+compare_bstat(
+       const void              *a,
+       const void              *b)
+{
+       const struct xfs_bulkstat *ba = a;
+       const struct xfs_bulkstat *bb = b;
+
+       return cmp_int(ba->bs_ino, bb->bs_ino);
+}
+
  /*
   * Run bulkstat on an entire inode allocation group, then check that we got
   * exactly the inodes we expected.  If not, load them one at a time (or fake
@@ -54,10 +130,10 @@ bulkstat_for_inumbers(
         const struct xfs_inumbers       *inumbers,
         struct xfs_bulkstat_req         *breq)
  {
-       struct xfs_bulkstat             *bstat = breq->bulkstat;
-       struct xfs_bulkstat             *bs;
+       struct xfs_bulkstat             *bs = NULL;
         const uint64_t                  limit_ino =
                 inumbers->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
+       uint64_t                        seen_mask = 0;
         int                             i;
         int                             error;
  
@@ -66,8 +142,12 @@ bulkstat_for_inumbers(
         /* First we try regular bulkstat, for speed. */
         breq->hdr.ino = inumbers->xi_startino;
         error = -xfrog_bulkstat(&ctx->mnt, breq);
-       if (!error && !breq->hdr.ocount)
-               return;
+       if (!error) {
+               if (!breq->hdr.ocount)
+                       return;
+               seen_mask |= seen_mask_from_bulkstat(inumbers,
+                                       inumbers->xi_startino, breq);
+       }
  
         /*
          * Bulkstat might return inodes beyond xi_startino + CHUNKSIZE.  Reduce
@@ -80,18 +160,33 @@ bulkstat_for_inumbers(
         }
  
         /*
-        * Check each of the stats we got back to make sure we got the inodes
-        * we asked for.
+        * Walk the xi_allocmask looking for set bits that aren't present in
+        * the fill mask.  For each such inode, fill the entries at the end of
+        * the array with stat information one at a time, synthesizing them if
+        * necessary.  At this point, (xi_allocmask & ~seen_mask) should be the
+        * corrupt inodes.
          */
-       for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
+       for (i = 0; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
+               /*
+                * Don't single-step if inumbers said it wasn't allocated or
+                * bulkstat actually filled it.
+                */
                 if (!(inumbers->xi_allocmask & (1ULL << i)))
                         continue;
-               if (bs->bs_ino == inumbers->xi_startino + i) {
-                       bs++;
+               if (seen_mask & (1ULL << i))
                         continue;
-               }
  
-               /* Load the one inode. */
+               assert(breq->hdr.ocount < LIBFROG_BULKSTAT_CHUNKSIZE);
+
+               if (!bs)
+                       bs = &breq->bulkstat[breq->hdr.ocount];
+
+               /*
+                * Didn't get desired stat data and we've hit the end of the
+                * returned data.  We can't distinguish between the inode being
+                * freed vs. the inode being to corrupt to load, so try a
+                * bulkstat single to see if we can load the inode.
+                */
                 error = -xfrog_bulkstat_single(&ctx->mnt,
                                 inumbers->xi_startino + i, breq->hdr.flags, bs);
                 if (error || bs->bs_ino != inumbers->xi_startino + i) {
@@ -99,8 +194,15 @@ bulkstat_for_inumbers(
                         bs->bs_ino = inumbers->xi_startino + i;
                         bs->bs_blksize = ctx->mnt_sv.f_frsize;
                 }
+
+               breq->hdr.ocount++;
                 bs++;
         }
+
+       /* If we added any entries, re-sort the array. */
+       if (bs)
+               qsort(breq->bulkstat, breq->hdr.ocount,
+                               sizeof(struct xfs_bulkstat), compare_bstat);
  }
  
  /* BULKSTAT wrapper routines. */
author	Darrick J. Wong <djwong@kernel.org>
	Mon, 24 Feb 2025 18:21:43 +0000 (10:21 -0800)
committer	Darrick J. Wong <djwong@kernel.org>
	Tue, 25 Feb 2025 17:15:57 +0000 (09:15 -0800)