]>
| Commit | Line | Data |
|---|---|---|
| 8d318d62 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 372d4ba9 | 2 | /* |
| 52520522 | 3 | * Copyright (C) 2018-2024 Oracle. All Rights Reserved. |
| 8d318d62 | 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
| 372d4ba9 | 5 | */ |
| a440f877 | 6 | #include "xfs.h" |
| 372d4ba9 DW |
7 | #include <stdint.h> |
| 8 | #include <stdlib.h> | |
| 9 | #include <pthread.h> | |
| 10 | #include <sys/statvfs.h> | |
| 11 | #include "platform_defs.h" | |
| 372d4ba9 | 12 | #include "xfs_arch.h" |
| 372d4ba9 | 13 | #include "handle.h" |
| 42b4c8e8 | 14 | #include "libfrog/paths.h" |
| 56598728 | 15 | #include "libfrog/workqueue.h" |
| 372d4ba9 DW |
16 | #include "xfs_scrub.h" |
| 17 | #include "common.h" | |
| 18 | #include "inodes.h" | |
| 245c72a6 | 19 | #include "descr.h" |
| fee68490 | 20 | #include "libfrog/fsgeom.h" |
| f31b5e12 | 21 | #include "libfrog/bulkstat.h" |
| 13af0394 | 22 | #include "libfrog/handle_priv.h" |
| 7ae92e1c DW |
23 | #include "bitops.h" |
| 24 | #include "libfrog/bitmask.h" | |
| 372d4ba9 DW |
25 | |
| 26 | /* | |
| 27 | * Iterate a range of inodes. | |
| 28 | * | |
| 29 | * This is a little more involved than repeatedly asking BULKSTAT for a | |
| 30 | * buffer's worth of stat data for some number of inodes. We want to scan as | |
| 2451a997 DW |
31 | * many of the inodes that the inobt thinks there are, so we use the INUMBERS |
| 32 | * ioctl to walk all the inobt records in the filesystem and spawn a worker to | |
| 33 | * bulkstat and iterate. The worker starts with an inumbers record that can | |
| 34 | * look like this: | |
| 372d4ba9 | 35 | * |
| 2451a997 DW |
36 | * {startino = S, allocmask = 0b11011} |
| 37 | * | |
| 38 | * Given a starting inumber S and count C=64, bulkstat will return a sorted | |
| 39 | * array of stat information. The bs_ino of those array elements can look like | |
| 40 | * any of the following: | |
| 41 | * | |
| 42 | * 0. [S, S+1, S+3, S+4] | |
| 43 | * 1. [S+e, S+e+1, S+e+3, S+e+4, S+e+C+1...], where e >= 0 | |
| 44 | * 2. [S+e+n], where n >= 0 | |
| 45 | * 3. [] | |
| 46 | * 4. [], errno == EFSCORRUPTED | |
| 47 | * | |
| 48 | * We know that bulkstat scanned the entire inode range between S and bs_ino of | |
| 49 | * the last array element, even though it only fills out an array element for | |
| 50 | * allocated inodes. Therefore, we can say in cases 0-2 that S was filled, | |
| 51 | * even if there is no bstat[] record for S. In turn, we can create a bitmask | |
| 52 | * of inodes that we have seen, and set bits 0 through (bstat[-1].bs_ino - S), | |
| 53 | * being careful not to set any bits past S+C. | |
| 54 | * | |
| 55 | * In case (0) we find that seen mask matches the inumber record | |
| 56 | * exactly, so the caller can walk the stat records and move on. In case (1) | |
| 57 | * this is also true, but we must be careful to reduce the array length to | |
| 58 | * avoid scanning inodes that are not in the inumber chunk. In case (3) we | |
| 59 | * conclude that there were no inodes left to scan and terminate. | |
| 60 | * | |
| 7ae92e1c DW |
61 | * In case (2) and (4) we don't know why bulkstat returned fewer than C |
| 62 | * elements. We might have found the end of the filesystem, or the kernel | |
| 63 | * might have found a corrupt inode and stopped. This we must investigate by | |
| 64 | * trying to fill out the rest of the bstat array starting with the next | |
| 65 | * inumber after the last bstat array element filled, and continuing until S' | |
| 66 | * is beyond S0 + C, or the array is full. Each time we succeed in loading | |
| 67 | * new records, the kernel increases S' for us; if instead we encounter case | |
| 68 | * (4), we can increment S' ourselves. | |
| 69 | * | |
| 2451a997 DW |
70 | * Inodes that are set in the allocmask but not set in the seen mask are the |
| 71 | * corrupt inodes. For each of these cases, we try to populate the bulkstat | |
| 72 | * array one inode at a time. If the kernel returns a matching record we can | |
| 73 | * use it; if instead we receive an error, we synthesize enough of a record | |
| 74 | * to be able to run online scrub by handle. | |
| 372d4ba9 DW |
75 | * |
| 76 | * If the iteration function returns ESTALE, that means that the inode has | |
| 77 | * been deleted and possibly recreated since the BULKSTAT call. We wil | |
| 78 | * refresh the stat information and try again up to 30 times before reporting | |
| 79 | * the staleness as an error. | |
| 80 | */ | |
| 81 | ||
| 2451a997 DW |
82 | /* |
| 83 | * Return the inumber of the highest inode in the bulkstat data, assuming the | |
| 84 | * records are sorted in inumber order. | |
| 85 | */ | |
| 86 | static inline uint64_t last_bstat_ino(const struct xfs_bulkstat_req *b) | |
| 87 | { | |
| 88 | return b->hdr.ocount ? b->bulkstat[b->hdr.ocount - 1].bs_ino : 0; | |
| 89 | } | |
| 90 | ||
| 91 | /* | |
| 92 | * Deduce the bitmask of the inodes in inums that were seen by bulkstat. If | |
| 93 | * the inode is present in the bstat array this is trivially true; or if it is | |
| 94 | * not in the array but higher inumbers are present, then it was freed. | |
| 95 | */ | |
| 96 | static __u64 | |
| 97 | seen_mask_from_bulkstat( | |
| 98 | const struct xfs_inumbers *inums, | |
| 99 | __u64 breq_startino, | |
| 100 | const struct xfs_bulkstat_req *breq) | |
| 101 | { | |
| 102 | const __u64 limit_ino = | |
| 103 | inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE; | |
| 104 | const __u64 last = last_bstat_ino(breq); | |
| 105 | __u64 ret = 0; | |
| 106 | int i, maxi; | |
| 107 | ||
| 108 | /* Ignore the bulkstat results if they don't cover inumbers */ | |
| 109 | if (breq_startino > limit_ino || last < inums->xi_startino) | |
| 110 | return 0; | |
| 111 | ||
| 112 | maxi = min(LIBFROG_BULKSTAT_CHUNKSIZE, last - inums->xi_startino + 1); | |
| 113 | for (i = breq_startino - inums->xi_startino; i < maxi; i++) | |
| 114 | ret |= 1ULL << i; | |
| 115 | ||
| 116 | return ret; | |
| 117 | } | |
| 118 | ||
| 7ae92e1c DW |
119 | /* |
| 120 | * Try to fill the rest of orig_breq with bulkstat data by re-running bulkstat | |
| 121 | * with increasing start_ino until we either hit the end of the inumbers info | |
| 122 | * or fill up the bstat array with something. Returns a bitmask of the inodes | |
| 123 | * within inums that were filled by the bulkstat requests. | |
| 124 | */ | |
| 125 | static __u64 | |
| 126 | bulkstat_the_rest( | |
| 127 | struct scrub_ctx *ctx, | |
| 128 | const struct xfs_inumbers *inums, | |
| 129 | struct xfs_bulkstat_req *orig_breq, | |
| 130 | int orig_error) | |
| 131 | { | |
| 132 | struct xfs_bulkstat_req *new_breq; | |
| 133 | struct xfs_bulkstat *old_bstat = | |
| 134 | &orig_breq->bulkstat[orig_breq->hdr.ocount]; | |
| 135 | const __u64 limit_ino = | |
| 136 | inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE; | |
| 137 | __u64 start_ino = orig_breq->hdr.ino; | |
| 138 | __u64 seen_mask = 0; | |
| 139 | int error; | |
| 140 | ||
| 141 | assert(orig_breq->hdr.ocount < orig_breq->hdr.icount); | |
| 142 | ||
| 143 | /* | |
| 144 | * If the first bulkstat returned a corruption error, that means | |
| 145 | * start_ino is corrupt. Restart instead at the next inumber. | |
| 146 | */ | |
| 147 | if (orig_error == EFSCORRUPTED) | |
| 148 | start_ino++; | |
| 149 | if (start_ino >= limit_ino) | |
| 150 | return 0; | |
| 151 | ||
| 152 | error = -xfrog_bulkstat_alloc_req( | |
| 153 | orig_breq->hdr.icount - orig_breq->hdr.ocount, | |
| 154 | start_ino, &new_breq); | |
| 155 | if (error) | |
| 156 | return error; | |
| 157 | new_breq->hdr.flags = orig_breq->hdr.flags; | |
| 158 | ||
| 159 | do { | |
| 160 | /* | |
| 161 | * Fill the new bulkstat request with stat data starting at | |
| 162 | * start_ino. | |
| 163 | */ | |
| 164 | error = -xfrog_bulkstat(&ctx->mnt, new_breq); | |
| 165 | if (error == EFSCORRUPTED) { | |
| 166 | /* | |
| 167 | * start_ino is corrupt, increment and try the next | |
| 168 | * inode. | |
| 169 | */ | |
| 170 | start_ino++; | |
| 171 | new_breq->hdr.ino = start_ino; | |
| 172 | continue; | |
| 173 | } | |
| 174 | if (error) { | |
| 175 | /* | |
| 176 | * Any other error means the caller falls back to | |
| 177 | * single stepping. | |
| 178 | */ | |
| 179 | break; | |
| 180 | } | |
| 181 | if (new_breq->hdr.ocount == 0) | |
| 182 | break; | |
| 183 | ||
| 184 | /* Copy new results to the original bstat buffer */ | |
| 185 | memcpy(old_bstat, new_breq->bulkstat, | |
| 186 | new_breq->hdr.ocount * sizeof(struct xfs_bulkstat)); | |
| 187 | orig_breq->hdr.ocount += new_breq->hdr.ocount; | |
| 188 | old_bstat += new_breq->hdr.ocount; | |
| 189 | seen_mask |= seen_mask_from_bulkstat(inums, start_ino, | |
| 190 | new_breq); | |
| 191 | ||
| 192 | new_breq->hdr.icount -= new_breq->hdr.ocount; | |
| 193 | start_ino = new_breq->hdr.ino; | |
| 194 | } while (new_breq->hdr.icount > 0 && new_breq->hdr.ino < limit_ino); | |
| 195 | ||
| 196 | free(new_breq); | |
| 197 | return seen_mask; | |
| 198 | } | |
| 199 | ||
| 2451a997 DW |
200 | /* Compare two bulkstat records by inumber. */ |
| 201 | static int | |
| 202 | compare_bstat( | |
| 203 | const void *a, | |
| 204 | const void *b) | |
| 205 | { | |
| 206 | const struct xfs_bulkstat *ba = a; | |
| 207 | const struct xfs_bulkstat *bb = b; | |
| 208 | ||
| 209 | return cmp_int(ba->bs_ino, bb->bs_ino); | |
| 210 | } | |
| 211 | ||
| 372d4ba9 | 212 | /* |
| b0289f63 DW |
213 | * Walk the xi_allocmask looking for set bits that aren't present in |
| 214 | * the fill mask. For each such inode, fill the entries at the end of | |
| 215 | * the array with stat information one at a time, synthesizing them if | |
| 216 | * necessary. At this point, (xi_allocmask & ~seen_mask) should be the | |
| 217 | * corrupt inodes. | |
| 372d4ba9 DW |
218 | */ |
| 219 | static void | |
| b0289f63 | 220 | bulkstat_single_step( |
| 17429887 | 221 | struct scrub_ctx *ctx, |
| 17429887 | 222 | const struct xfs_inumbers *inumbers, |
| b0289f63 | 223 | uint64_t seen_mask, |
| 17429887 | 224 | struct xfs_bulkstat_req *breq) |
| 372d4ba9 | 225 | { |
| 2451a997 | 226 | struct xfs_bulkstat *bs = NULL; |
| 17429887 DW |
227 | int i; |
| 228 | int error; | |
| 372d4ba9 | 229 | |
| 2451a997 DW |
230 | for (i = 0; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) { |
| 231 | /* | |
| 232 | * Don't single-step if inumbers said it wasn't allocated or | |
| 233 | * bulkstat actually filled it. | |
| 234 | */ | |
| b94a69ac | 235 | if (!(inumbers->xi_allocmask & (1ULL << i))) |
| 372d4ba9 | 236 | continue; |
| 2451a997 | 237 | if (seen_mask & (1ULL << i)) |
| 372d4ba9 | 238 | continue; |
| 372d4ba9 | 239 | |
| 2451a997 DW |
240 | assert(breq->hdr.ocount < LIBFROG_BULKSTAT_CHUNKSIZE); |
| 241 | ||
| 242 | if (!bs) | |
| 243 | bs = &breq->bulkstat[breq->hdr.ocount]; | |
| 244 | ||
| 245 | /* | |
| 246 | * Didn't get desired stat data and we've hit the end of the | |
| 247 | * returned data. We can't distinguish between the inode being | |
| 248 | * freed vs. the inode being to corrupt to load, so try a | |
| 249 | * bulkstat single to see if we can load the inode. | |
| 250 | */ | |
| e6542132 | 251 | error = -xfrog_bulkstat_single(&ctx->mnt, |
| ae497842 | 252 | inumbers->xi_startino + i, breq->hdr.flags, bs); |
| 20dbdd61 DW |
253 | switch (error) { |
| 254 | case ENOENT: | |
| 255 | /* | |
| 256 | * This inode wasn't found, and no results were | |
| 257 | * returned. We've likely hit the end of the | |
| 258 | * filesystem, but we'll move on to the next inode in | |
| 259 | * the mask for the sake of caution. | |
| 260 | */ | |
| 261 | continue; | |
| 262 | case 0: | |
| 263 | /* | |
| 264 | * If a result was returned but it wasn't the inode | |
| 265 | * we were looking for, then the missing inode was | |
| 266 | * freed. Move on to the next inode in the mask. | |
| 267 | */ | |
| 268 | if (bs->bs_ino != inumbers->xi_startino + i) | |
| 269 | continue; | |
| 270 | break; | |
| 271 | default: | |
| 272 | /* | |
| 273 | * Some error happened. Synthesize a bulkstat record | |
| 274 | * so that phase3 can try to see if there's a corrupt | |
| 275 | * inode that needs repairing. | |
| 276 | */ | |
| 4cca629d | 277 | memset(bs, 0, sizeof(struct xfs_bulkstat)); |
| b94a69ac | 278 | bs->bs_ino = inumbers->xi_startino + i; |
| 372d4ba9 | 279 | bs->bs_blksize = ctx->mnt_sv.f_frsize; |
| 20dbdd61 | 280 | break; |
| 372d4ba9 | 281 | } |
| 2451a997 DW |
282 | |
| 283 | breq->hdr.ocount++; | |
| 372d4ba9 DW |
284 | bs++; |
| 285 | } | |
| 2451a997 DW |
286 | |
| 287 | /* If we added any entries, re-sort the array. */ | |
| 288 | if (bs) | |
| 289 | qsort(breq->bulkstat, breq->hdr.ocount, | |
| 290 | sizeof(struct xfs_bulkstat), compare_bstat); | |
| 372d4ba9 DW |
291 | } |
| 292 | ||
| 7ae92e1c DW |
293 | /* Return the inumber of the highest allocated inode in the inumbers data. */ |
| 294 | static inline uint64_t last_allocmask_ino(const struct xfs_inumbers *i) | |
| 295 | { | |
| 296 | return i->xi_startino + xfrog_highbit64(i->xi_allocmask); | |
| 297 | } | |
| 298 | ||
| b0289f63 DW |
299 | /* |
| 300 | * Run bulkstat on an entire inode allocation group, then check that we got | |
| 301 | * exactly the inodes we expected. If not, load them one at a time (or fake | |
| 302 | * it) into the bulkstat data. | |
| 303 | */ | |
| 304 | static void | |
| 305 | bulkstat_for_inumbers( | |
| 306 | struct scrub_ctx *ctx, | |
| 307 | const struct xfs_inumbers *inumbers, | |
| 308 | struct xfs_bulkstat_req *breq) | |
| 309 | { | |
| 310 | const uint64_t limit_ino = | |
| 311 | inumbers->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE; | |
| 312 | uint64_t seen_mask = 0; | |
| 313 | int i; | |
| 314 | int error; | |
| 315 | ||
| 316 | assert(inumbers->xi_allocmask != 0); | |
| 317 | ||
| 318 | /* First we try regular bulkstat, for speed. */ | |
| 319 | breq->hdr.ino = inumbers->xi_startino; | |
| 320 | error = -xfrog_bulkstat(&ctx->mnt, breq); | |
| 321 | if (!error) { | |
| 322 | if (!breq->hdr.ocount) | |
| 323 | return; | |
| 324 | seen_mask |= seen_mask_from_bulkstat(inumbers, | |
| 325 | inumbers->xi_startino, breq); | |
| 326 | } | |
| 327 | ||
| 7ae92e1c DW |
328 | /* |
| 329 | * If the last allocated inode as reported by inumbers is higher than | |
| 330 | * the last inode reported by bulkstat, two things could have happened. | |
| 331 | * Either all the inodes at the high end of the cluster were freed | |
| 332 | * since the inumbers call; or bulkstat encountered a corrupt inode and | |
| 333 | * returned early. Try to bulkstat the rest of the array. | |
| 334 | */ | |
| 335 | if (last_allocmask_ino(inumbers) > last_bstat_ino(breq)) | |
| 336 | seen_mask |= bulkstat_the_rest(ctx, inumbers, breq, error); | |
| 337 | ||
| b0289f63 DW |
338 | /* |
| 339 | * Bulkstat might return inodes beyond xi_startino + CHUNKSIZE. Reduce | |
| 340 | * ocount to ignore inodes not described by the inumbers record. | |
| 341 | */ | |
| 342 | for (i = breq->hdr.ocount - 1; i >= 0; i--) { | |
| 343 | if (breq->bulkstat[i].bs_ino < limit_ino) | |
| 344 | break; | |
| 345 | breq->hdr.ocount--; | |
| 346 | } | |
| 347 | ||
| 348 | /* | |
| 349 | * Fill in any missing inodes that are mentioned in the alloc mask but | |
| 7ae92e1c | 350 | * weren't previously seen by bulkstat. These are the corrupt inodes. |
| b0289f63 DW |
351 | */ |
| 352 | bulkstat_single_step(ctx, inumbers, seen_mask, breq); | |
| 353 | } | |
| 354 | ||
| 59f79e0a DW |
355 | /* BULKSTAT wrapper routines. */ |
| 356 | struct scan_inodes { | |
| 245c72a6 | 357 | struct workqueue wq_bulkstat; |
| 59f79e0a DW |
358 | scrub_inode_iter_fn fn; |
| 359 | void *arg; | |
| 245c72a6 | 360 | unsigned int nr_threads; |
| 59f79e0a DW |
361 | bool aborted; |
| 362 | }; | |
| 363 | ||
| 372d4ba9 | 364 | /* |
| 245c72a6 DW |
365 | * A single unit of inode scan work. This contains a pointer to the parent |
| 366 | * information, followed by an INUMBERS request structure, followed by a | |
| 367 | * BULKSTAT request structure. The last two are VLAs, so we can't represent | |
| 368 | * them here. | |
| 369 | */ | |
| 370 | struct scan_ichunk { | |
| 371 | struct scan_inodes *si; | |
| 372 | }; | |
| 373 | ||
| 374 | static inline struct xfs_inumbers_req * | |
| 375 | ichunk_to_inumbers( | |
| 376 | struct scan_ichunk *ichunk) | |
| 377 | { | |
| 378 | char *p = (char *)ichunk; | |
| 379 | ||
| 380 | return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk)); | |
| 381 | } | |
| 382 | ||
| 383 | static inline struct xfs_bulkstat_req * | |
| 384 | ichunk_to_bulkstat( | |
| 385 | struct scan_ichunk *ichunk) | |
| 386 | { | |
| 387 | char *p = (char *)ichunk_to_inumbers(ichunk); | |
| 388 | ||
| 389 | return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1)); | |
| 390 | } | |
| 391 | ||
| 392 | static inline int | |
| 393 | alloc_ichunk( | |
| ae497842 | 394 | struct scrub_ctx *ctx, |
| 245c72a6 DW |
395 | struct scan_inodes *si, |
| 396 | uint32_t agno, | |
| 397 | uint64_t startino, | |
| 398 | struct scan_ichunk **ichunkp) | |
| 399 | { | |
| 400 | struct scan_ichunk *ichunk; | |
| 401 | struct xfs_inumbers_req *ireq; | |
| 402 | struct xfs_bulkstat_req *breq; | |
| 403 | ||
| 404 | ichunk = calloc(1, sizeof(struct scan_ichunk) + | |
| 405 | XFS_INUMBERS_REQ_SIZE(1) + | |
| 406 | XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE)); | |
| 407 | if (!ichunk) | |
| 408 | return -errno; | |
| 409 | ||
| 410 | ichunk->si = si; | |
| 411 | ||
| 412 | ireq = ichunk_to_inumbers(ichunk); | |
| 413 | ireq->hdr.icount = 1; | |
| 414 | ireq->hdr.ino = startino; | |
| 415 | ireq->hdr.agno = agno; | |
| 416 | ireq->hdr.flags |= XFS_BULK_IREQ_AGNO; | |
| 417 | ||
| 418 | breq = ichunk_to_bulkstat(ichunk); | |
| 419 | breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE; | |
| ae497842 DW |
420 | |
| 421 | /* Scan the metadata directory tree too. */ | |
| 422 | if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR) | |
| cd9d49b3 | 423 | breq->hdr.flags |= XFS_BULK_IREQ_METADIR; |
| 245c72a6 DW |
424 | |
| 425 | *ichunkp = ichunk; | |
| 426 | return 0; | |
| 427 | } | |
| 428 | ||
| b6fef47a | 429 | static int |
| 245c72a6 DW |
430 | render_ino_from_bulkstat( |
| 431 | struct scrub_ctx *ctx, | |
| 432 | char *buf, | |
| 433 | size_t buflen, | |
| 434 | void *data) | |
| 435 | { | |
| 436 | struct xfs_bulkstat *bstat = data; | |
| 437 | ||
| 438 | return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino, | |
| 439 | bstat->bs_gen, NULL); | |
| 440 | } | |
| 441 | ||
| 442 | static int | |
| 443 | render_inumbers_from_agno( | |
| 444 | struct scrub_ctx *ctx, | |
| 445 | char *buf, | |
| 446 | size_t buflen, | |
| 447 | void *data) | |
| 448 | { | |
| 449 | xfs_agnumber_t *agno = data; | |
| 450 | ||
| 451 | return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"), | |
| 452 | major(ctx->fsinfo.fs_datadev), | |
| 453 | minor(ctx->fsinfo.fs_datadev), | |
| 454 | *agno); | |
| 455 | } | |
| 456 | ||
| 457 | /* | |
| 458 | * Call BULKSTAT for information on a single chunk's worth of inodes and call | |
| 459 | * our iterator function. We'll try to fill the bulkstat information in | |
| 460 | * batches, but we also can detect iget failures. | |
| 372d4ba9 | 461 | */ |
| 59f79e0a | 462 | static void |
| 245c72a6 | 463 | scan_ag_bulkstat( |
| 59f79e0a DW |
464 | struct workqueue *wq, |
| 465 | xfs_agnumber_t agno, | |
| 372d4ba9 DW |
466 | void *arg) |
| 467 | { | |
| 13af0394 | 468 | struct xfs_handle handle; |
| 59f79e0a | 469 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
| 245c72a6 DW |
470 | struct scan_ichunk *ichunk = arg; |
| 471 | struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk); | |
| 472 | struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk); | |
| 473 | struct scan_inodes *si = ichunk->si; | |
| c053cf87 | 474 | struct xfs_bulkstat *bs = &breq->bulkstat[0]; |
| 245c72a6 | 475 | struct xfs_inumbers *inumbers = &ireq->inumbers[0]; |
| 9f4d6358 | 476 | uint64_t last_ino = 0; |
| 372d4ba9 DW |
477 | int i; |
| 478 | int error; | |
| 479 | int stale_count = 0; | |
| 245c72a6 DW |
480 | DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat); |
| 481 | DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno); | |
| 372d4ba9 | 482 | |
| 245c72a6 | 483 | descr_set(&dsc_inumbers, &agno); |
| 13af0394 | 484 | handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len); |
| 245c72a6 | 485 | retry: |
| 3653b83e | 486 | bulkstat_for_inumbers(ctx, inumbers, breq); |
| 245c72a6 DW |
487 | |
| 488 | /* Iterate all the inodes. */ | |
| c053cf87 | 489 | for (i = 0; !si->aborted && i < breq->hdr.ocount; i++, bs++) { |
| 9f4d6358 DW |
490 | uint64_t scan_ino = bs->bs_ino; |
| 491 | ||
| 492 | /* ensure forward progress if we retried */ | |
| 493 | if (scan_ino < last_ino) | |
| 494 | continue; | |
| 495 | ||
| 245c72a6 | 496 | descr_set(&dsc_bulkstat, bs); |
| 13af0394 | 497 | handle_from_bulkstat(&handle, bs); |
| 245c72a6 DW |
498 | error = si->fn(ctx, &handle, bs, si->arg); |
| 499 | switch (error) { | |
| 500 | case 0: | |
| 501 | break; | |
| 502 | case ESTALE: { | |
| 503 | stale_count++; | |
| 504 | if (stale_count < 30) { | |
| b95546f1 DW |
505 | uint64_t old_startino; |
| 506 | ||
| 507 | ireq->hdr.ino = old_startino = | |
| 508 | inumbers->xi_startino; | |
| 245c72a6 DW |
509 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
| 510 | if (error) | |
| 511 | goto err; | |
| b95546f1 DW |
512 | /* |
| 513 | * Retry only if inumbers returns the same | |
| 514 | * inobt record as the previous record and | |
| 515 | * there are allocated inodes in it. | |
| 516 | */ | |
| 517 | if (!si->aborted && | |
| 518 | ireq->hdr.ocount > 0 && | |
| 519 | inumbers->xi_alloccount > 0 && | |
| 520 | inumbers->xi_startino == old_startino) | |
| 521 | goto retry; | |
| 522 | goto out; | |
| 245c72a6 DW |
523 | } |
| 524 | str_info(ctx, descr_render(&dsc_bulkstat), | |
| 525 | _("Changed too many times during scan; giving up.")); | |
| 526 | si->aborted = true; | |
| 527 | goto out; | |
| 528 | } | |
| 529 | case ECANCELED: | |
| 530 | error = 0; | |
| 531 | fallthrough; | |
| 532 | default: | |
| 533 | goto err; | |
| 534 | } | |
| 535 | if (scrub_excessive_errors(ctx)) { | |
| 536 | si->aborted = true; | |
| 537 | goto out; | |
| 538 | } | |
| 9f4d6358 | 539 | last_ino = scan_ino; |
| 4cca629d DW |
540 | } |
| 541 | ||
| 245c72a6 | 542 | err: |
| e6542132 | 543 | if (error) { |
| 245c72a6 | 544 | str_liberror(ctx, error, descr_render(&dsc_bulkstat)); |
| 59f79e0a | 545 | si->aborted = true; |
| b94a69ac | 546 | } |
| 245c72a6 DW |
547 | out: |
| 548 | free(ichunk); | |
| 549 | } | |
| 550 | ||
| 551 | /* | |
| 552 | * Call INUMBERS for information about inode chunks, then queue the inumbers | |
| 553 | * responses in the bulkstat workqueue. This helps us maximize CPU parallelism | |
| 554 | * if the filesystem AGs are not evenly loaded. | |
| 555 | */ | |
| 556 | static void | |
| 557 | scan_ag_inumbers( | |
| 558 | struct workqueue *wq, | |
| 559 | xfs_agnumber_t agno, | |
| 560 | void *arg) | |
| 561 | { | |
| 562 | struct scan_ichunk *ichunk = NULL; | |
| 563 | struct scan_inodes *si = arg; | |
| 564 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; | |
| 565 | struct xfs_inumbers_req *ireq; | |
| 566 | uint64_t nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0); | |
| 567 | int error; | |
| 568 | DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno); | |
| 569 | ||
| 570 | descr_set(&dsc, &agno); | |
| 571 | ||
| ae497842 | 572 | error = alloc_ichunk(ctx, si, agno, 0, &ichunk); |
| 245c72a6 DW |
573 | if (error) |
| 574 | goto err; | |
| 575 | ireq = ichunk_to_inumbers(ichunk); | |
| b94a69ac | 576 | |
| 372d4ba9 | 577 | /* Find the inode chunk & alloc mask */ |
| e6542132 | 578 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
| 59f79e0a | 579 | while (!error && !si->aborted && ireq->hdr.ocount > 0) { |
| 4f546267 DW |
580 | /* |
| 581 | * Make sure that we always make forward progress while we | |
| 582 | * scan the inode btree. | |
| 583 | */ | |
| 245c72a6 DW |
584 | if (nextino > ireq->inumbers[0].xi_startino) { |
| 585 | str_corrupt(ctx, descr_render(&dsc), | |
| 4f546267 DW |
586 | _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno, |
| 587 | cvt_ino_to_agino(&ctx->mnt, nextino), | |
| 588 | cvt_ino_to_agino(&ctx->mnt, | |
| 589 | ireq->inumbers[0].xi_startino)); | |
| 590 | si->aborted = true; | |
| 591 | break; | |
| 592 | } | |
| 593 | nextino = ireq->hdr.ino; | |
| 594 | ||
| 245c72a6 DW |
595 | if (ireq->inumbers[0].xi_alloccount == 0) { |
| 596 | /* | |
| 597 | * We can have totally empty inode chunks on | |
| 598 | * filesystems where there are more than 64 inodes per | |
| 599 | * block. Skip these. | |
| 600 | */ | |
| 601 | ; | |
| 602 | } else if (si->nr_threads > 0) { | |
| 603 | /* Queue this inode chunk on the bulkstat workqueue. */ | |
| 604 | error = -workqueue_add(&si->wq_bulkstat, | |
| 605 | scan_ag_bulkstat, agno, ichunk); | |
| 606 | if (error) { | |
| 59f79e0a | 607 | si->aborted = true; |
| 245c72a6 DW |
608 | str_liberror(ctx, error, |
| 609 | _("queueing bulkstat work")); | |
| 372d4ba9 DW |
610 | goto out; |
| 611 | } | |
| 245c72a6 DW |
612 | ichunk = NULL; |
| 613 | } else { | |
| 614 | /* | |
| 615 | * Only one thread, call bulkstat directly. Remember, | |
| 616 | * ichunk is freed by the worker before returning. | |
| 617 | */ | |
| 618 | scan_ag_bulkstat(wq, agno, ichunk); | |
| 619 | ichunk = NULL; | |
| 620 | if (si->aborted) | |
| 621 | break; | |
| 622 | } | |
| 623 | ||
| 624 | if (!ichunk) { | |
| ae497842 | 625 | error = alloc_ichunk(ctx, si, agno, nextino, &ichunk); |
| 245c72a6 DW |
626 | if (error) |
| 627 | goto err; | |
| 372d4ba9 | 628 | } |
| 245c72a6 | 629 | ireq = ichunk_to_inumbers(ichunk); |
| 372d4ba9 | 630 | |
| e6542132 | 631 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
| 372d4ba9 DW |
632 | } |
| 633 | ||
| 634 | err: | |
| 635 | if (error) { | |
| 245c72a6 | 636 | str_liberror(ctx, error, descr_render(&dsc)); |
| 59f79e0a | 637 | si->aborted = true; |
| 372d4ba9 DW |
638 | } |
| 639 | out: | |
| 245c72a6 DW |
640 | if (ichunk) |
| 641 | free(ichunk); | |
| 372d4ba9 DW |
642 | } |
| 643 | ||
| 59f79e0a | 644 | /* |
| ae497842 DW |
645 | * Scan all the inodes in a filesystem, including metadata directory files and |
| 646 | * broken files. On error, this function will log an error message and return | |
| 647 | * -1. | |
| 59f79e0a DW |
648 | */ |
| 649 | int | |
| 650 | scrub_scan_all_inodes( | |
| 372d4ba9 | 651 | struct scrub_ctx *ctx, |
| 59f79e0a | 652 | scrub_inode_iter_fn fn, |
| 372d4ba9 DW |
653 | void *arg) |
| 654 | { | |
| 59f79e0a DW |
655 | struct scan_inodes si = { |
| 656 | .fn = fn, | |
| 657 | .arg = arg, | |
| 245c72a6 | 658 | .nr_threads = scrub_nproc_workqueue(ctx), |
| 59f79e0a | 659 | }; |
| 372d4ba9 | 660 | xfs_agnumber_t agno; |
| 245c72a6 DW |
661 | struct workqueue wq_inumbers; |
| 662 | unsigned int max_bulkstat; | |
| 372d4ba9 DW |
663 | int ret; |
| 664 | ||
| 245c72a6 DW |
665 | /* |
| 666 | * The bulkstat workqueue should queue at most one inobt block's worth | |
| 667 | * of inode chunk records per worker thread. If we're running in | |
| 668 | * single thread mode (nr_threads==0) then we skip the workqueues. | |
| 669 | */ | |
| 670 | max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16); | |
| 671 | ||
| 672 | ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx, | |
| 673 | si.nr_threads, max_bulkstat); | |
| 372d4ba9 | 674 | if (ret) { |
| 9d57cbfc | 675 | str_liberror(ctx, ret, _("creating bulkstat workqueue")); |
| 59f79e0a | 676 | return -1; |
| 372d4ba9 DW |
677 | } |
| 678 | ||
| 245c72a6 DW |
679 | ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx, |
| 680 | si.nr_threads); | |
| 681 | if (ret) { | |
| 682 | str_liberror(ctx, ret, _("creating inumbers workqueue")); | |
| 683 | si.aborted = true; | |
| 684 | goto kill_bulkstat; | |
| 685 | } | |
| 686 | ||
| 3f9efb2e | 687 | for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { |
| 245c72a6 | 688 | ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si); |
| 372d4ba9 | 689 | if (ret) { |
| 59f79e0a | 690 | si.aborted = true; |
| 245c72a6 | 691 | str_liberror(ctx, ret, _("queueing inumbers work")); |
| 372d4ba9 DW |
692 | break; |
| 693 | } | |
| 694 | } | |
| 695 | ||
| 245c72a6 DW |
696 | ret = -workqueue_terminate(&wq_inumbers); |
| 697 | if (ret) { | |
| 698 | si.aborted = true; | |
| 699 | str_liberror(ctx, ret, _("finishing inumbers work")); | |
| 700 | } | |
| 701 | workqueue_destroy(&wq_inumbers); | |
| 702 | ||
| 703 | kill_bulkstat: | |
| 704 | ret = -workqueue_terminate(&si.wq_bulkstat); | |
| 71296cf8 | 705 | if (ret) { |
| 59f79e0a | 706 | si.aborted = true; |
| 71296cf8 DW |
707 | str_liberror(ctx, ret, _("finishing bulkstat work")); |
| 708 | } | |
| 245c72a6 | 709 | workqueue_destroy(&si.wq_bulkstat); |
| 372d4ba9 | 710 | |
| 59f79e0a | 711 | return si.aborted ? -1 : 0; |
| 372d4ba9 DW |
712 | } |
| 713 | ||
| 279b0d0e DW |
714 | struct user_bulkstat { |
| 715 | struct scan_inodes *si; | |
| 716 | ||
| 717 | /* vla, must be last */ | |
| 718 | struct xfs_bulkstat_req breq; | |
| 719 | }; | |
| 720 | ||
| 721 | /* Iterate all the user files returned by a bulkstat. */ | |
| 722 | static void | |
| 723 | scan_user_files( | |
| 724 | struct workqueue *wq, | |
| 725 | xfs_agnumber_t agno, | |
| 726 | void *arg) | |
| 727 | { | |
| 728 | struct xfs_handle handle; | |
| 729 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; | |
| 730 | struct user_bulkstat *ureq = arg; | |
| 731 | struct xfs_bulkstat *bs = &ureq->breq.bulkstat[0]; | |
| 732 | struct scan_inodes *si = ureq->si; | |
| 733 | int i; | |
| 734 | int error = 0; | |
| 735 | DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat); | |
| 736 | ||
| 737 | handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len); | |
| 738 | ||
| 739 | for (i = 0; !si->aborted && i < ureq->breq.hdr.ocount; i++, bs++) { | |
| 740 | descr_set(&dsc_bulkstat, bs); | |
| 741 | handle_from_bulkstat(&handle, bs); | |
| 742 | error = si->fn(ctx, &handle, bs, si->arg); | |
| 743 | switch (error) { | |
| 744 | case 0: | |
| 745 | break; | |
| 746 | case ESTALE: | |
| 747 | case ECANCELED: | |
| 748 | error = 0; | |
| 749 | fallthrough; | |
| 750 | default: | |
| 751 | goto err; | |
| 752 | } | |
| 753 | if (scrub_excessive_errors(ctx)) { | |
| 754 | si->aborted = true; | |
| 755 | goto out; | |
| 756 | } | |
| 757 | } | |
| 758 | ||
| 759 | err: | |
| 760 | if (error) { | |
| 761 | str_liberror(ctx, error, descr_render(&dsc_bulkstat)); | |
| 762 | si->aborted = true; | |
| 763 | } | |
| 764 | out: | |
| 765 | free(ureq); | |
| 766 | } | |
| 767 | ||
| 768 | /* | |
| 769 | * Run one step of the user files bulkstat scan and schedule background | |
| 770 | * processing of the stat data returned. Returns 1 to keep going, or 0 to | |
| 771 | * stop. | |
| 772 | */ | |
| 773 | static int | |
| 774 | scan_user_bulkstat( | |
| 775 | struct scrub_ctx *ctx, | |
| 776 | struct scan_inodes *si, | |
| 777 | uint64_t *cursor) | |
| 778 | { | |
| 779 | struct user_bulkstat *ureq; | |
| 780 | const char *what = NULL; | |
| 781 | int ret; | |
| 782 | ||
| 783 | ureq = calloc(1, sizeof(struct user_bulkstat) + | |
| 784 | XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE)); | |
| 785 | if (!ureq) { | |
| 786 | ret = ENOMEM; | |
| 787 | what = _("creating bulkstat work item"); | |
| 788 | goto err; | |
| 789 | } | |
| 790 | ureq->si = si; | |
| 791 | ureq->breq.hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE; | |
| 792 | ureq->breq.hdr.ino = *cursor; | |
| 793 | ||
| 794 | ret = -xfrog_bulkstat(&ctx->mnt, &ureq->breq); | |
| 795 | if (ret) { | |
| 796 | what = _("user files bulkstat"); | |
| 797 | goto err_ureq; | |
| 798 | } | |
| 799 | if (ureq->breq.hdr.ocount == 0) { | |
| 800 | *cursor = NULLFSINO; | |
| 801 | free(ureq); | |
| 802 | return 0; | |
| 803 | } | |
| 804 | ||
| 805 | *cursor = ureq->breq.hdr.ino; | |
| 806 | ||
| 807 | /* scan_user_files frees ureq; do not access it */ | |
| 808 | ret = -workqueue_add(&si->wq_bulkstat, scan_user_files, 0, ureq); | |
| 809 | if (ret) { | |
| 810 | what = _("queueing bulkstat work"); | |
| 811 | goto err_ureq; | |
| 812 | } | |
| 813 | ureq = NULL; | |
| 814 | ||
| 815 | return 1; | |
| 816 | ||
| 817 | err_ureq: | |
| 818 | free(ureq); | |
| 819 | err: | |
| 820 | si->aborted = true; | |
| 821 | str_liberror(ctx, ret, what); | |
| 822 | return 0; | |
| 823 | } | |
| 824 | ||
| 825 | /* | |
| 826 | * Scan all the user files in a filesystem in inumber order. On error, this | |
| 827 | * function will log an error message and return -1. | |
| 828 | */ | |
| 829 | int | |
| 830 | scrub_scan_user_files( | |
| 831 | struct scrub_ctx *ctx, | |
| 832 | scrub_inode_iter_fn fn, | |
| 833 | void *arg) | |
| 834 | { | |
| 835 | struct scan_inodes si = { | |
| 836 | .fn = fn, | |
| 837 | .arg = arg, | |
| 838 | .nr_threads = scrub_nproc_workqueue(ctx), | |
| 839 | }; | |
| 840 | uint64_t ino = 0; | |
| 841 | int ret; | |
| 842 | ||
| 843 | /* Queue up to four bulkstat result sets per thread. */ | |
| 844 | ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx, | |
| 845 | si.nr_threads, si.nr_threads * 4); | |
| 846 | if (ret) { | |
| 847 | str_liberror(ctx, ret, _("creating bulkstat workqueue")); | |
| 848 | return -1; | |
| 849 | } | |
| 850 | ||
| 851 | while ((ret = scan_user_bulkstat(ctx, &si, &ino)) == 1) { | |
| 852 | /* empty */ | |
| 853 | } | |
| 854 | ||
| 855 | ret = -workqueue_terminate(&si.wq_bulkstat); | |
| 856 | if (ret) { | |
| 857 | si.aborted = true; | |
| 858 | str_liberror(ctx, ret, _("finishing bulkstat work")); | |
| 859 | } | |
| 860 | workqueue_destroy(&si.wq_bulkstat); | |
| 861 | ||
| 862 | return si.aborted ? -1 : 0; | |
| 863 | } | |
| 864 | ||
| 59f79e0a | 865 | /* Open a file by handle, returning either the fd or -1 on error. */ |
| 372d4ba9 | 866 | int |
| 59f79e0a | 867 | scrub_open_handle( |
| 372d4ba9 DW |
868 | struct xfs_handle *handle) |
| 869 | { | |
| 870 | return open_by_fshandle(handle, sizeof(*handle), | |
| 871 | O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); | |
| 872 | } |