]>
Commit | Line | Data |
---|---|---|
8d318d62 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
372d4ba9 | 2 | /* |
52520522 | 3 | * Copyright (C) 2018-2024 Oracle. All Rights Reserved. |
8d318d62 | 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
372d4ba9 | 5 | */ |
a440f877 | 6 | #include "xfs.h" |
372d4ba9 DW |
7 | #include <stdint.h> |
8 | #include <stdlib.h> | |
9 | #include <pthread.h> | |
10 | #include <sys/statvfs.h> | |
11 | #include "platform_defs.h" | |
372d4ba9 | 12 | #include "xfs_arch.h" |
372d4ba9 | 13 | #include "handle.h" |
42b4c8e8 | 14 | #include "libfrog/paths.h" |
56598728 | 15 | #include "libfrog/workqueue.h" |
372d4ba9 DW |
16 | #include "xfs_scrub.h" |
17 | #include "common.h" | |
18 | #include "inodes.h" | |
245c72a6 | 19 | #include "descr.h" |
fee68490 | 20 | #include "libfrog/fsgeom.h" |
f31b5e12 | 21 | #include "libfrog/bulkstat.h" |
13af0394 | 22 | #include "libfrog/handle_priv.h" |
7ae92e1c DW |
23 | #include "bitops.h" |
24 | #include "libfrog/bitmask.h" | |
372d4ba9 DW |
25 | |
26 | /* | |
27 | * Iterate a range of inodes. | |
28 | * | |
29 | * This is a little more involved than repeatedly asking BULKSTAT for a | |
30 | * buffer's worth of stat data for some number of inodes. We want to scan as | |
2451a997 DW |
31 | * many of the inodes that the inobt thinks there are, so we use the INUMBERS |
32 | * ioctl to walk all the inobt records in the filesystem and spawn a worker to | |
33 | * bulkstat and iterate. The worker starts with an inumbers record that can | |
34 | * look like this: | |
372d4ba9 | 35 | * |
2451a997 DW |
36 | * {startino = S, allocmask = 0b11011} |
37 | * | |
38 | * Given a starting inumber S and count C=64, bulkstat will return a sorted | |
39 | * array of stat information. The bs_ino of those array elements can look like | |
40 | * any of the following: | |
41 | * | |
42 | * 0. [S, S+1, S+3, S+4] | |
43 | * 1. [S+e, S+e+1, S+e+3, S+e+4, S+e+C+1...], where e >= 0 | |
44 | * 2. [S+e+n], where n >= 0 | |
45 | * 3. [] | |
46 | * 4. [], errno == EFSCORRUPTED | |
47 | * | |
48 | * We know that bulkstat scanned the entire inode range between S and bs_ino of | |
49 | * the last array element, even though it only fills out an array element for | |
50 | * allocated inodes. Therefore, we can say in cases 0-2 that S was filled, | |
51 | * even if there is no bstat[] record for S. In turn, we can create a bitmask | |
52 | * of inodes that we have seen, and set bits 0 through (bstat[-1].bs_ino - S), | |
53 | * being careful not to set any bits past S+C. | |
54 | * | |
55 | * In case (0) we find that seen mask matches the inumber record | |
56 | * exactly, so the caller can walk the stat records and move on. In case (1) | |
57 | * this is also true, but we must be careful to reduce the array length to | |
58 | * avoid scanning inodes that are not in the inumber chunk. In case (3) we | |
59 | * conclude that there were no inodes left to scan and terminate. | |
60 | * | |
7ae92e1c DW |
61 | * In case (2) and (4) we don't know why bulkstat returned fewer than C |
62 | * elements. We might have found the end of the filesystem, or the kernel | |
63 | * might have found a corrupt inode and stopped. This we must investigate by | |
64 | * trying to fill out the rest of the bstat array starting with the next | |
65 | * inumber after the last bstat array element filled, and continuing until S' | |
66 | * is beyond S0 + C, or the array is full. Each time we succeed in loading | |
67 | * new records, the kernel increases S' for us; if instead we encounter case | |
68 | * (4), we can increment S' ourselves. | |
69 | * | |
2451a997 DW |
70 | * Inodes that are set in the allocmask but not set in the seen mask are the |
71 | * corrupt inodes. For each of these cases, we try to populate the bulkstat | |
72 | * array one inode at a time. If the kernel returns a matching record we can | |
73 | * use it; if instead we receive an error, we synthesize enough of a record | |
74 | * to be able to run online scrub by handle. | |
372d4ba9 DW |
75 | * |
76 | * If the iteration function returns ESTALE, that means that the inode has | |
77 | * been deleted and possibly recreated since the BULKSTAT call. We wil | |
78 | * refresh the stat information and try again up to 30 times before reporting | |
79 | * the staleness as an error. | |
80 | */ | |
81 | ||
2451a997 DW |
82 | /* |
83 | * Return the inumber of the highest inode in the bulkstat data, assuming the | |
84 | * records are sorted in inumber order. | |
85 | */ | |
86 | static inline uint64_t last_bstat_ino(const struct xfs_bulkstat_req *b) | |
87 | { | |
88 | return b->hdr.ocount ? b->bulkstat[b->hdr.ocount - 1].bs_ino : 0; | |
89 | } | |
90 | ||
91 | /* | |
92 | * Deduce the bitmask of the inodes in inums that were seen by bulkstat. If | |
93 | * the inode is present in the bstat array this is trivially true; or if it is | |
94 | * not in the array but higher inumbers are present, then it was freed. | |
95 | */ | |
96 | static __u64 | |
97 | seen_mask_from_bulkstat( | |
98 | const struct xfs_inumbers *inums, | |
99 | __u64 breq_startino, | |
100 | const struct xfs_bulkstat_req *breq) | |
101 | { | |
102 | const __u64 limit_ino = | |
103 | inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE; | |
104 | const __u64 last = last_bstat_ino(breq); | |
105 | __u64 ret = 0; | |
106 | int i, maxi; | |
107 | ||
108 | /* Ignore the bulkstat results if they don't cover inumbers */ | |
109 | if (breq_startino > limit_ino || last < inums->xi_startino) | |
110 | return 0; | |
111 | ||
112 | maxi = min(LIBFROG_BULKSTAT_CHUNKSIZE, last - inums->xi_startino + 1); | |
113 | for (i = breq_startino - inums->xi_startino; i < maxi; i++) | |
114 | ret |= 1ULL << i; | |
115 | ||
116 | return ret; | |
117 | } | |
118 | ||
7ae92e1c DW |
119 | /* |
120 | * Try to fill the rest of orig_breq with bulkstat data by re-running bulkstat | |
121 | * with increasing start_ino until we either hit the end of the inumbers info | |
122 | * or fill up the bstat array with something. Returns a bitmask of the inodes | |
123 | * within inums that were filled by the bulkstat requests. | |
124 | */ | |
125 | static __u64 | |
126 | bulkstat_the_rest( | |
127 | struct scrub_ctx *ctx, | |
128 | const struct xfs_inumbers *inums, | |
129 | struct xfs_bulkstat_req *orig_breq, | |
130 | int orig_error) | |
131 | { | |
132 | struct xfs_bulkstat_req *new_breq; | |
133 | struct xfs_bulkstat *old_bstat = | |
134 | &orig_breq->bulkstat[orig_breq->hdr.ocount]; | |
135 | const __u64 limit_ino = | |
136 | inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE; | |
137 | __u64 start_ino = orig_breq->hdr.ino; | |
138 | __u64 seen_mask = 0; | |
139 | int error; | |
140 | ||
141 | assert(orig_breq->hdr.ocount < orig_breq->hdr.icount); | |
142 | ||
143 | /* | |
144 | * If the first bulkstat returned a corruption error, that means | |
145 | * start_ino is corrupt. Restart instead at the next inumber. | |
146 | */ | |
147 | if (orig_error == EFSCORRUPTED) | |
148 | start_ino++; | |
149 | if (start_ino >= limit_ino) | |
150 | return 0; | |
151 | ||
152 | error = -xfrog_bulkstat_alloc_req( | |
153 | orig_breq->hdr.icount - orig_breq->hdr.ocount, | |
154 | start_ino, &new_breq); | |
155 | if (error) | |
156 | return error; | |
157 | new_breq->hdr.flags = orig_breq->hdr.flags; | |
158 | ||
159 | do { | |
160 | /* | |
161 | * Fill the new bulkstat request with stat data starting at | |
162 | * start_ino. | |
163 | */ | |
164 | error = -xfrog_bulkstat(&ctx->mnt, new_breq); | |
165 | if (error == EFSCORRUPTED) { | |
166 | /* | |
167 | * start_ino is corrupt, increment and try the next | |
168 | * inode. | |
169 | */ | |
170 | start_ino++; | |
171 | new_breq->hdr.ino = start_ino; | |
172 | continue; | |
173 | } | |
174 | if (error) { | |
175 | /* | |
176 | * Any other error means the caller falls back to | |
177 | * single stepping. | |
178 | */ | |
179 | break; | |
180 | } | |
181 | if (new_breq->hdr.ocount == 0) | |
182 | break; | |
183 | ||
184 | /* Copy new results to the original bstat buffer */ | |
185 | memcpy(old_bstat, new_breq->bulkstat, | |
186 | new_breq->hdr.ocount * sizeof(struct xfs_bulkstat)); | |
187 | orig_breq->hdr.ocount += new_breq->hdr.ocount; | |
188 | old_bstat += new_breq->hdr.ocount; | |
189 | seen_mask |= seen_mask_from_bulkstat(inums, start_ino, | |
190 | new_breq); | |
191 | ||
192 | new_breq->hdr.icount -= new_breq->hdr.ocount; | |
193 | start_ino = new_breq->hdr.ino; | |
194 | } while (new_breq->hdr.icount > 0 && new_breq->hdr.ino < limit_ino); | |
195 | ||
196 | free(new_breq); | |
197 | return seen_mask; | |
198 | } | |
199 | ||
2451a997 DW |
200 | #define cmp_int(l, r) ((l > r) - (l < r)) |
201 | ||
202 | /* Compare two bulkstat records by inumber. */ | |
203 | static int | |
204 | compare_bstat( | |
205 | const void *a, | |
206 | const void *b) | |
207 | { | |
208 | const struct xfs_bulkstat *ba = a; | |
209 | const struct xfs_bulkstat *bb = b; | |
210 | ||
211 | return cmp_int(ba->bs_ino, bb->bs_ino); | |
212 | } | |
213 | ||
372d4ba9 | 214 | /* |
b0289f63 DW |
215 | * Walk the xi_allocmask looking for set bits that aren't present in |
216 | * the fill mask. For each such inode, fill the entries at the end of | |
217 | * the array with stat information one at a time, synthesizing them if | |
218 | * necessary. At this point, (xi_allocmask & ~seen_mask) should be the | |
219 | * corrupt inodes. | |
372d4ba9 DW |
220 | */ |
221 | static void | |
b0289f63 | 222 | bulkstat_single_step( |
17429887 | 223 | struct scrub_ctx *ctx, |
17429887 | 224 | const struct xfs_inumbers *inumbers, |
b0289f63 | 225 | uint64_t seen_mask, |
17429887 | 226 | struct xfs_bulkstat_req *breq) |
372d4ba9 | 227 | { |
2451a997 | 228 | struct xfs_bulkstat *bs = NULL; |
17429887 DW |
229 | int i; |
230 | int error; | |
372d4ba9 | 231 | |
2451a997 DW |
232 | for (i = 0; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) { |
233 | /* | |
234 | * Don't single-step if inumbers said it wasn't allocated or | |
235 | * bulkstat actually filled it. | |
236 | */ | |
b94a69ac | 237 | if (!(inumbers->xi_allocmask & (1ULL << i))) |
372d4ba9 | 238 | continue; |
2451a997 | 239 | if (seen_mask & (1ULL << i)) |
372d4ba9 | 240 | continue; |
372d4ba9 | 241 | |
2451a997 DW |
242 | assert(breq->hdr.ocount < LIBFROG_BULKSTAT_CHUNKSIZE); |
243 | ||
244 | if (!bs) | |
245 | bs = &breq->bulkstat[breq->hdr.ocount]; | |
246 | ||
247 | /* | |
248 | * Didn't get desired stat data and we've hit the end of the | |
249 | * returned data. We can't distinguish between the inode being | |
250 | * freed vs. the inode being to corrupt to load, so try a | |
251 | * bulkstat single to see if we can load the inode. | |
252 | */ | |
e6542132 | 253 | error = -xfrog_bulkstat_single(&ctx->mnt, |
ae497842 | 254 | inumbers->xi_startino + i, breq->hdr.flags, bs); |
20dbdd61 DW |
255 | switch (error) { |
256 | case ENOENT: | |
257 | /* | |
258 | * This inode wasn't found, and no results were | |
259 | * returned. We've likely hit the end of the | |
260 | * filesystem, but we'll move on to the next inode in | |
261 | * the mask for the sake of caution. | |
262 | */ | |
263 | continue; | |
264 | case 0: | |
265 | /* | |
266 | * If a result was returned but it wasn't the inode | |
267 | * we were looking for, then the missing inode was | |
268 | * freed. Move on to the next inode in the mask. | |
269 | */ | |
270 | if (bs->bs_ino != inumbers->xi_startino + i) | |
271 | continue; | |
272 | break; | |
273 | default: | |
274 | /* | |
275 | * Some error happened. Synthesize a bulkstat record | |
276 | * so that phase3 can try to see if there's a corrupt | |
277 | * inode that needs repairing. | |
278 | */ | |
4cca629d | 279 | memset(bs, 0, sizeof(struct xfs_bulkstat)); |
b94a69ac | 280 | bs->bs_ino = inumbers->xi_startino + i; |
372d4ba9 | 281 | bs->bs_blksize = ctx->mnt_sv.f_frsize; |
20dbdd61 | 282 | break; |
372d4ba9 | 283 | } |
2451a997 DW |
284 | |
285 | breq->hdr.ocount++; | |
372d4ba9 DW |
286 | bs++; |
287 | } | |
2451a997 DW |
288 | |
289 | /* If we added any entries, re-sort the array. */ | |
290 | if (bs) | |
291 | qsort(breq->bulkstat, breq->hdr.ocount, | |
292 | sizeof(struct xfs_bulkstat), compare_bstat); | |
372d4ba9 DW |
293 | } |
294 | ||
7ae92e1c DW |
295 | /* Return the inumber of the highest allocated inode in the inumbers data. */ |
296 | static inline uint64_t last_allocmask_ino(const struct xfs_inumbers *i) | |
297 | { | |
298 | return i->xi_startino + xfrog_highbit64(i->xi_allocmask); | |
299 | } | |
300 | ||
b0289f63 DW |
301 | /* |
302 | * Run bulkstat on an entire inode allocation group, then check that we got | |
303 | * exactly the inodes we expected. If not, load them one at a time (or fake | |
304 | * it) into the bulkstat data. | |
305 | */ | |
306 | static void | |
307 | bulkstat_for_inumbers( | |
308 | struct scrub_ctx *ctx, | |
309 | const struct xfs_inumbers *inumbers, | |
310 | struct xfs_bulkstat_req *breq) | |
311 | { | |
312 | const uint64_t limit_ino = | |
313 | inumbers->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE; | |
314 | uint64_t seen_mask = 0; | |
315 | int i; | |
316 | int error; | |
317 | ||
318 | assert(inumbers->xi_allocmask != 0); | |
319 | ||
320 | /* First we try regular bulkstat, for speed. */ | |
321 | breq->hdr.ino = inumbers->xi_startino; | |
322 | error = -xfrog_bulkstat(&ctx->mnt, breq); | |
323 | if (!error) { | |
324 | if (!breq->hdr.ocount) | |
325 | return; | |
326 | seen_mask |= seen_mask_from_bulkstat(inumbers, | |
327 | inumbers->xi_startino, breq); | |
328 | } | |
329 | ||
7ae92e1c DW |
330 | /* |
331 | * If the last allocated inode as reported by inumbers is higher than | |
332 | * the last inode reported by bulkstat, two things could have happened. | |
333 | * Either all the inodes at the high end of the cluster were freed | |
334 | * since the inumbers call; or bulkstat encountered a corrupt inode and | |
335 | * returned early. Try to bulkstat the rest of the array. | |
336 | */ | |
337 | if (last_allocmask_ino(inumbers) > last_bstat_ino(breq)) | |
338 | seen_mask |= bulkstat_the_rest(ctx, inumbers, breq, error); | |
339 | ||
b0289f63 DW |
340 | /* |
341 | * Bulkstat might return inodes beyond xi_startino + CHUNKSIZE. Reduce | |
342 | * ocount to ignore inodes not described by the inumbers record. | |
343 | */ | |
344 | for (i = breq->hdr.ocount - 1; i >= 0; i--) { | |
345 | if (breq->bulkstat[i].bs_ino < limit_ino) | |
346 | break; | |
347 | breq->hdr.ocount--; | |
348 | } | |
349 | ||
350 | /* | |
351 | * Fill in any missing inodes that are mentioned in the alloc mask but | |
7ae92e1c | 352 | * weren't previously seen by bulkstat. These are the corrupt inodes. |
b0289f63 DW |
353 | */ |
354 | bulkstat_single_step(ctx, inumbers, seen_mask, breq); | |
355 | } | |
356 | ||
59f79e0a DW |
357 | /* BULKSTAT wrapper routines. */ |
358 | struct scan_inodes { | |
245c72a6 | 359 | struct workqueue wq_bulkstat; |
59f79e0a DW |
360 | scrub_inode_iter_fn fn; |
361 | void *arg; | |
245c72a6 | 362 | unsigned int nr_threads; |
59f79e0a DW |
363 | bool aborted; |
364 | }; | |
365 | ||
372d4ba9 | 366 | /* |
245c72a6 DW |
367 | * A single unit of inode scan work. This contains a pointer to the parent |
368 | * information, followed by an INUMBERS request structure, followed by a | |
369 | * BULKSTAT request structure. The last two are VLAs, so we can't represent | |
370 | * them here. | |
371 | */ | |
372 | struct scan_ichunk { | |
373 | struct scan_inodes *si; | |
374 | }; | |
375 | ||
376 | static inline struct xfs_inumbers_req * | |
377 | ichunk_to_inumbers( | |
378 | struct scan_ichunk *ichunk) | |
379 | { | |
380 | char *p = (char *)ichunk; | |
381 | ||
382 | return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk)); | |
383 | } | |
384 | ||
385 | static inline struct xfs_bulkstat_req * | |
386 | ichunk_to_bulkstat( | |
387 | struct scan_ichunk *ichunk) | |
388 | { | |
389 | char *p = (char *)ichunk_to_inumbers(ichunk); | |
390 | ||
391 | return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1)); | |
392 | } | |
393 | ||
394 | static inline int | |
395 | alloc_ichunk( | |
ae497842 | 396 | struct scrub_ctx *ctx, |
245c72a6 DW |
397 | struct scan_inodes *si, |
398 | uint32_t agno, | |
399 | uint64_t startino, | |
400 | struct scan_ichunk **ichunkp) | |
401 | { | |
402 | struct scan_ichunk *ichunk; | |
403 | struct xfs_inumbers_req *ireq; | |
404 | struct xfs_bulkstat_req *breq; | |
405 | ||
406 | ichunk = calloc(1, sizeof(struct scan_ichunk) + | |
407 | XFS_INUMBERS_REQ_SIZE(1) + | |
408 | XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE)); | |
409 | if (!ichunk) | |
410 | return -errno; | |
411 | ||
412 | ichunk->si = si; | |
413 | ||
414 | ireq = ichunk_to_inumbers(ichunk); | |
415 | ireq->hdr.icount = 1; | |
416 | ireq->hdr.ino = startino; | |
417 | ireq->hdr.agno = agno; | |
418 | ireq->hdr.flags |= XFS_BULK_IREQ_AGNO; | |
419 | ||
420 | breq = ichunk_to_bulkstat(ichunk); | |
421 | breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE; | |
ae497842 DW |
422 | |
423 | /* Scan the metadata directory tree too. */ | |
424 | if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR) | |
cd9d49b3 | 425 | breq->hdr.flags |= XFS_BULK_IREQ_METADIR; |
245c72a6 DW |
426 | |
427 | *ichunkp = ichunk; | |
428 | return 0; | |
429 | } | |
430 | ||
b6fef47a | 431 | static int |
245c72a6 DW |
432 | render_ino_from_bulkstat( |
433 | struct scrub_ctx *ctx, | |
434 | char *buf, | |
435 | size_t buflen, | |
436 | void *data) | |
437 | { | |
438 | struct xfs_bulkstat *bstat = data; | |
439 | ||
440 | return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino, | |
441 | bstat->bs_gen, NULL); | |
442 | } | |
443 | ||
444 | static int | |
445 | render_inumbers_from_agno( | |
446 | struct scrub_ctx *ctx, | |
447 | char *buf, | |
448 | size_t buflen, | |
449 | void *data) | |
450 | { | |
451 | xfs_agnumber_t *agno = data; | |
452 | ||
453 | return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"), | |
454 | major(ctx->fsinfo.fs_datadev), | |
455 | minor(ctx->fsinfo.fs_datadev), | |
456 | *agno); | |
457 | } | |
458 | ||
459 | /* | |
460 | * Call BULKSTAT for information on a single chunk's worth of inodes and call | |
461 | * our iterator function. We'll try to fill the bulkstat information in | |
462 | * batches, but we also can detect iget failures. | |
372d4ba9 | 463 | */ |
59f79e0a | 464 | static void |
245c72a6 | 465 | scan_ag_bulkstat( |
59f79e0a DW |
466 | struct workqueue *wq, |
467 | xfs_agnumber_t agno, | |
372d4ba9 DW |
468 | void *arg) |
469 | { | |
13af0394 | 470 | struct xfs_handle handle; |
59f79e0a | 471 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
245c72a6 DW |
472 | struct scan_ichunk *ichunk = arg; |
473 | struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk); | |
474 | struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk); | |
475 | struct scan_inodes *si = ichunk->si; | |
c053cf87 | 476 | struct xfs_bulkstat *bs = &breq->bulkstat[0]; |
245c72a6 | 477 | struct xfs_inumbers *inumbers = &ireq->inumbers[0]; |
9f4d6358 | 478 | uint64_t last_ino = 0; |
372d4ba9 DW |
479 | int i; |
480 | int error; | |
481 | int stale_count = 0; | |
245c72a6 DW |
482 | DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat); |
483 | DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno); | |
372d4ba9 | 484 | |
245c72a6 | 485 | descr_set(&dsc_inumbers, &agno); |
13af0394 | 486 | handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len); |
245c72a6 | 487 | retry: |
3653b83e | 488 | bulkstat_for_inumbers(ctx, inumbers, breq); |
245c72a6 DW |
489 | |
490 | /* Iterate all the inodes. */ | |
c053cf87 | 491 | for (i = 0; !si->aborted && i < breq->hdr.ocount; i++, bs++) { |
9f4d6358 DW |
492 | uint64_t scan_ino = bs->bs_ino; |
493 | ||
494 | /* ensure forward progress if we retried */ | |
495 | if (scan_ino < last_ino) | |
496 | continue; | |
497 | ||
245c72a6 | 498 | descr_set(&dsc_bulkstat, bs); |
13af0394 | 499 | handle_from_bulkstat(&handle, bs); |
245c72a6 DW |
500 | error = si->fn(ctx, &handle, bs, si->arg); |
501 | switch (error) { | |
502 | case 0: | |
503 | break; | |
504 | case ESTALE: { | |
505 | stale_count++; | |
506 | if (stale_count < 30) { | |
b95546f1 DW |
507 | uint64_t old_startino; |
508 | ||
509 | ireq->hdr.ino = old_startino = | |
510 | inumbers->xi_startino; | |
245c72a6 DW |
511 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
512 | if (error) | |
513 | goto err; | |
b95546f1 DW |
514 | /* |
515 | * Retry only if inumbers returns the same | |
516 | * inobt record as the previous record and | |
517 | * there are allocated inodes in it. | |
518 | */ | |
519 | if (!si->aborted && | |
520 | ireq->hdr.ocount > 0 && | |
521 | inumbers->xi_alloccount > 0 && | |
522 | inumbers->xi_startino == old_startino) | |
523 | goto retry; | |
524 | goto out; | |
245c72a6 DW |
525 | } |
526 | str_info(ctx, descr_render(&dsc_bulkstat), | |
527 | _("Changed too many times during scan; giving up.")); | |
528 | si->aborted = true; | |
529 | goto out; | |
530 | } | |
531 | case ECANCELED: | |
532 | error = 0; | |
533 | fallthrough; | |
534 | default: | |
535 | goto err; | |
536 | } | |
537 | if (scrub_excessive_errors(ctx)) { | |
538 | si->aborted = true; | |
539 | goto out; | |
540 | } | |
9f4d6358 | 541 | last_ino = scan_ino; |
4cca629d DW |
542 | } |
543 | ||
245c72a6 | 544 | err: |
e6542132 | 545 | if (error) { |
245c72a6 | 546 | str_liberror(ctx, error, descr_render(&dsc_bulkstat)); |
59f79e0a | 547 | si->aborted = true; |
b94a69ac | 548 | } |
245c72a6 DW |
549 | out: |
550 | free(ichunk); | |
551 | } | |
552 | ||
553 | /* | |
554 | * Call INUMBERS for information about inode chunks, then queue the inumbers | |
555 | * responses in the bulkstat workqueue. This helps us maximize CPU parallelism | |
556 | * if the filesystem AGs are not evenly loaded. | |
557 | */ | |
558 | static void | |
559 | scan_ag_inumbers( | |
560 | struct workqueue *wq, | |
561 | xfs_agnumber_t agno, | |
562 | void *arg) | |
563 | { | |
564 | struct scan_ichunk *ichunk = NULL; | |
565 | struct scan_inodes *si = arg; | |
566 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; | |
567 | struct xfs_inumbers_req *ireq; | |
568 | uint64_t nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0); | |
569 | int error; | |
570 | DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno); | |
571 | ||
572 | descr_set(&dsc, &agno); | |
573 | ||
ae497842 | 574 | error = alloc_ichunk(ctx, si, agno, 0, &ichunk); |
245c72a6 DW |
575 | if (error) |
576 | goto err; | |
577 | ireq = ichunk_to_inumbers(ichunk); | |
b94a69ac | 578 | |
372d4ba9 | 579 | /* Find the inode chunk & alloc mask */ |
e6542132 | 580 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
59f79e0a | 581 | while (!error && !si->aborted && ireq->hdr.ocount > 0) { |
4f546267 DW |
582 | /* |
583 | * Make sure that we always make forward progress while we | |
584 | * scan the inode btree. | |
585 | */ | |
245c72a6 DW |
586 | if (nextino > ireq->inumbers[0].xi_startino) { |
587 | str_corrupt(ctx, descr_render(&dsc), | |
4f546267 DW |
588 | _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno, |
589 | cvt_ino_to_agino(&ctx->mnt, nextino), | |
590 | cvt_ino_to_agino(&ctx->mnt, | |
591 | ireq->inumbers[0].xi_startino)); | |
592 | si->aborted = true; | |
593 | break; | |
594 | } | |
595 | nextino = ireq->hdr.ino; | |
596 | ||
245c72a6 DW |
597 | if (ireq->inumbers[0].xi_alloccount == 0) { |
598 | /* | |
599 | * We can have totally empty inode chunks on | |
600 | * filesystems where there are more than 64 inodes per | |
601 | * block. Skip these. | |
602 | */ | |
603 | ; | |
604 | } else if (si->nr_threads > 0) { | |
605 | /* Queue this inode chunk on the bulkstat workqueue. */ | |
606 | error = -workqueue_add(&si->wq_bulkstat, | |
607 | scan_ag_bulkstat, agno, ichunk); | |
608 | if (error) { | |
59f79e0a | 609 | si->aborted = true; |
245c72a6 DW |
610 | str_liberror(ctx, error, |
611 | _("queueing bulkstat work")); | |
372d4ba9 DW |
612 | goto out; |
613 | } | |
245c72a6 DW |
614 | ichunk = NULL; |
615 | } else { | |
616 | /* | |
617 | * Only one thread, call bulkstat directly. Remember, | |
618 | * ichunk is freed by the worker before returning. | |
619 | */ | |
620 | scan_ag_bulkstat(wq, agno, ichunk); | |
621 | ichunk = NULL; | |
622 | if (si->aborted) | |
623 | break; | |
624 | } | |
625 | ||
626 | if (!ichunk) { | |
ae497842 | 627 | error = alloc_ichunk(ctx, si, agno, nextino, &ichunk); |
245c72a6 DW |
628 | if (error) |
629 | goto err; | |
372d4ba9 | 630 | } |
245c72a6 | 631 | ireq = ichunk_to_inumbers(ichunk); |
372d4ba9 | 632 | |
e6542132 | 633 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
372d4ba9 DW |
634 | } |
635 | ||
636 | err: | |
637 | if (error) { | |
245c72a6 | 638 | str_liberror(ctx, error, descr_render(&dsc)); |
59f79e0a | 639 | si->aborted = true; |
372d4ba9 DW |
640 | } |
641 | out: | |
245c72a6 DW |
642 | if (ichunk) |
643 | free(ichunk); | |
372d4ba9 DW |
644 | } |
645 | ||
59f79e0a | 646 | /* |
ae497842 DW |
647 | * Scan all the inodes in a filesystem, including metadata directory files and |
648 | * broken files. On error, this function will log an error message and return | |
649 | * -1. | |
59f79e0a DW |
650 | */ |
651 | int | |
652 | scrub_scan_all_inodes( | |
372d4ba9 | 653 | struct scrub_ctx *ctx, |
59f79e0a | 654 | scrub_inode_iter_fn fn, |
372d4ba9 DW |
655 | void *arg) |
656 | { | |
59f79e0a DW |
657 | struct scan_inodes si = { |
658 | .fn = fn, | |
659 | .arg = arg, | |
245c72a6 | 660 | .nr_threads = scrub_nproc_workqueue(ctx), |
59f79e0a | 661 | }; |
372d4ba9 | 662 | xfs_agnumber_t agno; |
245c72a6 DW |
663 | struct workqueue wq_inumbers; |
664 | unsigned int max_bulkstat; | |
372d4ba9 DW |
665 | int ret; |
666 | ||
245c72a6 DW |
667 | /* |
668 | * The bulkstat workqueue should queue at most one inobt block's worth | |
669 | * of inode chunk records per worker thread. If we're running in | |
670 | * single thread mode (nr_threads==0) then we skip the workqueues. | |
671 | */ | |
672 | max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16); | |
673 | ||
674 | ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx, | |
675 | si.nr_threads, max_bulkstat); | |
372d4ba9 | 676 | if (ret) { |
9d57cbfc | 677 | str_liberror(ctx, ret, _("creating bulkstat workqueue")); |
59f79e0a | 678 | return -1; |
372d4ba9 DW |
679 | } |
680 | ||
245c72a6 DW |
681 | ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx, |
682 | si.nr_threads); | |
683 | if (ret) { | |
684 | str_liberror(ctx, ret, _("creating inumbers workqueue")); | |
685 | si.aborted = true; | |
686 | goto kill_bulkstat; | |
687 | } | |
688 | ||
3f9efb2e | 689 | for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { |
245c72a6 | 690 | ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si); |
372d4ba9 | 691 | if (ret) { |
59f79e0a | 692 | si.aborted = true; |
245c72a6 | 693 | str_liberror(ctx, ret, _("queueing inumbers work")); |
372d4ba9 DW |
694 | break; |
695 | } | |
696 | } | |
697 | ||
245c72a6 DW |
698 | ret = -workqueue_terminate(&wq_inumbers); |
699 | if (ret) { | |
700 | si.aborted = true; | |
701 | str_liberror(ctx, ret, _("finishing inumbers work")); | |
702 | } | |
703 | workqueue_destroy(&wq_inumbers); | |
704 | ||
705 | kill_bulkstat: | |
706 | ret = -workqueue_terminate(&si.wq_bulkstat); | |
71296cf8 | 707 | if (ret) { |
59f79e0a | 708 | si.aborted = true; |
71296cf8 DW |
709 | str_liberror(ctx, ret, _("finishing bulkstat work")); |
710 | } | |
245c72a6 | 711 | workqueue_destroy(&si.wq_bulkstat); |
372d4ba9 | 712 | |
59f79e0a | 713 | return si.aborted ? -1 : 0; |
372d4ba9 DW |
714 | } |
715 | ||
279b0d0e DW |
716 | struct user_bulkstat { |
717 | struct scan_inodes *si; | |
718 | ||
719 | /* vla, must be last */ | |
720 | struct xfs_bulkstat_req breq; | |
721 | }; | |
722 | ||
723 | /* Iterate all the user files returned by a bulkstat. */ | |
724 | static void | |
725 | scan_user_files( | |
726 | struct workqueue *wq, | |
727 | xfs_agnumber_t agno, | |
728 | void *arg) | |
729 | { | |
730 | struct xfs_handle handle; | |
731 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; | |
732 | struct user_bulkstat *ureq = arg; | |
733 | struct xfs_bulkstat *bs = &ureq->breq.bulkstat[0]; | |
734 | struct scan_inodes *si = ureq->si; | |
735 | int i; | |
736 | int error = 0; | |
737 | DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat); | |
738 | ||
739 | handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len); | |
740 | ||
741 | for (i = 0; !si->aborted && i < ureq->breq.hdr.ocount; i++, bs++) { | |
742 | descr_set(&dsc_bulkstat, bs); | |
743 | handle_from_bulkstat(&handle, bs); | |
744 | error = si->fn(ctx, &handle, bs, si->arg); | |
745 | switch (error) { | |
746 | case 0: | |
747 | break; | |
748 | case ESTALE: | |
749 | case ECANCELED: | |
750 | error = 0; | |
751 | fallthrough; | |
752 | default: | |
753 | goto err; | |
754 | } | |
755 | if (scrub_excessive_errors(ctx)) { | |
756 | si->aborted = true; | |
757 | goto out; | |
758 | } | |
759 | } | |
760 | ||
761 | err: | |
762 | if (error) { | |
763 | str_liberror(ctx, error, descr_render(&dsc_bulkstat)); | |
764 | si->aborted = true; | |
765 | } | |
766 | out: | |
767 | free(ureq); | |
768 | } | |
769 | ||
770 | /* | |
771 | * Run one step of the user files bulkstat scan and schedule background | |
772 | * processing of the stat data returned. Returns 1 to keep going, or 0 to | |
773 | * stop. | |
774 | */ | |
775 | static int | |
776 | scan_user_bulkstat( | |
777 | struct scrub_ctx *ctx, | |
778 | struct scan_inodes *si, | |
779 | uint64_t *cursor) | |
780 | { | |
781 | struct user_bulkstat *ureq; | |
782 | const char *what = NULL; | |
783 | int ret; | |
784 | ||
785 | ureq = calloc(1, sizeof(struct user_bulkstat) + | |
786 | XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE)); | |
787 | if (!ureq) { | |
788 | ret = ENOMEM; | |
789 | what = _("creating bulkstat work item"); | |
790 | goto err; | |
791 | } | |
792 | ureq->si = si; | |
793 | ureq->breq.hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE; | |
794 | ureq->breq.hdr.ino = *cursor; | |
795 | ||
796 | ret = -xfrog_bulkstat(&ctx->mnt, &ureq->breq); | |
797 | if (ret) { | |
798 | what = _("user files bulkstat"); | |
799 | goto err_ureq; | |
800 | } | |
801 | if (ureq->breq.hdr.ocount == 0) { | |
802 | *cursor = NULLFSINO; | |
803 | free(ureq); | |
804 | return 0; | |
805 | } | |
806 | ||
807 | *cursor = ureq->breq.hdr.ino; | |
808 | ||
809 | /* scan_user_files frees ureq; do not access it */ | |
810 | ret = -workqueue_add(&si->wq_bulkstat, scan_user_files, 0, ureq); | |
811 | if (ret) { | |
812 | what = _("queueing bulkstat work"); | |
813 | goto err_ureq; | |
814 | } | |
815 | ureq = NULL; | |
816 | ||
817 | return 1; | |
818 | ||
819 | err_ureq: | |
820 | free(ureq); | |
821 | err: | |
822 | si->aborted = true; | |
823 | str_liberror(ctx, ret, what); | |
824 | return 0; | |
825 | } | |
826 | ||
827 | /* | |
828 | * Scan all the user files in a filesystem in inumber order. On error, this | |
829 | * function will log an error message and return -1. | |
830 | */ | |
831 | int | |
832 | scrub_scan_user_files( | |
833 | struct scrub_ctx *ctx, | |
834 | scrub_inode_iter_fn fn, | |
835 | void *arg) | |
836 | { | |
837 | struct scan_inodes si = { | |
838 | .fn = fn, | |
839 | .arg = arg, | |
840 | .nr_threads = scrub_nproc_workqueue(ctx), | |
841 | }; | |
842 | uint64_t ino = 0; | |
843 | int ret; | |
844 | ||
845 | /* Queue up to four bulkstat result sets per thread. */ | |
846 | ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx, | |
847 | si.nr_threads, si.nr_threads * 4); | |
848 | if (ret) { | |
849 | str_liberror(ctx, ret, _("creating bulkstat workqueue")); | |
850 | return -1; | |
851 | } | |
852 | ||
853 | while ((ret = scan_user_bulkstat(ctx, &si, &ino)) == 1) { | |
854 | /* empty */ | |
855 | } | |
856 | ||
857 | ret = -workqueue_terminate(&si.wq_bulkstat); | |
858 | if (ret) { | |
859 | si.aborted = true; | |
860 | str_liberror(ctx, ret, _("finishing bulkstat work")); | |
861 | } | |
862 | workqueue_destroy(&si.wq_bulkstat); | |
863 | ||
864 | return si.aborted ? -1 : 0; | |
865 | } | |
866 | ||
59f79e0a | 867 | /* Open a file by handle, returning either the fd or -1 on error. */ |
372d4ba9 | 868 | int |
59f79e0a | 869 | scrub_open_handle( |
372d4ba9 DW |
870 | struct xfs_handle *handle) |
871 | { | |
872 | return open_by_fshandle(handle, sizeof(*handle), | |
873 | O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); | |
874 | } |