]>
Commit | Line | Data |
---|---|---|
8d318d62 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
372d4ba9 | 2 | /* |
52520522 | 3 | * Copyright (C) 2018-2024 Oracle. All Rights Reserved. |
8d318d62 | 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
372d4ba9 | 5 | */ |
a440f877 | 6 | #include "xfs.h" |
372d4ba9 DW |
7 | #include <stdint.h> |
8 | #include <stdlib.h> | |
9 | #include <pthread.h> | |
10 | #include <sys/statvfs.h> | |
11 | #include "platform_defs.h" | |
372d4ba9 | 12 | #include "xfs_arch.h" |
372d4ba9 | 13 | #include "handle.h" |
42b4c8e8 | 14 | #include "libfrog/paths.h" |
56598728 | 15 | #include "libfrog/workqueue.h" |
372d4ba9 DW |
16 | #include "xfs_scrub.h" |
17 | #include "common.h" | |
18 | #include "inodes.h" | |
245c72a6 | 19 | #include "descr.h" |
fee68490 | 20 | #include "libfrog/fsgeom.h" |
f31b5e12 | 21 | #include "libfrog/bulkstat.h" |
372d4ba9 DW |
22 | |
23 | /* | |
24 | * Iterate a range of inodes. | |
25 | * | |
26 | * This is a little more involved than repeatedly asking BULKSTAT for a | |
27 | * buffer's worth of stat data for some number of inodes. We want to scan as | |
28 | * many of the inodes that the inobt thinks there are, including the ones that | |
29 | * are broken, but if we ask for n inodes starting at x, it'll skip the bad | |
30 | * ones and fill from beyond the range (x + n). | |
31 | * | |
32 | * Therefore, we ask INUMBERS to return one inobt chunk's worth of inode | |
33 | * bitmap information. Then we try to BULKSTAT only the inodes that were | |
34 | * present in that chunk, and compare what we got against what INUMBERS said | |
35 | * was there. If there's a mismatch, we know that we have an inode that fails | |
36 | * the verifiers but we can inject the bulkstat information to force the scrub | |
37 | * code to deal with the broken inodes. | |
38 | * | |
39 | * If the iteration function returns ESTALE, that means that the inode has | |
40 | * been deleted and possibly recreated since the BULKSTAT call. We wil | |
41 | * refresh the stat information and try again up to 30 times before reporting | |
42 | * the staleness as an error. | |
43 | */ | |
44 | ||
45 | /* | |
e3724c8b DW |
46 | * Run bulkstat on an entire inode allocation group, then check that we got |
47 | * exactly the inodes we expected. If not, load them one at a time (or fake | |
48 | * it) into the bulkstat data. | |
372d4ba9 DW |
49 | */ |
50 | static void | |
e3724c8b | 51 | bulkstat_for_inumbers( |
372d4ba9 | 52 | struct scrub_ctx *ctx, |
245c72a6 | 53 | struct descr *dsc, |
e3724c8b DW |
54 | const struct xfs_inumbers *inumbers, |
55 | struct xfs_bulkstat_req *breq) | |
372d4ba9 | 56 | { |
e3724c8b | 57 | struct xfs_bulkstat *bstat = breq->bulkstat; |
4cca629d | 58 | struct xfs_bulkstat *bs; |
372d4ba9 DW |
59 | int i; |
60 | int error; | |
61 | ||
e3724c8b DW |
62 | /* First we try regular bulkstat, for speed. */ |
63 | breq->hdr.ino = inumbers->xi_startino; | |
64 | breq->hdr.icount = inumbers->xi_alloccount; | |
e6542132 | 65 | error = -xfrog_bulkstat(&ctx->mnt, breq); |
e3724c8b DW |
66 | if (error) { |
67 | char errbuf[DESCR_BUFSZ]; | |
68 | ||
245c72a6 | 69 | str_info(ctx, descr_render(dsc), "%s", |
e3724c8b DW |
70 | strerror_r(error, errbuf, DESCR_BUFSZ)); |
71 | } | |
72 | ||
73 | /* | |
74 | * Check each of the stats we got back to make sure we got the inodes | |
75 | * we asked for. | |
76 | */ | |
e749bfaf | 77 | for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) { |
b94a69ac | 78 | if (!(inumbers->xi_allocmask & (1ULL << i))) |
372d4ba9 | 79 | continue; |
b94a69ac | 80 | if (bs->bs_ino == inumbers->xi_startino + i) { |
372d4ba9 DW |
81 | bs++; |
82 | continue; | |
83 | } | |
84 | ||
85 | /* Load the one inode. */ | |
e6542132 | 86 | error = -xfrog_bulkstat_single(&ctx->mnt, |
b94a69ac DW |
87 | inumbers->xi_startino + i, 0, bs); |
88 | if (error || bs->bs_ino != inumbers->xi_startino + i) { | |
4cca629d | 89 | memset(bs, 0, sizeof(struct xfs_bulkstat)); |
b94a69ac | 90 | bs->bs_ino = inumbers->xi_startino + i; |
372d4ba9 DW |
91 | bs->bs_blksize = ctx->mnt_sv.f_frsize; |
92 | } | |
93 | bs++; | |
94 | } | |
95 | } | |
96 | ||
59f79e0a DW |
97 | /* BULKSTAT wrapper routines. */ |
98 | struct scan_inodes { | |
245c72a6 | 99 | struct workqueue wq_bulkstat; |
59f79e0a DW |
100 | scrub_inode_iter_fn fn; |
101 | void *arg; | |
245c72a6 | 102 | unsigned int nr_threads; |
59f79e0a DW |
103 | bool aborted; |
104 | }; | |
105 | ||
372d4ba9 | 106 | /* |
245c72a6 DW |
107 | * A single unit of inode scan work. This contains a pointer to the parent |
108 | * information, followed by an INUMBERS request structure, followed by a | |
109 | * BULKSTAT request structure. The last two are VLAs, so we can't represent | |
110 | * them here. | |
111 | */ | |
112 | struct scan_ichunk { | |
113 | struct scan_inodes *si; | |
114 | }; | |
115 | ||
116 | static inline struct xfs_inumbers_req * | |
117 | ichunk_to_inumbers( | |
118 | struct scan_ichunk *ichunk) | |
119 | { | |
120 | char *p = (char *)ichunk; | |
121 | ||
122 | return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk)); | |
123 | } | |
124 | ||
125 | static inline struct xfs_bulkstat_req * | |
126 | ichunk_to_bulkstat( | |
127 | struct scan_ichunk *ichunk) | |
128 | { | |
129 | char *p = (char *)ichunk_to_inumbers(ichunk); | |
130 | ||
131 | return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1)); | |
132 | } | |
133 | ||
134 | static inline int | |
135 | alloc_ichunk( | |
136 | struct scan_inodes *si, | |
137 | uint32_t agno, | |
138 | uint64_t startino, | |
139 | struct scan_ichunk **ichunkp) | |
140 | { | |
141 | struct scan_ichunk *ichunk; | |
142 | struct xfs_inumbers_req *ireq; | |
143 | struct xfs_bulkstat_req *breq; | |
144 | ||
145 | ichunk = calloc(1, sizeof(struct scan_ichunk) + | |
146 | XFS_INUMBERS_REQ_SIZE(1) + | |
147 | XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE)); | |
148 | if (!ichunk) | |
149 | return -errno; | |
150 | ||
151 | ichunk->si = si; | |
152 | ||
153 | ireq = ichunk_to_inumbers(ichunk); | |
154 | ireq->hdr.icount = 1; | |
155 | ireq->hdr.ino = startino; | |
156 | ireq->hdr.agno = agno; | |
157 | ireq->hdr.flags |= XFS_BULK_IREQ_AGNO; | |
158 | ||
159 | breq = ichunk_to_bulkstat(ichunk); | |
160 | breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE; | |
161 | ||
162 | *ichunkp = ichunk; | |
163 | return 0; | |
164 | } | |
165 | ||
b6fef47a | 166 | static int |
245c72a6 DW |
167 | render_ino_from_bulkstat( |
168 | struct scrub_ctx *ctx, | |
169 | char *buf, | |
170 | size_t buflen, | |
171 | void *data) | |
172 | { | |
173 | struct xfs_bulkstat *bstat = data; | |
174 | ||
175 | return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino, | |
176 | bstat->bs_gen, NULL); | |
177 | } | |
178 | ||
179 | static int | |
180 | render_inumbers_from_agno( | |
181 | struct scrub_ctx *ctx, | |
182 | char *buf, | |
183 | size_t buflen, | |
184 | void *data) | |
185 | { | |
186 | xfs_agnumber_t *agno = data; | |
187 | ||
188 | return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"), | |
189 | major(ctx->fsinfo.fs_datadev), | |
190 | minor(ctx->fsinfo.fs_datadev), | |
191 | *agno); | |
192 | } | |
193 | ||
194 | /* | |
195 | * Call BULKSTAT for information on a single chunk's worth of inodes and call | |
196 | * our iterator function. We'll try to fill the bulkstat information in | |
197 | * batches, but we also can detect iget failures. | |
372d4ba9 | 198 | */ |
59f79e0a | 199 | static void |
245c72a6 | 200 | scan_ag_bulkstat( |
59f79e0a DW |
201 | struct workqueue *wq, |
202 | xfs_agnumber_t agno, | |
372d4ba9 DW |
203 | void *arg) |
204 | { | |
bbfbf5dd | 205 | struct xfs_handle handle = { }; |
59f79e0a | 206 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
245c72a6 DW |
207 | struct scan_ichunk *ichunk = arg; |
208 | struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk); | |
209 | struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk); | |
210 | struct scan_inodes *si = ichunk->si; | |
4cca629d | 211 | struct xfs_bulkstat *bs; |
245c72a6 | 212 | struct xfs_inumbers *inumbers = &ireq->inumbers[0]; |
9f4d6358 | 213 | uint64_t last_ino = 0; |
372d4ba9 DW |
214 | int i; |
215 | int error; | |
216 | int stale_count = 0; | |
245c72a6 DW |
217 | DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat); |
218 | DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno); | |
372d4ba9 | 219 | |
245c72a6 | 220 | descr_set(&dsc_inumbers, &agno); |
59f79e0a DW |
221 | |
222 | memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid)); | |
372d4ba9 DW |
223 | handle.ha_fid.fid_len = sizeof(xfs_fid_t) - |
224 | sizeof(handle.ha_fid.fid_len); | |
225 | handle.ha_fid.fid_pad = 0; | |
226 | ||
245c72a6 DW |
227 | retry: |
228 | bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq); | |
229 | ||
230 | /* Iterate all the inodes. */ | |
231 | bs = &breq->bulkstat[0]; | |
232 | for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) { | |
9f4d6358 DW |
233 | uint64_t scan_ino = bs->bs_ino; |
234 | ||
235 | /* ensure forward progress if we retried */ | |
236 | if (scan_ino < last_ino) | |
237 | continue; | |
238 | ||
245c72a6 | 239 | descr_set(&dsc_bulkstat, bs); |
9f4d6358 | 240 | handle.ha_fid.fid_ino = scan_ino; |
245c72a6 DW |
241 | handle.ha_fid.fid_gen = bs->bs_gen; |
242 | error = si->fn(ctx, &handle, bs, si->arg); | |
243 | switch (error) { | |
244 | case 0: | |
245 | break; | |
246 | case ESTALE: { | |
247 | stale_count++; | |
248 | if (stale_count < 30) { | |
249 | ireq->hdr.ino = inumbers->xi_startino; | |
250 | error = -xfrog_inumbers(&ctx->mnt, ireq); | |
251 | if (error) | |
252 | goto err; | |
253 | goto retry; | |
254 | } | |
255 | str_info(ctx, descr_render(&dsc_bulkstat), | |
256 | _("Changed too many times during scan; giving up.")); | |
257 | si->aborted = true; | |
258 | goto out; | |
259 | } | |
260 | case ECANCELED: | |
261 | error = 0; | |
262 | fallthrough; | |
263 | default: | |
264 | goto err; | |
265 | } | |
266 | if (scrub_excessive_errors(ctx)) { | |
267 | si->aborted = true; | |
268 | goto out; | |
269 | } | |
9f4d6358 | 270 | last_ino = scan_ino; |
4cca629d DW |
271 | } |
272 | ||
245c72a6 | 273 | err: |
e6542132 | 274 | if (error) { |
245c72a6 | 275 | str_liberror(ctx, error, descr_render(&dsc_bulkstat)); |
59f79e0a | 276 | si->aborted = true; |
b94a69ac | 277 | } |
245c72a6 DW |
278 | out: |
279 | free(ichunk); | |
280 | } | |
281 | ||
282 | /* | |
283 | * Call INUMBERS for information about inode chunks, then queue the inumbers | |
284 | * responses in the bulkstat workqueue. This helps us maximize CPU parallelism | |
285 | * if the filesystem AGs are not evenly loaded. | |
286 | */ | |
287 | static void | |
288 | scan_ag_inumbers( | |
289 | struct workqueue *wq, | |
290 | xfs_agnumber_t agno, | |
291 | void *arg) | |
292 | { | |
293 | struct scan_ichunk *ichunk = NULL; | |
294 | struct scan_inodes *si = arg; | |
295 | struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; | |
296 | struct xfs_inumbers_req *ireq; | |
297 | uint64_t nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0); | |
298 | int error; | |
299 | DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno); | |
300 | ||
301 | descr_set(&dsc, &agno); | |
302 | ||
303 | error = alloc_ichunk(si, agno, 0, &ichunk); | |
304 | if (error) | |
305 | goto err; | |
306 | ireq = ichunk_to_inumbers(ichunk); | |
b94a69ac | 307 | |
372d4ba9 | 308 | /* Find the inode chunk & alloc mask */ |
e6542132 | 309 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
59f79e0a | 310 | while (!error && !si->aborted && ireq->hdr.ocount > 0) { |
4f546267 DW |
311 | /* |
312 | * Make sure that we always make forward progress while we | |
313 | * scan the inode btree. | |
314 | */ | |
245c72a6 DW |
315 | if (nextino > ireq->inumbers[0].xi_startino) { |
316 | str_corrupt(ctx, descr_render(&dsc), | |
4f546267 DW |
317 | _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno, |
318 | cvt_ino_to_agino(&ctx->mnt, nextino), | |
319 | cvt_ino_to_agino(&ctx->mnt, | |
320 | ireq->inumbers[0].xi_startino)); | |
321 | si->aborted = true; | |
322 | break; | |
323 | } | |
324 | nextino = ireq->hdr.ino; | |
325 | ||
245c72a6 DW |
326 | if (ireq->inumbers[0].xi_alloccount == 0) { |
327 | /* | |
328 | * We can have totally empty inode chunks on | |
329 | * filesystems where there are more than 64 inodes per | |
330 | * block. Skip these. | |
331 | */ | |
332 | ; | |
333 | } else if (si->nr_threads > 0) { | |
334 | /* Queue this inode chunk on the bulkstat workqueue. */ | |
335 | error = -workqueue_add(&si->wq_bulkstat, | |
336 | scan_ag_bulkstat, agno, ichunk); | |
337 | if (error) { | |
59f79e0a | 338 | si->aborted = true; |
245c72a6 DW |
339 | str_liberror(ctx, error, |
340 | _("queueing bulkstat work")); | |
372d4ba9 DW |
341 | goto out; |
342 | } | |
245c72a6 DW |
343 | ichunk = NULL; |
344 | } else { | |
345 | /* | |
346 | * Only one thread, call bulkstat directly. Remember, | |
347 | * ichunk is freed by the worker before returning. | |
348 | */ | |
349 | scan_ag_bulkstat(wq, agno, ichunk); | |
350 | ichunk = NULL; | |
351 | if (si->aborted) | |
352 | break; | |
353 | } | |
354 | ||
355 | if (!ichunk) { | |
356 | error = alloc_ichunk(si, agno, nextino, &ichunk); | |
357 | if (error) | |
358 | goto err; | |
372d4ba9 | 359 | } |
245c72a6 | 360 | ireq = ichunk_to_inumbers(ichunk); |
372d4ba9 | 361 | |
e6542132 | 362 | error = -xfrog_inumbers(&ctx->mnt, ireq); |
372d4ba9 DW |
363 | } |
364 | ||
365 | err: | |
366 | if (error) { | |
245c72a6 | 367 | str_liberror(ctx, error, descr_render(&dsc)); |
59f79e0a | 368 | si->aborted = true; |
372d4ba9 DW |
369 | } |
370 | out: | |
245c72a6 DW |
371 | if (ichunk) |
372 | free(ichunk); | |
372d4ba9 DW |
373 | } |
374 | ||
59f79e0a DW |
375 | /* |
376 | * Scan all the inodes in a filesystem. On error, this function will log | |
377 | * an error message and return -1. | |
378 | */ | |
379 | int | |
380 | scrub_scan_all_inodes( | |
372d4ba9 | 381 | struct scrub_ctx *ctx, |
59f79e0a | 382 | scrub_inode_iter_fn fn, |
372d4ba9 DW |
383 | void *arg) |
384 | { | |
59f79e0a DW |
385 | struct scan_inodes si = { |
386 | .fn = fn, | |
387 | .arg = arg, | |
245c72a6 | 388 | .nr_threads = scrub_nproc_workqueue(ctx), |
59f79e0a | 389 | }; |
372d4ba9 | 390 | xfs_agnumber_t agno; |
245c72a6 DW |
391 | struct workqueue wq_inumbers; |
392 | unsigned int max_bulkstat; | |
372d4ba9 DW |
393 | int ret; |
394 | ||
245c72a6 DW |
395 | /* |
396 | * The bulkstat workqueue should queue at most one inobt block's worth | |
397 | * of inode chunk records per worker thread. If we're running in | |
398 | * single thread mode (nr_threads==0) then we skip the workqueues. | |
399 | */ | |
400 | max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16); | |
401 | ||
402 | ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx, | |
403 | si.nr_threads, max_bulkstat); | |
372d4ba9 | 404 | if (ret) { |
9d57cbfc | 405 | str_liberror(ctx, ret, _("creating bulkstat workqueue")); |
59f79e0a | 406 | return -1; |
372d4ba9 DW |
407 | } |
408 | ||
245c72a6 DW |
409 | ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx, |
410 | si.nr_threads); | |
411 | if (ret) { | |
412 | str_liberror(ctx, ret, _("creating inumbers workqueue")); | |
413 | si.aborted = true; | |
414 | goto kill_bulkstat; | |
415 | } | |
416 | ||
3f9efb2e | 417 | for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { |
245c72a6 | 418 | ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si); |
372d4ba9 | 419 | if (ret) { |
59f79e0a | 420 | si.aborted = true; |
245c72a6 | 421 | str_liberror(ctx, ret, _("queueing inumbers work")); |
372d4ba9 DW |
422 | break; |
423 | } | |
424 | } | |
425 | ||
245c72a6 DW |
426 | ret = -workqueue_terminate(&wq_inumbers); |
427 | if (ret) { | |
428 | si.aborted = true; | |
429 | str_liberror(ctx, ret, _("finishing inumbers work")); | |
430 | } | |
431 | workqueue_destroy(&wq_inumbers); | |
432 | ||
433 | kill_bulkstat: | |
434 | ret = -workqueue_terminate(&si.wq_bulkstat); | |
71296cf8 | 435 | if (ret) { |
59f79e0a | 436 | si.aborted = true; |
71296cf8 DW |
437 | str_liberror(ctx, ret, _("finishing bulkstat work")); |
438 | } | |
245c72a6 | 439 | workqueue_destroy(&si.wq_bulkstat); |
372d4ba9 | 440 | |
59f79e0a | 441 | return si.aborted ? -1 : 0; |
372d4ba9 DW |
442 | } |
443 | ||
59f79e0a | 444 | /* Open a file by handle, returning either the fd or -1 on error. */ |
372d4ba9 | 445 | int |
59f79e0a | 446 | scrub_open_handle( |
372d4ba9 DW |
447 | struct xfs_handle *handle) |
448 | { | |
449 | return open_by_fshandle(handle, sizeof(*handle), | |
450 | O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); | |
451 | } |