]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - scrub/inodes.c
xfsprogs: Release v6.10.1
[thirdparty/xfsprogs-dev.git] / scrub / inodes.c
CommitLineData
8d318d62 1// SPDX-License-Identifier: GPL-2.0-or-later
372d4ba9 2/*
52520522 3 * Copyright (C) 2018-2024 Oracle. All Rights Reserved.
8d318d62 4 * Author: Darrick J. Wong <djwong@kernel.org>
372d4ba9 5 */
a440f877 6#include "xfs.h"
372d4ba9
DW
7#include <stdint.h>
8#include <stdlib.h>
9#include <pthread.h>
10#include <sys/statvfs.h>
11#include "platform_defs.h"
372d4ba9 12#include "xfs_arch.h"
372d4ba9 13#include "handle.h"
42b4c8e8 14#include "libfrog/paths.h"
56598728 15#include "libfrog/workqueue.h"
372d4ba9
DW
16#include "xfs_scrub.h"
17#include "common.h"
18#include "inodes.h"
245c72a6 19#include "descr.h"
fee68490 20#include "libfrog/fsgeom.h"
f31b5e12 21#include "libfrog/bulkstat.h"
372d4ba9
DW
22
23/*
24 * Iterate a range of inodes.
25 *
26 * This is a little more involved than repeatedly asking BULKSTAT for a
27 * buffer's worth of stat data for some number of inodes. We want to scan as
28 * many of the inodes that the inobt thinks there are, including the ones that
29 * are broken, but if we ask for n inodes starting at x, it'll skip the bad
30 * ones and fill from beyond the range (x + n).
31 *
32 * Therefore, we ask INUMBERS to return one inobt chunk's worth of inode
33 * bitmap information. Then we try to BULKSTAT only the inodes that were
34 * present in that chunk, and compare what we got against what INUMBERS said
35 * was there. If there's a mismatch, we know that we have an inode that fails
36 * the verifiers but we can inject the bulkstat information to force the scrub
37 * code to deal with the broken inodes.
38 *
39 * If the iteration function returns ESTALE, that means that the inode has
40 * been deleted and possibly recreated since the BULKSTAT call. We wil
41 * refresh the stat information and try again up to 30 times before reporting
42 * the staleness as an error.
43 */
44
45/*
e3724c8b
DW
46 * Run bulkstat on an entire inode allocation group, then check that we got
47 * exactly the inodes we expected. If not, load them one at a time (or fake
48 * it) into the bulkstat data.
372d4ba9
DW
49 */
50static void
e3724c8b 51bulkstat_for_inumbers(
372d4ba9 52 struct scrub_ctx *ctx,
245c72a6 53 struct descr *dsc,
e3724c8b
DW
54 const struct xfs_inumbers *inumbers,
55 struct xfs_bulkstat_req *breq)
372d4ba9 56{
e3724c8b 57 struct xfs_bulkstat *bstat = breq->bulkstat;
4cca629d 58 struct xfs_bulkstat *bs;
372d4ba9
DW
59 int i;
60 int error;
61
e3724c8b
DW
62 /* First we try regular bulkstat, for speed. */
63 breq->hdr.ino = inumbers->xi_startino;
64 breq->hdr.icount = inumbers->xi_alloccount;
e6542132 65 error = -xfrog_bulkstat(&ctx->mnt, breq);
e3724c8b
DW
66 if (error) {
67 char errbuf[DESCR_BUFSZ];
68
245c72a6 69 str_info(ctx, descr_render(dsc), "%s",
e3724c8b
DW
70 strerror_r(error, errbuf, DESCR_BUFSZ));
71 }
72
73 /*
74 * Check each of the stats we got back to make sure we got the inodes
75 * we asked for.
76 */
e749bfaf 77 for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
b94a69ac 78 if (!(inumbers->xi_allocmask & (1ULL << i)))
372d4ba9 79 continue;
b94a69ac 80 if (bs->bs_ino == inumbers->xi_startino + i) {
372d4ba9
DW
81 bs++;
82 continue;
83 }
84
85 /* Load the one inode. */
e6542132 86 error = -xfrog_bulkstat_single(&ctx->mnt,
b94a69ac
DW
87 inumbers->xi_startino + i, 0, bs);
88 if (error || bs->bs_ino != inumbers->xi_startino + i) {
4cca629d 89 memset(bs, 0, sizeof(struct xfs_bulkstat));
b94a69ac 90 bs->bs_ino = inumbers->xi_startino + i;
372d4ba9
DW
91 bs->bs_blksize = ctx->mnt_sv.f_frsize;
92 }
93 bs++;
94 }
95}
96
59f79e0a
DW
97/* BULKSTAT wrapper routines. */
98struct scan_inodes {
245c72a6 99 struct workqueue wq_bulkstat;
59f79e0a
DW
100 scrub_inode_iter_fn fn;
101 void *arg;
245c72a6 102 unsigned int nr_threads;
59f79e0a
DW
103 bool aborted;
104};
105
372d4ba9 106/*
245c72a6
DW
107 * A single unit of inode scan work. This contains a pointer to the parent
108 * information, followed by an INUMBERS request structure, followed by a
109 * BULKSTAT request structure. The last two are VLAs, so we can't represent
110 * them here.
111 */
112struct scan_ichunk {
113 struct scan_inodes *si;
114};
115
116static inline struct xfs_inumbers_req *
117ichunk_to_inumbers(
118 struct scan_ichunk *ichunk)
119{
120 char *p = (char *)ichunk;
121
122 return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
123}
124
125static inline struct xfs_bulkstat_req *
126ichunk_to_bulkstat(
127 struct scan_ichunk *ichunk)
128{
129 char *p = (char *)ichunk_to_inumbers(ichunk);
130
131 return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
132}
133
134static inline int
135alloc_ichunk(
136 struct scan_inodes *si,
137 uint32_t agno,
138 uint64_t startino,
139 struct scan_ichunk **ichunkp)
140{
141 struct scan_ichunk *ichunk;
142 struct xfs_inumbers_req *ireq;
143 struct xfs_bulkstat_req *breq;
144
145 ichunk = calloc(1, sizeof(struct scan_ichunk) +
146 XFS_INUMBERS_REQ_SIZE(1) +
147 XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
148 if (!ichunk)
149 return -errno;
150
151 ichunk->si = si;
152
153 ireq = ichunk_to_inumbers(ichunk);
154 ireq->hdr.icount = 1;
155 ireq->hdr.ino = startino;
156 ireq->hdr.agno = agno;
157 ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
158
159 breq = ichunk_to_bulkstat(ichunk);
160 breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
161
162 *ichunkp = ichunk;
163 return 0;
164}
165
b6fef47a 166static int
245c72a6
DW
167render_ino_from_bulkstat(
168 struct scrub_ctx *ctx,
169 char *buf,
170 size_t buflen,
171 void *data)
172{
173 struct xfs_bulkstat *bstat = data;
174
175 return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
176 bstat->bs_gen, NULL);
177}
178
179static int
180render_inumbers_from_agno(
181 struct scrub_ctx *ctx,
182 char *buf,
183 size_t buflen,
184 void *data)
185{
186 xfs_agnumber_t *agno = data;
187
188 return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
189 major(ctx->fsinfo.fs_datadev),
190 minor(ctx->fsinfo.fs_datadev),
191 *agno);
192}
193
194/*
195 * Call BULKSTAT for information on a single chunk's worth of inodes and call
196 * our iterator function. We'll try to fill the bulkstat information in
197 * batches, but we also can detect iget failures.
372d4ba9 198 */
59f79e0a 199static void
245c72a6 200scan_ag_bulkstat(
59f79e0a
DW
201 struct workqueue *wq,
202 xfs_agnumber_t agno,
372d4ba9
DW
203 void *arg)
204{
bbfbf5dd 205 struct xfs_handle handle = { };
59f79e0a 206 struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx;
245c72a6
DW
207 struct scan_ichunk *ichunk = arg;
208 struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk);
209 struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk);
210 struct scan_inodes *si = ichunk->si;
4cca629d 211 struct xfs_bulkstat *bs;
245c72a6 212 struct xfs_inumbers *inumbers = &ireq->inumbers[0];
9f4d6358 213 uint64_t last_ino = 0;
372d4ba9
DW
214 int i;
215 int error;
216 int stale_count = 0;
245c72a6
DW
217 DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
218 DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
372d4ba9 219
245c72a6 220 descr_set(&dsc_inumbers, &agno);
59f79e0a
DW
221
222 memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid));
372d4ba9
DW
223 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
224 sizeof(handle.ha_fid.fid_len);
225 handle.ha_fid.fid_pad = 0;
226
245c72a6
DW
227retry:
228 bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq);
229
230 /* Iterate all the inodes. */
231 bs = &breq->bulkstat[0];
232 for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) {
9f4d6358
DW
233 uint64_t scan_ino = bs->bs_ino;
234
235 /* ensure forward progress if we retried */
236 if (scan_ino < last_ino)
237 continue;
238
245c72a6 239 descr_set(&dsc_bulkstat, bs);
9f4d6358 240 handle.ha_fid.fid_ino = scan_ino;
245c72a6
DW
241 handle.ha_fid.fid_gen = bs->bs_gen;
242 error = si->fn(ctx, &handle, bs, si->arg);
243 switch (error) {
244 case 0:
245 break;
246 case ESTALE: {
247 stale_count++;
248 if (stale_count < 30) {
249 ireq->hdr.ino = inumbers->xi_startino;
250 error = -xfrog_inumbers(&ctx->mnt, ireq);
251 if (error)
252 goto err;
253 goto retry;
254 }
255 str_info(ctx, descr_render(&dsc_bulkstat),
256_("Changed too many times during scan; giving up."));
257 si->aborted = true;
258 goto out;
259 }
260 case ECANCELED:
261 error = 0;
262 fallthrough;
263 default:
264 goto err;
265 }
266 if (scrub_excessive_errors(ctx)) {
267 si->aborted = true;
268 goto out;
269 }
9f4d6358 270 last_ino = scan_ino;
4cca629d
DW
271 }
272
245c72a6 273err:
e6542132 274 if (error) {
245c72a6 275 str_liberror(ctx, error, descr_render(&dsc_bulkstat));
59f79e0a 276 si->aborted = true;
b94a69ac 277 }
245c72a6
DW
278out:
279 free(ichunk);
280}
281
282/*
283 * Call INUMBERS for information about inode chunks, then queue the inumbers
284 * responses in the bulkstat workqueue. This helps us maximize CPU parallelism
285 * if the filesystem AGs are not evenly loaded.
286 */
287static void
288scan_ag_inumbers(
289 struct workqueue *wq,
290 xfs_agnumber_t agno,
291 void *arg)
292{
293 struct scan_ichunk *ichunk = NULL;
294 struct scan_inodes *si = arg;
295 struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx;
296 struct xfs_inumbers_req *ireq;
297 uint64_t nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
298 int error;
299 DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
300
301 descr_set(&dsc, &agno);
302
303 error = alloc_ichunk(si, agno, 0, &ichunk);
304 if (error)
305 goto err;
306 ireq = ichunk_to_inumbers(ichunk);
b94a69ac 307
372d4ba9 308 /* Find the inode chunk & alloc mask */
e6542132 309 error = -xfrog_inumbers(&ctx->mnt, ireq);
59f79e0a 310 while (!error && !si->aborted && ireq->hdr.ocount > 0) {
4f546267
DW
311 /*
312 * Make sure that we always make forward progress while we
313 * scan the inode btree.
314 */
245c72a6
DW
315 if (nextino > ireq->inumbers[0].xi_startino) {
316 str_corrupt(ctx, descr_render(&dsc),
4f546267
DW
317 _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
318 cvt_ino_to_agino(&ctx->mnt, nextino),
319 cvt_ino_to_agino(&ctx->mnt,
320 ireq->inumbers[0].xi_startino));
321 si->aborted = true;
322 break;
323 }
324 nextino = ireq->hdr.ino;
325
245c72a6
DW
326 if (ireq->inumbers[0].xi_alloccount == 0) {
327 /*
328 * We can have totally empty inode chunks on
329 * filesystems where there are more than 64 inodes per
330 * block. Skip these.
331 */
332 ;
333 } else if (si->nr_threads > 0) {
334 /* Queue this inode chunk on the bulkstat workqueue. */
335 error = -workqueue_add(&si->wq_bulkstat,
336 scan_ag_bulkstat, agno, ichunk);
337 if (error) {
59f79e0a 338 si->aborted = true;
245c72a6
DW
339 str_liberror(ctx, error,
340 _("queueing bulkstat work"));
372d4ba9
DW
341 goto out;
342 }
245c72a6
DW
343 ichunk = NULL;
344 } else {
345 /*
346 * Only one thread, call bulkstat directly. Remember,
347 * ichunk is freed by the worker before returning.
348 */
349 scan_ag_bulkstat(wq, agno, ichunk);
350 ichunk = NULL;
351 if (si->aborted)
352 break;
353 }
354
355 if (!ichunk) {
356 error = alloc_ichunk(si, agno, nextino, &ichunk);
357 if (error)
358 goto err;
372d4ba9 359 }
245c72a6 360 ireq = ichunk_to_inumbers(ichunk);
372d4ba9 361
e6542132 362 error = -xfrog_inumbers(&ctx->mnt, ireq);
372d4ba9
DW
363 }
364
365err:
366 if (error) {
245c72a6 367 str_liberror(ctx, error, descr_render(&dsc));
59f79e0a 368 si->aborted = true;
372d4ba9
DW
369 }
370out:
245c72a6
DW
371 if (ichunk)
372 free(ichunk);
372d4ba9
DW
373}
374
59f79e0a
DW
375/*
376 * Scan all the inodes in a filesystem. On error, this function will log
377 * an error message and return -1.
378 */
379int
380scrub_scan_all_inodes(
372d4ba9 381 struct scrub_ctx *ctx,
59f79e0a 382 scrub_inode_iter_fn fn,
372d4ba9
DW
383 void *arg)
384{
59f79e0a
DW
385 struct scan_inodes si = {
386 .fn = fn,
387 .arg = arg,
245c72a6 388 .nr_threads = scrub_nproc_workqueue(ctx),
59f79e0a 389 };
372d4ba9 390 xfs_agnumber_t agno;
245c72a6
DW
391 struct workqueue wq_inumbers;
392 unsigned int max_bulkstat;
372d4ba9
DW
393 int ret;
394
245c72a6
DW
395 /*
396 * The bulkstat workqueue should queue at most one inobt block's worth
397 * of inode chunk records per worker thread. If we're running in
398 * single thread mode (nr_threads==0) then we skip the workqueues.
399 */
400 max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
401
402 ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
403 si.nr_threads, max_bulkstat);
372d4ba9 404 if (ret) {
9d57cbfc 405 str_liberror(ctx, ret, _("creating bulkstat workqueue"));
59f79e0a 406 return -1;
372d4ba9
DW
407 }
408
245c72a6
DW
409 ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
410 si.nr_threads);
411 if (ret) {
412 str_liberror(ctx, ret, _("creating inumbers workqueue"));
413 si.aborted = true;
414 goto kill_bulkstat;
415 }
416
3f9efb2e 417 for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
245c72a6 418 ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
372d4ba9 419 if (ret) {
59f79e0a 420 si.aborted = true;
245c72a6 421 str_liberror(ctx, ret, _("queueing inumbers work"));
372d4ba9
DW
422 break;
423 }
424 }
425
245c72a6
DW
426 ret = -workqueue_terminate(&wq_inumbers);
427 if (ret) {
428 si.aborted = true;
429 str_liberror(ctx, ret, _("finishing inumbers work"));
430 }
431 workqueue_destroy(&wq_inumbers);
432
433kill_bulkstat:
434 ret = -workqueue_terminate(&si.wq_bulkstat);
71296cf8 435 if (ret) {
59f79e0a 436 si.aborted = true;
71296cf8
DW
437 str_liberror(ctx, ret, _("finishing bulkstat work"));
438 }
245c72a6 439 workqueue_destroy(&si.wq_bulkstat);
372d4ba9 440
59f79e0a 441 return si.aborted ? -1 : 0;
372d4ba9
DW
442}
443
59f79e0a 444/* Open a file by handle, returning either the fd or -1 on error. */
372d4ba9 445int
59f79e0a 446scrub_open_handle(
372d4ba9
DW
447 struct xfs_handle *handle)
448{
449 return open_by_fshandle(handle, sizeof(*handle),
450 O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
451}