]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - scrub/phase6.c
xfs_scrub: only call read_verify_force_io once per pool
[thirdparty/xfsprogs-dev.git] / scrub / phase6.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0+
b364a9c0
DW
2/*
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
b364a9c0 4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
b364a9c0 5 */
a440f877 6#include "xfs.h"
b364a9c0 7#include <stdint.h>
b364a9c0
DW
8#include <dirent.h>
9#include <sys/statvfs.h>
b364a9c0 10#include "handle.h"
42b4c8e8 11#include "libfrog/paths.h"
56598728 12#include "libfrog/workqueue.h"
b364a9c0
DW
13#include "xfs_scrub.h"
14#include "common.h"
a58400ed 15#include "libfrog/bitmap.h"
b364a9c0
DW
16#include "disk.h"
17#include "filemap.h"
ed60d210 18#include "fscounters.h"
b364a9c0
DW
19#include "inodes.h"
20#include "read_verify.h"
21#include "spacemap.h"
22#include "vfs.h"
23
24/*
25 * Phase 6: Verify data file integrity.
26 *
27 * Identify potential data block extents with GETFSMAP, then feed those
28 * extents to the read-verify pool to get the verify commands batched,
29 * issued, and (if there are problems) reported back to us. If there
30 * are errors, we'll record the bad regions and (if available) use rmap
31 * to tell us if metadata are now corrupt. Otherwise, we'll scan the
32 * whole directory tree looking for files that overlap the bad regions
33 * and report the paths of the now corrupt files.
34 */
35
f1bb1696
DW
36/* Verify disk blocks with GETFSMAP */
37
557f98d7 38struct media_verify_state {
f1bb1696
DW
39 struct read_verify_pool *rvp_data;
40 struct read_verify_pool *rvp_log;
41 struct read_verify_pool *rvp_realtime;
42 struct bitmap *d_bad; /* bytes */
43 struct bitmap *r_bad; /* bytes */
44};
45
b364a9c0 46/* Find the fd for a given device identifier. */
f1bb1696
DW
47static struct read_verify_pool *
48xfs_dev_to_pool(
49 struct scrub_ctx *ctx,
557f98d7 50 struct media_verify_state *vs,
f1bb1696 51 dev_t dev)
b364a9c0
DW
52{
53 if (dev == ctx->fsinfo.fs_datadev)
557f98d7 54 return vs->rvp_data;
b364a9c0 55 else if (dev == ctx->fsinfo.fs_logdev)
557f98d7 56 return vs->rvp_log;
b364a9c0 57 else if (dev == ctx->fsinfo.fs_rtdev)
557f98d7 58 return vs->rvp_realtime;
b364a9c0
DW
59 abort();
60}
61
62/* Find the device major/minor for a given file descriptor. */
63static dev_t
64xfs_disk_to_dev(
65 struct scrub_ctx *ctx,
66 struct disk *disk)
67{
68 if (disk == ctx->datadev)
69 return ctx->fsinfo.fs_datadev;
70 else if (disk == ctx->logdev)
71 return ctx->fsinfo.fs_logdev;
72 else if (disk == ctx->rtdev)
73 return ctx->fsinfo.fs_rtdev;
74 abort();
75}
76
77struct owner_decode {
78 uint64_t owner;
79 const char *descr;
80};
81
82static const struct owner_decode special_owners[] = {
83 {XFS_FMR_OWN_FREE, "free space"},
84 {XFS_FMR_OWN_UNKNOWN, "unknown owner"},
85 {XFS_FMR_OWN_FS, "static FS metadata"},
86 {XFS_FMR_OWN_LOG, "journalling log"},
87 {XFS_FMR_OWN_AG, "per-AG metadata"},
88 {XFS_FMR_OWN_INOBT, "inode btree blocks"},
89 {XFS_FMR_OWN_INODES, "inodes"},
90 {XFS_FMR_OWN_REFC, "refcount btree"},
91 {XFS_FMR_OWN_COW, "CoW staging"},
92 {XFS_FMR_OWN_DEFECTIVE, "bad blocks"},
93 {0, NULL},
94};
95
96/* Decode a special owner. */
97static const char *
98xfs_decode_special_owner(
99 uint64_t owner)
100{
101 const struct owner_decode *od = special_owners;
102
103 while (od->descr) {
104 if (od->owner == owner)
105 return od->descr;
106 od++;
107 }
108
109 return NULL;
110}
111
112/* Routines to translate bad physical extents into file paths and offsets. */
113
b364a9c0
DW
114/* Report if this extent overlaps a bad region. */
115static bool
116xfs_report_verify_inode_bmap(
117 struct scrub_ctx *ctx,
118 const char *descr,
119 int fd,
120 int whichfork,
121 struct fsxattr *fsx,
122 struct xfs_bmap *bmap,
123 void *arg)
124{
ed5f9cc7 125 struct media_verify_state *vs = arg;
b364a9c0
DW
126 struct bitmap *bmp;
127
128 /* Only report errors for real extents. */
129 if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
130 return true;
131
132 if (fsx->fsx_xflags & FS_XFLAG_REALTIME)
ed5f9cc7 133 bmp = vs->r_bad;
b364a9c0 134 else
ed5f9cc7 135 bmp = vs->d_bad;
b364a9c0
DW
136
137 if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
138 return true;
139
140 str_error(ctx, descr,
141_("offset %llu failed read verification."), bmap->bm_offset);
142 return true;
143}
144
145/* Iterate the extent mappings of a file to report errors. */
146static bool
147xfs_report_verify_fd(
148 struct scrub_ctx *ctx,
149 const char *descr,
150 int fd,
151 void *arg)
152{
153 struct xfs_bmap key = {0};
154 bool moveon;
155
156 /* data fork */
157 moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_DATA_FORK, &key,
158 xfs_report_verify_inode_bmap, arg);
159 if (!moveon)
160 return false;
161
162 /* attr fork */
163 moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_ATTR_FORK, &key,
164 xfs_report_verify_inode_bmap, arg);
165 if (!moveon)
166 return false;
167 return true;
168}
169
170/* Report read verify errors in unlinked (but still open) files. */
171static int
172xfs_report_verify_inode(
173 struct scrub_ctx *ctx,
174 struct xfs_handle *handle,
4cca629d 175 struct xfs_bulkstat *bstat,
b364a9c0
DW
176 void *arg)
177{
178 char descr[DESCR_BUFSZ];
b364a9c0
DW
179 bool moveon;
180 int fd;
181 int error;
182
183 snprintf(descr, DESCR_BUFSZ, _("inode %"PRIu64" (unlinked)"),
184 (uint64_t)bstat->bs_ino);
185
186 /* Ignore linked files and things we can't open. */
187 if (bstat->bs_nlink != 0)
188 return 0;
189 if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode))
190 return 0;
191
192 /* Try to open the inode. */
193 fd = xfs_open_handle(handle);
194 if (fd < 0) {
195 error = errno;
196 if (error == ESTALE)
197 return error;
198
bb5dbd06
DW
199 str_info(ctx, descr,
200_("Disappeared during read error reporting."));
b364a9c0
DW
201 return error;
202 }
203
204 /* Go find the badness. */
205 moveon = xfs_report_verify_fd(ctx, descr, fd, arg);
6c05cc5d
DW
206 error = close(fd);
207 if (error)
208 str_errno(ctx, descr);
b364a9c0
DW
209
210 return moveon ? 0 : XFS_ITERATE_INODES_ABORT;
211}
212
213/* Scan a directory for matches in the read verify error list. */
214static bool
215xfs_report_verify_dir(
216 struct scrub_ctx *ctx,
217 const char *path,
218 int dir_fd,
219 void *arg)
220{
221 return xfs_report_verify_fd(ctx, path, dir_fd, arg);
222}
223
224/*
225 * Scan the inode associated with a directory entry for matches with
226 * the read verify error list.
227 */
228static bool
229xfs_report_verify_dirent(
230 struct scrub_ctx *ctx,
231 const char *path,
232 int dir_fd,
233 struct dirent *dirent,
234 struct stat *sb,
235 void *arg)
236{
237 bool moveon;
238 int fd;
6c05cc5d 239 int error;
b364a9c0
DW
240
241 /* Ignore things we can't open. */
242 if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode))
243 return true;
244
245 /* Ignore . and .. */
246 if (!strcmp(".", dirent->d_name) || !strcmp("..", dirent->d_name))
247 return true;
248
249 /*
250 * If we were given a dirent, open the associated file under
251 * dir_fd for badblocks scanning. If dirent is NULL, then it's
252 * the directory itself we want to scan.
253 */
254 fd = openat(dir_fd, dirent->d_name,
255 O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
256 if (fd < 0)
257 return true;
258
259 /* Go find the badness. */
260 moveon = xfs_report_verify_fd(ctx, path, fd, arg);
261 if (moveon)
262 goto out;
263
264out:
6c05cc5d
DW
265 error = close(fd);
266 if (error)
267 str_errno(ctx, path);
b364a9c0
DW
268 return moveon;
269}
270
271/* Given bad extent lists for the data & rtdev, find bad files. */
272static bool
273xfs_report_verify_errors(
274 struct scrub_ctx *ctx,
ed5f9cc7 275 struct media_verify_state *vs)
b364a9c0 276{
b364a9c0
DW
277 bool moveon;
278
b364a9c0
DW
279 /* Scan the directory tree to get file paths. */
280 moveon = scan_fs_tree(ctx, xfs_report_verify_dir,
ed5f9cc7 281 xfs_report_verify_dirent, vs);
b364a9c0
DW
282 if (!moveon)
283 return false;
284
285 /* Scan for unlinked files. */
ed5f9cc7 286 return xfs_scan_all_inodes(ctx, xfs_report_verify_inode, vs);
b364a9c0
DW
287}
288
b364a9c0
DW
289/* Report an IO error resulting from read-verify based off getfsmap. */
290static bool
291xfs_check_rmap_error_report(
292 struct scrub_ctx *ctx,
293 const char *descr,
294 struct fsmap *map,
295 void *arg)
296{
297 const char *type;
298 char buf[32];
299 uint64_t err_physical = *(uint64_t *)arg;
300 uint64_t err_off;
301
302 if (err_physical > map->fmr_physical)
303 err_off = err_physical - map->fmr_physical;
304 else
305 err_off = 0;
306
307 snprintf(buf, 32, _("disk offset %"PRIu64),
308 (uint64_t)BTOBB(map->fmr_physical + err_off));
309
310 if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) {
311 type = xfs_decode_special_owner(map->fmr_owner);
312 str_error(ctx, buf,
313_("%s failed read verification."),
314 type);
315 }
316
317 /*
318 * XXX: If we had a getparent() call we could report IO errors
319 * efficiently. Until then, we'll have to scan the dir tree
320 * to find the bad file's pathname.
321 */
322
323 return true;
324}
325
326/*
327 * Remember a read error for later, and see if rmap will tell us about the
328 * owner ahead of time.
329 */
330static void
331xfs_check_rmap_ioerr(
332 struct scrub_ctx *ctx,
333 struct disk *disk,
334 uint64_t start,
335 uint64_t length,
336 int error,
337 void *arg)
338{
339 struct fsmap keys[2];
340 char descr[DESCR_BUFSZ];
557f98d7 341 struct media_verify_state *vs = arg;
b364a9c0
DW
342 struct bitmap *tree;
343 dev_t dev;
233fabee 344 int ret;
b364a9c0
DW
345
346 dev = xfs_disk_to_dev(ctx, disk);
347
348 /*
349 * If we don't have parent pointers, save the bad extent for
350 * later rescanning.
351 */
352 if (dev == ctx->fsinfo.fs_datadev)
557f98d7 353 tree = vs->d_bad;
b364a9c0 354 else if (dev == ctx->fsinfo.fs_rtdev)
557f98d7 355 tree = vs->r_bad;
b364a9c0
DW
356 else
357 tree = NULL;
358 if (tree) {
233fabee
DW
359 ret = bitmap_set(tree, start, length);
360 if (ret)
361 str_liberror(ctx, ret, _("setting bad block bitmap"));
b364a9c0
DW
362 }
363
364 snprintf(descr, DESCR_BUFSZ, _("dev %d:%d ioerr @ %"PRIu64":%"PRIu64" "),
365 major(dev), minor(dev), start, length);
366
367 /* Go figure out which blocks are bad from the fsmap. */
368 memset(keys, 0, sizeof(struct fsmap) * 2);
369 keys->fmr_device = dev;
370 keys->fmr_physical = start;
371 (keys + 1)->fmr_device = dev;
372 (keys + 1)->fmr_physical = start + length - 1;
373 (keys + 1)->fmr_owner = ULLONG_MAX;
374 (keys + 1)->fmr_offset = ULLONG_MAX;
375 (keys + 1)->fmr_flags = UINT_MAX;
376 xfs_iterate_fsmap(ctx, descr, keys, xfs_check_rmap_error_report,
377 &start);
378}
379
380/* Schedule a read-verify of a (data block) extent. */
381static bool
382xfs_check_rmap(
383 struct scrub_ctx *ctx,
384 const char *descr,
385 struct fsmap *map,
386 void *arg)
387{
557f98d7 388 struct media_verify_state *vs = arg;
f1bb1696 389 struct read_verify_pool *rvp;
8cab77d3 390 int ret;
f1bb1696 391
557f98d7 392 rvp = xfs_dev_to_pool(ctx, vs, map->fmr_device);
b364a9c0
DW
393
394 dbg_printf("rmap dev %d:%d phys %"PRIu64" owner %"PRId64
395 " offset %"PRIu64" len %"PRIu64" flags 0x%x\n",
396 major(map->fmr_device), minor(map->fmr_device),
397 (uint64_t)map->fmr_physical, (int64_t)map->fmr_owner,
398 (uint64_t)map->fmr_offset, (uint64_t)map->fmr_length,
399 map->fmr_flags);
400
401 /* "Unknown" extents should be verified; they could be data. */
402 if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) &&
403 map->fmr_owner == XFS_FMR_OWN_UNKNOWN)
404 map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER;
405
406 /*
407 * We only care about read-verifying data extents that have been
408 * written to disk. This means we can skip "special" owners
409 * (metadata), xattr blocks, unwritten extents, and extent maps.
410 * These should all get checked elsewhere in the scrubber.
411 */
412 if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK |
413 FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER))
22d658ec 414 return true;
b364a9c0
DW
415
416 /* XXX: Filter out directory data blocks. */
417
418 /* Schedule the read verify command for (eventual) running. */
8cab77d3
DW
419 ret = read_verify_schedule_io(rvp, map->fmr_physical, map->fmr_length,
420 vs);
421 if (ret) {
422 str_liberror(ctx, ret, descr);
423 return false;
424 }
b364a9c0 425
b364a9c0
DW
426 return true;
427}
428
f1bb1696 429/* Wait for read/verify actions to finish, then return # bytes checked. */
8cab77d3 430static int
f1bb1696 431clean_pool(
8cab77d3
DW
432 struct read_verify_pool *rvp,
433 unsigned long long *bytes_checked)
f1bb1696 434{
8cab77d3
DW
435 uint64_t pool_checked;
436 int ret;
f1bb1696
DW
437
438 if (!rvp)
439 return 0;
440
22d658ec
DW
441 ret = read_verify_force_io(rvp);
442 if (ret)
443 return ret;
444
8cab77d3
DW
445 ret = read_verify_pool_flush(rvp);
446 if (ret)
447 goto out_destroy;
448
449 ret = read_verify_bytes(rvp, &pool_checked);
450 if (ret)
451 goto out_destroy;
452
453 *bytes_checked += pool_checked;
454out_destroy:
f1bb1696
DW
455 read_verify_pool_destroy(rvp);
456 return ret;
457}
458
b364a9c0
DW
459/*
460 * Read verify all the file data blocks in a filesystem. Since XFS doesn't
461 * do data checksums, we trust that the underlying storage will pass back
462 * an IO error if it can't retrieve whatever we previously stored there.
463 * If we hit an IO error, we'll record the bad blocks in a bitmap and then
464 * scan the extent maps of the entire fs tree to figure (and the unlinked
465 * inodes) out which files are now broken.
466 */
467bool
468xfs_scan_blocks(
469 struct scrub_ctx *ctx)
470{
557f98d7 471 struct media_verify_state vs = { NULL };
93ab49dd 472 bool moveon = false;
233fabee 473 int ret;
b364a9c0 474
233fabee
DW
475 ret = bitmap_alloc(&vs.d_bad);
476 if (ret) {
477 str_liberror(ctx, ret, _("creating datadev badblock bitmap"));
41c08606 478 goto out;
b364a9c0
DW
479 }
480
233fabee
DW
481 ret = bitmap_alloc(&vs.r_bad);
482 if (ret) {
483 str_liberror(ctx, ret, _("creating realtime badblock bitmap"));
b364a9c0
DW
484 goto out_dbad;
485 }
486
8cab77d3 487 ret = read_verify_pool_alloc(ctx, ctx->datadev,
3f9efb2e 488 ctx->mnt.fsgeom.blocksize, xfs_check_rmap_ioerr,
8cab77d3
DW
489 scrub_nproc(ctx), &vs.rvp_data);
490 if (ret) {
491 str_liberror(ctx, ret, _("creating datadev media verifier"));
b364a9c0
DW
492 goto out_rbad;
493 }
f1bb1696 494 if (ctx->logdev) {
8cab77d3 495 ret = read_verify_pool_alloc(ctx, ctx->logdev,
3f9efb2e 496 ctx->mnt.fsgeom.blocksize, xfs_check_rmap_ioerr,
8cab77d3
DW
497 scrub_nproc(ctx), &vs.rvp_log);
498 if (ret) {
499 str_liberror(ctx, ret,
500 _("creating logdev media verifier"));
f1bb1696
DW
501 goto out_datapool;
502 }
503 }
504 if (ctx->rtdev) {
8cab77d3 505 ret = read_verify_pool_alloc(ctx, ctx->rtdev,
3f9efb2e 506 ctx->mnt.fsgeom.blocksize, xfs_check_rmap_ioerr,
8cab77d3
DW
507 scrub_nproc(ctx), &vs.rvp_realtime);
508 if (ret) {
509 str_liberror(ctx, ret,
510 _("creating rtdev media verifier"));
f1bb1696
DW
511 goto out_logpool;
512 }
513 }
557f98d7 514 moveon = xfs_scan_all_spacemaps(ctx, xfs_check_rmap, &vs);
b364a9c0 515 if (!moveon)
f1bb1696 516 goto out_rtpool;
8cab77d3
DW
517
518 ret = clean_pool(vs.rvp_data, &ctx->bytes_checked);
519 if (ret) {
520 str_liberror(ctx, ret, _("flushing datadev verify pool"));
521 moveon = false;
522 }
523
524 ret = clean_pool(vs.rvp_log, &ctx->bytes_checked);
525 if (ret) {
526 str_liberror(ctx, ret, _("flushing logdev verify pool"));
527 moveon = false;
528 }
529
530 ret = clean_pool(vs.rvp_realtime, &ctx->bytes_checked);
531 if (ret) {
532 str_liberror(ctx, ret, _("flushing rtdev verify pool"));
533 moveon = false;
534 }
b364a9c0
DW
535
536 /* Scan the whole dir tree to see what matches the bad extents. */
8cab77d3 537 if (moveon && (!bitmap_empty(vs.d_bad) || !bitmap_empty(vs.r_bad)))
ed5f9cc7 538 moveon = xfs_report_verify_errors(ctx, &vs);
b364a9c0 539
557f98d7
DW
540 bitmap_free(&vs.r_bad);
541 bitmap_free(&vs.d_bad);
b364a9c0
DW
542 return moveon;
543
f1bb1696 544out_rtpool:
7668d01d 545 if (vs.rvp_realtime) {
4cd869e5 546 read_verify_pool_abort(vs.rvp_realtime);
557f98d7 547 read_verify_pool_destroy(vs.rvp_realtime);
7668d01d 548 }
f1bb1696 549out_logpool:
7668d01d 550 if (vs.rvp_log) {
4cd869e5 551 read_verify_pool_abort(vs.rvp_log);
557f98d7 552 read_verify_pool_destroy(vs.rvp_log);
7668d01d 553 }
f1bb1696 554out_datapool:
4cd869e5 555 read_verify_pool_abort(vs.rvp_data);
557f98d7 556 read_verify_pool_destroy(vs.rvp_data);
b364a9c0 557out_rbad:
557f98d7 558 bitmap_free(&vs.r_bad);
b364a9c0 559out_dbad:
557f98d7 560 bitmap_free(&vs.d_bad);
41c08606 561out:
b364a9c0
DW
562 return moveon;
563}
ed60d210
DW
564
565/* Estimate how much work we're going to do. */
566bool
567xfs_estimate_verify_work(
568 struct scrub_ctx *ctx,
569 uint64_t *items,
570 unsigned int *nr_threads,
571 int *rshift)
572{
573 unsigned long long d_blocks;
574 unsigned long long d_bfree;
575 unsigned long long r_blocks;
576 unsigned long long r_bfree;
577 unsigned long long f_files;
578 unsigned long long f_free;
579 bool moveon;
580
581 moveon = xfs_scan_estimate_blocks(ctx, &d_blocks, &d_bfree,
582 &r_blocks, &r_bfree, &f_files, &f_free);
583 if (!moveon)
584 return moveon;
585
a749451c
DW
586 *items = cvt_off_fsb_to_b(&ctx->mnt,
587 (d_blocks - d_bfree) + (r_blocks - r_bfree));
ed60d210
DW
588 *nr_threads = disk_heads(ctx->datadev);
589 *rshift = 20;
590 return moveon;
591}