1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
9 #include <sys/statvfs.h>
11 #include "libfrog/paths.h"
12 #include "libfrog/workqueue.h"
13 #include "xfs_scrub.h"
15 #include "libfrog/bitmap.h"
18 #include "fscounters.h"
20 #include "read_verify.h"
25 * Phase 6: Verify data file integrity.
27 * Identify potential data block extents with GETFSMAP, then feed those
28 * extents to the read-verify pool to get the verify commands batched,
29 * issued, and (if there are problems) reported back to us. If there
30 * are errors, we'll record the bad regions and (if available) use rmap
31 * to tell us if metadata are now corrupt. Otherwise, we'll scan the
32 * whole directory tree looking for files that overlap the bad regions
33 * and report the paths of the now corrupt files.
36 /* Verify disk blocks with GETFSMAP */
38 struct media_verify_state
{
39 struct read_verify_pool
*rvp_data
;
40 struct read_verify_pool
*rvp_log
;
41 struct read_verify_pool
*rvp_realtime
;
42 struct bitmap
*d_bad
; /* bytes */
43 struct bitmap
*r_bad
; /* bytes */
46 /* Find the fd for a given device identifier. */
47 static struct read_verify_pool
*
49 struct scrub_ctx
*ctx
,
50 struct media_verify_state
*vs
,
53 if (dev
== ctx
->fsinfo
.fs_datadev
)
55 else if (dev
== ctx
->fsinfo
.fs_logdev
)
57 else if (dev
== ctx
->fsinfo
.fs_rtdev
)
58 return vs
->rvp_realtime
;
62 /* Find the device major/minor for a given file descriptor. */
65 struct scrub_ctx
*ctx
,
68 if (disk
== ctx
->datadev
)
69 return ctx
->fsinfo
.fs_datadev
;
70 else if (disk
== ctx
->logdev
)
71 return ctx
->fsinfo
.fs_logdev
;
72 else if (disk
== ctx
->rtdev
)
73 return ctx
->fsinfo
.fs_rtdev
;
77 /* Find the incore bad blocks bitmap for a given disk. */
78 static struct bitmap
*
80 struct scrub_ctx
*ctx
,
82 struct media_verify_state
*vs
)
84 dev_t dev
= disk_to_dev(ctx
, disk
);
86 if (dev
== ctx
->fsinfo
.fs_datadev
)
88 else if (dev
== ctx
->fsinfo
.fs_rtdev
)
93 struct disk_ioerr_report
{
94 struct scrub_ctx
*ctx
;
103 static const struct owner_decode special_owners
[] = {
104 {XFS_FMR_OWN_FREE
, "free space"},
105 {XFS_FMR_OWN_UNKNOWN
, "unknown owner"},
106 {XFS_FMR_OWN_FS
, "static FS metadata"},
107 {XFS_FMR_OWN_LOG
, "journalling log"},
108 {XFS_FMR_OWN_AG
, "per-AG metadata"},
109 {XFS_FMR_OWN_INOBT
, "inode btree blocks"},
110 {XFS_FMR_OWN_INODES
, "inodes"},
111 {XFS_FMR_OWN_REFC
, "refcount btree"},
112 {XFS_FMR_OWN_COW
, "CoW staging"},
113 {XFS_FMR_OWN_DEFECTIVE
, "bad blocks"},
117 /* Decode a special owner. */
119 decode_special_owner(
122 const struct owner_decode
*od
= special_owners
;
125 if (od
->owner
== owner
)
133 /* Routines to translate bad physical extents into file paths and offsets. */
135 struct badfile_report
{
136 struct scrub_ctx
*ctx
;
138 struct media_verify_state
*vs
;
139 struct file_bmap
*bmap
;
142 /* Report on bad extents found during a media scan. */
149 struct badfile_report
*br
= arg
;
150 unsigned long long bad_offset
;
151 unsigned long long bad_length
;
153 /* Clamp the bad region to the file mapping. */
154 if (start
< br
->bmap
->bm_physical
) {
155 length
-= br
->bmap
->bm_physical
- start
;
156 start
= br
->bmap
->bm_physical
;
158 length
= min(length
, br
->bmap
->bm_length
);
160 /* Figure out how far into the bmap is the bad mapping and report it. */
161 bad_offset
= start
- br
->bmap
->bm_physical
;
162 bad_length
= min(start
+ length
,
163 br
->bmap
->bm_physical
+ br
->bmap
->bm_length
) - start
;
165 str_unfixable_error(br
->ctx
, br
->descr
,
166 _("media error at data offset %llu length %llu."),
167 br
->bmap
->bm_offset
+ bad_offset
, bad_length
);
171 /* Report if this extent overlaps a bad region. */
174 struct scrub_ctx
*ctx
,
178 struct file_bmap
*bmap
,
181 struct badfile_report
*br
= arg
;
182 struct media_verify_state
*vs
= br
->vs
;
187 /* Only report errors for real extents. */
188 if (bmap
->bm_flags
& (BMV_OF_PREALLOC
| BMV_OF_DELALLOC
))
191 if (fsx
->fsx_xflags
& FS_XFLAG_REALTIME
)
196 return bitmap_iterate_range(bmp
, bmap
->bm_physical
, bmap
->bm_length
,
200 /* Report if the extended attribute data overlaps a bad region. */
203 struct scrub_ctx
*ctx
,
207 struct file_bmap
*bmap
,
210 struct badfile_report
*br
= arg
;
211 struct media_verify_state
*vs
= br
->vs
;
212 struct bitmap
*bmp
= vs
->d_bad
;
214 /* Complain about attr fork extents that don't look right. */
215 if (bmap
->bm_flags
& (BMV_OF_PREALLOC
| BMV_OF_DELALLOC
)) {
216 str_info(ctx
, br
->descr
,
217 _("found unexpected unwritten/delalloc attr fork extent."));
221 if (fsx
->fsx_xflags
& FS_XFLAG_REALTIME
) {
222 str_info(ctx
, br
->descr
,
223 _("found unexpected realtime attr fork extent."));
227 if (bitmap_test(bmp
, bmap
->bm_physical
, bmap
->bm_length
))
228 str_corrupt(ctx
, br
->descr
,
229 _("media error in extended attribute data."));
234 /* Iterate the extent mappings of a file to report errors. */
237 struct scrub_ctx
*ctx
,
242 struct badfile_report br
= {
247 struct file_bmap key
= {0};
251 ret
= scrub_iterate_filemaps(ctx
, fd
, XFS_DATA_FORK
, &key
,
252 report_data_loss
, &br
);
254 str_liberror(ctx
, ret
, descr
);
259 ret
= scrub_iterate_filemaps(ctx
, fd
, XFS_ATTR_FORK
, &key
,
260 report_attr_loss
, &br
);
262 str_liberror(ctx
, ret
, descr
);
269 /* Report read verify errors in unlinked (but still open) files. */
272 struct scrub_ctx
*ctx
,
273 struct xfs_handle
*handle
,
274 struct xfs_bulkstat
*bstat
,
277 char descr
[DESCR_BUFSZ
];
281 /* Ignore linked files and things we can't open. */
282 if (bstat
->bs_nlink
!= 0)
284 if (!S_ISREG(bstat
->bs_mode
) && !S_ISDIR(bstat
->bs_mode
))
287 scrub_render_ino_descr(ctx
, descr
, DESCR_BUFSZ
,
288 bstat
->bs_ino
, bstat
->bs_gen
, _("(unlinked)"));
290 /* Try to open the inode. */
291 fd
= scrub_open_handle(handle
);
298 _("Disappeared during read error reporting."));
302 /* Go find the badness. */
303 error
= report_fd_loss(ctx
, descr
, fd
, arg
);
307 str_errno(ctx
, descr
);
312 /* Scan a directory for matches in the read verify error list. */
315 struct scrub_ctx
*ctx
,
320 return report_fd_loss(ctx
, path
, dir_fd
, arg
);
324 * Scan the inode associated with a directory entry for matches with
325 * the read verify error list.
329 struct scrub_ctx
*ctx
,
332 struct dirent
*dirent
,
339 /* Ignore things we can't open. */
340 if (!S_ISREG(sb
->st_mode
) && !S_ISDIR(sb
->st_mode
))
343 /* Ignore . and .. */
344 if (!strcmp(".", dirent
->d_name
) || !strcmp("..", dirent
->d_name
))
348 * If we were given a dirent, open the associated file under
349 * dir_fd for badblocks scanning. If dirent is NULL, then it's
350 * the directory itself we want to scan.
352 fd
= openat(dir_fd
, dirent
->d_name
,
353 O_RDONLY
| O_NOATIME
| O_NOFOLLOW
| O_NOCTTY
);
357 str_errno(ctx
, path
);
361 /* Go find the badness. */
362 error
= report_fd_loss(ctx
, path
, fd
, arg
);
366 str_errno(ctx
, path
);
373 /* Use a fsmap to report metadata lost to a media error. */
376 struct scrub_ctx
*ctx
,
381 char buf
[DESCR_BUFSZ
];
382 uint64_t err_physical
= *(uint64_t *)arg
;
385 /* Don't care about unwritten extents. */
386 if (map
->fmr_flags
& FMR_OF_PREALLOC
)
389 if (err_physical
> map
->fmr_physical
)
390 err_off
= err_physical
- map
->fmr_physical
;
394 /* Report special owners */
395 if (map
->fmr_flags
& FMR_OF_SPECIAL_OWNER
) {
396 snprintf(buf
, DESCR_BUFSZ
, _("disk offset %"PRIu64
),
397 (uint64_t)map
->fmr_physical
+ err_off
);
398 type
= decode_special_owner(map
->fmr_owner
);
399 str_corrupt(ctx
, buf
, _("media error in %s."), type
);
402 /* Report extent maps */
403 if (map
->fmr_flags
& FMR_OF_EXTENT_MAP
) {
404 bool attr
= (map
->fmr_flags
& FMR_OF_ATTR_FORK
);
406 scrub_render_ino_descr(ctx
, buf
, DESCR_BUFSZ
,
407 map
->fmr_owner
, 0, " %s",
408 attr
? _("extended attribute") :
410 str_corrupt(ctx
, buf
, _("media error in extent map"));
414 * XXX: If we had a getparent() call we could report IO errors
415 * efficiently. Until then, we'll have to scan the dir tree
416 * to find the bad file's pathname.
423 * For a range of bad blocks, visit each space mapping that overlaps the bad
424 * range so that we can report lost metadata.
432 struct fsmap keys
[2];
433 struct disk_ioerr_report
*dioerr
= arg
;
436 dev
= disk_to_dev(dioerr
->ctx
, dioerr
->disk
);
438 /* Go figure out which blocks are bad from the fsmap. */
439 memset(keys
, 0, sizeof(struct fsmap
) * 2);
440 keys
->fmr_device
= dev
;
441 keys
->fmr_physical
= start
;
442 (keys
+ 1)->fmr_device
= dev
;
443 (keys
+ 1)->fmr_physical
= start
+ length
- 1;
444 (keys
+ 1)->fmr_owner
= ULLONG_MAX
;
445 (keys
+ 1)->fmr_offset
= ULLONG_MAX
;
446 (keys
+ 1)->fmr_flags
= UINT_MAX
;
447 return scrub_iterate_fsmap(dioerr
->ctx
, keys
, report_ioerr_fsmap
,
451 /* Report all the media errors found on a disk. */
454 struct scrub_ctx
*ctx
,
456 struct media_verify_state
*vs
)
458 struct disk_ioerr_report dioerr
= {
466 tree
= bitmap_for_disk(ctx
, disk
, vs
);
469 return bitmap_iterate(tree
, report_ioerr
, &dioerr
);
472 /* Given bad extent lists for the data & rtdev, find bad files. */
474 report_all_media_errors(
475 struct scrub_ctx
*ctx
,
476 struct media_verify_state
*vs
)
480 ret
= report_disk_ioerrs(ctx
, ctx
->datadev
, vs
);
482 str_liberror(ctx
, ret
, _("walking datadev io errors"));
486 ret
= report_disk_ioerrs(ctx
, ctx
->rtdev
, vs
);
488 str_liberror(ctx
, ret
, _("walking rtdev io errors"));
492 /* Scan the directory tree to get file paths. */
493 ret
= scan_fs_tree(ctx
, report_dir_loss
, report_dirent_loss
, vs
);
497 /* Scan for unlinked files. */
498 return scrub_scan_all_inodes(ctx
, report_inode_loss
, vs
);
501 /* Schedule a read-verify of a (data block) extent. */
504 struct scrub_ctx
*ctx
,
508 struct media_verify_state
*vs
= arg
;
509 struct read_verify_pool
*rvp
;
512 rvp
= dev_to_pool(ctx
, vs
, map
->fmr_device
);
514 dbg_printf("rmap dev %d:%d phys %"PRIu64
" owner %"PRId64
515 " offset %"PRIu64
" len %"PRIu64
" flags 0x%x\n",
516 major(map
->fmr_device
), minor(map
->fmr_device
),
517 (uint64_t)map
->fmr_physical
, (int64_t)map
->fmr_owner
,
518 (uint64_t)map
->fmr_offset
, (uint64_t)map
->fmr_length
,
521 /* "Unknown" extents should be verified; they could be data. */
522 if ((map
->fmr_flags
& FMR_OF_SPECIAL_OWNER
) &&
523 map
->fmr_owner
== XFS_FMR_OWN_UNKNOWN
)
524 map
->fmr_flags
&= ~FMR_OF_SPECIAL_OWNER
;
527 * We only care about read-verifying data extents that have been
528 * written to disk. This means we can skip "special" owners
529 * (metadata), xattr blocks, unwritten extents, and extent maps.
530 * These should all get checked elsewhere in the scrubber.
532 if (map
->fmr_flags
& (FMR_OF_PREALLOC
| FMR_OF_ATTR_FORK
|
533 FMR_OF_EXTENT_MAP
| FMR_OF_SPECIAL_OWNER
))
536 /* XXX: Filter out directory data blocks. */
538 /* Schedule the read verify command for (eventual) running. */
539 ret
= read_verify_schedule_io(rvp
, map
->fmr_physical
, map
->fmr_length
,
542 str_liberror(ctx
, ret
, _("scheduling media verify command"));
549 /* Wait for read/verify actions to finish, then return # bytes checked. */
552 struct read_verify_pool
*rvp
,
553 unsigned long long *bytes_checked
)
555 uint64_t pool_checked
;
561 ret
= read_verify_force_io(rvp
);
565 ret
= read_verify_pool_flush(rvp
);
569 ret
= read_verify_bytes(rvp
, &pool_checked
);
573 *bytes_checked
+= pool_checked
;
575 read_verify_pool_destroy(rvp
);
579 /* Remember a media error for later. */
582 struct scrub_ctx
*ctx
,
589 struct media_verify_state
*vs
= arg
;
593 tree
= bitmap_for_disk(ctx
, disk
, vs
);
595 str_liberror(ctx
, ENOENT
, _("finding bad block bitmap"));
599 ret
= bitmap_set(tree
, start
, length
);
601 str_liberror(ctx
, ret
, _("setting bad block bitmap"));
605 * Read verify all the file data blocks in a filesystem. Since XFS doesn't
606 * do data checksums, we trust that the underlying storage will pass back
607 * an IO error if it can't retrieve whatever we previously stored there.
608 * If we hit an IO error, we'll record the bad blocks in a bitmap and then
609 * scan the extent maps of the entire fs tree to figure (and the unlinked
610 * inodes) out which files are now broken.
614 struct scrub_ctx
*ctx
)
616 struct media_verify_state vs
= { NULL
};
619 ret
= bitmap_alloc(&vs
.d_bad
);
621 str_liberror(ctx
, ret
, _("creating datadev badblock bitmap"));
625 ret
= bitmap_alloc(&vs
.r_bad
);
627 str_liberror(ctx
, ret
, _("creating realtime badblock bitmap"));
631 ret
= read_verify_pool_alloc(ctx
, ctx
->datadev
,
632 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
633 scrub_nproc(ctx
), &vs
.rvp_data
);
635 str_liberror(ctx
, ret
, _("creating datadev media verifier"));
639 ret
= read_verify_pool_alloc(ctx
, ctx
->logdev
,
640 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
641 scrub_nproc(ctx
), &vs
.rvp_log
);
643 str_liberror(ctx
, ret
,
644 _("creating logdev media verifier"));
649 ret
= read_verify_pool_alloc(ctx
, ctx
->rtdev
,
650 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
651 scrub_nproc(ctx
), &vs
.rvp_realtime
);
653 str_liberror(ctx
, ret
,
654 _("creating rtdev media verifier"));
658 ret
= scrub_scan_all_spacemaps(ctx
, check_rmap
, &vs
);
662 ret
= clean_pool(vs
.rvp_data
, &ctx
->bytes_checked
);
664 str_liberror(ctx
, ret
, _("flushing datadev verify pool"));
666 ret2
= clean_pool(vs
.rvp_log
, &ctx
->bytes_checked
);
668 str_liberror(ctx
, ret2
, _("flushing logdev verify pool"));
670 ret3
= clean_pool(vs
.rvp_realtime
, &ctx
->bytes_checked
);
672 str_liberror(ctx
, ret3
, _("flushing rtdev verify pool"));
675 * If the verify flush didn't work or we found no bad blocks, we're
676 * done! No errors detected.
678 if (ret
|| ret2
|| ret3
)
680 if (bitmap_empty(vs
.d_bad
) && bitmap_empty(vs
.r_bad
))
683 /* Scan the whole dir tree to see what matches the bad extents. */
684 ret
= report_all_media_errors(ctx
, &vs
);
686 bitmap_free(&vs
.r_bad
);
687 bitmap_free(&vs
.d_bad
);
691 if (vs
.rvp_realtime
) {
692 read_verify_pool_abort(vs
.rvp_realtime
);
693 read_verify_pool_destroy(vs
.rvp_realtime
);
697 read_verify_pool_abort(vs
.rvp_log
);
698 read_verify_pool_destroy(vs
.rvp_log
);
701 read_verify_pool_abort(vs
.rvp_data
);
702 read_verify_pool_destroy(vs
.rvp_data
);
704 bitmap_free(&vs
.r_bad
);
706 bitmap_free(&vs
.d_bad
);
712 struct scrub_ctx
*ctx
)
714 return phase6_func(ctx
) == 0;
717 /* Estimate how much work we're going to do. */
720 struct scrub_ctx
*ctx
,
722 unsigned int *nr_threads
,
725 unsigned long long d_blocks
;
726 unsigned long long d_bfree
;
727 unsigned long long r_blocks
;
728 unsigned long long r_bfree
;
729 unsigned long long f_files
;
730 unsigned long long f_free
;
733 ret
= scrub_scan_estimate_blocks(ctx
, &d_blocks
, &d_bfree
,
734 &r_blocks
, &r_bfree
, &f_files
, &f_free
);
736 str_liberror(ctx
, ret
, _("estimating verify work"));
740 *items
= cvt_off_fsb_to_b(&ctx
->mnt
,
741 (d_blocks
- d_bfree
) + (r_blocks
- r_bfree
));
742 *nr_threads
= disk_heads(ctx
->datadev
);
748 xfs_estimate_verify_work(
749 struct scrub_ctx
*ctx
,
751 unsigned int *nr_threads
,
754 return phase6_estimate(ctx
, items
, nr_threads
, rshift
) == 0;