1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2018-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
9 #include <sys/statvfs.h>
10 #include <linux/fsmap.h>
12 #include "libfrog/paths.h"
13 #include "libfrog/workqueue.h"
14 #include "xfs_scrub.h"
16 #include "libfrog/bitmap.h"
19 #include "fscounters.h"
21 #include "read_verify.h"
25 #include "libfrog/bulkstat.h"
28 * Phase 6: Verify data file integrity.
30 * Identify potential data block extents with GETFSMAP, then feed those
31 * extents to the read-verify pool to get the verify commands batched,
32 * issued, and (if there are problems) reported back to us. If there
33 * are errors, we'll record the bad regions and (if available) use rmap
34 * to tell us if metadata are now corrupt. Otherwise, we'll scan the
35 * whole directory tree looking for files that overlap the bad regions
36 * and report the paths of the now corrupt files.
39 /* Verify disk blocks with GETFSMAP */
41 struct media_verify_state
{
42 struct read_verify_pool
*rvp_data
;
43 struct read_verify_pool
*rvp_log
;
44 struct read_verify_pool
*rvp_realtime
;
45 struct bitmap
*d_bad
; /* bytes */
46 struct bitmap
*r_bad
; /* bytes */
52 /* Find the fd for a given device identifier. */
53 static struct read_verify_pool
*
55 struct scrub_ctx
*ctx
,
56 struct media_verify_state
*vs
,
59 if (ctx
->mnt
.fsgeom
.rtstart
) {
60 if (dev
== XFS_DEV_DATA
)
62 if (dev
== XFS_DEV_LOG
)
64 if (dev
== XFS_DEV_RT
)
65 return vs
->rvp_realtime
;
67 if (dev
== ctx
->fsinfo
.fs_datadev
)
69 if (dev
== ctx
->fsinfo
.fs_logdev
)
71 if (dev
== ctx
->fsinfo
.fs_rtdev
)
72 return vs
->rvp_realtime
;
77 /* Find the device major/minor for a given file descriptor. */
80 struct scrub_ctx
*ctx
,
83 if (ctx
->mnt
.fsgeom
.rtstart
) {
84 if (disk
== ctx
->datadev
)
86 if (disk
== ctx
->logdev
)
88 if (disk
== ctx
->rtdev
)
91 if (disk
== ctx
->datadev
)
92 return ctx
->fsinfo
.fs_datadev
;
93 if (disk
== ctx
->logdev
)
94 return ctx
->fsinfo
.fs_logdev
;
95 if (disk
== ctx
->rtdev
)
96 return ctx
->fsinfo
.fs_rtdev
;
101 /* Find the incore bad blocks bitmap for a given disk. */
102 static struct bitmap
*
104 struct scrub_ctx
*ctx
,
106 struct media_verify_state
*vs
)
108 if (disk
== ctx
->datadev
)
110 if (disk
== ctx
->rtdev
)
115 struct disk_ioerr_report
{
116 struct scrub_ctx
*ctx
;
120 struct owner_decode
{
125 static const struct owner_decode special_owners
[] = {
126 {XFS_FMR_OWN_FREE
, "free space"},
127 {XFS_FMR_OWN_UNKNOWN
, "unknown owner"},
128 {XFS_FMR_OWN_FS
, "static FS metadata"},
129 {XFS_FMR_OWN_LOG
, "journalling log"},
130 {XFS_FMR_OWN_AG
, "per-AG metadata"},
131 {XFS_FMR_OWN_INOBT
, "inode btree blocks"},
132 {XFS_FMR_OWN_INODES
, "inodes"},
133 {XFS_FMR_OWN_REFC
, "refcount btree"},
134 {XFS_FMR_OWN_COW
, "CoW staging"},
135 {XFS_FMR_OWN_DEFECTIVE
, "bad blocks"},
139 /* Decode a special owner. */
141 decode_special_owner(
144 const struct owner_decode
*od
= special_owners
;
147 if (od
->owner
== owner
)
155 /* Routines to translate bad physical extents into file paths and offsets. */
157 struct badfile_report
{
158 struct scrub_ctx
*ctx
;
160 struct media_verify_state
*vs
;
161 struct file_bmap
*bmap
;
164 /* Report on bad extents found during a media scan. */
171 struct badfile_report
*br
= arg
;
172 unsigned long long bad_offset
;
173 unsigned long long bad_length
;
175 /* Clamp the bad region to the file mapping. */
176 if (start
< br
->bmap
->bm_physical
) {
177 length
-= br
->bmap
->bm_physical
- start
;
178 start
= br
->bmap
->bm_physical
;
180 length
= min(length
, br
->bmap
->bm_length
);
182 /* Figure out how far into the bmap is the bad mapping and report it. */
183 bad_offset
= start
- br
->bmap
->bm_physical
;
184 bad_length
= min(start
+ length
,
185 br
->bmap
->bm_physical
+ br
->bmap
->bm_length
) - start
;
187 str_unfixable_error(br
->ctx
, br
->descr
,
188 _("media error at data offset %llu length %llu."),
189 br
->bmap
->bm_offset
+ bad_offset
, bad_length
);
193 /* Report if this extent overlaps a bad region. */
196 struct scrub_ctx
*ctx
,
200 struct file_bmap
*bmap
,
203 struct badfile_report
*br
= arg
;
204 struct media_verify_state
*vs
= br
->vs
;
209 /* Only report errors for real extents. */
210 if (bmap
->bm_flags
& (BMV_OF_PREALLOC
| BMV_OF_DELALLOC
))
213 if (fsx
->fsx_xflags
& FS_XFLAG_REALTIME
)
218 return -bitmap_iterate_range(bmp
, bmap
->bm_physical
, bmap
->bm_length
,
222 /* Report if the extended attribute data overlaps a bad region. */
225 struct scrub_ctx
*ctx
,
229 struct file_bmap
*bmap
,
232 struct badfile_report
*br
= arg
;
233 struct media_verify_state
*vs
= br
->vs
;
234 struct bitmap
*bmp
= vs
->d_bad
;
236 /* Complain about attr fork extents that don't look right. */
237 if (bmap
->bm_flags
& (BMV_OF_PREALLOC
| BMV_OF_DELALLOC
)) {
238 str_info(ctx
, br
->descr
,
239 _("found unexpected unwritten/delalloc attr fork extent."));
243 if (fsx
->fsx_xflags
& FS_XFLAG_REALTIME
) {
244 str_info(ctx
, br
->descr
,
245 _("found unexpected realtime attr fork extent."));
249 if (bitmap_test(bmp
, bmap
->bm_physical
, bmap
->bm_length
))
250 str_corrupt(ctx
, br
->descr
,
251 _("media error in extended attribute data."));
256 /* Iterate the extent mappings of a file to report errors. */
259 struct scrub_ctx
*ctx
,
264 struct badfile_report br
= {
269 struct file_bmap key
= {0};
273 ret
= scrub_iterate_filemaps(ctx
, fd
, XFS_DATA_FORK
, &key
,
274 report_data_loss
, &br
);
276 str_liberror(ctx
, ret
, descr
);
281 ret
= scrub_iterate_filemaps(ctx
, fd
, XFS_ATTR_FORK
, &key
,
282 report_attr_loss
, &br
);
284 str_liberror(ctx
, ret
, descr
);
291 /* Report read verify errors in unlinked (but still open) files. */
294 struct scrub_ctx
*ctx
,
295 struct xfs_handle
*handle
,
296 struct xfs_bulkstat
*bstat
,
299 char descr
[DESCR_BUFSZ
];
303 /* Ignore linked files and things we can't open. */
304 if (bstat
->bs_nlink
!= 0)
306 if (!S_ISREG(bstat
->bs_mode
) && !S_ISDIR(bstat
->bs_mode
))
309 scrub_render_ino_descr(ctx
, descr
, DESCR_BUFSZ
,
310 bstat
->bs_ino
, bstat
->bs_gen
, _("(unlinked)"));
312 /* Try to open the inode. */
313 fd
= scrub_open_handle(handle
);
315 /* Handle is stale, try again. */
319 str_error(ctx
, descr
,
320 _("Could not open to report read errors: %s."),
325 /* Go find the badness. */
326 error
= report_fd_loss(ctx
, descr
, fd
, arg
);
330 str_errno(ctx
, descr
);
335 /* Scan a directory for matches in the read verify error list. */
338 struct scrub_ctx
*ctx
,
343 return report_fd_loss(ctx
, path
, dir_fd
, arg
);
347 * Scan the inode associated with a directory entry for matches with
348 * the read verify error list.
352 struct scrub_ctx
*ctx
,
355 struct dirent
*dirent
,
362 /* Ignore things we can't open. */
363 if (!S_ISREG(sb
->st_mode
) && !S_ISDIR(sb
->st_mode
))
366 /* Ignore . and .. */
367 if (!strcmp(".", dirent
->d_name
) || !strcmp("..", dirent
->d_name
))
371 * If we were given a dirent, open the associated file under
372 * dir_fd for badblocks scanning. If dirent is NULL, then it's
373 * the directory itself we want to scan.
375 fd
= openat(dir_fd
, dirent
->d_name
,
376 O_RDONLY
| O_NOATIME
| O_NOFOLLOW
| O_NOCTTY
);
378 char descr
[PATH_MAX
+ 1];
383 snprintf(descr
, PATH_MAX
, "%s/%s", path
, dirent
->d_name
);
386 str_error(ctx
, descr
,
387 _("Could not open to report read errors: %s."),
392 /* Go find the badness. */
393 error
= report_fd_loss(ctx
, path
, fd
, arg
);
397 str_errno(ctx
, path
);
404 struct ioerr_filerange
{
410 * If reverse mapping and parent pointers are enabled, we can map media errors
411 * directly back to a filename and a file position without needing to walk the
416 const struct scrub_ctx
*ctx
)
418 return (ctx
->mnt
.fsgeom
.flags
& XFS_FSOP_GEOM_FLAGS_PARENT
) &&
419 (ctx
->mnt
.fsgeom
.flags
& XFS_FSOP_GEOM_FLAGS_RMAPBT
);
422 /* Use a fsmap to report metadata lost to a media error. */
425 struct scrub_ctx
*ctx
,
430 struct xfs_bulkstat bs
= { };
431 char buf
[DESCR_BUFSZ
];
432 struct ioerr_filerange
*fr
= arg
;
436 /* Don't care about unwritten extents. */
437 if (map
->fmr_flags
& FMR_OF_PREALLOC
)
440 if (fr
->physical
> map
->fmr_physical
)
441 err_off
= fr
->physical
- map
->fmr_physical
;
445 /* Report special owners */
446 if (map
->fmr_flags
& FMR_OF_SPECIAL_OWNER
) {
447 snprintf(buf
, DESCR_BUFSZ
, _("disk offset %"PRIu64
),
448 (uint64_t)map
->fmr_physical
+ err_off
);
449 type
= decode_special_owner(map
->fmr_owner
);
451 * On filesystems that don't store reverse mappings, the
452 * GETFSMAP call returns OWNER_UNKNOWN for allocated space.
453 * We'll have to let the directory tree walker find the file
456 if (!(ctx
->mnt
.fsgeom
.flags
& XFS_FSOP_GEOM_FLAGS_RMAPBT
) &&
457 map
->fmr_owner
== XFS_FMR_OWN_UNKNOWN
) {
458 str_info(ctx
, buf
, _("media error detected."));
460 str_corrupt(ctx
, buf
, _("media error in %s."), type
);
464 if (can_use_pptrs(ctx
)) {
465 ret
= -xfrog_bulkstat_single(&ctx
->mnt
, map
->fmr_owner
, 0, &bs
);
467 str_liberror(ctx
, ret
,
468 _("bulkstat for media error report"));
471 /* Report extent maps */
472 if (map
->fmr_flags
& FMR_OF_EXTENT_MAP
) {
473 bool attr
= (map
->fmr_flags
& FMR_OF_ATTR_FORK
);
475 scrub_render_ino_descr(ctx
, buf
, DESCR_BUFSZ
,
476 map
->fmr_owner
, bs
.bs_gen
, " %s",
477 attr
? _("extended attribute") :
479 str_corrupt(ctx
, buf
, _("media error in extent map"));
483 * If directory parent pointers are available, use that to find the
484 * pathname to a file, and report that path as having lost its
485 * extended attributes, or the precise offset of the lost file data.
487 if (!can_use_pptrs(ctx
))
490 scrub_render_ino_descr(ctx
, buf
, DESCR_BUFSZ
, map
->fmr_owner
,
493 if (map
->fmr_flags
& FMR_OF_ATTR_FORK
) {
494 str_corrupt(ctx
, buf
, _("media error in extended attributes"));
498 str_unfixable_error(ctx
, buf
,
499 _("media error at data offset %llu length %llu."),
500 err_off
, fr
->length
);
505 * For a range of bad blocks, visit each space mapping that overlaps the bad
506 * range so that we can report lost metadata.
514 struct fsmap keys
[2] = { };
515 struct ioerr_filerange fr
= {
519 struct disk_ioerr_report
*dioerr
= arg
;
521 /* Go figure out which blocks are bad from the fsmap. */
522 keys
[0].fmr_device
= disk_to_dev(dioerr
->ctx
, dioerr
->disk
);
523 keys
[0].fmr_physical
= start
;
524 keys
[1].fmr_device
= keys
[0].fmr_device
;
525 keys
[1].fmr_physical
= start
+ length
- 1;
526 keys
[1].fmr_owner
= ULLONG_MAX
;
527 keys
[1].fmr_offset
= ULLONG_MAX
;
528 keys
[1].fmr_flags
= UINT_MAX
;
529 return -scrub_iterate_fsmap(dioerr
->ctx
, keys
, report_ioerr_fsmap
,
533 /* Report all the media errors found on a disk. */
536 struct scrub_ctx
*ctx
,
538 struct media_verify_state
*vs
)
540 struct disk_ioerr_report dioerr
= {
548 tree
= bitmap_for_disk(ctx
, disk
, vs
);
551 return -bitmap_iterate(tree
, report_ioerr
, &dioerr
);
554 /* Given bad extent lists for the data & rtdev, find bad files. */
556 report_all_media_errors(
557 struct scrub_ctx
*ctx
,
558 struct media_verify_state
*vs
)
563 str_corrupt(ctx
, ctx
->mntpoint
, _("data device truncated"));
565 str_corrupt(ctx
, ctx
->mntpoint
, _("log device truncated"));
567 str_corrupt(ctx
, ctx
->mntpoint
, _("rt device truncated"));
569 ret
= report_disk_ioerrs(ctx
, ctx
->datadev
, vs
);
571 str_liberror(ctx
, ret
, _("walking datadev io errors"));
575 ret
= report_disk_ioerrs(ctx
, ctx
->rtdev
, vs
);
577 str_liberror(ctx
, ret
, _("walking rtdev io errors"));
582 * Scan the directory tree to get file paths if we didn't already use
583 * directory parent pointers to report the loss. If parent pointers
584 * are enabled, report_ioerr_fsmap will have already reported file
585 * paths that have lost file data and xattrs.
587 if (can_use_pptrs(ctx
))
590 ret
= scan_fs_tree(ctx
, report_dir_loss
, report_dirent_loss
, vs
);
594 /* Scan for unlinked files. */
595 return scrub_scan_user_files(ctx
, report_inode_loss
, vs
);
598 /* Schedule a read-verify of a (data block) extent. */
601 struct scrub_ctx
*ctx
,
605 struct media_verify_state
*vs
= arg
;
606 struct read_verify_pool
*rvp
;
609 rvp
= dev_to_pool(ctx
, vs
, map
->fmr_device
);
611 dbg_printf("rmap dev %d:%d phys %"PRIu64
" owner %"PRId64
612 " offset %"PRIu64
" len %"PRIu64
" flags 0x%x\n",
613 major(map
->fmr_device
), minor(map
->fmr_device
),
614 (uint64_t)map
->fmr_physical
, (int64_t)map
->fmr_owner
,
615 (uint64_t)map
->fmr_offset
, (uint64_t)map
->fmr_length
,
618 /* "Unknown" extents should be verified; they could be data. */
619 if ((map
->fmr_flags
& FMR_OF_SPECIAL_OWNER
) &&
620 map
->fmr_owner
== XFS_FMR_OWN_UNKNOWN
)
621 map
->fmr_flags
&= ~FMR_OF_SPECIAL_OWNER
;
624 * We only care about read-verifying data extents that have been
625 * written to disk. This means we can skip "special" owners
626 * (metadata), xattr blocks, unwritten extents, and extent maps.
627 * These should all get checked elsewhere in the scrubber.
629 if (map
->fmr_flags
& (FMR_OF_PREALLOC
| FMR_OF_ATTR_FORK
|
630 FMR_OF_EXTENT_MAP
| FMR_OF_SPECIAL_OWNER
))
633 /* XXX: Filter out directory data blocks. */
635 /* Schedule the read verify command for (eventual) running. */
636 ret
= read_verify_schedule_io(rvp
, map
->fmr_physical
, map
->fmr_length
,
639 str_liberror(ctx
, ret
, _("scheduling media verify command"));
646 /* Wait for read/verify actions to finish, then return # bytes checked. */
649 struct read_verify_pool
*rvp
,
650 unsigned long long *bytes_checked
)
652 uint64_t pool_checked
;
658 ret
= read_verify_force_io(rvp
);
662 ret
= read_verify_pool_flush(rvp
);
666 ret
= read_verify_bytes(rvp
, &pool_checked
);
670 *bytes_checked
+= pool_checked
;
672 read_verify_pool_destroy(rvp
);
676 /* Remember a media error for later. */
679 struct scrub_ctx
*ctx
,
686 struct media_verify_state
*vs
= arg
;
691 if (disk
== ctx
->datadev
)
693 else if (disk
== ctx
->logdev
)
695 else if (disk
== ctx
->rtdev
)
700 tree
= bitmap_for_disk(ctx
, disk
, vs
);
702 str_liberror(ctx
, ENOENT
, _("finding bad block bitmap"));
706 ret
= -bitmap_set(tree
, start
, length
);
708 str_liberror(ctx
, ret
, _("setting bad block bitmap"));
712 * Read verify all the file data blocks in a filesystem. Since XFS doesn't
713 * do data checksums, we trust that the underlying storage will pass back
714 * an IO error if it can't retrieve whatever we previously stored there.
715 * If we hit an IO error, we'll record the bad blocks in a bitmap and then
716 * scan the extent maps of the entire fs tree to figure (and the unlinked
717 * inodes) out which files are now broken.
721 struct scrub_ctx
*ctx
)
723 struct media_verify_state vs
= { NULL
};
726 ret
= -bitmap_alloc(&vs
.d_bad
);
728 str_liberror(ctx
, ret
, _("creating datadev badblock bitmap"));
732 ret
= -bitmap_alloc(&vs
.r_bad
);
734 str_liberror(ctx
, ret
, _("creating realtime badblock bitmap"));
738 ret
= read_verify_pool_alloc(ctx
, ctx
->datadev
,
739 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
740 scrub_nproc(ctx
), &vs
.rvp_data
);
742 str_liberror(ctx
, ret
, _("creating datadev media verifier"));
746 ret
= read_verify_pool_alloc(ctx
, ctx
->logdev
,
747 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
748 scrub_nproc(ctx
), &vs
.rvp_log
);
750 str_liberror(ctx
, ret
,
751 _("creating logdev media verifier"));
756 ret
= read_verify_pool_alloc(ctx
, ctx
->rtdev
,
757 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
758 scrub_nproc(ctx
), &vs
.rvp_realtime
);
760 str_liberror(ctx
, ret
,
761 _("creating rtdev media verifier"));
765 ret
= scrub_scan_all_spacemaps(ctx
, check_rmap
, &vs
);
769 ret
= clean_pool(vs
.rvp_data
, &ctx
->bytes_checked
);
771 str_liberror(ctx
, ret
, _("flushing datadev verify pool"));
773 ret2
= clean_pool(vs
.rvp_log
, &ctx
->bytes_checked
);
775 str_liberror(ctx
, ret2
, _("flushing logdev verify pool"));
777 ret3
= clean_pool(vs
.rvp_realtime
, &ctx
->bytes_checked
);
779 str_liberror(ctx
, ret3
, _("flushing rtdev verify pool"));
782 * If the verify flush didn't work or we found no bad blocks, we're
783 * done! No errors detected.
785 if (ret
|| ret2
|| ret3
)
787 if (bitmap_empty(vs
.d_bad
) && bitmap_empty(vs
.r_bad
))
790 /* Scan the whole dir tree to see what matches the bad extents. */
791 ret
= report_all_media_errors(ctx
, &vs
);
793 bitmap_free(&vs
.r_bad
);
794 bitmap_free(&vs
.d_bad
);
798 if (vs
.rvp_realtime
) {
799 read_verify_pool_abort(vs
.rvp_realtime
);
800 read_verify_pool_destroy(vs
.rvp_realtime
);
804 read_verify_pool_abort(vs
.rvp_log
);
805 read_verify_pool_destroy(vs
.rvp_log
);
808 read_verify_pool_abort(vs
.rvp_data
);
809 read_verify_pool_destroy(vs
.rvp_data
);
811 bitmap_free(&vs
.r_bad
);
813 bitmap_free(&vs
.d_bad
);
817 /* Estimate how much work we're going to do. */
820 struct scrub_ctx
*ctx
,
822 unsigned int *nr_threads
,
825 unsigned long long d_blocks
;
826 unsigned long long d_bfree
;
827 unsigned long long r_blocks
;
828 unsigned long long r_bfree
;
829 unsigned long long dontcare
;
832 ret
= scrub_scan_estimate_blocks(ctx
, &d_blocks
, &d_bfree
, &r_blocks
,
833 &r_bfree
, &dontcare
);
835 str_liberror(ctx
, ret
, _("estimating verify work"));
839 *items
= cvt_off_fsb_to_b(&ctx
->mnt
,
840 (d_blocks
- d_bfree
) + (r_blocks
- r_bfree
));
843 * Each read-verify pool starts a thread pool, and each worker thread
844 * can contribute to the progress counter. Hence we need to set
845 * nr_threads appropriately to handle that many threads.
847 *nr_threads
= disk_heads(ctx
->datadev
);
849 *nr_threads
+= disk_heads(ctx
->rtdev
);
851 *nr_threads
+= disk_heads(ctx
->logdev
);