1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
9 #include <sys/statvfs.h>
11 #include "libfrog/paths.h"
12 #include "libfrog/workqueue.h"
13 #include "xfs_scrub.h"
15 #include "libfrog/bitmap.h"
18 #include "fscounters.h"
20 #include "read_verify.h"
25 * Phase 6: Verify data file integrity.
27 * Identify potential data block extents with GETFSMAP, then feed those
28 * extents to the read-verify pool to get the verify commands batched,
29 * issued, and (if there are problems) reported back to us. If there
30 * are errors, we'll record the bad regions and (if available) use rmap
31 * to tell us if metadata are now corrupt. Otherwise, we'll scan the
32 * whole directory tree looking for files that overlap the bad regions
33 * and report the paths of the now corrupt files.
36 /* Verify disk blocks with GETFSMAP */
38 struct media_verify_state
{
39 struct read_verify_pool
*rvp_data
;
40 struct read_verify_pool
*rvp_log
;
41 struct read_verify_pool
*rvp_realtime
;
42 struct bitmap
*d_bad
; /* bytes */
43 struct bitmap
*r_bad
; /* bytes */
46 /* Find the fd for a given device identifier. */
47 static struct read_verify_pool
*
49 struct scrub_ctx
*ctx
,
50 struct media_verify_state
*vs
,
53 if (dev
== ctx
->fsinfo
.fs_datadev
)
55 else if (dev
== ctx
->fsinfo
.fs_logdev
)
57 else if (dev
== ctx
->fsinfo
.fs_rtdev
)
58 return vs
->rvp_realtime
;
62 /* Find the device major/minor for a given file descriptor. */
65 struct scrub_ctx
*ctx
,
68 if (disk
== ctx
->datadev
)
69 return ctx
->fsinfo
.fs_datadev
;
70 else if (disk
== ctx
->logdev
)
71 return ctx
->fsinfo
.fs_logdev
;
72 else if (disk
== ctx
->rtdev
)
73 return ctx
->fsinfo
.fs_rtdev
;
77 /* Find the incore bad blocks bitmap for a given disk. */
78 static struct bitmap
*
80 struct scrub_ctx
*ctx
,
82 struct media_verify_state
*vs
)
84 dev_t dev
= xfs_disk_to_dev(ctx
, disk
);
86 if (dev
== ctx
->fsinfo
.fs_datadev
)
88 else if (dev
== ctx
->fsinfo
.fs_rtdev
)
93 struct disk_ioerr_report
{
94 struct scrub_ctx
*ctx
;
103 static const struct owner_decode special_owners
[] = {
104 {XFS_FMR_OWN_FREE
, "free space"},
105 {XFS_FMR_OWN_UNKNOWN
, "unknown owner"},
106 {XFS_FMR_OWN_FS
, "static FS metadata"},
107 {XFS_FMR_OWN_LOG
, "journalling log"},
108 {XFS_FMR_OWN_AG
, "per-AG metadata"},
109 {XFS_FMR_OWN_INOBT
, "inode btree blocks"},
110 {XFS_FMR_OWN_INODES
, "inodes"},
111 {XFS_FMR_OWN_REFC
, "refcount btree"},
112 {XFS_FMR_OWN_COW
, "CoW staging"},
113 {XFS_FMR_OWN_DEFECTIVE
, "bad blocks"},
117 /* Decode a special owner. */
119 xfs_decode_special_owner(
122 const struct owner_decode
*od
= special_owners
;
125 if (od
->owner
== owner
)
133 /* Routines to translate bad physical extents into file paths and offsets. */
135 struct badfile_report
{
136 struct scrub_ctx
*ctx
;
138 struct media_verify_state
*vs
;
139 struct file_bmap
*bmap
;
142 /* Report on bad extents found during a media scan. */
149 struct badfile_report
*br
= arg
;
150 unsigned long long bad_offset
;
151 unsigned long long bad_length
;
153 /* Clamp the bad region to the file mapping. */
154 if (start
< br
->bmap
->bm_physical
) {
155 length
-= br
->bmap
->bm_physical
- start
;
156 start
= br
->bmap
->bm_physical
;
158 length
= min(length
, br
->bmap
->bm_length
);
160 /* Figure out how far into the bmap is the bad mapping and report it. */
161 bad_offset
= start
- br
->bmap
->bm_physical
;
162 bad_length
= min(start
+ length
,
163 br
->bmap
->bm_physical
+ br
->bmap
->bm_length
) - start
;
165 str_unfixable_error(br
->ctx
, br
->descr
,
166 _("media error at data offset %llu length %llu."),
167 br
->bmap
->bm_offset
+ bad_offset
, bad_length
);
171 /* Report if this extent overlaps a bad region. */
174 struct scrub_ctx
*ctx
,
178 struct file_bmap
*bmap
,
181 struct badfile_report
*br
= arg
;
182 struct media_verify_state
*vs
= br
->vs
;
187 /* Only report errors for real extents. */
188 if (bmap
->bm_flags
& (BMV_OF_PREALLOC
| BMV_OF_DELALLOC
))
191 if (fsx
->fsx_xflags
& FS_XFLAG_REALTIME
)
196 return bitmap_iterate_range(bmp
, bmap
->bm_physical
, bmap
->bm_length
,
200 /* Report if the extended attribute data overlaps a bad region. */
203 struct scrub_ctx
*ctx
,
207 struct file_bmap
*bmap
,
210 struct badfile_report
*br
= arg
;
211 struct media_verify_state
*vs
= br
->vs
;
212 struct bitmap
*bmp
= vs
->d_bad
;
214 /* Complain about attr fork extents that don't look right. */
215 if (bmap
->bm_flags
& (BMV_OF_PREALLOC
| BMV_OF_DELALLOC
)) {
216 str_info(ctx
, br
->descr
,
217 _("found unexpected unwritten/delalloc attr fork extent."));
221 if (fsx
->fsx_xflags
& FS_XFLAG_REALTIME
) {
222 str_info(ctx
, br
->descr
,
223 _("found unexpected realtime attr fork extent."));
227 if (bitmap_test(bmp
, bmap
->bm_physical
, bmap
->bm_length
))
228 str_corrupt(ctx
, br
->descr
,
229 _("media error in extended attribute data."));
234 /* Iterate the extent mappings of a file to report errors. */
236 xfs_report_verify_fd(
237 struct scrub_ctx
*ctx
,
242 struct badfile_report br
= {
247 struct file_bmap key
= {0};
251 ret
= scrub_iterate_filemaps(ctx
, fd
, XFS_DATA_FORK
, &key
,
252 report_data_loss
, &br
);
254 str_liberror(ctx
, ret
, descr
);
259 ret
= scrub_iterate_filemaps(ctx
, fd
, XFS_ATTR_FORK
, &key
,
260 report_attr_loss
, &br
);
262 str_liberror(ctx
, ret
, descr
);
268 /* Report read verify errors in unlinked (but still open) files. */
270 xfs_report_verify_inode(
271 struct scrub_ctx
*ctx
,
272 struct xfs_handle
*handle
,
273 struct xfs_bulkstat
*bstat
,
276 char descr
[DESCR_BUFSZ
];
281 /* Ignore linked files and things we can't open. */
282 if (bstat
->bs_nlink
!= 0)
284 if (!S_ISREG(bstat
->bs_mode
) && !S_ISDIR(bstat
->bs_mode
))
287 scrub_render_ino_descr(ctx
, descr
, DESCR_BUFSZ
,
288 bstat
->bs_ino
, bstat
->bs_gen
, _("(unlinked)"));
290 /* Try to open the inode. */
291 fd
= scrub_open_handle(handle
);
298 _("Disappeared during read error reporting."));
302 /* Go find the badness. */
303 moveon
= xfs_report_verify_fd(ctx
, descr
, fd
, arg
);
306 str_errno(ctx
, descr
);
308 return moveon
? 0 : XFS_ITERATE_INODES_ABORT
;
311 /* Scan a directory for matches in the read verify error list. */
313 xfs_report_verify_dir(
314 struct scrub_ctx
*ctx
,
321 moveon
= xfs_report_verify_fd(ctx
, path
, dir_fd
, arg
);
322 return moveon
? 0 : -1;
326 * Scan the inode associated with a directory entry for matches with
327 * the read verify error list.
330 xfs_report_verify_dirent(
331 struct scrub_ctx
*ctx
,
334 struct dirent
*dirent
,
342 /* Ignore things we can't open. */
343 if (!S_ISREG(sb
->st_mode
) && !S_ISDIR(sb
->st_mode
))
346 /* Ignore . and .. */
347 if (!strcmp(".", dirent
->d_name
) || !strcmp("..", dirent
->d_name
))
351 * If we were given a dirent, open the associated file under
352 * dir_fd for badblocks scanning. If dirent is NULL, then it's
353 * the directory itself we want to scan.
355 fd
= openat(dir_fd
, dirent
->d_name
,
356 O_RDONLY
| O_NOATIME
| O_NOFOLLOW
| O_NOCTTY
);
360 str_errno(ctx
, path
);
364 /* Go find the badness. */
365 moveon
= xfs_report_verify_fd(ctx
, path
, fd
, arg
);
372 str_errno(ctx
, path
);
373 return moveon
? 0 : -1;
376 /* Use a fsmap to report metadata lost to a media error. */
379 struct scrub_ctx
*ctx
,
385 char buf
[DESCR_BUFSZ
];
386 uint64_t err_physical
= *(uint64_t *)arg
;
389 /* Don't care about unwritten extents. */
390 if (map
->fmr_flags
& FMR_OF_PREALLOC
)
393 if (err_physical
> map
->fmr_physical
)
394 err_off
= err_physical
- map
->fmr_physical
;
398 /* Report special owners */
399 if (map
->fmr_flags
& FMR_OF_SPECIAL_OWNER
) {
400 snprintf(buf
, DESCR_BUFSZ
, _("disk offset %"PRIu64
),
401 (uint64_t)map
->fmr_physical
+ err_off
);
402 type
= xfs_decode_special_owner(map
->fmr_owner
);
403 str_corrupt(ctx
, buf
, _("media error in %s."), type
);
406 /* Report extent maps */
407 if (map
->fmr_flags
& FMR_OF_EXTENT_MAP
) {
408 bool attr
= (map
->fmr_flags
& FMR_OF_ATTR_FORK
);
410 scrub_render_ino_descr(ctx
, buf
, DESCR_BUFSZ
,
411 map
->fmr_owner
, 0, " %s",
412 attr
? _("extended attribute") :
414 str_corrupt(ctx
, buf
, _("media error in extent map"));
418 * XXX: If we had a getparent() call we could report IO errors
419 * efficiently. Until then, we'll have to scan the dir tree
420 * to find the bad file's pathname.
427 * For a range of bad blocks, visit each space mapping that overlaps the bad
428 * range so that we can report lost metadata.
436 struct fsmap keys
[2];
437 char descr
[DESCR_BUFSZ
];
438 struct disk_ioerr_report
*dioerr
= arg
;
441 dev
= xfs_disk_to_dev(dioerr
->ctx
, dioerr
->disk
);
443 snprintf(descr
, DESCR_BUFSZ
,
444 _("dev %d:%d ioerr @ %"PRIu64
":%"PRIu64
" "),
445 major(dev
), minor(dev
), start
, length
);
447 /* Go figure out which blocks are bad from the fsmap. */
448 memset(keys
, 0, sizeof(struct fsmap
) * 2);
449 keys
->fmr_device
= dev
;
450 keys
->fmr_physical
= start
;
451 (keys
+ 1)->fmr_device
= dev
;
452 (keys
+ 1)->fmr_physical
= start
+ length
- 1;
453 (keys
+ 1)->fmr_owner
= ULLONG_MAX
;
454 (keys
+ 1)->fmr_offset
= ULLONG_MAX
;
455 (keys
+ 1)->fmr_flags
= UINT_MAX
;
456 xfs_iterate_fsmap(dioerr
->ctx
, descr
, keys
, report_ioerr_fsmap
,
461 /* Report all the media errors found on a disk. */
464 struct scrub_ctx
*ctx
,
466 struct media_verify_state
*vs
)
468 struct disk_ioerr_report dioerr
= {
476 tree
= bitmap_for_disk(ctx
, disk
, vs
);
479 return bitmap_iterate(tree
, report_ioerr
, &dioerr
);
482 /* Given bad extent lists for the data & rtdev, find bad files. */
484 report_all_media_errors(
485 struct scrub_ctx
*ctx
,
486 struct media_verify_state
*vs
)
490 ret
= report_disk_ioerrs(ctx
, ctx
->datadev
, vs
);
492 str_liberror(ctx
, ret
, _("walking datadev io errors"));
496 ret
= report_disk_ioerrs(ctx
, ctx
->rtdev
, vs
);
498 str_liberror(ctx
, ret
, _("walking rtdev io errors"));
502 /* Scan the directory tree to get file paths. */
503 ret
= scan_fs_tree(ctx
, xfs_report_verify_dir
,
504 xfs_report_verify_dirent
, vs
);
508 /* Scan for unlinked files. */
509 ret
= scrub_scan_all_inodes(ctx
, xfs_report_verify_inode
, vs
);
513 /* Schedule a read-verify of a (data block) extent. */
516 struct scrub_ctx
*ctx
,
521 struct media_verify_state
*vs
= arg
;
522 struct read_verify_pool
*rvp
;
525 rvp
= xfs_dev_to_pool(ctx
, vs
, map
->fmr_device
);
527 dbg_printf("rmap dev %d:%d phys %"PRIu64
" owner %"PRId64
528 " offset %"PRIu64
" len %"PRIu64
" flags 0x%x\n",
529 major(map
->fmr_device
), minor(map
->fmr_device
),
530 (uint64_t)map
->fmr_physical
, (int64_t)map
->fmr_owner
,
531 (uint64_t)map
->fmr_offset
, (uint64_t)map
->fmr_length
,
534 /* "Unknown" extents should be verified; they could be data. */
535 if ((map
->fmr_flags
& FMR_OF_SPECIAL_OWNER
) &&
536 map
->fmr_owner
== XFS_FMR_OWN_UNKNOWN
)
537 map
->fmr_flags
&= ~FMR_OF_SPECIAL_OWNER
;
540 * We only care about read-verifying data extents that have been
541 * written to disk. This means we can skip "special" owners
542 * (metadata), xattr blocks, unwritten extents, and extent maps.
543 * These should all get checked elsewhere in the scrubber.
545 if (map
->fmr_flags
& (FMR_OF_PREALLOC
| FMR_OF_ATTR_FORK
|
546 FMR_OF_EXTENT_MAP
| FMR_OF_SPECIAL_OWNER
))
549 /* XXX: Filter out directory data blocks. */
551 /* Schedule the read verify command for (eventual) running. */
552 ret
= read_verify_schedule_io(rvp
, map
->fmr_physical
, map
->fmr_length
,
555 str_liberror(ctx
, ret
, descr
);
562 /* Wait for read/verify actions to finish, then return # bytes checked. */
565 struct read_verify_pool
*rvp
,
566 unsigned long long *bytes_checked
)
568 uint64_t pool_checked
;
574 ret
= read_verify_force_io(rvp
);
578 ret
= read_verify_pool_flush(rvp
);
582 ret
= read_verify_bytes(rvp
, &pool_checked
);
586 *bytes_checked
+= pool_checked
;
588 read_verify_pool_destroy(rvp
);
592 /* Remember a media error for later. */
595 struct scrub_ctx
*ctx
,
602 struct media_verify_state
*vs
= arg
;
606 tree
= bitmap_for_disk(ctx
, disk
, vs
);
608 str_liberror(ctx
, ENOENT
, _("finding bad block bitmap"));
612 ret
= bitmap_set(tree
, start
, length
);
614 str_liberror(ctx
, ret
, _("setting bad block bitmap"));
618 * Read verify all the file data blocks in a filesystem. Since XFS doesn't
619 * do data checksums, we trust that the underlying storage will pass back
620 * an IO error if it can't retrieve whatever we previously stored there.
621 * If we hit an IO error, we'll record the bad blocks in a bitmap and then
622 * scan the extent maps of the entire fs tree to figure (and the unlinked
623 * inodes) out which files are now broken.
627 struct scrub_ctx
*ctx
)
629 struct media_verify_state vs
= { NULL
};
633 ret
= bitmap_alloc(&vs
.d_bad
);
635 str_liberror(ctx
, ret
, _("creating datadev badblock bitmap"));
639 ret
= bitmap_alloc(&vs
.r_bad
);
641 str_liberror(ctx
, ret
, _("creating realtime badblock bitmap"));
645 ret
= read_verify_pool_alloc(ctx
, ctx
->datadev
,
646 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
647 scrub_nproc(ctx
), &vs
.rvp_data
);
649 str_liberror(ctx
, ret
, _("creating datadev media verifier"));
653 ret
= read_verify_pool_alloc(ctx
, ctx
->logdev
,
654 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
655 scrub_nproc(ctx
), &vs
.rvp_log
);
657 str_liberror(ctx
, ret
,
658 _("creating logdev media verifier"));
663 ret
= read_verify_pool_alloc(ctx
, ctx
->rtdev
,
664 ctx
->mnt
.fsgeom
.blocksize
, remember_ioerr
,
665 scrub_nproc(ctx
), &vs
.rvp_realtime
);
667 str_liberror(ctx
, ret
,
668 _("creating rtdev media verifier"));
672 moveon
= xfs_scan_all_spacemaps(ctx
, xfs_check_rmap
, &vs
);
676 ret
= clean_pool(vs
.rvp_data
, &ctx
->bytes_checked
);
678 str_liberror(ctx
, ret
, _("flushing datadev verify pool"));
682 ret
= clean_pool(vs
.rvp_log
, &ctx
->bytes_checked
);
684 str_liberror(ctx
, ret
, _("flushing logdev verify pool"));
688 ret
= clean_pool(vs
.rvp_realtime
, &ctx
->bytes_checked
);
690 str_liberror(ctx
, ret
, _("flushing rtdev verify pool"));
694 /* Scan the whole dir tree to see what matches the bad extents. */
695 if (moveon
&& (!bitmap_empty(vs
.d_bad
) || !bitmap_empty(vs
.r_bad
)))
696 moveon
= report_all_media_errors(ctx
, &vs
);
698 bitmap_free(&vs
.r_bad
);
699 bitmap_free(&vs
.d_bad
);
703 if (vs
.rvp_realtime
) {
704 read_verify_pool_abort(vs
.rvp_realtime
);
705 read_verify_pool_destroy(vs
.rvp_realtime
);
709 read_verify_pool_abort(vs
.rvp_log
);
710 read_verify_pool_destroy(vs
.rvp_log
);
713 read_verify_pool_abort(vs
.rvp_data
);
714 read_verify_pool_destroy(vs
.rvp_data
);
716 bitmap_free(&vs
.r_bad
);
718 bitmap_free(&vs
.d_bad
);
723 /* Estimate how much work we're going to do. */
725 xfs_estimate_verify_work(
726 struct scrub_ctx
*ctx
,
728 unsigned int *nr_threads
,
731 unsigned long long d_blocks
;
732 unsigned long long d_bfree
;
733 unsigned long long r_blocks
;
734 unsigned long long r_bfree
;
735 unsigned long long f_files
;
736 unsigned long long f_free
;
739 ret
= scrub_scan_estimate_blocks(ctx
, &d_blocks
, &d_bfree
,
740 &r_blocks
, &r_bfree
, &f_files
, &f_free
);
742 str_liberror(ctx
, ret
, _("estimating verify work"));
746 *items
= cvt_off_fsb_to_b(&ctx
->mnt
,
747 (d_blocks
- d_bfree
) + (r_blocks
- r_bfree
));
748 *nr_threads
= disk_heads(ctx
->datadev
);