1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
11 #include <sys/resource.h>
12 #include <sys/statvfs.h>
13 #include "platform_defs.h"
16 #include "xfs_scrub.h"
22 * XFS Online Metadata Scrub (and Repair)
24 * The XFS scrubber uses custom XFS ioctls to probe more deeply into the
25 * internals of the filesystem. It takes advantage of scrubbing ioctls
26 * to check all the records stored in a metadata object and to
27 * cross-reference those records against the other filesystem metadata.
29 * After the program gathers command line arguments to figure out
30 * exactly what the program is going to do, scrub execution is split up
31 * into several separate phases:
33 * The "find geometry" phase queries XFS for the filesystem geometry.
34 * The block devices for the data, realtime, and log devices are opened.
35 * Kernel ioctls are test-queried to see if they actually work (the scrub
36 * ioctl in particular), and any other filesystem-specific information
39 * In the "check internal metadata" phase, we call the metadata scrub
40 * ioctl to check the filesystem's internal per-AG btrees. This
41 * includes the AG superblock, AGF, AGFL, and AGI headers, freespace
42 * btrees, the regular and free inode btrees, the reverse mapping
43 * btrees, and the reference counting btrees. If the realtime device is
44 * enabled, the realtime bitmap and reverse mapping btrees are checked.
45 * Quotas, if enabled, are also checked in this phase.
47 * Each AG (and the realtime device) has its metadata checked in a
48 * separate thread for better performance. Errors in the internal
49 * metadata can be fixed here prior to the inode scan; refer to the
50 * section about the "repair filesystem" phase for more information.
52 * The "scan all inodes" phase uses BULKSTAT to scan all the inodes in
53 * an AG in disk order. The BULKSTAT information provides enough
54 * information to construct a file handle that is used to check the
55 * following parts of every file:
58 * - All three block forks (data, attr, CoW)
59 * - If it's a symlink, the symlink target.
60 * - If it's a directory, the directory entries.
61 * - All extended attributes
62 * - The parent pointer
64 * Multiple threads are started to check each the inodes of each AG in
65 * parallel. Errors in file metadata can be fixed here; see the section
66 * about the "repair filesystem" phase for more information.
68 * Next comes the (configurable) "repair filesystem" phase. The user
69 * can instruct this program to fix all problems encountered; to fix
70 * only optimality problems and leave the corruptions; or not to touch
71 * the filesystem at all. Any metadata repairs that did not succeed in
72 * the previous two phases are retried here; if there are uncorrectable
73 * errors, xfs_scrub stops here.
75 * To perform the actual repairs (or optimizations), we iterate all the
76 * items on the per-AG action item list and ask the kernel to repair
77 * them. Items which are successfully repaired are removed from the
78 * list. If an item is not acted upon successfully (or the kernel asks us
79 * to try again), we retry the actions until there is nothing left to
80 * fix or we fail to make forward progress. In that event, the
81 * unfinished items are recorded as errors. If there are no errors at
82 * this point, we call FSTRIM on the filesystem.
84 * The next phase is the "check directory tree" phase. In this phase,
85 * every directory is opened (via file handle) to confirm that each
86 * directory is connected to the root. Directory entries are checked
87 * for ambiguous Unicode normalization mappings, which is to say that we
88 * look for pairs of entries whose utf-8 strings normalize to the same
89 * code point sequence and map to different inodes, because that could
90 * be used to trick a user into opening the wrong file. The names of
91 * extended attributes are checked for Unicode normalization collisions.
93 * In the "verify data file integrity" phase, we employ GETFSMAP to read
94 * the reverse-mappings of all AGs and issue direct-reads of the
95 * underlying disk blocks. We rely on the underlying storage to have
96 * checksummed the data blocks appropriately. Multiple threads are
97 * started to check each AG in parallel; a separate thread pool is used
98 * to handle the direct reads.
100 * In the "check summary counters" phase, use GETFSMAP to tally up the
101 * blocks and BULKSTAT to tally up the inodes we saw and compare that to
102 * the statfs output. This gives the user a rough estimate of how
103 * thorough the scrub was.
107 * Known debug tweaks (pass -d and set the environment variable):
108 * XFS_SCRUB_FORCE_ERROR -- pretend all metadata is corrupt
109 * XFS_SCRUB_FORCE_REPAIR -- repair all metadata even if it's ok
110 * XFS_SCRUB_NO_KERNEL -- pretend there is no kernel ioctl
111 * XFS_SCRUB_NO_SCSI_VERIFY -- disable SCSI VERIFY (if present)
112 * XFS_SCRUB_PHASE -- run only this scrub phase
113 * XFS_SCRUB_THREADS -- start exactly this number of threads
115 * Available even in non-debug mode:
116 * SERVICE_MODE -- compress all error codes to 1 for LSB
117 * service action compliance
120 /* Program name; needed for libfrog error reports. */
121 char *progname
= "xfs_scrub";
123 /* Debug level; higher values mean more verbosity. */
126 /* Display resource usage at the end of each phase? */
127 static bool display_rusage
;
129 /* Background mode; higher values insert more pauses between scrub calls. */
130 unsigned int bg_mode
;
132 /* Maximum number of processors available to us. */
135 /* Number of threads we're allowed to use. */
136 unsigned int nr_threads
;
138 /* Verbosity; higher values print more information. */
141 /* Should we scrub the data blocks? */
142 static bool scrub_data
;
144 /* Size of a memory page. */
147 /* Should we FSTRIM after a successful run? */
148 bool want_fstrim
= true;
150 /* If stdout/stderr are ttys, we can use richer terminal control. */
155 * If we are running as a service, we need to be careful about what
156 * error codes we return to the calling process.
160 #define SCRUB_RET_SUCCESS (0) /* no problems left behind */
161 #define SCRUB_RET_CORRUPT (1) /* corruption remains on fs */
162 #define SCRUB_RET_UNOPTIMIZED (2) /* fs could be optimized */
163 #define SCRUB_RET_OPERROR (4) /* operational problems */
164 #define SCRUB_RET_SYNTAX (8) /* cmdline args rejected */
166 static void __attribute__((noreturn
))
169 fprintf(stderr
, _("Usage: %s [OPTIONS] mountpoint\n"), progname
);
170 fprintf(stderr
, "\n");
171 fprintf(stderr
, _("Options:\n"));
172 fprintf(stderr
, _(" -a count Stop after this many errors are found.\n"));
173 fprintf(stderr
, _(" -b Background mode.\n"));
174 fprintf(stderr
, _(" -C fd Print progress information to this fd.\n"));
175 fprintf(stderr
, _(" -e behavior What to do if errors are found.\n"));
176 fprintf(stderr
, _(" -k Do not FITRIM the free space.\n"));
177 fprintf(stderr
, _(" -m path Path to /etc/mtab.\n"));
178 fprintf(stderr
, _(" -n Dry run. Do not modify anything.\n"));
179 fprintf(stderr
, _(" -T Display timing/usage information.\n"));
180 fprintf(stderr
, _(" -v Verbose output.\n"));
181 fprintf(stderr
, _(" -V Print version.\n"));
182 fprintf(stderr
, _(" -x Scrub file data too.\n"));
184 exit(SCRUB_RET_SYNTAX
);
188 # define RUSAGE_BOTH (-2)
191 /* Get resource usage for ourselves and all children. */
194 struct rusage
*usage
)
196 struct rusage cusage
;
199 err
= getrusage(RUSAGE_BOTH
, usage
);
203 err
= getrusage(RUSAGE_SELF
, usage
);
207 err
= getrusage(RUSAGE_CHILDREN
, &cusage
);
211 usage
->ru_minflt
+= cusage
.ru_minflt
;
212 usage
->ru_majflt
+= cusage
.ru_majflt
;
213 usage
->ru_nswap
+= cusage
.ru_nswap
;
214 usage
->ru_inblock
+= cusage
.ru_inblock
;
215 usage
->ru_oublock
+= cusage
.ru_oublock
;
216 usage
->ru_msgsnd
+= cusage
.ru_msgsnd
;
217 usage
->ru_msgrcv
+= cusage
.ru_msgrcv
;
218 usage
->ru_nsignals
+= cusage
.ru_nsignals
;
219 usage
->ru_nvcsw
+= cusage
.ru_nvcsw
;
220 usage
->ru_nivcsw
+= cusage
.ru_nivcsw
;
225 * Scrub Phase Dispatch
227 * The operations of the scrub program are split up into several
228 * different phases. Each phase builds upon the metadata checked in the
229 * previous phase, which is to say that we may skip phase (X + 1) if our
230 * scans in phase (X) reveal corruption. A phase may be skipped
234 /* Resource usage for each phase. */
235 struct phase_rusage
{
238 unsigned long long verified_bytes
;
243 /* Operations for each phase. */
244 #define DATASCAN_DUMMY_FN ((void *)1)
245 #define REPAIR_DUMMY_FN ((void *)2)
248 bool (*fn
)(struct scrub_ctx
*);
249 bool (*estimate_work
)(struct scrub_ctx
*, uint64_t *,
250 unsigned int *, int *);
254 /* Start tracking resource usage for a phase. */
257 struct phase_rusage
*pi
,
263 memset(pi
, 0, sizeof(*pi
));
264 error
= scrub_getrusage(&pi
->ruse
);
266 perror(_("getrusage"));
269 pi
->brk_start
= sbrk(0);
271 error
= gettimeofday(&pi
->time
, NULL
);
273 perror(_("gettimeofday"));
278 if ((verbose
|| display_rusage
) && descr
) {
279 fprintf(stdout
, _("Phase %u: %s\n"), phase
, descr
);
285 /* Report usage stats. */
288 struct phase_rusage
*pi
,
291 struct rusage ruse_now
;
293 struct mallinfo mall_now
;
295 struct timeval time_now
;
296 char phasebuf
[DESCR_BUFSZ
];
298 unsigned long long in
, out
;
299 unsigned long long io
;
301 double din
, dout
, dtot
;
302 char *iu
, *ou
, *tu
, *dinu
, *doutu
, *dtotu
;
308 error
= gettimeofday(&time_now
, NULL
);
310 perror(_("gettimeofday"));
313 dt
= timeval_subtract(&time_now
, &pi
->time
);
315 error
= scrub_getrusage(&ruse_now
);
317 perror(_("getrusage"));
322 snprintf(phasebuf
, DESCR_BUFSZ
, _("Phase %u: "), phase
);
326 #define kbytes(x) (((unsigned long)(x) + 1023) / 1024)
329 mall_now
= mallinfo();
330 fprintf(stdout
, _("%sMemory used: %luk/%luk (%luk/%luk), "),
332 kbytes(mall_now
.arena
), kbytes(mall_now
.hblkhd
),
333 kbytes(mall_now
.uordblks
), kbytes(mall_now
.fordblks
));
335 fprintf(stdout
, _("%sMemory used: %luk, "),
337 (unsigned long) kbytes(((char *) sbrk(0)) -
338 ((char *) pi
->brk_start
)));
342 fprintf(stdout
, _("time: %5.2f/%5.2f/%5.2fs\n"),
343 timeval_subtract(&time_now
, &pi
->time
),
344 timeval_subtract(&ruse_now
.ru_utime
, &pi
->ruse
.ru_utime
),
345 timeval_subtract(&ruse_now
.ru_stime
, &pi
->ruse
.ru_stime
));
348 in
= ((unsigned long long)ruse_now
.ru_inblock
-
349 pi
->ruse
.ru_inblock
) << BBSHIFT
;
350 out
= ((unsigned long long)ruse_now
.ru_oublock
-
351 pi
->ruse
.ru_oublock
) << BBSHIFT
;
354 i
= auto_space_units(in
, &iu
);
355 o
= auto_space_units(out
, &ou
);
356 t
= auto_space_units(io
, &tu
);
357 din
= auto_space_units(in
/ dt
, &dinu
);
358 dout
= auto_space_units(out
/ dt
, &doutu
);
359 dtot
= auto_space_units(io
/ dt
, &dtotu
);
361 _("%sI/O: %.1f%s in, %.1f%s out, %.1f%s tot\n"),
362 phasebuf
, i
, iu
, o
, ou
, t
, tu
);
364 _("%sI/O rate: %.1f%s/s in, %.1f%s/s out, %.1f%s/s tot\n"),
365 phasebuf
, din
, dinu
, dout
, doutu
, dtot
, dtotu
);
372 /* Run all the phases of the scrubber. */
375 struct scrub_ctx
*ctx
,
378 struct phase_ops phases
[] =
381 .descr
= _("Find filesystem geometry."),
386 .descr
= _("Check internal metadata."),
387 .fn
= xfs_scan_metadata
,
388 .estimate_work
= xfs_estimate_metadata_work
,
391 .descr
= _("Scan all inodes."),
392 .fn
= xfs_scan_inodes
,
393 .estimate_work
= xfs_estimate_inodes_work
,
396 .descr
= _("Defer filesystem repairs."),
397 .fn
= REPAIR_DUMMY_FN
,
398 .estimate_work
= xfs_estimate_repair_work
,
401 .descr
= _("Check directory tree."),
402 .fn
= xfs_scan_connections
,
403 .estimate_work
= xfs_estimate_inodes_work
,
406 .descr
= _("Verify data file integrity."),
407 .fn
= DATASCAN_DUMMY_FN
,
408 .estimate_work
= xfs_estimate_verify_work
,
411 .descr
= _("Check summary counters."),
412 .fn
= xfs_scan_summary
,
419 struct phase_rusage pi
;
420 struct phase_ops
*sp
;
423 unsigned int debug_phase
= 0;
425 unsigned int nr_threads
;
428 if (debug_tweak_on("XFS_SCRUB_PHASE"))
429 debug_phase
= atoi(getenv("XFS_SCRUB_PHASE"));
431 /* Run all phases of the scrub tool. */
432 for (phase
= 1, sp
= phases
; sp
->fn
; sp
++, phase
++) {
433 /* Turn on certain phases if user said to. */
434 if (sp
->fn
== DATASCAN_DUMMY_FN
&& scrub_data
) {
435 sp
->fn
= xfs_scan_blocks
;
436 } else if (sp
->fn
== REPAIR_DUMMY_FN
&&
437 ctx
->mode
== SCRUB_MODE_REPAIR
) {
438 sp
->descr
= _("Repair filesystem.");
439 sp
->fn
= xfs_repair_fs
;
443 /* Skip certain phases unless they're turned on. */
444 if (sp
->fn
== REPAIR_DUMMY_FN
||
445 sp
->fn
== DATASCAN_DUMMY_FN
)
448 /* Allow debug users to force a particular phase. */
449 if (debug_phase
&& phase
!= debug_phase
&& !sp
->must_run
)
452 /* Run this phase. */
453 moveon
= phase_start(&pi
, phase
, sp
->descr
);
456 if (sp
->estimate_work
) {
457 moveon
= sp
->estimate_work(ctx
, &max_work
, &nr_threads
,
461 moveon
= progress_init_phase(ctx
, progress_fp
, phase
,
462 max_work
, rshift
, nr_threads
);
464 moveon
= progress_init_phase(ctx
, NULL
, phase
, 0, 0, 0);
468 moveon
= sp
->fn(ctx
);
470 str_info(ctx
, ctx
->mntpoint
,
471 _("Scrub aborted after phase %d."),
475 progress_end_phase();
476 moveon
= phase_end(&pi
, phase
);
480 /* Too many errors? */
481 moveon
= !xfs_scrub_excessive_errors(ctx
);
490 report_modifications(
491 struct scrub_ctx
*ctx
)
493 if (ctx
->repairs
== 0 && ctx
->preens
== 0)
496 if (ctx
->repairs
&& ctx
->preens
)
498 _("%s: repairs made: %llu; optimizations made: %llu.\n"),
499 ctx
->mntpoint
, ctx
->repairs
, ctx
->preens
);
500 else if (ctx
->preens
== 0)
502 _("%s: repairs made: %llu.\n"),
503 ctx
->mntpoint
, ctx
->repairs
);
504 else if (ctx
->repairs
== 0)
506 _("%s: optimizations made: %llu.\n"),
507 ctx
->mntpoint
, ctx
->preens
);
512 struct scrub_ctx
*ctx
)
514 unsigned long long total_errors
;
516 total_errors
= ctx
->errors_found
+ ctx
->runtime_errors
;
518 if (total_errors
== 0 && ctx
->warnings_found
== 0) {
519 log_info(ctx
, _("No errors found."));
523 if (total_errors
== 0) {
524 fprintf(stderr
, _("%s: warnings found: %llu\n"), ctx
->mntpoint
,
525 ctx
->warnings_found
);
526 log_warn(ctx
, _("warnings found: %llu"), ctx
->warnings_found
);
527 } else if (ctx
->warnings_found
== 0) {
528 fprintf(stderr
, _("%s: errors found: %llu\n"), ctx
->mntpoint
,
530 log_err(ctx
, _("errors found: %llu"), total_errors
);
532 fprintf(stderr
, _("%s: errors found: %llu; warnings found: %llu\n"),
533 ctx
->mntpoint
, total_errors
,
534 ctx
->warnings_found
);
535 log_err(ctx
, _("errors found: %llu; warnings found: %llu"),
536 total_errors
, ctx
->warnings_found
);
540 * Don't advise the user to run repair unless we were successful in
541 * setting up the scrub and we actually saw corruptions. Warnings
542 * are not corruptions.
544 if (ctx
->scrub_setup_succeeded
&& total_errors
> 0) {
547 if (ctx
->mode
== SCRUB_MODE_DRY_RUN
)
548 msg
= _("%s: Re-run xfs_scrub without -n.\n");
550 msg
= _("%s: Unmount and run xfs_repair.\n");
552 fprintf(stderr
, msg
, ctx
->mntpoint
);
561 struct scrub_ctx ctx
= {0};
562 struct phase_rusage all_pi
;
564 FILE *progress_fp
= NULL
;
569 int ret
= SCRUB_RET_SUCCESS
;
571 fprintf(stdout
, "EXPERIMENTAL xfs_scrub program in use! Use at your own risk!\n");
573 progname
= basename(argv
[0]);
574 setlocale(LC_ALL
, "");
575 bindtextdomain(PACKAGE
, LOCALEDIR
);
578 pthread_mutex_init(&ctx
.lock
, NULL
);
579 ctx
.mode
= SCRUB_MODE_REPAIR
;
580 ctx
.error_action
= ERRORS_CONTINUE
;
581 while ((c
= getopt(argc
, argv
, "a:bC:de:km:nTvxV")) != EOF
) {
584 ctx
.max_errors
= cvt_u64(optarg
, 10);
596 fd
= cvt_u32(optarg
, 10);
601 progress_fp
= fdopen(fd
, "w");
611 if (!strcmp("continue", optarg
))
612 ctx
.error_action
= ERRORS_CONTINUE
;
613 else if (!strcmp("shutdown", optarg
))
614 ctx
.error_action
= ERRORS_SHUTDOWN
;
617 _("Unknown error behavior \"%s\".\n"),
629 ctx
.mode
= SCRUB_MODE_DRY_RUN
;
632 display_rusage
= true;
638 fprintf(stdout
, _("%s version %s\n"), progname
,
641 return SCRUB_RET_SUCCESS
;
652 /* Override thread count if debugger */
653 if (debug_tweak_on("XFS_SCRUB_THREADS")) {
656 x
= cvt_u32(getenv("XFS_SCRUB_THREADS"), 10);
658 perror("nr_threads");
664 if (optind
!= argc
- 1)
667 ctx
.mntpoint
= argv
[optind
];
669 stdout_isatty
= isatty(STDOUT_FILENO
);
670 stderr_isatty
= isatty(STDERR_FILENO
);
672 /* If interactive, start the progress bar. */
673 if (stdout_isatty
&& !progress_fp
)
674 progress_fp
= fdopen(1, "w+");
676 if (getenv("SERVICE_MODE"))
679 /* Initialize overall phase stats. */
680 moveon
= phase_start(&all_pi
, 0, NULL
);
682 return SCRUB_RET_OPERROR
;
684 /* Find the mount record for the passed-in argument. */
685 if (stat(argv
[optind
], &ctx
.mnt_sb
) < 0) {
687 _("%s: could not stat: %s: %s\n"),
688 progname
, argv
[optind
], strerror(errno
));
689 ctx
.runtime_errors
++;
694 * If the user did not specify an explicit mount table, try to use
695 * /proc/mounts if it is available, else /etc/mtab. We prefer
696 * /proc/mounts because it is kernel controlled, while /etc/mtab
697 * may contain garbage that userspace tools like pam_mounts wrote
701 if (access(_PATH_PROC_MOUNTS
, R_OK
) == 0)
702 mtab
= _PATH_PROC_MOUNTS
;
704 mtab
= _PATH_MOUNTED
;
707 fs_table_initialise(0, NULL
, 0, NULL
);
708 fsp
= fs_table_lookup_mount(ctx
.mntpoint
);
710 fprintf(stderr
, _("%s: Not a XFS mount point.\n"),
712 ret
|= SCRUB_RET_SYNTAX
;
715 memcpy(&ctx
.fsinfo
, fsp
, sizeof(struct fs_path
));
718 nproc
= sysconf(_SC_NPROCESSORS_ONLN
);
722 /* Set up a page-aligned buffer for read verification. */
723 page_size
= sysconf(_SC_PAGESIZE
);
725 str_errno(&ctx
, ctx
.mntpoint
);
729 if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR"))
730 ctx
.mode
= SCRUB_MODE_REPAIR
;
732 /* Scrub a filesystem. */
733 moveon
= run_scrub_phases(&ctx
, progress_fp
);
734 if (!moveon
&& ctx
.runtime_errors
== 0)
735 ctx
.runtime_errors
++;
738 * Excessive errors will cause the scrub phases to bail out early.
739 * We don't want every thread yelling that into the output, so check
740 * if we hit the threshold and tell the user *once*.
742 if (xfs_scrub_excessive_errors(&ctx
))
743 str_info(&ctx
, ctx
.mntpoint
, _("Too many errors; aborting."));
745 if (debug_tweak_on("XFS_SCRUB_FORCE_ERROR"))
746 str_error(&ctx
, ctx
.mntpoint
, _("Injecting error."));
748 /* Clean up scan data. */
749 moveon
= xfs_cleanup_fs(&ctx
);
750 if (!moveon
&& ctx
.runtime_errors
== 0)
751 ctx
.runtime_errors
++;
754 report_modifications(&ctx
);
755 report_outcome(&ctx
);
757 if (ctx
.errors_found
) {
758 if (ctx
.error_action
== ERRORS_SHUTDOWN
)
759 xfs_shutdown_fs(&ctx
);
760 ret
|= SCRUB_RET_CORRUPT
;
762 if (ctx
.warnings_found
)
763 ret
|= SCRUB_RET_UNOPTIMIZED
;
764 if (ctx
.runtime_errors
)
765 ret
|= SCRUB_RET_OPERROR
;
766 phase_end(&all_pi
, 0);
771 * If we're being run as a service, the return code must fit the LSB
772 * init script action error guidelines, which is to say that we
773 * compress all errors to 1 ("generic or unspecified error", LSB 5.0
774 * section 22.2) and hope the admin will scan the log for what
777 * We have to sleep 2 seconds here because journald uses the pid to
778 * connect our log messages to the systemd service. This is critical
779 * for capturing all the log messages if the scrub fails, because the
780 * fail service uses the service name to gather log messages for the
785 if (ret
!= SCRUB_RET_SUCCESS
)