2 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <sys/resource.h>
26 #include <sys/statvfs.h>
27 #include "platform_defs.h"
32 #include "xfs_scrub.h"
38 * XFS Online Metadata Scrub (and Repair)
40 * The XFS scrubber uses custom XFS ioctls to probe more deeply into the
41 * internals of the filesystem. It takes advantage of scrubbing ioctls
42 * to check all the records stored in a metadata object and to
43 * cross-reference those records against the other filesystem metadata.
45 * After the program gathers command line arguments to figure out
46 * exactly what the program is going to do, scrub execution is split up
47 * into several separate phases:
49 * The "find geometry" phase queries XFS for the filesystem geometry.
50 * The block devices for the data, realtime, and log devices are opened.
51 * Kernel ioctls are test-queried to see if they actually work (the scrub
52 * ioctl in particular), and any other filesystem-specific information
55 * In the "check internal metadata" phase, we call the metadata scrub
56 * ioctl to check the filesystem's internal per-AG btrees. This
57 * includes the AG superblock, AGF, AGFL, and AGI headers, freespace
58 * btrees, the regular and free inode btrees, the reverse mapping
59 * btrees, and the reference counting btrees. If the realtime device is
60 * enabled, the realtime bitmap and reverse mapping btrees are checked.
61 * Quotas, if enabled, are also checked in this phase.
63 * Each AG (and the realtime device) has its metadata checked in a
64 * separate thread for better performance. Errors in the internal
65 * metadata can be fixed here prior to the inode scan; refer to the
66 * section about the "repair filesystem" phase for more information.
68 * The "scan all inodes" phase uses BULKSTAT to scan all the inodes in
69 * an AG in disk order. The BULKSTAT information provides enough
70 * information to construct a file handle that is used to check the
71 * following parts of every file:
74 * - All three block forks (data, attr, CoW)
75 * - If it's a symlink, the symlink target.
76 * - If it's a directory, the directory entries.
77 * - All extended attributes
78 * - The parent pointer
80 * Multiple threads are started to check each the inodes of each AG in
81 * parallel. Errors in file metadata can be fixed here; see the section
82 * about the "repair filesystem" phase for more information.
84 * Next comes the (configurable) "repair filesystem" phase. The user
85 * can instruct this program to fix all problems encountered; to fix
86 * only optimality problems and leave the corruptions; or not to touch
87 * the filesystem at all. Any metadata repairs that did not succeed in
88 * the previous two phases are retried here; if there are uncorrectable
89 * errors, xfs_scrub stops here.
91 * The next phase is the "check directory tree" phase. In this phase,
92 * every directory is opened (via file handle) to confirm that each
93 * directory is connected to the root. Directory entries are checked
94 * for ambiguous Unicode normalization mappings, which is to say that we
95 * look for pairs of entries whose utf-8 strings normalize to the same
96 * code point sequence and map to different inodes, because that could
97 * be used to trick a user into opening the wrong file. The names of
98 * extended attributes are checked for Unicode normalization collisions.
100 * In the "verify data file integrity" phase, we employ GETFSMAP to read
101 * the reverse-mappings of all AGs and issue direct-reads of the
102 * underlying disk blocks. We rely on the underlying storage to have
103 * checksummed the data blocks appropriately. Multiple threads are
104 * started to check each AG in parallel; a separate thread pool is used
105 * to handle the direct reads.
107 * In the "check summary counters" phase, use GETFSMAP to tally up the
108 * blocks and BULKSTAT to tally up the inodes we saw and compare that to
109 * the statfs output. This gives the user a rough estimate of how
110 * thorough the scrub was.
114 * Known debug tweaks (pass -d and set the environment variable):
115 * XFS_SCRUB_FORCE_ERROR -- pretend all metadata is corrupt
116 * XFS_SCRUB_FORCE_REPAIR -- repair all metadata even if it's ok
117 * XFS_SCRUB_NO_KERNEL -- pretend there is no kernel ioctl
118 * XFS_SCRUB_NO_SCSI_VERIFY -- disable SCSI VERIFY (if present)
119 * XFS_SCRUB_PHASE -- run only this scrub phase
120 * XFS_SCRUB_THREADS -- start exactly this number of threads
122 * Available even in non-debug mode:
123 * SERVICE_MODE -- compress all error codes to 1 for LSB
124 * service action compliance
127 /* Program name; needed for libfrog error reports. */
128 char *progname
= "xfs_scrub";
130 /* Debug level; higher values mean more verbosity. */
133 /* Display resource usage at the end of each phase? */
134 static bool display_rusage
;
136 /* Background mode; higher values insert more pauses between scrub calls. */
137 unsigned int bg_mode
;
139 /* Maximum number of processors available to us. */
142 /* Number of threads we're allowed to use. */
143 unsigned int nr_threads
;
145 /* Verbosity; higher values print more information. */
148 /* Should we scrub the data blocks? */
149 static bool scrub_data
;
151 /* Size of a memory page. */
154 /* Should we FSTRIM after a successful run? */
155 bool want_fstrim
= true;
157 /* If stdout/stderr are ttys, we can use richer terminal control. */
162 * If we are running as a service, we need to be careful about what
163 * error codes we return to the calling process.
167 #define SCRUB_RET_SUCCESS (0) /* no problems left behind */
168 #define SCRUB_RET_CORRUPT (1) /* corruption remains on fs */
169 #define SCRUB_RET_UNOPTIMIZED (2) /* fs could be optimized */
170 #define SCRUB_RET_OPERROR (4) /* operational problems */
171 #define SCRUB_RET_SYNTAX (8) /* cmdline args rejected */
173 static void __attribute__((noreturn
))
176 fprintf(stderr
, _("Usage: %s [OPTIONS] mountpoint | device\n"), progname
);
177 fprintf(stderr
, "\n");
178 fprintf(stderr
, _("Options:\n"));
179 fprintf(stderr
, _(" -a count Stop after this many errors are found.\n"));
180 fprintf(stderr
, _(" -b Background mode.\n"));
181 fprintf(stderr
, _(" -C fd Print progress information to this fd.\n"));
182 fprintf(stderr
, _(" -e behavior What to do if errors are found.\n"));
183 fprintf(stderr
, _(" -k Do not FITRIM the free space.\n"));
184 fprintf(stderr
, _(" -m path Path to /etc/mtab.\n"));
185 fprintf(stderr
, _(" -n Dry run. Do not modify anything.\n"));
186 fprintf(stderr
, _(" -T Display timing/usage information.\n"));
187 fprintf(stderr
, _(" -v Verbose output.\n"));
188 fprintf(stderr
, _(" -V Print version.\n"));
189 fprintf(stderr
, _(" -x Scrub file data too.\n"));
191 exit(SCRUB_RET_SYNTAX
);
195 # define RUSAGE_BOTH (-2)
198 /* Get resource usage for ourselves and all children. */
201 struct rusage
*usage
)
203 struct rusage cusage
;
206 err
= getrusage(RUSAGE_BOTH
, usage
);
210 err
= getrusage(RUSAGE_SELF
, usage
);
214 err
= getrusage(RUSAGE_CHILDREN
, &cusage
);
218 usage
->ru_minflt
+= cusage
.ru_minflt
;
219 usage
->ru_majflt
+= cusage
.ru_majflt
;
220 usage
->ru_nswap
+= cusage
.ru_nswap
;
221 usage
->ru_inblock
+= cusage
.ru_inblock
;
222 usage
->ru_oublock
+= cusage
.ru_oublock
;
223 usage
->ru_msgsnd
+= cusage
.ru_msgsnd
;
224 usage
->ru_msgrcv
+= cusage
.ru_msgrcv
;
225 usage
->ru_nsignals
+= cusage
.ru_nsignals
;
226 usage
->ru_nvcsw
+= cusage
.ru_nvcsw
;
227 usage
->ru_nivcsw
+= cusage
.ru_nivcsw
;
232 * Scrub Phase Dispatch
234 * The operations of the scrub program are split up into several
235 * different phases. Each phase builds upon the metadata checked in the
236 * previous phase, which is to say that we may skip phase (X + 1) if our
237 * scans in phase (X) reveal corruption. A phase may be skipped
241 /* Resource usage for each phase. */
242 struct phase_rusage
{
245 unsigned long long verified_bytes
;
250 /* Operations for each phase. */
251 #define DATASCAN_DUMMY_FN ((void *)1)
252 #define REPAIR_DUMMY_FN ((void *)2)
255 bool (*fn
)(struct scrub_ctx
*);
256 bool (*estimate_work
)(struct scrub_ctx
*, uint64_t *,
257 unsigned int *, int *);
261 /* Start tracking resource usage for a phase. */
264 struct phase_rusage
*pi
,
270 memset(pi
, 0, sizeof(*pi
));
271 error
= scrub_getrusage(&pi
->ruse
);
273 perror(_("getrusage"));
276 pi
->brk_start
= sbrk(0);
278 error
= gettimeofday(&pi
->time
, NULL
);
280 perror(_("gettimeofday"));
285 if ((verbose
|| display_rusage
) && descr
) {
286 fprintf(stdout
, _("Phase %u: %s\n"), phase
, descr
);
292 /* Report usage stats. */
295 struct phase_rusage
*pi
,
298 struct rusage ruse_now
;
300 struct mallinfo mall_now
;
302 struct timeval time_now
;
303 char phasebuf
[DESCR_BUFSZ
];
305 unsigned long long in
, out
;
306 unsigned long long io
;
308 double din
, dout
, dtot
;
309 char *iu
, *ou
, *tu
, *dinu
, *doutu
, *dtotu
;
315 error
= gettimeofday(&time_now
, NULL
);
317 perror(_("gettimeofday"));
320 dt
= timeval_subtract(&time_now
, &pi
->time
);
322 error
= scrub_getrusage(&ruse_now
);
324 perror(_("getrusage"));
329 snprintf(phasebuf
, DESCR_BUFSZ
, _("Phase %u: "), phase
);
333 #define kbytes(x) (((unsigned long)(x) + 1023) / 1024)
336 mall_now
= mallinfo();
337 fprintf(stdout
, _("%sMemory used: %luk/%luk (%luk/%luk), "),
339 kbytes(mall_now
.arena
), kbytes(mall_now
.hblkhd
),
340 kbytes(mall_now
.uordblks
), kbytes(mall_now
.fordblks
));
342 fprintf(stdout
, _("%sMemory used: %luk, "),
344 (unsigned long) kbytes(((char *) sbrk(0)) -
345 ((char *) pi
->brk_start
)));
349 fprintf(stdout
, _("time: %5.2f/%5.2f/%5.2fs\n"),
350 timeval_subtract(&time_now
, &pi
->time
),
351 timeval_subtract(&ruse_now
.ru_utime
, &pi
->ruse
.ru_utime
),
352 timeval_subtract(&ruse_now
.ru_stime
, &pi
->ruse
.ru_stime
));
355 in
= ((unsigned long long)ruse_now
.ru_inblock
-
356 pi
->ruse
.ru_inblock
) << BBSHIFT
;
357 out
= ((unsigned long long)ruse_now
.ru_oublock
-
358 pi
->ruse
.ru_oublock
) << BBSHIFT
;
361 i
= auto_space_units(in
, &iu
);
362 o
= auto_space_units(out
, &ou
);
363 t
= auto_space_units(io
, &tu
);
364 din
= auto_space_units(in
/ dt
, &dinu
);
365 dout
= auto_space_units(out
/ dt
, &doutu
);
366 dtot
= auto_space_units(io
/ dt
, &dtotu
);
368 _("%sI/O: %.1f%s in, %.1f%s out, %.1f%s tot\n"),
369 phasebuf
, i
, iu
, o
, ou
, t
, tu
);
371 _("%sI/O rate: %.1f%s/s in, %.1f%s/s out, %.1f%s/s tot\n"),
372 phasebuf
, din
, dinu
, dout
, doutu
, dtot
, dtotu
);
379 /* Run all the phases of the scrubber. */
382 struct scrub_ctx
*ctx
,
385 struct phase_ops phases
[] =
388 .descr
= _("Find filesystem geometry."),
393 .descr
= _("Check internal metadata."),
394 .fn
= xfs_scan_metadata
,
395 .estimate_work
= xfs_estimate_metadata_work
,
398 .descr
= _("Scan all inodes."),
399 .fn
= xfs_scan_inodes
,
400 .estimate_work
= xfs_estimate_inodes_work
,
403 .descr
= _("Defer filesystem repairs."),
404 .fn
= REPAIR_DUMMY_FN
,
405 .estimate_work
= xfs_estimate_repair_work
,
408 .descr
= _("Check directory tree."),
409 .fn
= xfs_scan_connections
,
410 .estimate_work
= xfs_estimate_inodes_work
,
413 .descr
= _("Verify data file integrity."),
414 .fn
= DATASCAN_DUMMY_FN
,
415 .estimate_work
= xfs_estimate_verify_work
,
418 .descr
= _("Check summary counters."),
419 .fn
= xfs_scan_summary
,
426 struct phase_rusage pi
;
427 struct phase_ops
*sp
;
430 unsigned int debug_phase
= 0;
432 unsigned int nr_threads
;
435 if (debug
&& debug_tweak_on("XFS_SCRUB_PHASE"))
436 debug_phase
= atoi(getenv("XFS_SCRUB_PHASE"));
438 /* Run all phases of the scrub tool. */
439 for (phase
= 1, sp
= phases
; sp
->fn
; sp
++, phase
++) {
440 /* Turn on certain phases if user said to. */
441 if (sp
->fn
== DATASCAN_DUMMY_FN
&& scrub_data
) {
442 sp
->fn
= xfs_scan_blocks
;
443 } else if (sp
->fn
== REPAIR_DUMMY_FN
&&
444 ctx
->mode
== SCRUB_MODE_REPAIR
) {
445 sp
->descr
= _("Repair filesystem.");
446 sp
->fn
= xfs_repair_fs
;
450 /* Skip certain phases unless they're turned on. */
451 if (sp
->fn
== REPAIR_DUMMY_FN
||
452 sp
->fn
== DATASCAN_DUMMY_FN
)
455 /* Allow debug users to force a particular phase. */
456 if (debug_phase
&& phase
!= debug_phase
&& !sp
->must_run
)
459 /* Run this phase. */
460 moveon
= phase_start(&pi
, phase
, sp
->descr
);
463 if (sp
->estimate_work
) {
464 moveon
= sp
->estimate_work(ctx
, &max_work
, &nr_threads
,
468 moveon
= progress_init_phase(ctx
, progress_fp
, phase
,
469 max_work
, rshift
, nr_threads
);
471 moveon
= progress_init_phase(ctx
, NULL
, phase
, 0, 0, 0);
475 moveon
= sp
->fn(ctx
);
477 str_info(ctx
, ctx
->mntpoint
,
478 _("Scrub aborted after phase %d."),
482 progress_end_phase();
483 moveon
= phase_end(&pi
, phase
);
487 /* Too many errors? */
488 moveon
= !xfs_scrub_excessive_errors(ctx
);
498 struct scrub_ctx
*ctx
)
500 unsigned long long total_errors
;
502 total_errors
= ctx
->errors_found
+ ctx
->runtime_errors
;
504 if (total_errors
== 0 && ctx
->warnings_found
== 0) {
505 log_info(ctx
, _("No errors found."));
509 if (total_errors
== 0) {
510 fprintf(stderr
, _("%s: warnings found: %llu\n"), ctx
->mntpoint
,
511 ctx
->warnings_found
);
512 log_warn(ctx
, _("warnings found: %llu"), ctx
->warnings_found
);
513 } else if (ctx
->warnings_found
== 0) {
514 fprintf(stderr
, _("%s: errors found: %llu\n"), ctx
->mntpoint
,
516 log_err(ctx
, _("errors found: %llu"), total_errors
);
518 fprintf(stderr
, _("%s: errors found: %llu; warnings found: %llu\n"),
519 ctx
->mntpoint
, total_errors
,
520 ctx
->warnings_found
);
521 log_err(ctx
, _("errors found: %llu; warnings found: %llu"),
522 total_errors
, ctx
->warnings_found
);
526 * Don't advise the user to run repair unless we were successful in
527 * setting up the scrub and we actually saw corruptions. Warnings
528 * are not corruptions.
530 if (ctx
->scrub_setup_succeeded
&& total_errors
> 0)
531 fprintf(stderr
, _("%s: Unmount and run xfs_repair.\n"),
540 struct scrub_ctx ctx
= {0};
541 struct phase_rusage all_pi
;
543 FILE *progress_fp
= NULL
;
548 int ret
= SCRUB_RET_SUCCESS
;
550 fprintf(stdout
, "EXPERIMENTAL xfs_scrub program in use! Use at your own risk!\n");
552 progname
= basename(argv
[0]);
553 setlocale(LC_ALL
, "");
554 bindtextdomain(PACKAGE
, LOCALEDIR
);
557 pthread_mutex_init(&ctx
.lock
, NULL
);
558 ctx
.mode
= SCRUB_MODE_REPAIR
;
559 ctx
.error_action
= ERRORS_CONTINUE
;
560 while ((c
= getopt(argc
, argv
, "a:bC:de:km:nTvxV")) != EOF
) {
563 ctx
.max_errors
= cvt_u64(optarg
, 10);
575 fd
= cvt_u32(optarg
, 10);
580 progress_fp
= fdopen(fd
, "w");
590 if (!strcmp("continue", optarg
))
591 ctx
.error_action
= ERRORS_CONTINUE
;
592 else if (!strcmp("shutdown", optarg
))
593 ctx
.error_action
= ERRORS_SHUTDOWN
;
596 _("Unknown error behavior \"%s\".\n"),
608 ctx
.mode
= SCRUB_MODE_DRY_RUN
;
611 display_rusage
= true;
617 fprintf(stdout
, _("%s version %s\n"), progname
,
620 return SCRUB_RET_SUCCESS
;
631 /* Override thread count if debugger */
632 if (debug_tweak_on("XFS_SCRUB_THREADS")) {
635 x
= cvt_u32(getenv("XFS_SCRUB_THREADS"), 10);
637 perror("nr_threads");
643 if (optind
!= argc
- 1)
646 ctx
.mntpoint
= strdup(argv
[optind
]);
648 stdout_isatty
= isatty(STDOUT_FILENO
);
649 stderr_isatty
= isatty(STDERR_FILENO
);
651 /* If interactive, start the progress bar. */
652 if (stdout_isatty
&& !progress_fp
)
653 progress_fp
= fdopen(1, "w+");
655 if (getenv("SERVICE_MODE"))
658 /* Initialize overall phase stats. */
659 moveon
= phase_start(&all_pi
, 0, NULL
);
661 return SCRUB_RET_OPERROR
;
663 /* Find the mount record for the passed-in argument. */
664 if (stat(argv
[optind
], &ctx
.mnt_sb
) < 0) {
666 _("%s: could not stat: %s: %s\n"),
667 progname
, argv
[optind
], strerror(errno
));
668 ctx
.runtime_errors
++;
673 * If the user did not specify an explicit mount table, try to use
674 * /proc/mounts if it is available, else /etc/mtab. We prefer
675 * /proc/mounts because it is kernel controlled, while /etc/mtab
676 * may contain garbage that userspace tools like pam_mounts wrote
680 if (access(_PATH_PROC_MOUNTS
, R_OK
) == 0)
681 mtab
= _PATH_PROC_MOUNTS
;
683 mtab
= _PATH_MOUNTED
;
686 ismnt
= find_mountpoint(mtab
, &ctx
);
689 _("%s: Not a XFS mount point or block device.\n"),
691 ret
|= SCRUB_RET_SYNTAX
;
696 nproc
= sysconf(_SC_NPROCESSORS_ONLN
);
700 /* Set up a page-aligned buffer for read verification. */
701 page_size
= sysconf(_SC_PAGESIZE
);
703 str_errno(&ctx
, ctx
.mntpoint
);
707 if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR"))
708 ctx
.mode
= SCRUB_MODE_REPAIR
;
710 /* Scrub a filesystem. */
711 moveon
= run_scrub_phases(&ctx
, progress_fp
);
712 if (!moveon
&& ctx
.runtime_errors
== 0)
713 ctx
.runtime_errors
++;
716 * Excessive errors will cause the scrub phases to bail out early.
717 * We don't want every thread yelling that into the output, so check
718 * if we hit the threshold and tell the user *once*.
720 if (xfs_scrub_excessive_errors(&ctx
))
721 str_info(&ctx
, ctx
.mntpoint
, _("Too many errors; aborting."));
723 if (debug_tweak_on("XFS_SCRUB_FORCE_ERROR"))
724 str_error(&ctx
, ctx
.mntpoint
, _("Injecting error."));
726 /* Clean up scan data. */
727 moveon
= xfs_cleanup_fs(&ctx
);
728 if (!moveon
&& ctx
.runtime_errors
== 0)
729 ctx
.runtime_errors
++;
732 report_outcome(&ctx
);
734 if (ctx
.errors_found
) {
735 if (ctx
.error_action
== ERRORS_SHUTDOWN
)
736 xfs_shutdown_fs(&ctx
);
737 ret
|= SCRUB_RET_CORRUPT
;
739 if (ctx
.warnings_found
)
740 ret
|= SCRUB_RET_UNOPTIMIZED
;
741 if (ctx
.runtime_errors
)
742 ret
|= SCRUB_RET_OPERROR
;
743 phase_end(&all_pi
, 0);
750 * If we're being run as a service, the return code must fit the LSB
751 * init script action error guidelines, which is to say that we
752 * compress all errors to 1 ("generic or unspecified error", LSB 5.0
753 * section 22.2) and hope the admin will scan the log for what
756 * We have to sleep 2 seconds here because journald uses the pid to
757 * connect our log messages to the systemd service. This is critical
758 * for capturing all the log messages if the scrub fails, because the
759 * fail service uses the service name to gather log messages for the
764 if (ret
!= SCRUB_RET_SUCCESS
)