]>
git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - scrub/xfs_scrub.c
2 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <sys/resource.h>
26 #include <sys/statvfs.h>
27 #include "platform_defs.h"
32 #include "xfs_scrub.h"
37 * XFS Online Metadata Scrub (and Repair)
39 * The XFS scrubber uses custom XFS ioctls to probe more deeply into the
40 * internals of the filesystem. It takes advantage of scrubbing ioctls
41 * to check all the records stored in a metadata object and to
42 * cross-reference those records against the other filesystem metadata.
44 * After the program gathers command line arguments to figure out
45 * exactly what the program is going to do, scrub execution is split up
46 * into several separate phases:
48 * The "find geometry" phase queries XFS for the filesystem geometry.
49 * The block devices for the data, realtime, and log devices are opened.
50 * Kernel ioctls are test-queried to see if they actually work (the scrub
51 * ioctl in particular), and any other filesystem-specific information
54 * In the "check internal metadata" phase, we call the metadata scrub
55 * ioctl to check the filesystem's internal per-AG btrees. This
56 * includes the AG superblock, AGF, AGFL, and AGI headers, freespace
57 * btrees, the regular and free inode btrees, the reverse mapping
58 * btrees, and the reference counting btrees. If the realtime device is
59 * enabled, the realtime bitmap and reverse mapping btrees are checked.
60 * Quotas, if enabled, are also checked in this phase.
62 * Each AG (and the realtime device) has its metadata checked in a
63 * separate thread for better performance. Errors in the internal
64 * metadata can be fixed here prior to the inode scan; refer to the
65 * section about the "repair filesystem" phase for more information.
67 * The "scan all inodes" phase uses BULKSTAT to scan all the inodes in
68 * an AG in disk order. The BULKSTAT information provides enough
69 * information to construct a file handle that is used to check the
70 * following parts of every file:
73 * - All three block forks (data, attr, CoW)
74 * - If it's a symlink, the symlink target.
75 * - If it's a directory, the directory entries.
76 * - All extended attributes
77 * - The parent pointer
79 * Multiple threads are started to check each the inodes of each AG in
80 * parallel. Errors in file metadata can be fixed here; see the section
81 * about the "repair filesystem" phase for more information.
83 * Next comes the (configurable) "repair filesystem" phase. The user
84 * can instruct this program to fix all problems encountered; to fix
85 * only optimality problems and leave the corruptions; or not to touch
86 * the filesystem at all. Any metadata repairs that did not succeed in
87 * the previous two phases are retried here; if there are uncorrectable
88 * errors, xfs_scrub stops here.
90 * The next phase is the "check directory tree" phase. In this phase,
91 * every directory is opened (via file handle) to confirm that each
92 * directory is connected to the root. Directory entries are checked
93 * for ambiguous Unicode normalization mappings, which is to say that we
94 * look for pairs of entries whose utf-8 strings normalize to the same
95 * code point sequence and map to different inodes, because that could
96 * be used to trick a user into opening the wrong file. The names of
97 * extended attributes are checked for Unicode normalization collisions.
99 * In the "verify data file integrity" phase, we employ GETFSMAP to read
100 * the reverse-mappings of all AGs and issue direct-reads of the
101 * underlying disk blocks. We rely on the underlying storage to have
102 * checksummed the data blocks appropriately. Multiple threads are
103 * started to check each AG in parallel; a separate thread pool is used
104 * to handle the direct reads.
106 * In the "check summary counters" phase, use GETFSMAP to tally up the
107 * blocks and BULKSTAT to tally up the inodes we saw and compare that to
108 * the statfs output. This gives the user a rough estimate of how
109 * thorough the scrub was.
113 * Known debug tweaks (pass -d and set the environment variable):
114 * XFS_SCRUB_FORCE_ERROR -- pretend all metadata is corrupt
115 * XFS_SCRUB_FORCE_REPAIR -- repair all metadata even if it's ok
116 * XFS_SCRUB_NO_KERNEL -- pretend there is no kernel ioctl
117 * XFS_SCRUB_NO_SCSI_VERIFY -- disable SCSI VERIFY (if present)
118 * XFS_SCRUB_PHASE -- run only this scrub phase
119 * XFS_SCRUB_THREADS -- start exactly this number of threads
122 /* Program name; needed for libfrog error reports. */
123 char *progname
= "xfs_scrub";
125 /* Debug level; higher values mean more verbosity. */
128 /* Display resource usage at the end of each phase? */
129 static bool display_rusage
;
131 /* Background mode; higher values insert more pauses between scrub calls. */
132 unsigned int bg_mode
;
134 /* Maximum number of processors available to us. */
137 /* Number of threads we're allowed to use. */
138 unsigned int nr_threads
;
140 /* Verbosity; higher values print more information. */
143 /* Should we scrub the data blocks? */
144 static bool scrub_data
;
146 /* Size of a memory page. */
149 #define SCRUB_RET_SUCCESS (0) /* no problems left behind */
150 #define SCRUB_RET_CORRUPT (1) /* corruption remains on fs */
151 #define SCRUB_RET_UNOPTIMIZED (2) /* fs could be optimized */
152 #define SCRUB_RET_OPERROR (4) /* operational problems */
153 #define SCRUB_RET_SYNTAX (8) /* cmdline args rejected */
155 static void __attribute__((noreturn
))
158 fprintf(stderr
, _("Usage: %s [OPTIONS] mountpoint | device\n"), progname
);
159 fprintf(stderr
, "\n");
160 fprintf(stderr
, _("Options:\n"));
161 fprintf(stderr
, _(" -a count Stop after this many errors are found.\n"));
162 fprintf(stderr
, _(" -b Background mode.\n"));
163 fprintf(stderr
, _(" -e behavior What to do if errors are found.\n"));
164 fprintf(stderr
, _(" -m path Path to /etc/mtab.\n"));
165 fprintf(stderr
, _(" -n Dry run. Do not modify anything.\n"));
166 fprintf(stderr
, _(" -T Display timing/usage information.\n"));
167 fprintf(stderr
, _(" -v Verbose output.\n"));
168 fprintf(stderr
, _(" -V Print version.\n"));
169 fprintf(stderr
, _(" -x Scrub file data too.\n"));
170 fprintf(stderr
, _(" -y Repair all errors.\n"));
172 exit(SCRUB_RET_SYNTAX
);
176 # define RUSAGE_BOTH (-2)
179 /* Get resource usage for ourselves and all children. */
182 struct rusage
*usage
)
184 struct rusage cusage
;
187 err
= getrusage(RUSAGE_BOTH
, usage
);
191 err
= getrusage(RUSAGE_SELF
, usage
);
195 err
= getrusage(RUSAGE_CHILDREN
, &cusage
);
199 usage
->ru_minflt
+= cusage
.ru_minflt
;
200 usage
->ru_majflt
+= cusage
.ru_majflt
;
201 usage
->ru_nswap
+= cusage
.ru_nswap
;
202 usage
->ru_inblock
+= cusage
.ru_inblock
;
203 usage
->ru_oublock
+= cusage
.ru_oublock
;
204 usage
->ru_msgsnd
+= cusage
.ru_msgsnd
;
205 usage
->ru_msgrcv
+= cusage
.ru_msgrcv
;
206 usage
->ru_nsignals
+= cusage
.ru_nsignals
;
207 usage
->ru_nvcsw
+= cusage
.ru_nvcsw
;
208 usage
->ru_nivcsw
+= cusage
.ru_nivcsw
;
213 * Scrub Phase Dispatch
215 * The operations of the scrub program are split up into several
216 * different phases. Each phase builds upon the metadata checked in the
217 * previous phase, which is to say that we may skip phase (X + 1) if our
218 * scans in phase (X) reveal corruption. A phase may be skipped
222 /* Resource usage for each phase. */
223 struct phase_rusage
{
226 unsigned long long verified_bytes
;
231 /* Operations for each phase. */
232 #define DATASCAN_DUMMY_FN ((void *)1)
233 #define REPAIR_DUMMY_FN ((void *)2)
236 bool (*fn
)(struct scrub_ctx
*);
240 /* Start tracking resource usage for a phase. */
243 struct phase_rusage
*pi
,
249 memset(pi
, 0, sizeof(*pi
));
250 error
= scrub_getrusage(&pi
->ruse
);
252 perror(_("getrusage"));
255 pi
->brk_start
= sbrk(0);
257 error
= gettimeofday(&pi
->time
, NULL
);
259 perror(_("gettimeofday"));
264 if ((verbose
|| display_rusage
) && descr
) {
265 fprintf(stdout
, _("Phase %u: %s\n"), phase
, descr
);
271 /* Report usage stats. */
274 struct phase_rusage
*pi
,
277 struct rusage ruse_now
;
279 struct mallinfo mall_now
;
281 struct timeval time_now
;
282 char phasebuf
[DESCR_BUFSZ
];
284 unsigned long long in
, out
;
285 unsigned long long io
;
287 double din
, dout
, dtot
;
288 char *iu
, *ou
, *tu
, *dinu
, *doutu
, *dtotu
;
294 error
= gettimeofday(&time_now
, NULL
);
296 perror(_("gettimeofday"));
299 dt
= timeval_subtract(&time_now
, &pi
->time
);
301 error
= scrub_getrusage(&ruse_now
);
303 perror(_("getrusage"));
308 snprintf(phasebuf
, DESCR_BUFSZ
, _("Phase %u: "), phase
);
312 #define kbytes(x) (((unsigned long)(x) + 1023) / 1024)
315 mall_now
= mallinfo();
316 fprintf(stdout
, _("%sMemory used: %luk/%luk (%luk/%luk), "),
318 kbytes(mall_now
.arena
), kbytes(mall_now
.hblkhd
),
319 kbytes(mall_now
.uordblks
), kbytes(mall_now
.fordblks
));
321 fprintf(stdout
, _("%sMemory used: %luk, "),
323 (unsigned long) kbytes(((char *) sbrk(0)) -
324 ((char *) pi
->brk_start
)));
328 fprintf(stdout
, _("time: %5.2f/%5.2f/%5.2fs\n"),
329 timeval_subtract(&time_now
, &pi
->time
),
330 timeval_subtract(&ruse_now
.ru_utime
, &pi
->ruse
.ru_utime
),
331 timeval_subtract(&ruse_now
.ru_stime
, &pi
->ruse
.ru_stime
));
334 in
= ((unsigned long long)ruse_now
.ru_inblock
-
335 pi
->ruse
.ru_inblock
) << BBSHIFT
;
336 out
= ((unsigned long long)ruse_now
.ru_oublock
-
337 pi
->ruse
.ru_oublock
) << BBSHIFT
;
340 i
= auto_space_units(in
, &iu
);
341 o
= auto_space_units(out
, &ou
);
342 t
= auto_space_units(io
, &tu
);
343 din
= auto_space_units(in
/ dt
, &dinu
);
344 dout
= auto_space_units(out
/ dt
, &doutu
);
345 dtot
= auto_space_units(io
/ dt
, &dtotu
);
347 _("%sI/O: %.1f%s in, %.1f%s out, %.1f%s tot\n"),
348 phasebuf
, i
, iu
, o
, ou
, t
, tu
);
350 _("%sI/O rate: %.1f%s/s in, %.1f%s/s out, %.1f%s/s tot\n"),
351 phasebuf
, din
, dinu
, dout
, doutu
, dtot
, dtotu
);
358 /* Run all the phases of the scrubber. */
361 struct scrub_ctx
*ctx
)
363 struct phase_ops phases
[] =
366 .descr
= _("Find filesystem geometry."),
371 .descr
= _("Check internal metadata."),
372 .fn
= xfs_scan_metadata
,
375 .descr
= _("Scan all inodes."),
376 .fn
= xfs_scan_inodes
,
379 .descr
= _("Defer filesystem repairs."),
380 .fn
= REPAIR_DUMMY_FN
,
383 .descr
= _("Check directory tree."),
384 .fn
= xfs_scan_connections
,
387 .descr
= _("Verify data file integrity."),
388 .fn
= DATASCAN_DUMMY_FN
,
391 .descr
= _("Check summary counters."),
397 struct phase_rusage pi
;
398 struct phase_ops
*sp
;
400 unsigned int debug_phase
= 0;
403 if (debug
&& debug_tweak_on("XFS_SCRUB_PHASE"))
404 debug_phase
= atoi(getenv("XFS_SCRUB_PHASE"));
406 /* Run all phases of the scrub tool. */
407 for (phase
= 1, sp
= phases
; sp
->fn
; sp
++, phase
++) {
408 /* Skip certain phases unless they're turned on. */
409 if (sp
->fn
== REPAIR_DUMMY_FN
||
410 sp
->fn
== DATASCAN_DUMMY_FN
)
413 /* Allow debug users to force a particular phase. */
414 if (debug_phase
&& phase
!= debug_phase
&& !sp
->must_run
)
417 /* Run this phase. */
418 moveon
= phase_start(&pi
, phase
, sp
->descr
);
421 moveon
= sp
->fn(ctx
);
423 str_info(ctx
, ctx
->mntpoint
,
424 _("Scrub aborted after phase %d."),
428 moveon
= phase_end(&pi
, phase
);
432 /* Too many errors? */
433 moveon
= !xfs_scrub_excessive_errors(ctx
);
446 struct scrub_ctx ctx
= {0};
447 struct phase_rusage all_pi
;
449 char *repairstr
= "";
450 unsigned long long total_errors
;
454 int ret
= SCRUB_RET_SUCCESS
;
456 fprintf(stdout
, "EXPERIMENTAL xfs_scrub program in use! Use at your own risk!\n");
457 return SCRUB_RET_OPERROR
;
459 progname
= basename(argv
[0]);
460 setlocale(LC_ALL
, "");
461 bindtextdomain(PACKAGE
, LOCALEDIR
);
464 pthread_mutex_init(&ctx
.lock
, NULL
);
465 ctx
.mode
= SCRUB_MODE_DEFAULT
;
466 ctx
.error_action
= ERRORS_CONTINUE
;
467 while ((c
= getopt(argc
, argv
, "a:bde:m:nTvxVy")) != EOF
) {
470 ctx
.max_errors
= cvt_u64(optarg
, 10);
484 if (!strcmp("continue", optarg
))
485 ctx
.error_action
= ERRORS_CONTINUE
;
486 else if (!strcmp("shutdown", optarg
))
487 ctx
.error_action
= ERRORS_SHUTDOWN
;
490 _("Unknown error behavior \"%s\".\n"),
499 if (ctx
.mode
!= SCRUB_MODE_DEFAULT
) {
501 _("Only one of the options -n or -y may be specified.\n"));
504 ctx
.mode
= SCRUB_MODE_DRY_RUN
;
507 display_rusage
= true;
513 fprintf(stdout
, _("%s version %s\n"), progname
,
516 return SCRUB_RET_SUCCESS
;
521 if (ctx
.mode
!= SCRUB_MODE_DEFAULT
) {
523 _("Only one of the options -n or -y may be specified.\n"));
526 ctx
.mode
= SCRUB_MODE_REPAIR
;
535 /* Override thread count if debugger */
536 if (debug_tweak_on("XFS_SCRUB_THREADS")) {
539 x
= cvt_u32(getenv("XFS_SCRUB_THREADS"), 10);
541 perror("nr_threads");
547 if (optind
!= argc
- 1)
550 ctx
.mntpoint
= strdup(argv
[optind
]);
552 /* Find the mount record for the passed-in argument. */
553 if (stat(argv
[optind
], &ctx
.mnt_sb
) < 0) {
555 _("%s: could not stat: %s: %s\n"),
556 progname
, argv
[optind
], strerror(errno
));
557 ctx
.runtime_errors
++;
562 * If the user did not specify an explicit mount table, try to use
563 * /proc/mounts if it is available, else /etc/mtab. We prefer
564 * /proc/mounts because it is kernel controlled, while /etc/mtab
565 * may contain garbage that userspace tools like pam_mounts wrote
569 if (access(_PATH_PROC_MOUNTS
, R_OK
) == 0)
570 mtab
= _PATH_PROC_MOUNTS
;
572 mtab
= _PATH_MOUNTED
;
575 /* Initialize overall phase stats. */
576 moveon
= phase_start(&all_pi
, 0, NULL
);
580 ismnt
= find_mountpoint(mtab
, &ctx
);
583 _("%s: Not a XFS mount point or block device.\n"),
585 ret
|= SCRUB_RET_SYNTAX
;
590 nproc
= sysconf(_SC_NPROCESSORS_ONLN
);
594 /* Set up a page-aligned buffer for read verification. */
595 page_size
= sysconf(_SC_PAGESIZE
);
597 str_errno(&ctx
, ctx
.mntpoint
);
601 if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR"))
602 ctx
.mode
= SCRUB_MODE_REPAIR
;
604 /* Scrub a filesystem. */
605 moveon
= run_scrub_phases(&ctx
);
606 if (!moveon
&& ctx
.runtime_errors
== 0)
607 ctx
.runtime_errors
++;
610 * Excessive errors will cause the scrub phases to bail out early.
611 * We don't want every thread yelling that into the output, so check
612 * if we hit the threshold and tell the user *once*.
614 if (xfs_scrub_excessive_errors(&ctx
))
615 str_info(&ctx
, ctx
.mntpoint
, _("Too many errors; aborting."));
617 if (debug_tweak_on("XFS_SCRUB_FORCE_ERROR"))
618 str_error(&ctx
, ctx
.mntpoint
, _("Injecting error."));
620 /* Clean up scan data. */
621 moveon
= xfs_cleanup_fs(&ctx
);
622 if (!moveon
&& ctx
.runtime_errors
== 0)
623 ctx
.runtime_errors
++;
626 total_errors
= ctx
.errors_found
+ ctx
.runtime_errors
;
628 repairstr
= _(" Unmount and run xfs_repair.");
629 if (total_errors
&& ctx
.warnings_found
)
631 _("%s: %llu errors and %llu warnings found.%s\n"),
632 ctx
.mntpoint
, total_errors
, ctx
.warnings_found
,
634 else if (total_errors
&& ctx
.warnings_found
== 0)
636 _("%s: %llu errors found.%s\n"),
637 ctx
.mntpoint
, total_errors
, repairstr
);
638 else if (total_errors
== 0 && ctx
.warnings_found
)
640 _("%s: %llu warnings found.\n"),
641 ctx
.mntpoint
, ctx
.warnings_found
);
642 if (ctx
.errors_found
) {
643 if (ctx
.error_action
== ERRORS_SHUTDOWN
)
644 xfs_shutdown_fs(&ctx
);
645 ret
|= SCRUB_RET_CORRUPT
;
647 if (ctx
.warnings_found
)
648 ret
|= SCRUB_RET_UNOPTIMIZED
;
649 if (ctx
.runtime_errors
)
650 ret
|= SCRUB_RET_OPERROR
;
651 phase_end(&all_pi
, 0);