]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - scrub/xfs_scrub.c
xfs_scrub: warn about normalized Unicode name collisions
[thirdparty/xfsprogs-dev.git] / scrub / xfs_scrub.c
1 /*
2 * Copyright (C) 2018 Oracle. All Rights Reserved.
3 *
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20 #include <stdio.h>
21 #include <pthread.h>
22 #include <stdbool.h>
23 #include <stdlib.h>
24 #include <sys/time.h>
25 #include <sys/resource.h>
26 #include <sys/statvfs.h>
27 #include "platform_defs.h"
28 #include "xfs.h"
29 #include "xfs_fs.h"
30 #include "input.h"
31 #include "path.h"
32 #include "xfs_scrub.h"
33 #include "common.h"
34 #include "unicrash.h"
35
36 /*
37 * XFS Online Metadata Scrub (and Repair)
38 *
39 * The XFS scrubber uses custom XFS ioctls to probe more deeply into the
40 * internals of the filesystem. It takes advantage of scrubbing ioctls
41 * to check all the records stored in a metadata object and to
42 * cross-reference those records against the other filesystem metadata.
43 *
44 * After the program gathers command line arguments to figure out
45 * exactly what the program is going to do, scrub execution is split up
46 * into several separate phases:
47 *
48 * The "find geometry" phase queries XFS for the filesystem geometry.
49 * The block devices for the data, realtime, and log devices are opened.
50 * Kernel ioctls are test-queried to see if they actually work (the scrub
51 * ioctl in particular), and any other filesystem-specific information
52 * is gathered.
53 *
54 * In the "check internal metadata" phase, we call the metadata scrub
55 * ioctl to check the filesystem's internal per-AG btrees. This
56 * includes the AG superblock, AGF, AGFL, and AGI headers, freespace
57 * btrees, the regular and free inode btrees, the reverse mapping
58 * btrees, and the reference counting btrees. If the realtime device is
59 * enabled, the realtime bitmap and reverse mapping btrees are checked.
60 * Quotas, if enabled, are also checked in this phase.
61 *
62 * Each AG (and the realtime device) has its metadata checked in a
63 * separate thread for better performance. Errors in the internal
64 * metadata can be fixed here prior to the inode scan; refer to the
65 * section about the "repair filesystem" phase for more information.
66 *
67 * The "scan all inodes" phase uses BULKSTAT to scan all the inodes in
68 * an AG in disk order. The BULKSTAT information provides enough
69 * information to construct a file handle that is used to check the
70 * following parts of every file:
71 *
72 * - The inode record
73 * - All three block forks (data, attr, CoW)
74 * - If it's a symlink, the symlink target.
75 * - If it's a directory, the directory entries.
76 * - All extended attributes
77 * - The parent pointer
78 *
79 * Multiple threads are started to check each the inodes of each AG in
80 * parallel. Errors in file metadata can be fixed here; see the section
81 * about the "repair filesystem" phase for more information.
82 *
83 * Next comes the (configurable) "repair filesystem" phase. The user
84 * can instruct this program to fix all problems encountered; to fix
85 * only optimality problems and leave the corruptions; or not to touch
86 * the filesystem at all. Any metadata repairs that did not succeed in
87 * the previous two phases are retried here; if there are uncorrectable
88 * errors, xfs_scrub stops here.
89 *
90 * The next phase is the "check directory tree" phase. In this phase,
91 * every directory is opened (via file handle) to confirm that each
92 * directory is connected to the root. Directory entries are checked
93 * for ambiguous Unicode normalization mappings, which is to say that we
94 * look for pairs of entries whose utf-8 strings normalize to the same
95 * code point sequence and map to different inodes, because that could
96 * be used to trick a user into opening the wrong file. The names of
97 * extended attributes are checked for Unicode normalization collisions.
98 *
99 * In the "verify data file integrity" phase, we employ GETFSMAP to read
100 * the reverse-mappings of all AGs and issue direct-reads of the
101 * underlying disk blocks. We rely on the underlying storage to have
102 * checksummed the data blocks appropriately. Multiple threads are
103 * started to check each AG in parallel; a separate thread pool is used
104 * to handle the direct reads.
105 *
106 * In the "check summary counters" phase, use GETFSMAP to tally up the
107 * blocks and BULKSTAT to tally up the inodes we saw and compare that to
108 * the statfs output. This gives the user a rough estimate of how
109 * thorough the scrub was.
110 */
111
112 /*
113 * Known debug tweaks (pass -d and set the environment variable):
114 * XFS_SCRUB_FORCE_ERROR -- pretend all metadata is corrupt
115 * XFS_SCRUB_FORCE_REPAIR -- repair all metadata even if it's ok
116 * XFS_SCRUB_NO_KERNEL -- pretend there is no kernel ioctl
117 * XFS_SCRUB_NO_SCSI_VERIFY -- disable SCSI VERIFY (if present)
118 * XFS_SCRUB_PHASE -- run only this scrub phase
119 * XFS_SCRUB_THREADS -- start exactly this number of threads
120 */
121
122 /* Program name; needed for libfrog error reports. */
123 char *progname = "xfs_scrub";
124
125 /* Debug level; higher values mean more verbosity. */
126 unsigned int debug;
127
128 /* Display resource usage at the end of each phase? */
129 static bool display_rusage;
130
131 /* Background mode; higher values insert more pauses between scrub calls. */
132 unsigned int bg_mode;
133
134 /* Maximum number of processors available to us. */
135 int nproc;
136
137 /* Number of threads we're allowed to use. */
138 unsigned int nr_threads;
139
140 /* Verbosity; higher values print more information. */
141 bool verbose;
142
143 /* Should we scrub the data blocks? */
144 static bool scrub_data;
145
146 /* Size of a memory page. */
147 long page_size;
148
149 #define SCRUB_RET_SUCCESS (0) /* no problems left behind */
150 #define SCRUB_RET_CORRUPT (1) /* corruption remains on fs */
151 #define SCRUB_RET_UNOPTIMIZED (2) /* fs could be optimized */
152 #define SCRUB_RET_OPERROR (4) /* operational problems */
153 #define SCRUB_RET_SYNTAX (8) /* cmdline args rejected */
154
155 static void __attribute__((noreturn))
156 usage(void)
157 {
158 fprintf(stderr, _("Usage: %s [OPTIONS] mountpoint | device\n"), progname);
159 fprintf(stderr, "\n");
160 fprintf(stderr, _("Options:\n"));
161 fprintf(stderr, _(" -a count Stop after this many errors are found.\n"));
162 fprintf(stderr, _(" -b Background mode.\n"));
163 fprintf(stderr, _(" -e behavior What to do if errors are found.\n"));
164 fprintf(stderr, _(" -m path Path to /etc/mtab.\n"));
165 fprintf(stderr, _(" -n Dry run. Do not modify anything.\n"));
166 fprintf(stderr, _(" -T Display timing/usage information.\n"));
167 fprintf(stderr, _(" -v Verbose output.\n"));
168 fprintf(stderr, _(" -V Print version.\n"));
169 fprintf(stderr, _(" -x Scrub file data too.\n"));
170 fprintf(stderr, _(" -y Repair all errors.\n"));
171
172 exit(SCRUB_RET_SYNTAX);
173 }
174
175 #ifndef RUSAGE_BOTH
176 # define RUSAGE_BOTH (-2)
177 #endif
178
179 /* Get resource usage for ourselves and all children. */
180 static int
181 scrub_getrusage(
182 struct rusage *usage)
183 {
184 struct rusage cusage;
185 int err;
186
187 err = getrusage(RUSAGE_BOTH, usage);
188 if (!err)
189 return err;
190
191 err = getrusage(RUSAGE_SELF, usage);
192 if (err)
193 return err;
194
195 err = getrusage(RUSAGE_CHILDREN, &cusage);
196 if (err)
197 return err;
198
199 usage->ru_minflt += cusage.ru_minflt;
200 usage->ru_majflt += cusage.ru_majflt;
201 usage->ru_nswap += cusage.ru_nswap;
202 usage->ru_inblock += cusage.ru_inblock;
203 usage->ru_oublock += cusage.ru_oublock;
204 usage->ru_msgsnd += cusage.ru_msgsnd;
205 usage->ru_msgrcv += cusage.ru_msgrcv;
206 usage->ru_nsignals += cusage.ru_nsignals;
207 usage->ru_nvcsw += cusage.ru_nvcsw;
208 usage->ru_nivcsw += cusage.ru_nivcsw;
209 return 0;
210 }
211
212 /*
213 * Scrub Phase Dispatch
214 *
215 * The operations of the scrub program are split up into several
216 * different phases. Each phase builds upon the metadata checked in the
217 * previous phase, which is to say that we may skip phase (X + 1) if our
218 * scans in phase (X) reveal corruption. A phase may be skipped
219 * entirely.
220 */
221
222 /* Resource usage for each phase. */
223 struct phase_rusage {
224 struct rusage ruse;
225 struct timeval time;
226 unsigned long long verified_bytes;
227 void *brk_start;
228 const char *descr;
229 };
230
231 /* Operations for each phase. */
232 #define DATASCAN_DUMMY_FN ((void *)1)
233 #define REPAIR_DUMMY_FN ((void *)2)
234 struct phase_ops {
235 char *descr;
236 bool (*fn)(struct scrub_ctx *);
237 bool must_run;
238 };
239
240 /* Start tracking resource usage for a phase. */
241 static bool
242 phase_start(
243 struct phase_rusage *pi,
244 unsigned int phase,
245 const char *descr)
246 {
247 int error;
248
249 memset(pi, 0, sizeof(*pi));
250 error = scrub_getrusage(&pi->ruse);
251 if (error) {
252 perror(_("getrusage"));
253 return false;
254 }
255 pi->brk_start = sbrk(0);
256
257 error = gettimeofday(&pi->time, NULL);
258 if (error) {
259 perror(_("gettimeofday"));
260 return false;
261 }
262
263 pi->descr = descr;
264 if ((verbose || display_rusage) && descr) {
265 fprintf(stdout, _("Phase %u: %s\n"), phase, descr);
266 fflush(stdout);
267 }
268 return true;
269 }
270
271 /* Report usage stats. */
272 static bool
273 phase_end(
274 struct phase_rusage *pi,
275 unsigned int phase)
276 {
277 struct rusage ruse_now;
278 #ifdef HAVE_MALLINFO
279 struct mallinfo mall_now;
280 #endif
281 struct timeval time_now;
282 char phasebuf[DESCR_BUFSZ];
283 double dt;
284 unsigned long long in, out;
285 unsigned long long io;
286 double i, o, t;
287 double din, dout, dtot;
288 char *iu, *ou, *tu, *dinu, *doutu, *dtotu;
289 int error;
290
291 if (!display_rusage)
292 return true;
293
294 error = gettimeofday(&time_now, NULL);
295 if (error) {
296 perror(_("gettimeofday"));
297 return false;
298 }
299 dt = timeval_subtract(&time_now, &pi->time);
300
301 error = scrub_getrusage(&ruse_now);
302 if (error) {
303 perror(_("getrusage"));
304 return false;
305 }
306
307 if (phase)
308 snprintf(phasebuf, DESCR_BUFSZ, _("Phase %u: "), phase);
309 else
310 phasebuf[0] = 0;
311
312 #define kbytes(x) (((unsigned long)(x) + 1023) / 1024)
313 #ifdef HAVE_MALLINFO
314
315 mall_now = mallinfo();
316 fprintf(stdout, _("%sMemory used: %luk/%luk (%luk/%luk), "),
317 phasebuf,
318 kbytes(mall_now.arena), kbytes(mall_now.hblkhd),
319 kbytes(mall_now.uordblks), kbytes(mall_now.fordblks));
320 #else
321 fprintf(stdout, _("%sMemory used: %luk, "),
322 phasebuf,
323 (unsigned long) kbytes(((char *) sbrk(0)) -
324 ((char *) pi->brk_start)));
325 #endif
326 #undef kbytes
327
328 fprintf(stdout, _("time: %5.2f/%5.2f/%5.2fs\n"),
329 timeval_subtract(&time_now, &pi->time),
330 timeval_subtract(&ruse_now.ru_utime, &pi->ruse.ru_utime),
331 timeval_subtract(&ruse_now.ru_stime, &pi->ruse.ru_stime));
332
333 /* I/O usage */
334 in = ((unsigned long long)ruse_now.ru_inblock -
335 pi->ruse.ru_inblock) << BBSHIFT;
336 out = ((unsigned long long)ruse_now.ru_oublock -
337 pi->ruse.ru_oublock) << BBSHIFT;
338 io = in + out;
339 if (io) {
340 i = auto_space_units(in, &iu);
341 o = auto_space_units(out, &ou);
342 t = auto_space_units(io, &tu);
343 din = auto_space_units(in / dt, &dinu);
344 dout = auto_space_units(out / dt, &doutu);
345 dtot = auto_space_units(io / dt, &dtotu);
346 fprintf(stdout,
347 _("%sI/O: %.1f%s in, %.1f%s out, %.1f%s tot\n"),
348 phasebuf, i, iu, o, ou, t, tu);
349 fprintf(stdout,
350 _("%sI/O rate: %.1f%s/s in, %.1f%s/s out, %.1f%s/s tot\n"),
351 phasebuf, din, dinu, dout, doutu, dtot, dtotu);
352 }
353 fflush(stdout);
354
355 return true;
356 }
357
358 /* Run all the phases of the scrubber. */
359 static bool
360 run_scrub_phases(
361 struct scrub_ctx *ctx)
362 {
363 struct phase_ops phases[] =
364 {
365 {
366 .descr = _("Find filesystem geometry."),
367 .fn = xfs_setup_fs,
368 .must_run = true,
369 },
370 {
371 .descr = _("Check internal metadata."),
372 .fn = xfs_scan_metadata,
373 },
374 {
375 .descr = _("Scan all inodes."),
376 .fn = xfs_scan_inodes,
377 },
378 {
379 .descr = _("Defer filesystem repairs."),
380 .fn = REPAIR_DUMMY_FN,
381 },
382 {
383 .descr = _("Check directory tree."),
384 .fn = xfs_scan_connections,
385 },
386 {
387 .descr = _("Verify data file integrity."),
388 .fn = DATASCAN_DUMMY_FN,
389 },
390 {
391 .descr = _("Check summary counters."),
392 },
393 {
394 NULL
395 },
396 };
397 struct phase_rusage pi;
398 struct phase_ops *sp;
399 bool moveon = true;
400 unsigned int debug_phase = 0;
401 unsigned int phase;
402
403 if (debug && debug_tweak_on("XFS_SCRUB_PHASE"))
404 debug_phase = atoi(getenv("XFS_SCRUB_PHASE"));
405
406 /* Run all phases of the scrub tool. */
407 for (phase = 1, sp = phases; sp->fn; sp++, phase++) {
408 /* Skip certain phases unless they're turned on. */
409 if (sp->fn == REPAIR_DUMMY_FN ||
410 sp->fn == DATASCAN_DUMMY_FN)
411 continue;
412
413 /* Allow debug users to force a particular phase. */
414 if (debug_phase && phase != debug_phase && !sp->must_run)
415 continue;
416
417 /* Run this phase. */
418 moveon = phase_start(&pi, phase, sp->descr);
419 if (!moveon)
420 break;
421 moveon = sp->fn(ctx);
422 if (!moveon) {
423 str_info(ctx, ctx->mntpoint,
424 _("Scrub aborted after phase %d."),
425 phase);
426 break;
427 }
428 moveon = phase_end(&pi, phase);
429 if (!moveon)
430 break;
431
432 /* Too many errors? */
433 moveon = !xfs_scrub_excessive_errors(ctx);
434 if (!moveon)
435 break;
436 }
437
438 return moveon;
439 }
440
441 int
442 main(
443 int argc,
444 char **argv)
445 {
446 struct scrub_ctx ctx = {0};
447 struct phase_rusage all_pi;
448 char *mtab = NULL;
449 char *repairstr = "";
450 unsigned long long total_errors;
451 bool moveon = true;
452 bool ismnt;
453 int c;
454 int ret = SCRUB_RET_SUCCESS;
455
456 fprintf(stdout, "EXPERIMENTAL xfs_scrub program in use! Use at your own risk!\n");
457 return SCRUB_RET_OPERROR;
458
459 progname = basename(argv[0]);
460 setlocale(LC_ALL, "");
461 bindtextdomain(PACKAGE, LOCALEDIR);
462 textdomain(PACKAGE);
463
464 pthread_mutex_init(&ctx.lock, NULL);
465 ctx.mode = SCRUB_MODE_DEFAULT;
466 ctx.error_action = ERRORS_CONTINUE;
467 while ((c = getopt(argc, argv, "a:bde:m:nTvxVy")) != EOF) {
468 switch (c) {
469 case 'a':
470 ctx.max_errors = cvt_u64(optarg, 10);
471 if (errno) {
472 perror(optarg);
473 usage();
474 }
475 break;
476 case 'b':
477 nr_threads = 1;
478 bg_mode++;
479 break;
480 case 'd':
481 debug++;
482 break;
483 case 'e':
484 if (!strcmp("continue", optarg))
485 ctx.error_action = ERRORS_CONTINUE;
486 else if (!strcmp("shutdown", optarg))
487 ctx.error_action = ERRORS_SHUTDOWN;
488 else {
489 fprintf(stderr,
490 _("Unknown error behavior \"%s\".\n"),
491 optarg);
492 usage();
493 }
494 break;
495 case 'm':
496 mtab = optarg;
497 break;
498 case 'n':
499 if (ctx.mode != SCRUB_MODE_DEFAULT) {
500 fprintf(stderr,
501 _("Only one of the options -n or -y may be specified.\n"));
502 usage();
503 }
504 ctx.mode = SCRUB_MODE_DRY_RUN;
505 break;
506 case 'T':
507 display_rusage = true;
508 break;
509 case 'v':
510 verbose = true;
511 break;
512 case 'V':
513 fprintf(stdout, _("%s version %s\n"), progname,
514 VERSION);
515 fflush(stdout);
516 return SCRUB_RET_SUCCESS;
517 case 'x':
518 scrub_data = true;
519 break;
520 case 'y':
521 if (ctx.mode != SCRUB_MODE_DEFAULT) {
522 fprintf(stderr,
523 _("Only one of the options -n or -y may be specified.\n"));
524 usage();
525 }
526 ctx.mode = SCRUB_MODE_REPAIR;
527 break;
528 case '?':
529 /* fall through */
530 default:
531 usage();
532 }
533 }
534
535 /* Override thread count if debugger */
536 if (debug_tweak_on("XFS_SCRUB_THREADS")) {
537 unsigned int x;
538
539 x = cvt_u32(getenv("XFS_SCRUB_THREADS"), 10);
540 if (errno) {
541 perror("nr_threads");
542 usage();
543 }
544 nr_threads = x;
545 }
546
547 if (optind != argc - 1)
548 usage();
549
550 ctx.mntpoint = strdup(argv[optind]);
551
552 /* Find the mount record for the passed-in argument. */
553 if (stat(argv[optind], &ctx.mnt_sb) < 0) {
554 fprintf(stderr,
555 _("%s: could not stat: %s: %s\n"),
556 progname, argv[optind], strerror(errno));
557 ctx.runtime_errors++;
558 goto out;
559 }
560
561 /*
562 * If the user did not specify an explicit mount table, try to use
563 * /proc/mounts if it is available, else /etc/mtab. We prefer
564 * /proc/mounts because it is kernel controlled, while /etc/mtab
565 * may contain garbage that userspace tools like pam_mounts wrote
566 * into it.
567 */
568 if (!mtab) {
569 if (access(_PATH_PROC_MOUNTS, R_OK) == 0)
570 mtab = _PATH_PROC_MOUNTS;
571 else
572 mtab = _PATH_MOUNTED;
573 }
574
575 /* Initialize overall phase stats. */
576 moveon = phase_start(&all_pi, 0, NULL);
577 if (!moveon)
578 goto out;
579
580 ismnt = find_mountpoint(mtab, &ctx);
581 if (!ismnt) {
582 fprintf(stderr,
583 _("%s: Not a XFS mount point or block device.\n"),
584 ctx.mntpoint);
585 ret |= SCRUB_RET_SYNTAX;
586 goto out;
587 }
588
589 /* How many CPUs? */
590 nproc = sysconf(_SC_NPROCESSORS_ONLN);
591 if (nproc < 1)
592 nproc = 1;
593
594 /* Set up a page-aligned buffer for read verification. */
595 page_size = sysconf(_SC_PAGESIZE);
596 if (page_size < 0) {
597 str_errno(&ctx, ctx.mntpoint);
598 goto out;
599 }
600
601 if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR"))
602 ctx.mode = SCRUB_MODE_REPAIR;
603
604 /* Scrub a filesystem. */
605 moveon = run_scrub_phases(&ctx);
606 if (!moveon && ctx.runtime_errors == 0)
607 ctx.runtime_errors++;
608
609 /*
610 * Excessive errors will cause the scrub phases to bail out early.
611 * We don't want every thread yelling that into the output, so check
612 * if we hit the threshold and tell the user *once*.
613 */
614 if (xfs_scrub_excessive_errors(&ctx))
615 str_info(&ctx, ctx.mntpoint, _("Too many errors; aborting."));
616
617 if (debug_tweak_on("XFS_SCRUB_FORCE_ERROR"))
618 str_error(&ctx, ctx.mntpoint, _("Injecting error."));
619
620 /* Clean up scan data. */
621 moveon = xfs_cleanup_fs(&ctx);
622 if (!moveon && ctx.runtime_errors == 0)
623 ctx.runtime_errors++;
624
625 out:
626 total_errors = ctx.errors_found + ctx.runtime_errors;
627 if (ctx.need_repair)
628 repairstr = _(" Unmount and run xfs_repair.");
629 if (total_errors && ctx.warnings_found)
630 fprintf(stderr,
631 _("%s: %llu errors and %llu warnings found.%s\n"),
632 ctx.mntpoint, total_errors, ctx.warnings_found,
633 repairstr);
634 else if (total_errors && ctx.warnings_found == 0)
635 fprintf(stderr,
636 _("%s: %llu errors found.%s\n"),
637 ctx.mntpoint, total_errors, repairstr);
638 else if (total_errors == 0 && ctx.warnings_found)
639 fprintf(stderr,
640 _("%s: %llu warnings found.\n"),
641 ctx.mntpoint, ctx.warnings_found);
642 if (ctx.errors_found) {
643 if (ctx.error_action == ERRORS_SHUTDOWN)
644 xfs_shutdown_fs(&ctx);
645 ret |= SCRUB_RET_CORRUPT;
646 }
647 if (ctx.warnings_found)
648 ret |= SCRUB_RET_UNOPTIMIZED;
649 if (ctx.runtime_errors)
650 ret |= SCRUB_RET_OPERROR;
651 phase_end(&all_pi, 0);
652 free(ctx.blkdev);
653 free(ctx.mntpoint);
654
655 return ret;
656 }