repair/xfs_repair.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6
   7 #include "libxfs.h"
   8 #include "libxlog.h"
   9 #include <sys/resource.h>
  10 #include "xfs_multidisk.h"
  11 #include "avl.h"
  12 #include "libfrog/avl64.h"
  13 #include "globals.h"
  14 #include "versions.h"
  15 #include "agheader.h"
  16 #include "protos.h"
  17 #include "incore.h"
  18 #include "err_protos.h"
  19 #include "prefetch.h"
  20 #include "threads.h"
  21 #include "progress.h"
  22 #include "dinode.h"
  23 #include "slab.h"
  24 #include "rmap.h"
  25 #include "libfrog/fsgeom.h"
  26
  27 /*
  28  * option tables for getsubopt calls
  29  */
  30
  31 /*
  32  * -o: user-supplied override options
  33  */
  34 enum o_opt_nums {
  35         ASSUME_XFS = 0,
  36         IHASH_SIZE,
  37         BHASH_SIZE,
  38         AG_STRIDE,
  39         FORCE_GEO,
  40         PHASE2_THREADS,
  41         O_MAX_OPTS,
  42 };
  43
  44 static char *o_opts[] = {
  45         [ASSUME_XFS]            = "assume_xfs",
  46         [IHASH_SIZE]            = "ihash",
  47         [BHASH_SIZE]            = "bhash",
  48         [AG_STRIDE]             = "ag_stride",
  49         [FORCE_GEO]             = "force_geometry",
  50         [PHASE2_THREADS]        = "phase2_threads",
  51         [O_MAX_OPTS]            = NULL,
  52 };
  53
  54 /*
  55  * -c: conversion options
  56  */
  57 enum c_opt_nums {
  58         CONVERT_LAZY_COUNT = 0,
  59         C_MAX_OPTS,
  60 };
  61
  62 static char *c_opts[] = {
  63         [CONVERT_LAZY_COUNT]    = "lazycount",
  64         [C_MAX_OPTS]            = NULL,
  65 };
  66
  67
  68 static int      bhash_option_used;
  69 static long     max_mem_specified;      /* in megabytes */
  70 static int      phase2_threads = 32;
  71 static bool     report_corrected;
  72
  73 static void
  74 usage(void)
  75 {
  76         do_warn(_(
  77 "Usage: %s [options] device\n"
  78 "\n"
  79 "Options:\n"
  80 "  -f           The device is a file\n"
  81 "  -L           Force log zeroing. Do this as a last resort.\n"
  82 "  -l logdev    Specifies the device where the external log resides.\n"
  83 "  -m maxmem    Maximum amount of memory to be used in megabytes.\n"
  84 "  -n           No modify mode, just checks the filesystem for damage.\n"
  85 "               (Cannot be used together with -e.)\n"
  86 "  -P           Disables prefetching.\n"
  87 "  -r rtdev     Specifies the device where the realtime section resides.\n"
  88 "  -v           Verbose output.\n"
  89 "  -c subopts   Change filesystem parameters - use xfs_admin.\n"
  90 "  -o subopts   Override default behaviour, refer to man page.\n"
  91 "  -t interval  Reporting interval in seconds.\n"
  92 "  -d           Repair dangerously.\n"
  93 "  -e           Exit with a non-zero code if any errors were repaired.\n"
  94 "               (Cannot be used together with -n.)\n"
  95 "  -V           Reports version and exits.\n"), progname);
  96         exit(1);
  97 }
  98
  99 char *
 100 err_string(int err_code)
 101 {
 102         static char *err_message[XR_BAD_ERR_CODE];
 103         static int done;
 104
 105         if (!done) {
 106                 err_message[XR_OK] = _("no error");
 107                 err_message[XR_BAD_MAGIC] = _("bad magic number");
 108                 err_message[XR_BAD_BLOCKSIZE] = _("bad blocksize field");
 109                 err_message[XR_BAD_BLOCKLOG] = _("bad blocksize log field");
 110                 err_message[XR_BAD_VERSION] = _("bad or unsupported version");
 111                 err_message[XR_BAD_INPROGRESS] =
 112                         _("filesystem mkfs-in-progress bit set");
 113                 err_message[XR_BAD_FS_SIZE_DATA] =
 114                         _("inconsistent filesystem geometry information");
 115                 err_message[XR_BAD_INO_SIZE_DATA] =
 116         _("bad inode size or inconsistent with number of inodes/block"),
 117                 err_message[XR_BAD_SECT_SIZE_DATA] = _("bad sector size");
 118                 err_message[XR_AGF_GEO_MISMATCH] =
 119         _("AGF geometry info conflicts with filesystem geometry");
 120                 err_message[XR_AGI_GEO_MISMATCH] =
 121         _("AGI geometry info conflicts with filesystem geometry");
 122                 err_message[XR_SB_GEO_MISMATCH] =
 123         _("AG superblock geometry info conflicts with filesystem geometry");
 124                 err_message[XR_EOF] = _("attempted to perform I/O beyond EOF");
 125                 err_message[XR_BAD_RT_GEO_DATA] =
 126         _("inconsistent filesystem geometry in realtime filesystem component");
 127                 err_message[XR_BAD_INO_MAX_PCT] =
 128                         _("maximum indicated percentage of inodes > 100%");
 129                 err_message[XR_BAD_INO_ALIGN] =
 130                         _("inconsistent inode alignment value");
 131                 err_message[XR_INSUFF_SEC_SB] =
 132         _("not enough secondary superblocks with matching geometry");
 133                 err_message[XR_BAD_SB_UNIT] =
 134                         _("bad stripe unit in superblock");
 135                 err_message[XR_BAD_SB_WIDTH] =
 136                         _("bad stripe width in superblock");
 137                 err_message[XR_BAD_SVN] =
 138                         _("bad shared version number in superblock");
 139                 err_message[XR_BAD_CRC] =
 140                         _("bad CRC in superblock");
 141                 err_message[XR_BAD_DIR_SIZE_DATA] =
 142                         _("inconsistent directory geometry information");
 143                 err_message[XR_BAD_LOG_GEOMETRY] =
 144                         _("inconsistent log geometry information");
 145                 done = 1;
 146         }
 147
 148         if (err_code < XR_OK || err_code >= XR_BAD_ERR_CODE)
 149                 do_abort(_("bad error code - %d\n"), err_code);
 150
 151         return(err_message[err_code]);
 152 }
 153
 154 static void
 155 noval(char opt, char *tbl[], int idx)
 156 {
 157         do_warn(_("-%c %s option cannot have a value\n"), opt, tbl[idx]);
 158         usage();
 159 }
 160
 161 static void
 162 respec(char opt, char *tbl[], int idx)
 163 {
 164         do_warn("-%c ", opt);
 165         if (tbl)
 166                 do_warn("%s ", tbl[idx]);
 167         do_warn(_("option respecified\n"));
 168         usage();
 169 }
 170
 171 static void
 172 unknown(char opt, char *s)
 173 {
 174         do_warn(_("unknown option -%c %s\n"), opt, s);
 175         usage();
 176 }
 177
 178 /*
 179  * sets only the global argument flags and variables
 180  */
 181 static void
 182 process_args(int argc, char **argv)
 183 {
 184         char *p;
 185         int c;
 186
 187         log_spec = 0;
 188         fs_is_dirty = 0;
 189         verbose = 0;
 190         no_modify = 0;
 191         dangerously = 0;
 192         isa_file = 0;
 193         zap_log = 0;
 194         dumpcore = 0;
 195         full_ino_ex_data = 0;
 196         force_geo = 0;
 197         assume_xfs = 0;
 198         copied_sunit = 0;
 199         sb_inoalignmt = 0;
 200         sb_unit = 0;
 201         sb_width = 0;
 202         ag_stride = 0;
 203         thread_count = 1;
 204         report_interval = PROG_RPT_DEFAULT;
 205         report_corrected = false;
 206
 207         /*
 208          * XXX have to add suboption processing here
 209          * attributes, quotas, nlinks, aligned_inos, sb_fbits
 210          */
 211         while ((c = getopt(argc, argv, "c:o:fl:m:r:LnDvVdPet:")) != EOF)  {
 212                 switch (c) {
 213                 case 'D':
 214                         dumpcore = 1;
 215                         break;
 216                 case 'o':
 217                         p = optarg;
 218                         while (*p != '\0')  {
 219                                 char *val;
 220
 221                                 switch (getsubopt(&p, o_opts, &val))  {
 222                                 case ASSUME_XFS:
 223                                         if (val)
 224                                                 noval('o', o_opts, ASSUME_XFS);
 225                                         if (assume_xfs)
 226                                                 respec('o', o_opts, ASSUME_XFS);
 227                                         assume_xfs = 1;
 228                                         break;
 229                                 case IHASH_SIZE:
 230                                         do_warn(
 231                 _("-o ihash option has been removed and will be ignored\n"));
 232                                         break;
 233                                 case BHASH_SIZE:
 234                                         if (max_mem_specified)
 235                                                 do_abort(
 236                 _("-o bhash option cannot be used with -m option\n"));
 237                                         if (!val)
 238                                                 do_abort(
 239                 _("-o bhash requires a parameter\n"));
 240                                         libxfs_bhash_size = (int)strtol(val, NULL, 0);
 241                                         bhash_option_used = 1;
 242                                         break;
 243                                 case AG_STRIDE:
 244                                         if (!val)
 245                                                 do_abort(
 246                 _("-o ag_stride requires a parameter\n"));
 247                                         ag_stride = (int)strtol(val, NULL, 0);
 248                                         break;
 249                                 case FORCE_GEO:
 250                                         if (val)
 251                                                 noval('o', o_opts, FORCE_GEO);
 252                                         if (force_geo)
 253                                                 respec('o', o_opts, FORCE_GEO);
 254                                         force_geo = 1;
 255                                         break;
 256                                 case PHASE2_THREADS:
 257                                         if (!val)
 258                                                 do_abort(
 259                 _("-o phase2_threads requires a parameter\n"));
 260                                         phase2_threads = (int)strtol(val, NULL, 0);
 261                                         break;
 262                                 default:
 263                                         unknown('o', val);
 264                                         break;
 265                                 }
 266                         }
 267                         break;
 268                 case 'c':
 269                         p = optarg;
 270                         while (*p) {
 271                                 char *val;
 272
 273                                 switch (getsubopt(&p, c_opts, &val)) {
 274                                 case CONVERT_LAZY_COUNT:
 275                                         if (!val)
 276                                                 do_abort(
 277                 _("-c lazycount requires a parameter\n"));
 278                                         lazy_count = (int)strtol(val, NULL, 0);
 279                                         convert_lazy_count = 1;
 280                                         break;
 281                                 default:
 282                                         unknown('c', val);
 283                                         break;
 284                                 }
 285                         }
 286                         break;
 287                 case 'l':
 288                         log_name = optarg;
 289                         log_spec = 1;
 290                         break;
 291                 case 'r':
 292                         rt_name = optarg;
 293                         rt_spec = 1;
 294                         break;
 295                 case 'f':
 296                         isa_file = 1;
 297                         break;
 298                 case 'm':
 299                         if (bhash_option_used)
 300                                 do_abort(_("-m option cannot be used with "
 301                                                 "-o bhash option\n"));
 302                         max_mem_specified = strtol(optarg, NULL, 0);
 303                         break;
 304                 case 'L':
 305                         zap_log = 1;
 306                         break;
 307                 case 'n':
 308                         no_modify = 1;
 309                         break;
 310                 case 'd':
 311                         dangerously = 1;
 312                         break;
 313                 case 'v':
 314                         verbose++;
 315                         break;
 316                 case 'V':
 317                         printf(_("%s version %s\n"), progname, VERSION);
 318                         exit(0);
 319                 case 'P':
 320                         do_prefetch = 0;
 321                         break;
 322                 case 't':
 323                         report_interval = (int)strtol(optarg, NULL, 0);
 324                         break;
 325                 case 'e':
 326                         report_corrected = true;
 327                         break;
 328                 case '?':
 329                         usage();
 330                 }
 331         }
 332
 333         if (argc - optind != 1)
 334                 usage();
 335
 336         if ((fs_name = argv[optind]) == NULL)
 337                 usage();
 338
 339         if (report_corrected && no_modify)
 340                 usage();
 341 }
 342
 343 void __attribute__((noreturn))
 344 do_error(char const *msg, ...)
 345 {
 346         va_list args;
 347
 348         fprintf(stderr, _("\nfatal error -- "));
 349
 350         va_start(args, msg);
 351         vfprintf(stderr, msg, args);
 352         if (dumpcore)
 353                 abort();
 354         exit(1);
 355 }
 356
 357 /*
 358  * like do_error, only the error is internal, no system
 359  * error so no oserror processing
 360  */
 361 void __attribute__((noreturn))
 362 do_abort(char const *msg, ...)
 363 {
 364         va_list args;
 365
 366         va_start(args, msg);
 367         vfprintf(stderr, msg, args);
 368         if (dumpcore)
 369                 abort();
 370         exit(1);
 371 }
 372
 373 void
 374 do_warn(char const *msg, ...)
 375 {
 376         va_list args;
 377
 378         fs_is_dirty = 1;
 379
 380         va_start(args, msg);
 381         vfprintf(stderr, msg, args);
 382         va_end(args);
 383 }
 384
 385 /* no formatting */
 386
 387 void
 388 do_log(char const *msg, ...)
 389 {
 390         va_list args;
 391
 392         va_start(args, msg);
 393         vfprintf(stderr, msg, args);
 394         va_end(args);
 395 }
 396
 397 static void
 398 calc_mkfs(xfs_mount_t *mp)
 399 {
 400         xfs_agblock_t   fino_bno;
 401         int             do_inoalign;
 402
 403         do_inoalign = M_IGEO(mp)->ialloc_align;
 404
 405         /*
 406          * Pre-calculate the geometry of ag 0. We know what it looks like
 407          * because we know what mkfs does: 2 allocation btree roots (by block
 408          * and by size), the inode allocation btree root, the free inode
 409          * allocation btree root (if enabled) and some number of blocks to
 410          * prefill the agfl.
 411          *
 412          * Because the current shape of the btrees may differ from the current
 413          * shape, we open code the mkfs freelist block count here. mkfs creates
 414          * single level trees, so the calculation is pertty straight forward for
 415          * the trees that use the AGFL.
 416          */
 417         bnobt_root = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
 418         bcntbt_root = bnobt_root + 1;
 419         inobt_root = bnobt_root + 2;
 420         fino_bno = inobt_root + (2 * min(2, mp->m_ag_maxlevels)) + 1;
 421         if (xfs_sb_version_hasfinobt(&mp->m_sb))
 422                 fino_bno++;
 423         if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
 424                 fino_bno += min(2, mp->m_rmap_maxlevels); /* agfl blocks */
 425                 fino_bno++;
 426         }
 427         if (xfs_sb_version_hasreflink(&mp->m_sb))
 428                 fino_bno++;
 429
 430         /*
 431          * If the log is allocated in the first allocation group we need to
 432          * add the number of blocks used by the log to the above calculation.
 433          *
 434          * This can happens with filesystems that only have a single
 435          * allocation group, or very odd geometries created by old mkfs
 436          * versions on very small filesystems.
 437          */
 438         if (mp->m_sb.sb_logstart &&
 439             XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) {
 440
 441                 /*
 442                  * XXX(hch): verify that sb_logstart makes sense?
 443                  */
 444                  fino_bno += mp->m_sb.sb_logblocks;
 445         }
 446
 447         /*
 448          * ditto the location of the first inode chunks in the fs ('/')
 449          */
 450         if (xfs_sb_version_hasdalign(&mp->m_sb) && do_inoalign)  {
 451                 first_prealloc_ino = XFS_AGB_TO_AGINO(mp, roundup(fino_bno,
 452                                         mp->m_sb.sb_unit));
 453         } else if (xfs_sb_version_hasalign(&mp->m_sb) &&
 454                                         mp->m_sb.sb_inoalignmt > 1)  {
 455                 first_prealloc_ino = XFS_AGB_TO_AGINO(mp,
 456                                         roundup(fino_bno,
 457                                                 mp->m_sb.sb_inoalignmt));
 458         } else  {
 459                 first_prealloc_ino = XFS_AGB_TO_AGINO(mp, fino_bno);
 460         }
 461
 462         ASSERT(M_IGEO(mp)->ialloc_blks > 0);
 463
 464         if (M_IGEO(mp)->ialloc_blks > 1)
 465                 last_prealloc_ino = first_prealloc_ino + XFS_INODES_PER_CHUNK;
 466         else
 467                 last_prealloc_ino = XFS_AGB_TO_AGINO(mp, fino_bno + 1);
 468
 469         /*
 470          * now the first 3 inodes in the system
 471          */
 472         if (mp->m_sb.sb_rootino != first_prealloc_ino)  {
 473                 do_warn(
 474 _("sb root inode value %" PRIu64 " %sinconsistent with calculated value %u\n"),
 475                         mp->m_sb.sb_rootino,
 476                         (mp->m_sb.sb_rootino == NULLFSINO ? "(NULLFSINO) ":""),
 477                         first_prealloc_ino);
 478
 479                 if (!no_modify)
 480                         do_warn(
 481                 _("resetting superblock root inode pointer to %u\n"),
 482                                 first_prealloc_ino);
 483                 else
 484                         do_warn(
 485                 _("would reset superblock root inode pointer to %u\n"),
 486                                 first_prealloc_ino);
 487
 488                 /*
 489                  * just set the value -- safe since the superblock
 490                  * doesn't get flushed out if no_modify is set
 491                  */
 492                 mp->m_sb.sb_rootino = first_prealloc_ino;
 493         }
 494
 495         if (mp->m_sb.sb_rbmino != first_prealloc_ino + 1)  {
 496                 do_warn(
 497 _("sb realtime bitmap inode %" PRIu64 " %sinconsistent with calculated value %u\n"),
 498                         mp->m_sb.sb_rbmino,
 499                         (mp->m_sb.sb_rbmino == NULLFSINO ? "(NULLFSINO) ":""),
 500                         first_prealloc_ino + 1);
 501
 502                 if (!no_modify)
 503                         do_warn(
 504                 _("resetting superblock realtime bitmap ino pointer to %u\n"),
 505                                 first_prealloc_ino + 1);
 506                 else
 507                         do_warn(
 508                 _("would reset superblock realtime bitmap ino pointer to %u\n"),
 509                                 first_prealloc_ino + 1);
 510
 511                 /*
 512                  * just set the value -- safe since the superblock
 513                  * doesn't get flushed out if no_modify is set
 514                  */
 515                 mp->m_sb.sb_rbmino = first_prealloc_ino + 1;
 516         }
 517
 518         if (mp->m_sb.sb_rsumino != first_prealloc_ino + 2)  {
 519                 do_warn(
 520 _("sb realtime summary inode %" PRIu64 " %sinconsistent with calculated value %u\n"),
 521                         mp->m_sb.sb_rsumino,
 522                         (mp->m_sb.sb_rsumino == NULLFSINO ? "(NULLFSINO) ":""),
 523                         first_prealloc_ino + 2);
 524
 525                 if (!no_modify)
 526                         do_warn(
 527                 _("resetting superblock realtime summary ino pointer to %u\n"),
 528                                 first_prealloc_ino + 2);
 529                 else
 530                         do_warn(
 531                 _("would reset superblock realtime summary ino pointer to %u\n"),
 532                                 first_prealloc_ino + 2);
 533
 534                 /*
 535                  * just set the value -- safe since the superblock
 536                  * doesn't get flushed out if no_modify is set
 537                  */
 538                 mp->m_sb.sb_rsumino = first_prealloc_ino + 2;
 539         }
 540
 541 }
 542
 543 /*
 544  * v5 superblock metadata track the LSN of last modification and thus require
 545  * that the current LSN is always moving forward. The current LSN is reset if
 546  * the log has been cleared, which puts the log behind parts of the filesystem
 547  * on-disk and can disrupt log recovery.
 548  *
 549  * We have tracked the maximum LSN of every piece of metadata that has been read
 550  * in via the read verifiers. Compare the max LSN with the log and if the log is
 551  * behind, bump the cycle number and reformat the log.
 552  */
 553 static void
 554 format_log_max_lsn(
 555         struct xfs_mount        *mp)
 556 {
 557         struct xlog             *log = mp->m_log;
 558         int                     max_cycle;
 559         int                     max_block;
 560         int                     new_cycle;
 561         xfs_daddr_t             logstart;
 562         xfs_daddr_t             logblocks;
 563         int                     logversion;
 564
 565         if (!xfs_sb_version_hascrc(&mp->m_sb))
 566                 return;
 567
 568         /*
 569          * If the log is ahead of the highest metadata LSN we've seen, we're
 570          * safe and there's nothing to do.
 571          */
 572         max_cycle = CYCLE_LSN(libxfs_max_lsn);
 573         max_block = BLOCK_LSN(libxfs_max_lsn);
 574         if (max_cycle < log->l_curr_cycle ||
 575             (max_cycle == log->l_curr_cycle && max_block < log->l_curr_block))
 576                 return;
 577
 578         /*
 579          * Going to the next cycle should be sufficient but we bump by a few
 580          * counts to help cover any metadata LSNs we could have missed.
 581          */
 582         new_cycle = max_cycle + 3;
 583         logstart = XFS_FSB_TO_DADDR(mp, mp->m_sb.sb_logstart);
 584         logblocks = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 585         logversion = xfs_sb_version_haslogv2(&mp->m_sb) ? 2 : 1;
 586
 587         do_warn(_("Maximum metadata LSN (%d:%d) is ahead of log (%d:%d).\n"),
 588                 max_cycle, max_block, log->l_curr_cycle, log->l_curr_block);
 589
 590         if (no_modify) {
 591                 do_warn(_("Would format log to cycle %d.\n"), new_cycle);
 592                 return;
 593         }
 594
 595         do_warn(_("Format log to cycle %d.\n"), new_cycle);
 596         libxfs_log_clear(log->l_dev, NULL, logstart, logblocks,
 597                          &mp->m_sb.sb_uuid, logversion, mp->m_sb.sb_logsunit,
 598                          XLOG_FMT, new_cycle, true);
 599 }
 600
 601 /*
 602  * mkfs increases the AG count for "multidisk" configurations, we want
 603  * to target these for an increase in thread count. Hence check the superlock
 604  * geometry information to determine if mkfs considered this a multidisk
 605  * configuration.
 606  */
 607 static bool
 608 is_multidisk_filesystem(
 609         struct xfs_mount        *mp)
 610 {
 611         struct xfs_sb           *sbp = &mp->m_sb;
 612
 613         /* High agcount filesystems are always considered "multidisk" */
 614         if (sbp->sb_agcount >= XFS_MULTIDISK_AGCOUNT)
 615                 return true;
 616
 617         /*
 618          * If it doesn't have a sunit/swidth, mkfs didn't consider it a
 619          * multi-disk array, so we don't either.
 620          */
 621         if (!sbp->sb_unit)
 622                 return false;
 623
 624         ASSERT(sbp->sb_width);
 625         return true;
 626 }
 627
 628 /*
 629  * if the sector size of the filesystem we are trying to repair is
 630  * smaller than that of the underlying filesystem (i.e. we are repairing
 631  * an image), the we have to turn off direct IO because we cannot do IO
 632  * smaller than the host filesystem's sector size.
 633  */
 634 static void
 635 check_fs_vs_host_sectsize(
 636         struct xfs_sb   *sb)
 637 {
 638         int     fd, ret;
 639         long    old_flags;
 640         struct xfs_fsop_geom    geom = { 0 };
 641
 642         fd = libxfs_device_to_fd(x.ddev);
 643
 644         ret = xfrog_geometry(fd, &geom);
 645         if (ret) {
 646                 do_log(_("Cannot get host filesystem geometry.\n"
 647         "Repair may fail if there is a sector size mismatch between\n"
 648         "the image and the host filesystem.\n"));
 649                 geom.sectsize = BBSIZE;
 650         }
 651
 652         if (sb->sb_sectsize < geom.sectsize) {
 653                 old_flags = fcntl(fd, F_GETFL, 0);
 654                 if (fcntl(fd, F_SETFL, old_flags & ~O_DIRECT) < 0) {
 655                         do_warn(_(
 656         "Sector size on host filesystem larger than image sector size.\n"
 657         "Cannot turn off direct IO, so exiting.\n"));
 658                         exit(1);
 659                 }
 660         }
 661 }
 662
 663 int
 664 main(int argc, char **argv)
 665 {
 666         xfs_mount_t     *temp_mp;
 667         xfs_mount_t     *mp;
 668         xfs_dsb_t       *dsb;
 669         xfs_buf_t       *sbp;
 670         xfs_mount_t     xfs_m;
 671         struct xlog     log = {0};
 672         char            *msgbuf;
 673         struct xfs_sb   psb;
 674         int             rval;
 675         struct xfs_ino_geometry *igeo;
 676
 677         progname = basename(argv[0]);
 678         setlocale(LC_ALL, "");
 679         bindtextdomain(PACKAGE, LOCALEDIR);
 680         textdomain(PACKAGE);
 681         dinode_bmbt_translation_init();
 682
 683         temp_mp = &xfs_m;
 684         setbuf(stdout, NULL);
 685
 686         process_args(argc, argv);
 687         xfs_init(&x);
 688
 689         msgbuf = malloc(DURATION_BUF_SIZE);
 690
 691         timestamp(PHASE_START, 0, NULL);
 692         timestamp(PHASE_END, 0, NULL);
 693
 694         /* -f forces this, but let's be nice and autodetect it, as well. */
 695         if (!isa_file) {
 696                 int             fd = libxfs_device_to_fd(x.ddev);
 697                 struct stat     statbuf;
 698
 699                 if (fstat(fd, &statbuf) < 0)
 700                         do_warn(_("%s: couldn't stat \"%s\"\n"),
 701                                 progname, fs_name);
 702                 else if (S_ISREG(statbuf.st_mode))
 703                         isa_file = 1;
 704         }
 705
 706         if (isa_file) {
 707                 /* Best effort attempt to validate fs vs host sector size */
 708                 rval = get_sb(&psb, 0, XFS_MAX_SECTORSIZE, 0);
 709                 if (rval == XR_OK)
 710                         check_fs_vs_host_sectsize(&psb);
 711         }
 712
 713         /* do phase1 to make sure we have a superblock */
 714         phase1(temp_mp);
 715         timestamp(PHASE_END, 1, NULL);
 716
 717         if (no_modify && primary_sb_modified)  {
 718                 do_warn(_("Primary superblock would have been modified.\n"
 719                           "Cannot proceed further in no_modify mode.\n"
 720                           "Exiting now.\n"));
 721                 exit(1);
 722         }
 723
 724         rval = get_sb(&psb, 0, XFS_MAX_SECTORSIZE, 0);
 725         if (rval != XR_OK) {
 726                 do_warn(_("Primary superblock bad after phase 1!\n"
 727                           "Exiting now.\n"));
 728                 exit(1);
 729         }
 730
 731         /*
 732          * Now that we have completely validated the superblock, geometry may
 733          * have changed; re-check geometry vs the host filesystem geometry
 734          */
 735         if (isa_file)
 736                 check_fs_vs_host_sectsize(&psb);
 737
 738         /*
 739          * Prepare the mount structure. Point the log reference to our local
 740          * copy so it's available to the various phases. The log bits are
 741          * initialized in phase 2.
 742          */
 743         memset(&xfs_m, 0, sizeof(xfs_mount_t));
 744         mp = libxfs_mount(&xfs_m, &psb, x.ddev, x.logdev, x.rtdev, 0);
 745
 746         if (!mp)  {
 747                 fprintf(stderr,
 748                         _("%s: cannot repair this filesystem.  Sorry.\n"),
 749                         progname);
 750                 exit(1);
 751         }
 752         mp->m_log = &log;
 753         igeo = M_IGEO(mp);
 754
 755         /* Spit out function & line on these corruption macros */
 756         if (verbose > 2)
 757                 mp->m_flags |= LIBXFS_MOUNT_WANT_CORRUPTED;
 758
 759         /*
 760          * set XFS-independent status vars from the mount/sb structure
 761          */
 762         glob_agcount = mp->m_sb.sb_agcount;
 763
 764         chunks_pblock = mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK;
 765         max_symlink_blocks = libxfs_symlink_blocks(mp, XFS_SYMLINK_MAXLEN);
 766         inodes_per_cluster = max(mp->m_sb.sb_inopblock,
 767                         igeo->inode_cluster_size >> mp->m_sb.sb_inodelog);
 768
 769         /*
 770          * Automatic striding for high agcount filesystems.
 771          *
 772          * More AGs indicates that the filesystem is either large or can handle
 773          * more IO parallelism. Either way, we should try to process multiple
 774          * AGs at a time in such a configuration to try to saturate the
 775          * underlying storage and speed the repair process. Only do this if
 776          * prefetching is enabled.
 777          *
 778          * Given mkfs defaults for 16AGs for "multidisk" configurations, we want
 779          * to target these for an increase in thread count. Hence a stride value
 780          * of 15 is chosen to ensure we get at least 2 AGs being scanned at once
 781          * on such filesystems.
 782          *
 783          * Limit the maximum thread count based on the available CPU power that
 784          * is available. If we use too many threads, we might run out of memory
 785          * and CPU power before we run out of IO concurrency. We limit to 8
 786          * threads/CPU as this is enough threads to saturate a CPU on fast
 787          * devices, yet few enough that it will saturate but won't overload slow
 788          * devices.
 789          *
 790          * Multidisk filesystems can handle more IO parallelism so we should try
 791          * to process multiple AGs at a time in such a configuration to try to
 792          * saturate the underlying storage and speed the repair process. Only do
 793          * this if prefetching is enabled.
 794          */
 795         if (!ag_stride && do_prefetch && is_multidisk_filesystem(mp)) {
 796                 /*
 797                  * For small agcount multidisk systems, just double the
 798                  * parallelism. For larger AG count filesystems (32 and above)
 799                  * use more parallelism, and linearly increase the parallelism
 800                  * with the number of AGs.
 801                  */
 802                 ag_stride = min(glob_agcount, XFS_MULTIDISK_AGCOUNT / 2) - 1;
 803         }
 804
 805         if (ag_stride) {
 806                 int max_threads = platform_nproc() * 8;
 807
 808                 thread_count = (glob_agcount + ag_stride - 1) / ag_stride;
 809                 while (thread_count > max_threads) {
 810                         ag_stride *= 2;
 811                         thread_count = (glob_agcount + ag_stride - 1) /
 812                                                                 ag_stride;
 813                 }
 814                 if (thread_count > 0)
 815                         thread_init();
 816                 else {
 817                         thread_count = 1;
 818                         ag_stride = 0;
 819                 }
 820         }
 821
 822         if (ag_stride && report_interval) {
 823                 init_progress_rpt();
 824                 if (msgbuf) {
 825                         do_log(_("        - reporting progress in intervals of %s\n"),
 826                         duration(report_interval, msgbuf));
 827                 }
 828         }
 829
 830         /*
 831          * Adjust libxfs cache sizes based on system memory,
 832          * filesystem size and inode count.
 833          *
 834          * We'll set the cache size based on 3/4s the memory minus
 835          * space used by the inode AVL tree and block usage map.
 836          *
 837          * Inode AVL tree space is approximately 4 bytes per inode,
 838          * block usage map is currently 1 byte for 2 blocks.
 839          *
 840          * We assume most blocks will be inode clusters.
 841          *
 842          * Calculations are done in kilobyte units.
 843          */
 844
 845         if (!bhash_option_used || max_mem_specified) {
 846                 unsigned long   mem_used;
 847                 unsigned long   max_mem;
 848                 struct rlimit   rlim;
 849
 850                 libxfs_bcache_purge();
 851                 cache_destroy(libxfs_bcache);
 852
 853                 mem_used = (mp->m_sb.sb_icount >> (10 - 2)) +
 854                                         (mp->m_sb.sb_dblocks >> (10 + 1)) +
 855                                         50000;  /* rough estimate of 50MB overhead */
 856                 max_mem = max_mem_specified ? max_mem_specified * 1024 :
 857                                                 libxfs_physmem() * 3 / 4;
 858
 859                 if (getrlimit(RLIMIT_AS, &rlim) != -1 &&
 860                                         rlim.rlim_cur != RLIM_INFINITY) {
 861                         rlim.rlim_cur = rlim.rlim_max;
 862                         setrlimit(RLIMIT_AS, &rlim);
 863                         /* use approximately 80% of rlimit to avoid overrun */
 864                         max_mem = min(max_mem, rlim.rlim_cur / 1280);
 865                 } else
 866                         max_mem = min(max_mem, (LONG_MAX >> 10) + 1);
 867
 868                 if (verbose > 1)
 869                         do_log(
 870         _("        - max_mem = %lu, icount = %" PRIu64 ", imem = %" PRIu64 ", dblock = %" PRIu64 ", dmem = %" PRIu64 "\n"),
 871                                 max_mem, mp->m_sb.sb_icount,
 872                                 mp->m_sb.sb_icount >> (10 - 2),
 873                                 mp->m_sb.sb_dblocks,
 874                                 mp->m_sb.sb_dblocks >> (10 + 1));
 875
 876                 if (max_mem <= mem_used) {
 877                         if (max_mem_specified) {
 878                                 do_abort(
 879         _("Required memory for repair is greater that the maximum specified\n"
 880           "with the -m option. Please increase it to at least %lu.\n"),
 881                                         mem_used / 1024);
 882                         }
 883                         do_log(
 884         _("Memory available for repair (%luMB) may not be sufficient.\n"
 885           "At least %luMB is needed to repair this filesystem efficiently\n"
 886           "If repair fails due to lack of memory, please\n"),
 887                                 max_mem / 1024, mem_used / 1024);
 888                         if (do_prefetch)
 889                                 do_log(
 890         _("turn prefetching off (-P) to reduce the memory footprint.\n"));
 891                         else
 892                                 do_log(
 893         _("increase system RAM and/or swap space to at least %luMB.\n"),
 894                         mem_used * 2 / 1024);
 895
 896                         max_mem = mem_used;
 897                 }
 898
 899                 max_mem -= mem_used;
 900                 if (max_mem >= (1 << 30))
 901                         max_mem = 1 << 30;
 902                 libxfs_bhash_size = max_mem / (HASH_CACHE_RATIO *
 903                                 (igeo->inode_cluster_size >> 10));
 904                 if (libxfs_bhash_size < 512)
 905                         libxfs_bhash_size = 512;
 906
 907                 if (verbose)
 908                         do_log(_("        - block cache size set to %d entries\n"),
 909                                 libxfs_bhash_size * HASH_CACHE_RATIO);
 910
 911                 libxfs_bcache = cache_init(0, libxfs_bhash_size,
 912                                                 &libxfs_bcache_operations);
 913         }
 914
 915         /*
 916          * calculate what mkfs would do to this filesystem
 917          */
 918         calc_mkfs(mp);
 919
 920         /*
 921          * initialize block alloc map
 922          */
 923         init_bmaps(mp);
 924         incore_ino_init(mp);
 925         incore_ext_init(mp);
 926         rmaps_init(mp);
 927
 928         /* initialize random globals now that we know the fs geometry */
 929         inodes_per_block = mp->m_sb.sb_inopblock;
 930
 931         if (parse_sb_version(&mp->m_sb))  {
 932                 do_warn(
 933         _("Found unsupported filesystem features.  Exiting now.\n"));
 934                 return(1);
 935         }
 936
 937         /* make sure the per-ag freespace maps are ok so we can mount the fs */
 938         phase2(mp, phase2_threads);
 939         timestamp(PHASE_END, 2, NULL);
 940
 941         if (do_prefetch)
 942                 init_prefetch(mp);
 943
 944         phase3(mp, phase2_threads);
 945         timestamp(PHASE_END, 3, NULL);
 946
 947         phase4(mp);
 948         timestamp(PHASE_END, 4, NULL);
 949
 950         if (no_modify)
 951                 printf(_("No modify flag set, skipping phase 5\n"));
 952         else {
 953                 phase5(mp);
 954         }
 955         timestamp(PHASE_END, 5, NULL);
 956
 957         /*
 958          * Done with the block usage maps, toss them...
 959          */
 960         rmaps_free(mp);
 961         free_bmaps(mp);
 962
 963         if (!bad_ino_btree)  {
 964                 phase6(mp);
 965                 timestamp(PHASE_END, 6, NULL);
 966
 967                 phase7(mp, phase2_threads);
 968                 timestamp(PHASE_END, 7, NULL);
 969         } else  {
 970                 do_warn(
 971 _("Inode allocation btrees are too corrupted, skipping phases 6 and 7\n"));
 972         }
 973
 974         if (lost_quotas && !have_uquotino && !have_gquotino && !have_pquotino) {
 975                 if (!no_modify)  {
 976                         do_warn(
 977 _("Warning:  no quota inodes were found.  Quotas disabled.\n"));
 978                 } else  {
 979                         do_warn(
 980 _("Warning:  no quota inodes were found.  Quotas would be disabled.\n"));
 981                 }
 982         } else if (lost_quotas)  {
 983                 if (!no_modify)  {
 984                         do_warn(
 985 _("Warning:  quota inodes were cleared.  Quotas disabled.\n"));
 986                 } else  {
 987                         do_warn(
 988 _("Warning:  quota inodes would be cleared.  Quotas would be disabled.\n"));
 989                 }
 990         } else  {
 991                 if (lost_uquotino)  {
 992                         if (!no_modify)  {
 993                                 do_warn(
 994 _("Warning:  user quota information was cleared.\n"
 995   "User quotas can not be enforced until limit information is recreated.\n"));
 996                         } else  {
 997                                 do_warn(
 998 _("Warning:  user quota information would be cleared.\n"
 999   "User quotas could not be enforced until limit information was recreated.\n"));
1000                         }
1001                 }
1002
1003                 if (lost_gquotino)  {
1004                         if (!no_modify)  {
1005                                 do_warn(
1006 _("Warning:  group quota information was cleared.\n"
1007   "Group quotas can not be enforced until limit information is recreated.\n"));
1008                         } else  {
1009                                 do_warn(
1010 _("Warning:  group quota information would be cleared.\n"
1011   "Group quotas could not be enforced until limit information was recreated.\n"));
1012                         }
1013                 }
1014
1015                 if (lost_pquotino)  {
1016                         if (!no_modify)  {
1017                                 do_warn(
1018 _("Warning:  project quota information was cleared.\n"
1019   "Project quotas can not be enforced until limit information is recreated.\n"));
1020                         } else  {
1021                                 do_warn(
1022 _("Warning:  project quota information would be cleared.\n"
1023   "Project quotas could not be enforced until limit information was recreated.\n"));
1024                         }
1025                 }
1026         }
1027
1028         if (ag_stride && report_interval)
1029                 stop_progress_rpt();
1030
1031         if (no_modify)  {
1032                 /*
1033                  * Warn if the current LSN is problematic and the log requires a
1034                  * reformat.
1035                  */
1036                 format_log_max_lsn(mp);
1037
1038                 do_log(
1039         _("No modify flag set, skipping filesystem flush and exiting.\n"));
1040                 if (verbose)
1041                         summary_report();
1042                 if (fs_is_dirty)
1043                         return(1);
1044
1045                 return(0);
1046         }
1047
1048         /*
1049          * Clear the quota flags if they're on.
1050          */
1051         sbp = libxfs_getsb(mp);
1052         if (!sbp)
1053                 do_error(_("couldn't get superblock\n"));
1054
1055         dsb = XFS_BUF_TO_SBP(sbp);
1056
1057         if (be16_to_cpu(dsb->sb_qflags) & XFS_ALL_QUOTA_CHKD) {
1058                 do_warn(_("Note - quota info will be regenerated on next "
1059                         "quota mount.\n"));
1060                 dsb->sb_qflags &= cpu_to_be16(~XFS_ALL_QUOTA_CHKD);
1061         }
1062
1063         if (copied_sunit) {
1064                 do_warn(
1065 _("Note - stripe unit (%d) and width (%d) were copied from a backup superblock.\n"
1066   "Please reset with mount -o sunit=<value>,swidth=<value> if necessary\n"),
1067                         be32_to_cpu(dsb->sb_unit), be32_to_cpu(dsb->sb_width));
1068         }
1069
1070         libxfs_writebuf(sbp, 0);
1071
1072         /*
1073          * Done. Flush all cached buffers and inodes first to ensure all
1074          * verifiers are run (where we discover the max metadata LSN), reformat
1075          * the log if necessary and unmount.
1076          */
1077         libxfs_bcache_flush();
1078         format_log_max_lsn(mp);
1079         libxfs_umount(mp);
1080
1081         if (x.rtdev)
1082                 libxfs_device_close(x.rtdev);
1083         if (x.logdev && x.logdev != x.ddev)
1084                 libxfs_device_close(x.logdev);
1085         libxfs_device_close(x.ddev);
1086         libxfs_destroy();
1087
1088         if (verbose)
1089                 summary_report();
1090         do_log(_("done\n"));
1091
1092         if (dangerously && !no_modify)
1093                 do_warn(
1094 _("Repair of readonly mount complete.  Immediate reboot encouraged.\n"));
1095
1096         pftrace_done();
1097
1098         free(msgbuf);
1099
1100         if (fs_is_dirty && report_corrected)
1101                 return (4);
1102         return (0);
1103 }