repair/xfs_repair.c

   1 /*
   2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "libxfs.h"
  20 #include "libxlog.h"
  21 #include <sys/resource.h>
  22 #include "avl.h"
  23 #include "avl64.h"
  24 #include "globals.h"
  25 #include "versions.h"
  26 #include "agheader.h"
  27 #include "protos.h"
  28 #include "incore.h"
  29 #include "err_protos.h"
  30 #include "prefetch.h"
  31 #include "threads.h"
  32 #include "progress.h"
  33 #include "dinode.h"
  34
  35 #define rounddown(x, y) (((x)/(y))*(y))
  36
  37 #define         XR_MAX_SECT_SIZE        (64 * 1024)
  38
  39 /*
  40  * option tables for getsubopt calls
  41  */
  42
  43 /*
  44  * -o: user-supplied override options
  45  */
  46 static char *o_opts[] = {
  47 #define ASSUME_XFS      0
  48         "assume_xfs",
  49 #define PRE_65_BETA     1
  50         "fs_is_pre_65_beta",
  51 #define IHASH_SIZE      2
  52         "ihash",
  53 #define BHASH_SIZE      3
  54         "bhash",
  55 #define AG_STRIDE       4
  56         "ag_stride",
  57 #define FORCE_GEO       5
  58         "force_geometry",
  59 #define PHASE2_THREADS  6
  60         "phase2_threads",
  61         NULL
  62 };
  63
  64 /*
  65  * -c: conversion options
  66  */
  67 static char *c_opts[] = {
  68 #define CONVERT_LAZY_COUNT      0
  69         "lazycount",
  70         NULL
  71 };
  72
  73
  74 static int      bhash_option_used;
  75 static long     max_mem_specified;      /* in megabytes */
  76 static int      phase2_threads = 32;
  77
  78 static void
  79 usage(void)
  80 {
  81         do_warn(_(
  82 "Usage: %s [options] device\n"
  83 "\n"
  84 "Options:\n"
  85 "  -f           The device is a file\n"
  86 "  -L           Force log zeroing. Do this as a last resort.\n"
  87 "  -l logdev    Specifies the device where the external log resides.\n"
  88 "  -m maxmem    Maximum amount of memory to be used in megabytes.\n"
  89 "  -n           No modify mode, just checks the filesystem for damage.\n"
  90 "  -P           Disables prefetching.\n"
  91 "  -r rtdev     Specifies the device where the realtime section resides.\n"
  92 "  -v           Verbose output.\n"
  93 "  -c subopts   Change filesystem parameters - use xfs_admin.\n"
  94 "  -o subopts   Override default behaviour, refer to man page.\n"
  95 "  -t interval  Reporting interval in seconds.\n"
  96 "  -d           Repair dangerously.\n"
  97 "  -V           Reports version and exits.\n"), progname);
  98         exit(1);
  99 }
 100
 101 char *
 102 err_string(int err_code)
 103 {
 104         static char *err_message[XR_BAD_ERR_CODE];
 105         static int done;
 106
 107         if (!done) {
 108                 err_message[XR_OK] = _("no error");
 109                 err_message[XR_BAD_MAGIC] = _("bad magic number");
 110                 err_message[XR_BAD_BLOCKSIZE] = _("bad blocksize field");
 111                 err_message[XR_BAD_BLOCKLOG] = _("bad blocksize log field");
 112                 err_message[XR_BAD_VERSION] = _("bad or unsupported version");
 113                 err_message[XR_BAD_INPROGRESS] =
 114                         _("filesystem mkfs-in-progress bit set");
 115                 err_message[XR_BAD_FS_SIZE_DATA] =
 116                         _("inconsistent filesystem geometry information");
 117                 err_message[XR_BAD_INO_SIZE_DATA] =
 118         _("bad inode size or inconsistent with number of inodes/block"),
 119                 err_message[XR_BAD_SECT_SIZE_DATA] = _("bad sector size");
 120                 err_message[XR_AGF_GEO_MISMATCH] =
 121         _("AGF geometry info conflicts with filesystem geometry");
 122                 err_message[XR_AGI_GEO_MISMATCH] =
 123         _("AGI geometry info conflicts with filesystem geometry");
 124                 err_message[XR_SB_GEO_MISMATCH] =
 125         _("AG superblock geometry info conflicts with filesystem geometry");
 126                 err_message[XR_EOF] = _("attempted to perform I/O beyond EOF");
 127                 err_message[XR_BAD_RT_GEO_DATA] =
 128         _("inconsistent filesystem geometry in realtime filesystem component");
 129                 err_message[XR_BAD_INO_MAX_PCT] =
 130                         _("maximum indicated percentage of inodes > 100%");
 131                 err_message[XR_BAD_INO_ALIGN] =
 132                         _("inconsistent inode alignment value");
 133                 err_message[XR_INSUFF_SEC_SB] =
 134         _("not enough secondary superblocks with matching geometry");
 135                 err_message[XR_BAD_SB_UNIT] =
 136                         _("bad stripe unit in superblock");
 137                 err_message[XR_BAD_SB_WIDTH] =
 138                         _("bad stripe width in superblock");
 139                 err_message[XR_BAD_SVN] =
 140                         _("bad shared version number in superblock");
 141                 err_message[XR_BAD_CRC] =
 142                         _("bad CRC in superblock");
 143                 done = 1;
 144         }
 145
 146         if (err_code < XR_OK || err_code >= XR_BAD_ERR_CODE)
 147                 do_abort(_("bad error code - %d\n"), err_code);
 148
 149         return(err_message[err_code]);
 150 }
 151
 152 static void
 153 noval(char opt, char *tbl[], int idx)
 154 {
 155         do_warn(_("-%c %s option cannot have a value\n"), opt, tbl[idx]);
 156         usage();
 157 }
 158
 159 static void
 160 respec(char opt, char *tbl[], int idx)
 161 {
 162         do_warn("-%c ", opt);
 163         if (tbl)
 164                 do_warn("%s ", tbl[idx]);
 165         do_warn(_("option respecified\n"));
 166         usage();
 167 }
 168
 169 static void
 170 unknown(char opt, char *s)
 171 {
 172         do_warn(_("unknown option -%c %s\n"), opt, s);
 173         usage();
 174 }
 175
 176 /*
 177  * sets only the global argument flags and variables
 178  */
 179 static void
 180 process_args(int argc, char **argv)
 181 {
 182         char *p;
 183         int c;
 184
 185         log_spec = 0;
 186         fs_is_dirty = 0;
 187         verbose = 0;
 188         no_modify = 0;
 189         dangerously = 0;
 190         isa_file = 0;
 191         zap_log = 0;
 192         dumpcore = 0;
 193         full_ino_ex_data = 0;
 194         delete_attr_ok = 1;
 195         force_geo = 0;
 196         assume_xfs = 0;
 197         copied_sunit = 0;
 198         sb_inoalignmt = 0;
 199         sb_unit = 0;
 200         sb_width = 0;
 201         fs_attributes_allowed = 1;
 202         fs_attributes2_allowed = 1;
 203         fs_quotas_allowed = 1;
 204         fs_aligned_inodes_allowed = 1;
 205         fs_sb_feature_bits_allowed = 1;
 206         fs_has_extflgbit_allowed = 1;
 207         pre_65_beta = 0;
 208         fs_shared_allowed = 1;
 209         ag_stride = 0;
 210         thread_count = 1;
 211         report_interval = PROG_RPT_DEFAULT;
 212
 213         /*
 214          * XXX have to add suboption processing here
 215          * attributes, quotas, nlinks, aligned_inos, sb_fbits
 216          */
 217         while ((c = getopt(argc, argv, "c:o:fl:m:r:LnDvVdPt:")) != EOF)  {
 218                 switch (c) {
 219                 case 'D':
 220                         dumpcore = 1;
 221                         break;
 222                 case 'o':
 223                         p = optarg;
 224                         while (*p != '\0')  {
 225                                 char *val;
 226
 227                                 switch (getsubopt(&p, (constpp)o_opts, &val))  {
 228                                 case ASSUME_XFS:
 229                                         if (val)
 230                                                 noval('o', o_opts, ASSUME_XFS);
 231                                         if (assume_xfs)
 232                                                 respec('o', o_opts, ASSUME_XFS);
 233                                         assume_xfs = 1;
 234                                         break;
 235                                 case PRE_65_BETA:
 236                                         if (val)
 237                                                 noval('o', o_opts, PRE_65_BETA);
 238                                         if (pre_65_beta)
 239                                                 respec('o', o_opts,
 240                                                         PRE_65_BETA);
 241                                         pre_65_beta = 1;
 242                                         break;
 243                                 case IHASH_SIZE:
 244                                         do_warn(
 245                 _("-o ihash option has been removed and will be ignored\n"));
 246                                         break;
 247                                 case BHASH_SIZE:
 248                                         if (max_mem_specified)
 249                                                 do_abort(
 250                 _("-o bhash option cannot be used with -m option\n"));
 251                                         libxfs_bhash_size = (int)strtol(val, NULL, 0);
 252                                         bhash_option_used = 1;
 253                                         break;
 254                                 case AG_STRIDE:
 255                                         ag_stride = (int)strtol(val, NULL, 0);
 256                                         break;
 257                                 case FORCE_GEO:
 258                                         if (val)
 259                                                 noval('o', o_opts, FORCE_GEO);
 260                                         if (force_geo)
 261                                                 respec('o', o_opts, FORCE_GEO);
 262                                         force_geo = 1;
 263                                         break;
 264                                 case PHASE2_THREADS:
 265                                         phase2_threads = (int)strtol(val, NULL, 0);
 266                                         break;
 267                                 default:
 268                                         unknown('o', val);
 269                                         break;
 270                                 }
 271                         }
 272                         break;
 273                 case 'c':
 274                         p = optarg;
 275                         while (*p) {
 276                                 char *val;
 277
 278                                 switch (getsubopt(&p, (constpp)c_opts, &val)) {
 279                                 case CONVERT_LAZY_COUNT:
 280                                         lazy_count = (int)strtol(val, NULL, 0);
 281                                         convert_lazy_count = 1;
 282                                         break;
 283                                 default:
 284                                         unknown('c', val);
 285                                         break;
 286                                 }
 287                         }
 288                         break;
 289                 case 'l':
 290                         log_name = optarg;
 291                         log_spec = 1;
 292                         break;
 293                 case 'r':
 294                         rt_name = optarg;
 295                         rt_spec = 1;
 296                         break;
 297                 case 'f':
 298                         isa_file = 1;
 299                         break;
 300                 case 'm':
 301                         if (bhash_option_used)
 302                                 do_abort(_("-m option cannot be used with "
 303                                                 "-o bhash option\n"));
 304                         max_mem_specified = strtol(optarg, NULL, 0);
 305                         break;
 306                 case 'L':
 307                         zap_log = 1;
 308                         break;
 309                 case 'n':
 310                         no_modify = 1;
 311                         break;
 312                 case 'd':
 313                         dangerously = 1;
 314                         break;
 315                 case 'v':
 316                         verbose++;
 317                         break;
 318                 case 'V':
 319                         printf(_("%s version %s\n"), progname, VERSION);
 320                         exit(0);
 321                 case 'P':
 322                         do_prefetch = 0;
 323                         break;
 324                 case 't':
 325                         report_interval = (int)strtol(optarg, NULL, 0);
 326                         break;
 327                 case '?':
 328                         usage();
 329                 }
 330         }
 331
 332         if (argc - optind != 1)
 333                 usage();
 334
 335         if ((fs_name = argv[optind]) == NULL)
 336                 usage();
 337 }
 338
 339 void __attribute__((noreturn))
 340 do_error(char const *msg, ...)
 341 {
 342         va_list args;
 343
 344         fprintf(stderr, _("\nfatal error -- "));
 345
 346         va_start(args, msg);
 347         vfprintf(stderr, msg, args);
 348         if (dumpcore)
 349                 abort();
 350         exit(1);
 351 }
 352
 353 /*
 354  * like do_error, only the error is internal, no system
 355  * error so no oserror processing
 356  */
 357 void __attribute__((noreturn))
 358 do_abort(char const *msg, ...)
 359 {
 360         va_list args;
 361
 362         va_start(args, msg);
 363         vfprintf(stderr, msg, args);
 364         if (dumpcore)
 365                 abort();
 366         exit(1);
 367 }
 368
 369 void
 370 do_warn(char const *msg, ...)
 371 {
 372         va_list args;
 373
 374         fs_is_dirty = 1;
 375
 376         va_start(args, msg);
 377         vfprintf(stderr, msg, args);
 378         va_end(args);
 379 }
 380
 381 /* no formatting */
 382
 383 void
 384 do_log(char const *msg, ...)
 385 {
 386         va_list args;
 387
 388         va_start(args, msg);
 389         vfprintf(stderr, msg, args);
 390         va_end(args);
 391 }
 392
 393 static void
 394 calc_mkfs(xfs_mount_t *mp)
 395 {
 396         xfs_agblock_t   fino_bno;
 397         int             do_inoalign;
 398
 399         do_inoalign = mp->m_sinoalign;
 400
 401         /*
 402          * Pre-calculate the geometry of ag 0. We know what it looks like
 403          * because we know what mkfs does: 2 allocation btree roots (by block
 404          * and by size), the inode allocation btree root, the free inode
 405          * allocation btree root (if enabled) and some number of blocks to
 406          * prefill the agfl.
 407          *
 408          * Because the current shape of the btrees may differ from the current
 409          * shape, we open code the mkfs freelist block count here. mkfs creates
 410          * single level trees, so the calculation is pertty straight forward for
 411          * the two trees that use the AGFL.
 412          */
 413         bnobt_root = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
 414         bcntbt_root = bnobt_root + 1;
 415         inobt_root = bnobt_root + 2;
 416         fino_bno = inobt_root + (2 * min(2, mp->m_ag_maxlevels)) + 1;
 417         if (xfs_sb_version_hasfinobt(&mp->m_sb))
 418                 fino_bno++;
 419
 420         /*
 421          * If the log is allocated in the first allocation group we need to
 422          * add the number of blocks used by the log to the above calculation.
 423          *
 424          * This can happens with filesystems that only have a single
 425          * allocation group, or very odd geometries created by old mkfs
 426          * versions on very small filesystems.
 427          */
 428         if (mp->m_sb.sb_logstart &&
 429             XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) {
 430
 431                 /*
 432                  * XXX(hch): verify that sb_logstart makes sense?
 433                  */
 434                  fino_bno += mp->m_sb.sb_logblocks;
 435         }
 436
 437         /*
 438          * ditto the location of the first inode chunks in the fs ('/')
 439          */
 440         if (xfs_sb_version_hasdalign(&mp->m_sb) && do_inoalign)  {
 441                 first_prealloc_ino = XFS_OFFBNO_TO_AGINO(mp, roundup(fino_bno,
 442                                         mp->m_sb.sb_unit), 0);
 443         } else if (xfs_sb_version_hasalign(&mp->m_sb) &&
 444                                         mp->m_sb.sb_inoalignmt > 1)  {
 445                 first_prealloc_ino = XFS_OFFBNO_TO_AGINO(mp,
 446                                         roundup(fino_bno,
 447                                                 mp->m_sb.sb_inoalignmt),
 448                                         0);
 449         } else  {
 450                 first_prealloc_ino = XFS_OFFBNO_TO_AGINO(mp, fino_bno, 0);
 451         }
 452
 453         ASSERT(mp->m_ialloc_blks > 0);
 454
 455         if (mp->m_ialloc_blks > 1)
 456                 last_prealloc_ino = first_prealloc_ino + XFS_INODES_PER_CHUNK;
 457         else
 458                 last_prealloc_ino = XFS_OFFBNO_TO_AGINO(mp, fino_bno + 1, 0);
 459
 460         /*
 461          * now the first 3 inodes in the system
 462          */
 463         if (mp->m_sb.sb_rootino != first_prealloc_ino)  {
 464                 do_warn(
 465 _("sb root inode value %" PRIu64 " %sinconsistent with calculated value %u\n"),
 466                         mp->m_sb.sb_rootino,
 467                         (mp->m_sb.sb_rootino == NULLFSINO ? "(NULLFSINO) ":""),
 468                         first_prealloc_ino);
 469
 470                 if (!no_modify)
 471                         do_warn(
 472                 _("resetting superblock root inode pointer to %u\n"),
 473                                 first_prealloc_ino);
 474                 else
 475                         do_warn(
 476                 _("would reset superblock root inode pointer to %u\n"),
 477                                 first_prealloc_ino);
 478
 479                 /*
 480                  * just set the value -- safe since the superblock
 481                  * doesn't get flushed out if no_modify is set
 482                  */
 483                 mp->m_sb.sb_rootino = first_prealloc_ino;
 484         }
 485
 486         if (mp->m_sb.sb_rbmino != first_prealloc_ino + 1)  {
 487                 do_warn(
 488 _("sb realtime bitmap inode %" PRIu64 " %sinconsistent with calculated value %u\n"),
 489                         mp->m_sb.sb_rbmino,
 490                         (mp->m_sb.sb_rbmino == NULLFSINO ? "(NULLFSINO) ":""),
 491                         first_prealloc_ino + 1);
 492
 493                 if (!no_modify)
 494                         do_warn(
 495                 _("resetting superblock realtime bitmap ino pointer to %u\n"),
 496                                 first_prealloc_ino + 1);
 497                 else
 498                         do_warn(
 499                 _("would reset superblock realtime bitmap ino pointer to %u\n"),
 500                                 first_prealloc_ino + 1);
 501
 502                 /*
 503                  * just set the value -- safe since the superblock
 504                  * doesn't get flushed out if no_modify is set
 505                  */
 506                 mp->m_sb.sb_rbmino = first_prealloc_ino + 1;
 507         }
 508
 509         if (mp->m_sb.sb_rsumino != first_prealloc_ino + 2)  {
 510                 do_warn(
 511 _("sb realtime summary inode %" PRIu64 " %sinconsistent with calculated value %u\n"),
 512                         mp->m_sb.sb_rsumino,
 513                         (mp->m_sb.sb_rsumino == NULLFSINO ? "(NULLFSINO) ":""),
 514                         first_prealloc_ino + 2);
 515
 516                 if (!no_modify)
 517                         do_warn(
 518                 _("resetting superblock realtime summary ino pointer to %u\n"),
 519                                 first_prealloc_ino + 2);
 520                 else
 521                         do_warn(
 522                 _("would reset superblock realtime summary ino pointer to %u\n"),
 523                                 first_prealloc_ino + 2);
 524
 525                 /*
 526                  * just set the value -- safe since the superblock
 527                  * doesn't get flushed out if no_modify is set
 528                  */
 529                 mp->m_sb.sb_rsumino = first_prealloc_ino + 2;
 530         }
 531
 532 }
 533
 534 /*
 535  * v5 superblock metadata track the LSN of last modification and thus require
 536  * that the current LSN is always moving forward. The current LSN is reset if
 537  * the log has been cleared, which puts the log behind parts of the filesystem
 538  * on-disk and can disrupt log recovery.
 539  *
 540  * We have tracked the maximum LSN of every piece of metadata that has been read
 541  * in via the read verifiers. Compare the max LSN with the log and if the log is
 542  * behind, bump the cycle number and reformat the log.
 543  */
 544 static void
 545 format_log_max_lsn(
 546         struct xfs_mount        *mp)
 547 {
 548         struct xlog             *log = mp->m_log;
 549         int                     max_cycle;
 550         int                     max_block;
 551         int                     new_cycle;
 552         xfs_daddr_t             logstart;
 553         xfs_daddr_t             logblocks;
 554         int                     logversion;
 555
 556         if (!xfs_sb_version_hascrc(&mp->m_sb))
 557                 return;
 558
 559         /*
 560          * If the log is ahead of the highest metadata LSN we've seen, we're
 561          * safe and there's nothing to do.
 562          */
 563         max_cycle = CYCLE_LSN(libxfs_max_lsn);
 564         max_block = BLOCK_LSN(libxfs_max_lsn);
 565         if (max_cycle < log->l_curr_cycle ||
 566             (max_cycle == log->l_curr_cycle && max_block < log->l_curr_block))
 567                 return;
 568
 569         /*
 570          * Going to the next cycle should be sufficient but we bump by a few
 571          * counts to help cover any metadata LSNs we could have missed.
 572          */
 573         new_cycle = max_cycle + 3;
 574         logstart = XFS_FSB_TO_DADDR(mp, mp->m_sb.sb_logstart);
 575         logblocks = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 576         logversion = xfs_sb_version_haslogv2(&mp->m_sb) ? 2 : 1;
 577
 578         do_warn(_("Maximum metadata LSN (%d:%d) is ahead of log (%d:%d).\n"),
 579                 max_cycle, max_block, log->l_curr_cycle, log->l_curr_block);
 580
 581         if (no_modify) {
 582                 do_warn(_("Would format log to cycle %d.\n"), new_cycle);
 583                 return;
 584         }
 585
 586         do_warn(_("Format log to cycle %d.\n"), new_cycle);
 587         libxfs_log_clear(log->l_dev, NULL, logstart, logblocks,
 588                          &mp->m_sb.sb_uuid, logversion, mp->m_sb.sb_logsunit,
 589                          XLOG_FMT, new_cycle, true);
 590 }
 591
 592 int
 593 main(int argc, char **argv)
 594 {
 595         xfs_mount_t     *temp_mp;
 596         xfs_mount_t     *mp;
 597         xfs_dsb_t       *dsb;
 598         xfs_buf_t       *sbp;
 599         xfs_mount_t     xfs_m;
 600         struct xlog     log = {0};
 601         char            *msgbuf;
 602         struct xfs_sb   psb;
 603         int             rval;
 604
 605         progname = basename(argv[0]);
 606         setlocale(LC_ALL, "");
 607         bindtextdomain(PACKAGE, LOCALEDIR);
 608         textdomain(PACKAGE);
 609         dinode_bmbt_translation_init();
 610
 611         temp_mp = &xfs_m;
 612         setbuf(stdout, NULL);
 613
 614         process_args(argc, argv);
 615         xfs_init(&x);
 616
 617         msgbuf = malloc(DURATION_BUF_SIZE);
 618
 619         timestamp(PHASE_START, 0, NULL);
 620         timestamp(PHASE_END, 0, NULL);
 621
 622         /* do phase1 to make sure we have a superblock */
 623         phase1(temp_mp);
 624         timestamp(PHASE_END, 1, NULL);
 625
 626         if (no_modify && primary_sb_modified)  {
 627                 do_warn(_("Primary superblock would have been modified.\n"
 628                           "Cannot proceed further in no_modify mode.\n"
 629                           "Exiting now.\n"));
 630                 exit(1);
 631         }
 632
 633         rval = get_sb(&psb, 0, XFS_MAX_SECTORSIZE, 0);
 634         if (rval != XR_OK) {
 635                 do_warn(_("Primary superblock bad after phase 1!\n"
 636                           "Exiting now.\n"));
 637                 exit(1);
 638         }
 639
 640         /* -f forces this, but let's be nice and autodetect it, as well. */
 641         if (!isa_file) {
 642                 int             fd = libxfs_device_to_fd(x.ddev);
 643                 struct stat64   statbuf;
 644
 645                 if (fstat64(fd, &statbuf) < 0)
 646                         do_warn(_("%s: couldn't stat \"%s\"\n"),
 647                                 progname, fs_name);
 648                 else if (S_ISREG(statbuf.st_mode))
 649                         isa_file = 1;
 650         }
 651
 652         /*
 653          * if the sector size of the filesystem we are trying to repair is
 654          * smaller than that of the underlying filesystem (i.e. we are repairing
 655          * an image), the we have to turn off direct IO because we cannot do IO
 656          * smaller than the host filesystem's sector size.
 657          */
 658         if (isa_file) {
 659                 int     fd = libxfs_device_to_fd(x.ddev);
 660                 struct xfs_fsop_geom_v1 geom = { 0 };
 661
 662                 if (ioctl(fd, XFS_IOC_FSGEOMETRY_V1, &geom) < 0) {
 663                         do_warn(_("Cannot get host filesystem geometry.\n"
 664                 "Repair may fail if there is a sector size mismatch between\n"
 665                 "the image and the host filesystem.\n"));
 666                         geom.sectsize = BBSIZE;
 667                 }
 668
 669                 if (psb.sb_sectsize < geom.sectsize) {
 670                         long    old_flags;
 671
 672                         old_flags = fcntl(fd, F_GETFL, 0);
 673                         if (fcntl(fd, F_SETFL, old_flags & ~O_DIRECT) < 0) {
 674                                 do_warn(_(
 675                 "Sector size on host filesystem larger than image sector size.\n"
 676                 "Cannot turn off direct IO, so exiting.\n"));
 677                                 exit(1);
 678                         }
 679                 }
 680         }
 681
 682         /*
 683          * Prepare the mount structure. Point the log reference to our local
 684          * copy so it's available to the various phases. The log bits are
 685          * initialized in phase 2.
 686          */
 687         memset(&xfs_m, 0, sizeof(xfs_mount_t));
 688         mp = libxfs_mount(&xfs_m, &psb, x.ddev, x.logdev, x.rtdev, 0);
 689
 690         if (!mp)  {
 691                 fprintf(stderr,
 692                         _("%s: cannot repair this filesystem.  Sorry.\n"),
 693                         progname);
 694                 exit(1);
 695         }
 696         mp->m_log = &log;
 697
 698         /* Spit out function & line on these corruption macros */
 699         if (verbose > 2)
 700                 mp->m_flags |= LIBXFS_MOUNT_WANT_CORRUPTED;
 701
 702         /*
 703          * set XFS-independent status vars from the mount/sb structure
 704          */
 705         glob_agcount = mp->m_sb.sb_agcount;
 706
 707         chunks_pblock = mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK;
 708         max_symlink_blocks = libxfs_symlink_blocks(mp, MAXPATHLEN);
 709         inodes_per_cluster = MAX(mp->m_sb.sb_inopblock,
 710                         mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog);
 711
 712         /*
 713          * Automatic striding for high agcount filesystems.
 714          *
 715          * More AGs indicates that the filesystem is either large or can handle
 716          * more IO parallelism. Either way, we should try to process multiple
 717          * AGs at a time in such a configuration to try to saturate the
 718          * underlying storage and speed the repair process. Only do this if
 719          * prefetching is enabled.
 720          *
 721          * Given mkfs defaults for 16AGs for "multidisk" configurations, we want
 722          * to target these for an increase in thread count. Hence a stride value
 723          * of 15 is chosen to ensure we get at least 2 AGs being scanned at once
 724          * on such filesystems.
 725          *
 726          * Limit the maximum thread count based on the available CPU power that
 727          * is available. If we use too many threads, we might run out of memory
 728          * and CPU power before we run out of IO concurrency. We limit to 8
 729          * threads/CPU as this is enough threads to saturate a CPU on fast
 730          * devices, yet few enough that it will saturate but won't overload slow
 731          * devices.
 732          */
 733         if (!ag_stride && glob_agcount >= 16 && do_prefetch)
 734                 ag_stride = 15;
 735
 736         if (ag_stride) {
 737                 int max_threads = platform_nproc() * 8;
 738
 739                 thread_count = (glob_agcount + ag_stride - 1) / ag_stride;
 740                 while (thread_count > max_threads) {
 741                         ag_stride *= 2;
 742                         thread_count = (glob_agcount + ag_stride - 1) /
 743                                                                 ag_stride;
 744                 }
 745                 if (thread_count > 0)
 746                         thread_init();
 747                 else {
 748                         thread_count = 1;
 749                         ag_stride = 0;
 750                 }
 751         }
 752
 753         if (ag_stride && report_interval) {
 754                 init_progress_rpt();
 755                 if (msgbuf) {
 756                         do_log(_("        - reporting progress in intervals of %s\n"),
 757                         duration(report_interval, msgbuf));
 758                 }
 759         }
 760
 761         /*
 762          * Adjust libxfs cache sizes based on system memory,
 763          * filesystem size and inode count.
 764          *
 765          * We'll set the cache size based on 3/4s the memory minus
 766          * space used by the inode AVL tree and block usage map.
 767          *
 768          * Inode AVL tree space is approximately 4 bytes per inode,
 769          * block usage map is currently 1 byte for 2 blocks.
 770          *
 771          * We assume most blocks will be inode clusters.
 772          *
 773          * Calculations are done in kilobyte units.
 774          */
 775
 776         if (!bhash_option_used || max_mem_specified) {
 777                 unsigned long   mem_used;
 778                 unsigned long   max_mem;
 779                 struct rlimit   rlim;
 780
 781                 libxfs_bcache_purge();
 782                 cache_destroy(libxfs_bcache);
 783
 784                 mem_used = (mp->m_sb.sb_icount >> (10 - 2)) +
 785                                         (mp->m_sb.sb_dblocks >> (10 + 1)) +
 786                                         50000;  /* rough estimate of 50MB overhead */
 787                 max_mem = max_mem_specified ? max_mem_specified * 1024 :
 788                                                 libxfs_physmem() * 3 / 4;
 789
 790                 if (getrlimit(RLIMIT_AS, &rlim) != -1 &&
 791                                         rlim.rlim_cur != RLIM_INFINITY) {
 792                         rlim.rlim_cur = rlim.rlim_max;
 793                         setrlimit(RLIMIT_AS, &rlim);
 794                         /* use approximately 80% of rlimit to avoid overrun */
 795                         max_mem = MIN(max_mem, rlim.rlim_cur / 1280);
 796                 } else
 797                         max_mem = MIN(max_mem, (LONG_MAX >> 10) + 1);
 798
 799                 if (verbose > 1)
 800                         do_log(
 801         _("        - max_mem = %lu, icount = %" PRIu64 ", imem = %" PRIu64 ", dblock = %" PRIu64 ", dmem = %" PRIu64 "\n"),
 802                                 max_mem, mp->m_sb.sb_icount,
 803                                 mp->m_sb.sb_icount >> (10 - 2),
 804                                 mp->m_sb.sb_dblocks,
 805                                 mp->m_sb.sb_dblocks >> (10 + 1));
 806
 807                 if (max_mem <= mem_used) {
 808                         if (max_mem_specified) {
 809                                 do_abort(
 810         _("Required memory for repair is greater that the maximum specified\n"
 811           "with the -m option. Please increase it to at least %lu.\n"),
 812                                         mem_used / 1024);
 813                         }
 814                         do_warn(
 815         _("Memory available for repair (%luMB) may not be sufficient.\n"
 816           "At least %luMB is needed to repair this filesystem efficiently\n"
 817           "If repair fails due to lack of memory, please\n"),
 818                                 max_mem / 1024, mem_used / 1024);
 819                         if (do_prefetch)
 820                                 do_warn(
 821         _("turn prefetching off (-P) to reduce the memory footprint.\n"));
 822                         else
 823                                 do_warn(
 824         _("increase system RAM and/or swap space to at least %luMB.\n"),
 825                         mem_used * 2 / 1024);
 826
 827                         max_mem = mem_used;
 828                 }
 829
 830                 max_mem -= mem_used;
 831                 if (max_mem >= (1 << 30))
 832                         max_mem = 1 << 30;
 833                 libxfs_bhash_size = max_mem / (HASH_CACHE_RATIO *
 834                                 (mp->m_inode_cluster_size >> 10));
 835                 if (libxfs_bhash_size < 512)
 836                         libxfs_bhash_size = 512;
 837
 838                 if (verbose)
 839                         do_log(_("        - block cache size set to %d entries\n"),
 840                                 libxfs_bhash_size * HASH_CACHE_RATIO);
 841
 842                 libxfs_bcache = cache_init(0, libxfs_bhash_size,
 843                                                 &libxfs_bcache_operations);
 844         }
 845
 846         /*
 847          * calculate what mkfs would do to this filesystem
 848          */
 849         calc_mkfs(mp);
 850
 851         /*
 852          * initialize block alloc map
 853          */
 854         init_bmaps(mp);
 855         incore_ino_init(mp);
 856         incore_ext_init(mp);
 857
 858         /* initialize random globals now that we know the fs geometry */
 859         inodes_per_block = mp->m_sb.sb_inopblock;
 860
 861         if (parse_sb_version(&mp->m_sb))  {
 862                 do_warn(
 863         _("Found unsupported filesystem features.  Exiting now.\n"));
 864                 return(1);
 865         }
 866
 867         /* make sure the per-ag freespace maps are ok so we can mount the fs */
 868         phase2(mp, phase2_threads);
 869         timestamp(PHASE_END, 2, NULL);
 870
 871         if (do_prefetch)
 872                 init_prefetch(mp);
 873
 874         phase3(mp, phase2_threads);
 875         timestamp(PHASE_END, 3, NULL);
 876
 877         phase4(mp);
 878         timestamp(PHASE_END, 4, NULL);
 879
 880         if (no_modify)
 881                 printf(_("No modify flag set, skipping phase 5\n"));
 882         else {
 883                 phase5(mp);
 884         }
 885         timestamp(PHASE_END, 5, NULL);
 886
 887         /*
 888          * Done with the block usage maps, toss them...
 889          */
 890         free_bmaps(mp);
 891
 892         if (!bad_ino_btree)  {
 893                 phase6(mp);
 894                 timestamp(PHASE_END, 6, NULL);
 895
 896                 phase7(mp, phase2_threads);
 897                 timestamp(PHASE_END, 7, NULL);
 898         } else  {
 899                 do_warn(
 900 _("Inode allocation btrees are too corrupted, skipping phases 6 and 7\n"));
 901         }
 902
 903         if (lost_quotas && !have_uquotino && !have_gquotino && !have_pquotino) {
 904                 if (!no_modify)  {
 905                         do_warn(
 906 _("Warning:  no quota inodes were found.  Quotas disabled.\n"));
 907                 } else  {
 908                         do_warn(
 909 _("Warning:  no quota inodes were found.  Quotas would be disabled.\n"));
 910                 }
 911         } else if (lost_quotas)  {
 912                 if (!no_modify)  {
 913                         do_warn(
 914 _("Warning:  quota inodes were cleared.  Quotas disabled.\n"));
 915                 } else  {
 916                         do_warn(
 917 _("Warning:  quota inodes would be cleared.  Quotas would be disabled.\n"));
 918                 }
 919         } else  {
 920                 if (lost_uquotino)  {
 921                         if (!no_modify)  {
 922                                 do_warn(
 923 _("Warning:  user quota information was cleared.\n"
 924   "User quotas can not be enforced until limit information is recreated.\n"));
 925                         } else  {
 926                                 do_warn(
 927 _("Warning:  user quota information would be cleared.\n"
 928   "User quotas could not be enforced until limit information was recreated.\n"));
 929                         }
 930                 }
 931
 932                 if (lost_gquotino)  {
 933                         if (!no_modify)  {
 934                                 do_warn(
 935 _("Warning:  group quota information was cleared.\n"
 936   "Group quotas can not be enforced until limit information is recreated.\n"));
 937                         } else  {
 938                                 do_warn(
 939 _("Warning:  group quota information would be cleared.\n"
 940   "Group quotas could not be enforced until limit information was recreated.\n"));
 941                         }
 942                 }
 943
 944                 if (lost_pquotino)  {
 945                         if (!no_modify)  {
 946                                 do_warn(
 947 _("Warning:  project quota information was cleared.\n"
 948   "Project quotas can not be enforced until limit information is recreated.\n"));
 949                         } else  {
 950                                 do_warn(
 951 _("Warning:  project quota information would be cleared.\n"
 952   "Project quotas could not be enforced until limit information was recreated.\n"));
 953                         }
 954                 }
 955         }
 956
 957         if (ag_stride && report_interval)
 958                 stop_progress_rpt();
 959
 960         if (no_modify)  {
 961                 /*
 962                  * Warn if the current LSN is problematic and the log requires a
 963                  * reformat.
 964                  */
 965                 format_log_max_lsn(mp);
 966
 967                 do_log(
 968         _("No modify flag set, skipping filesystem flush and exiting.\n"));
 969                 if (verbose)
 970                         summary_report();
 971                 if (fs_is_dirty)
 972                         return(1);
 973
 974                 return(0);
 975         }
 976
 977         /*
 978          * Clear the quota flags if they're on.
 979          */
 980         sbp = libxfs_getsb(mp, 0);
 981         if (!sbp)
 982                 do_error(_("couldn't get superblock\n"));
 983
 984         dsb = XFS_BUF_TO_SBP(sbp);
 985
 986         if (be16_to_cpu(dsb->sb_qflags) & XFS_ALL_QUOTA_CHKD) {
 987                 do_warn(_("Note - quota info will be regenerated on next "
 988                         "quota mount.\n"));
 989                 dsb->sb_qflags &= cpu_to_be16(~XFS_ALL_QUOTA_CHKD);
 990         }
 991
 992         if (copied_sunit) {
 993                 do_warn(
 994 _("Note - stripe unit (%d) and width (%d) were copied from a backup superblock.\n"
 995   "Please reset with mount -o sunit=<value>,swidth=<value> if necessary\n"),
 996                         be32_to_cpu(dsb->sb_unit), be32_to_cpu(dsb->sb_width));
 997         }
 998
 999         libxfs_writebuf(sbp, 0);
1000
1001         /*
1002          * Done. Flush all cached buffers and inodes first to ensure all
1003          * verifiers are run (where we discover the max metadata LSN), reformat
1004          * the log if necessary and unmount.
1005          */
1006         libxfs_bcache_flush();
1007         format_log_max_lsn(mp);
1008         libxfs_umount(mp);
1009
1010         if (x.rtdev)
1011                 libxfs_device_close(x.rtdev);
1012         if (x.logdev && x.logdev != x.ddev)
1013                 libxfs_device_close(x.logdev);
1014         libxfs_device_close(x.ddev);
1015
1016         if (verbose)
1017                 summary_report();
1018         do_log(_("done\n"));
1019
1020         if (dangerously && !no_modify)
1021                 do_warn(
1022 _("Repair of readonly mount complete.  Immediate reboot encouraged.\n"));
1023
1024         pftrace_done();
1025
1026         free(msgbuf);
1027
1028         return (0);
1029 }