* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <xfs/libxlog.h>
+#include "libxfs.h"
+#include "libxlog.h"
#include <sys/resource.h>
+#include "xfs_multidisk.h"
#include "avl.h"
#include "avl64.h"
#include "globals.h"
#include "threads.h"
#include "progress.h"
#include "dinode.h"
+#include "slab.h"
+#include "rmap.h"
#define rounddown(x, y) (((x)/(y))*(y))
static int bhash_option_used;
static long max_mem_specified; /* in megabytes */
static int phase2_threads = 32;
+static bool report_corrected;
static void
usage(void)
" -l logdev Specifies the device where the external log resides.\n"
" -m maxmem Maximum amount of memory to be used in megabytes.\n"
" -n No modify mode, just checks the filesystem for damage.\n"
+" (Cannot be used together with -e.)\n"
" -P Disables prefetching.\n"
" -r rtdev Specifies the device where the realtime section resides.\n"
" -v Verbose output.\n"
" -o subopts Override default behaviour, refer to man page.\n"
" -t interval Reporting interval in seconds.\n"
" -d Repair dangerously.\n"
+" -e Exit with a non-zero code if any errors were repaired.\n"
+" (Cannot be used together with -n.)\n"
" -V Reports version and exits.\n"), progname);
exit(1);
}
_("bad stripe width in superblock");
err_message[XR_BAD_SVN] =
_("bad shared version number in superblock");
+ err_message[XR_BAD_CRC] =
+ _("bad CRC in superblock");
+ err_message[XR_BAD_DIR_SIZE_DATA] =
+ _("inconsistent directory geometry information");
done = 1;
}
delete_attr_ok = 1;
force_geo = 0;
assume_xfs = 0;
- clear_sunit = 0;
+ copied_sunit = 0;
sb_inoalignmt = 0;
sb_unit = 0;
sb_width = 0;
- fs_attributes_allowed = 1;
- fs_attributes2_allowed = 1;
- fs_inode_nlink_allowed = 1;
- fs_quotas_allowed = 1;
- fs_aligned_inodes_allowed = 1;
- fs_sb_feature_bits_allowed = 1;
- fs_has_extflgbit_allowed = 1;
pre_65_beta = 0;
fs_shared_allowed = 1;
ag_stride = 0;
thread_count = 1;
report_interval = PROG_RPT_DEFAULT;
+ report_corrected = false;
/*
* XXX have to add suboption processing here
* attributes, quotas, nlinks, aligned_inos, sb_fbits
*/
- while ((c = getopt(argc, argv, "c:o:fl:m:r:LnDvVdPt:")) != EOF) {
+ while ((c = getopt(argc, argv, "c:o:fl:m:r:LnDvVdPet:")) != EOF) {
switch (c) {
case 'D':
dumpcore = 1;
while (*p != '\0') {
char *val;
- switch (getsubopt(&p, (constpp)o_opts, &val)) {
+ switch (getsubopt(&p, o_opts, &val)) {
case ASSUME_XFS:
if (val)
noval('o', o_opts, ASSUME_XFS);
while (*p) {
char *val;
- switch (getsubopt(&p, (constpp)c_opts, &val)) {
+ switch (getsubopt(&p, c_opts, &val)) {
case CONVERT_LAZY_COUNT:
lazy_count = (int)strtol(val, NULL, 0);
convert_lazy_count = 1;
case 't':
report_interval = (int)strtol(optarg, NULL, 0);
break;
+ case 'e':
+ report_corrected = true;
+ break;
case '?':
usage();
}
if ((fs_name = argv[optind]) == NULL)
usage();
+
+ if (report_corrected && no_modify)
+ usage();
}
void __attribute__((noreturn))
do_inoalign = mp->m_sinoalign;
/*
- * pre-calculate geometry of ag 0. We know what it looks
- * like because we know what mkfs does -- 3 btree roots,
- * and some number of blocks to prefill the agfl.
+ * Pre-calculate the geometry of ag 0. We know what it looks like
+ * because we know what mkfs does: 2 allocation btree roots (by block
+ * and by size), the inode allocation btree root, the free inode
+ * allocation btree root (if enabled) and some number of blocks to
+ * prefill the agfl.
+ *
+ * Because the current shape of the btrees may differ from the current
+ * shape, we open code the mkfs freelist block count here. mkfs creates
+ * single level trees, so the calculation is pertty straight forward for
+ * the trees that use the AGFL.
*/
bnobt_root = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
bcntbt_root = bnobt_root + 1;
inobt_root = bnobt_root + 2;
- fino_bno = inobt_root + XFS_MIN_FREELIST_RAW(1, 1, mp) + 1;
+ fino_bno = inobt_root + (2 * min(2, mp->m_ag_maxlevels)) + 1;
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ fino_bno++;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ fino_bno += min(2, mp->m_rmap_maxlevels); /* agfl blocks */
+ fino_bno++;
+ }
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ fino_bno++;
/*
* If the log is allocated in the first allocation group we need to
first_prealloc_ino = XFS_OFFBNO_TO_AGINO(mp, fino_bno, 0);
}
- ASSERT(XFS_IALLOC_BLOCKS(mp) > 0);
+ ASSERT(mp->m_ialloc_blks > 0);
- if (XFS_IALLOC_BLOCKS(mp) > 1)
+ if (mp->m_ialloc_blks > 1)
last_prealloc_ino = first_prealloc_ino + XFS_INODES_PER_CHUNK;
else
last_prealloc_ino = XFS_OFFBNO_TO_AGINO(mp, fino_bno + 1, 0);
}
+/*
+ * v5 superblock metadata track the LSN of last modification and thus require
+ * that the current LSN is always moving forward. The current LSN is reset if
+ * the log has been cleared, which puts the log behind parts of the filesystem
+ * on-disk and can disrupt log recovery.
+ *
+ * We have tracked the maximum LSN of every piece of metadata that has been read
+ * in via the read verifiers. Compare the max LSN with the log and if the log is
+ * behind, bump the cycle number and reformat the log.
+ */
+static void
+format_log_max_lsn(
+ struct xfs_mount *mp)
+{
+ struct xlog *log = mp->m_log;
+ int max_cycle;
+ int max_block;
+ int new_cycle;
+ xfs_daddr_t logstart;
+ xfs_daddr_t logblocks;
+ int logversion;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ /*
+ * If the log is ahead of the highest metadata LSN we've seen, we're
+ * safe and there's nothing to do.
+ */
+ max_cycle = CYCLE_LSN(libxfs_max_lsn);
+ max_block = BLOCK_LSN(libxfs_max_lsn);
+ if (max_cycle < log->l_curr_cycle ||
+ (max_cycle == log->l_curr_cycle && max_block < log->l_curr_block))
+ return;
+
+ /*
+ * Going to the next cycle should be sufficient but we bump by a few
+ * counts to help cover any metadata LSNs we could have missed.
+ */
+ new_cycle = max_cycle + 3;
+ logstart = XFS_FSB_TO_DADDR(mp, mp->m_sb.sb_logstart);
+ logblocks = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+ logversion = xfs_sb_version_haslogv2(&mp->m_sb) ? 2 : 1;
+
+ do_warn(_("Maximum metadata LSN (%d:%d) is ahead of log (%d:%d).\n"),
+ max_cycle, max_block, log->l_curr_cycle, log->l_curr_block);
+
+ if (no_modify) {
+ do_warn(_("Would format log to cycle %d.\n"), new_cycle);
+ return;
+ }
+
+ do_warn(_("Format log to cycle %d.\n"), new_cycle);
+ libxfs_log_clear(log->l_dev, NULL, logstart, logblocks,
+ &mp->m_sb.sb_uuid, logversion, mp->m_sb.sb_logsunit,
+ XLOG_FMT, new_cycle, true);
+}
+
+/*
+ * mkfs increases the AG count for "multidisk" configurations, we want
+ * to target these for an increase in thread count. Hence check the superlock
+ * geometry information to determine if mkfs considered this a multidisk
+ * configuration.
+ */
+static bool
+is_multidisk_filesystem(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+
+ /* High agcount filesystems are always considered "multidisk" */
+ if (sbp->sb_agcount >= XFS_MULTIDISK_AGCOUNT)
+ return true;
+
+ /*
+ * If it doesn't have a sunit/swidth, mkfs didn't consider it a
+ * multi-disk array, so we don't either.
+ */
+ if (!sbp->sb_unit)
+ return false;
+
+ ASSERT(sbp->sb_width);
+ return true;
+}
+
+/*
+ * if the sector size of the filesystem we are trying to repair is
+ * smaller than that of the underlying filesystem (i.e. we are repairing
+ * an image), the we have to turn off direct IO because we cannot do IO
+ * smaller than the host filesystem's sector size.
+ */
+static void
+check_fs_vs_host_sectsize(
+ struct xfs_sb *sb)
+{
+ int fd;
+ long old_flags;
+ struct xfs_fsop_geom_v1 geom = { 0 };
+
+ fd = libxfs_device_to_fd(x.ddev);
+
+ if (ioctl(fd, XFS_IOC_FSGEOMETRY_V1, &geom) < 0) {
+ do_log(_("Cannot get host filesystem geometry.\n"
+ "Repair may fail if there is a sector size mismatch between\n"
+ "the image and the host filesystem.\n"));
+ geom.sectsize = BBSIZE;
+ }
+
+ if (sb->sb_sectsize < geom.sectsize) {
+ old_flags = fcntl(fd, F_GETFL, 0);
+ if (fcntl(fd, F_SETFL, old_flags & ~O_DIRECT) < 0) {
+ do_warn(_(
+ "Sector size on host filesystem larger than image sector size.\n"
+ "Cannot turn off direct IO, so exiting.\n"));
+ exit(1);
+ }
+ }
+}
+
int
main(int argc, char **argv)
{
xfs_dsb_t *dsb;
xfs_buf_t *sbp;
xfs_mount_t xfs_m;
+ struct xlog log = {0};
char *msgbuf;
+ struct xfs_sb psb;
+ int rval;
progname = basename(argv[0]);
setlocale(LC_ALL, "");
timestamp(PHASE_START, 0, NULL);
timestamp(PHASE_END, 0, NULL);
+ /* -f forces this, but let's be nice and autodetect it, as well. */
+ if (!isa_file) {
+ int fd = libxfs_device_to_fd(x.ddev);
+ struct stat statbuf;
+
+ if (fstat(fd, &statbuf) < 0)
+ do_warn(_("%s: couldn't stat \"%s\"\n"),
+ progname, fs_name);
+ else if (S_ISREG(statbuf.st_mode))
+ isa_file = 1;
+ }
+
+ if (isa_file) {
+ /* Best effort attempt to validate fs vs host sector size */
+ rval = get_sb(&psb, 0, XFS_MAX_SECTORSIZE, 0);
+ if (rval == XR_OK)
+ check_fs_vs_host_sectsize(&psb);
+ }
+
/* do phase1 to make sure we have a superblock */
phase1(temp_mp);
timestamp(PHASE_END, 1, NULL);
exit(1);
}
- /* prepare the mount structure */
- memset(&xfs_m, 0, sizeof(xfs_mount_t));
- libxfs_buftarg_init(&xfs_m, x.ddev, x.logdev, x.rtdev);
- sbp = libxfs_readbuf(xfs_m.m_ddev_targp, XFS_SB_DADDR,
- 1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0,
- &xfs_sb_buf_ops);
- libxfs_sb_from_disk(&xfs_m.m_sb, XFS_BUF_TO_SBP(sbp));
+ rval = get_sb(&psb, 0, XFS_MAX_SECTORSIZE, 0);
+ if (rval != XR_OK) {
+ do_warn(_("Primary superblock bad after phase 1!\n"
+ "Exiting now.\n"));
+ exit(1);
+ }
/*
- * if the sector size of the filesystem we are trying to repair is
- * smaller than that of the underlying filesystem (i.e. we are repairing
- * an image), the we have to turn off direct IO because we cannot do IO
- * smaller than the host filesystem's sector size.
+ * Now that we have completely validated the superblock, geometry may
+ * have changed; re-check geometry vs the host filesystem geometry
*/
- if (isa_file) {
- int fd = libxfs_device_to_fd(x.ddev);
- struct xfs_fsop_geom_v1 geom = { 0 };
-
- if (ioctl(fd, XFS_IOC_FSGEOMETRY_V1, &geom) < 0) {
- do_warn(_("Cannot get host filesystem geometry.\n"
- "Repair may fail if there is a sector size mismatch between\n"
- "the image and the host filesystem.\n"));
- geom.sectsize = BBSIZE;
- }
+ if (isa_file)
+ check_fs_vs_host_sectsize(&psb);
- if (xfs_m.m_sb.sb_sectsize < geom.sectsize) {
- long old_flags;
-
- old_flags = fcntl(fd, F_GETFL, 0);
- if (fcntl(fd, F_SETFL, old_flags & ~O_DIRECT) < 0) {
- do_warn(_(
- "Sector size on host filesystem larger than image sector size.\n"
- "Cannot turn off direct IO, so exiting.\n"));
- exit(1);
- }
- }
- }
- mp = libxfs_mount(&xfs_m, &xfs_m.m_sb, x.ddev, x.logdev, x.rtdev, 0);
+ /*
+ * Prepare the mount structure. Point the log reference to our local
+ * copy so it's available to the various phases. The log bits are
+ * initialized in phase 2.
+ */
+ memset(&xfs_m, 0, sizeof(xfs_mount_t));
+ mp = libxfs_mount(&xfs_m, &psb, x.ddev, x.logdev, x.rtdev, 0);
if (!mp) {
fprintf(stderr,
progname);
exit(1);
}
- libxfs_putbuf(sbp);
- libxfs_purgebuf(sbp);
+ mp->m_log = &log;
+
+ /* Spit out function & line on these corruption macros */
+ if (verbose > 2)
+ mp->m_flags |= LIBXFS_MOUNT_WANT_CORRUPTED;
/*
* set XFS-independent status vars from the mount/sb structure
glob_agcount = mp->m_sb.sb_agcount;
chunks_pblock = mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK;
- max_symlink_blocks = libxfs_symlink_blocks(mp, MAXPATHLEN);
+ max_symlink_blocks = libxfs_symlink_blocks(mp, XFS_SYMLINK_MAXLEN);
inodes_per_cluster = MAX(mp->m_sb.sb_inopblock,
- XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+ mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog);
/*
* Automatic striding for high agcount filesystems.
* threads/CPU as this is enough threads to saturate a CPU on fast
* devices, yet few enough that it will saturate but won't overload slow
* devices.
+ *
+ * Multidisk filesystems can handle more IO parallelism so we should try
+ * to process multiple AGs at a time in such a configuration to try to
+ * saturate the underlying storage and speed the repair process. Only do
+ * this if prefetching is enabled.
*/
- if (!ag_stride && glob_agcount >= 16 && do_prefetch)
- ag_stride = 15;
+ if (!ag_stride && do_prefetch && is_multidisk_filesystem(mp)) {
+ /*
+ * For small agcount multidisk systems, just double the
+ * parallelism. For larger AG count filesystems (32 and above)
+ * use more parallelism, and linearly increase the parallelism
+ * with the number of AGs.
+ */
+ ag_stride = min(glob_agcount, XFS_MULTIDISK_AGCOUNT / 2) - 1;
+ }
if (ag_stride) {
int max_threads = platform_nproc() * 8;
"with the -m option. Please increase it to at least %lu.\n"),
mem_used / 1024);
}
- do_warn(
+ do_log(
_("Memory available for repair (%luMB) may not be sufficient.\n"
"At least %luMB is needed to repair this filesystem efficiently\n"
"If repair fails due to lack of memory, please\n"),
max_mem / 1024, mem_used / 1024);
if (do_prefetch)
- do_warn(
+ do_log(
_("turn prefetching off (-P) to reduce the memory footprint.\n"));
else
- do_warn(
+ do_log(
_("increase system RAM and/or swap space to at least %luMB.\n"),
mem_used * 2 / 1024);
init_bmaps(mp);
incore_ino_init(mp);
incore_ext_init(mp);
+ rmaps_init(mp);
/* initialize random globals now that we know the fs geometry */
inodes_per_block = mp->m_sb.sb_inopblock;
if (do_prefetch)
init_prefetch(mp);
- phase3(mp);
+ phase3(mp, phase2_threads);
timestamp(PHASE_END, 3, NULL);
phase4(mp);
/*
* Done with the block usage maps, toss them...
*/
+ rmaps_free(mp);
free_bmaps(mp);
if (!bad_ino_btree) {
phase6(mp);
timestamp(PHASE_END, 6, NULL);
- phase7(mp);
+ phase7(mp, phase2_threads);
timestamp(PHASE_END, 7, NULL);
} else {
do_warn(
stop_progress_rpt();
if (no_modify) {
+ /*
+ * Warn if the current LSN is problematic and the log requires a
+ * reformat.
+ */
+ format_log_max_lsn(mp);
+
do_log(
_("No modify flag set, skipping filesystem flush and exiting.\n"));
if (verbose)
dsb->sb_qflags &= cpu_to_be16(~XFS_ALL_QUOTA_CHKD);
}
- if (clear_sunit) {
+ if (copied_sunit) {
do_warn(
-_("Note - stripe unit (%d) and width (%d) fields have been reset.\n"
- "Please set with mount -o sunit=<value>,swidth=<value>\n"),
+_("Note - stripe unit (%d) and width (%d) were copied from a backup superblock.\n"
+ "Please reset with mount -o sunit=<value>,swidth=<value> if necessary\n"),
be32_to_cpu(dsb->sb_unit), be32_to_cpu(dsb->sb_width));
- dsb->sb_unit = 0;
- dsb->sb_width = 0;
}
libxfs_writebuf(sbp, 0);
/*
- * Done, flush all cached buffers and inodes.
+ * Done. Flush all cached buffers and inodes first to ensure all
+ * verifiers are run (where we discover the max metadata LSN), reformat
+ * the log if necessary and unmount.
*/
libxfs_bcache_flush();
-
+ format_log_max_lsn(mp);
libxfs_umount(mp);
+
if (x.rtdev)
libxfs_device_close(x.rtdev);
if (x.logdev && x.logdev != x.ddev)
libxfs_device_close(x.logdev);
libxfs_device_close(x.ddev);
+ libxfs_destroy();
if (verbose)
summary_report();
pftrace_done();
+ free(msgbuf);
+
+ if (fs_is_dirty && report_corrected)
+ return (4);
return (0);
}