From 79ba1e15d80eba3aff4396f44629eb8960722d36 Mon Sep 17 00:00:00 2001 From: Srikanth C S Date: Tue, 13 Dec 2022 22:45:43 +0530 Subject: [PATCH] fsck.xfs: mount/umount xfs fs to replay log before running xfs_repair After a recent data center crash, we had to recover root filesystems on several thousands of VMs via a boot time fsck. Since these machines are remotely manageable, support can inject the kernel command line with 'fsck.mode=force fsck.repair=yes' to kick off xfs_repair if the machine won't come up or if they suspect there might be deeper issues with latent errors in the fs metadata, which is what they did to try to get everyone running ASAP while anticipating any future problems. But, fsck.xfs does not address the journal replay in case of a crash. fsck.xfs does xfs_repair -e if fsck.mode=force is set. It is possible that when the machine crashes, the fs is in inconsistent state with the journal log not yet replayed. This can drop the machine into the rescue shell because xfs_fsck.sh does not know how to clean the log. Since the administrator told us to force repairs, address the deficiency by cleaning the log and rerunning xfs_repair. Run xfs_repair -e when fsck.mode=force and repair=auto or yes. Replay the logs only if fsck.mode=force and fsck.repair=yes. For other option -fa and -f drop to the rescue shell if repair detects any corruptions. Signed-off-by: Srikanth C S Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fsck/xfs_fsck.sh | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fsck/xfs_fsck.sh b/fsck/xfs_fsck.sh index 6af0f224d..62a1e0b39 100755 --- a/fsck/xfs_fsck.sh +++ b/fsck/xfs_fsck.sh @@ -31,10 +31,12 @@ repair2fsck_code() { AUTO=false FORCE=false +REPAIR=false while getopts ":aApyf" c do case $c in - a|A|p|y) AUTO=true;; + a|A|p) AUTO=true;; + y) REPAIR=true;; f) FORCE=true;; esac done @@ -64,7 +66,32 @@ fi if $FORCE; then xfs_repair -e $DEV - repair2fsck_code $? + error=$? + if [ $error -eq 2 ] && [ $REPAIR = true ]; then + echo "Replaying log for $DEV" + mkdir -p /tmp/repair_mnt || exit 1 + for x in $(cat /proc/cmdline); do + case $x in + root=*) + ROOT="${x#root=}" + ;; + rootflags=*) + ROOTFLAGS="-o ${x#rootflags=}" + ;; + esac + done + test -b "$ROOT" || ROOT=$(blkid -t "$ROOT" -o device) + if [ $(basename $DEV) = $(basename $ROOT) ]; then + mount $DEV /tmp/repair_mnt $ROOTFLAGS || exit 1 + else + mount $DEV /tmp/repair_mnt || exit 1 + fi + umount /tmp/repair_mnt + xfs_repair -e $DEV + error=$? + rm -d /tmp/repair_mnt + fi + repair2fsck_code $error exit $? fi -- 2.39.2