]> git.ipfire.org Git - ipfire-2.x.git/blobdiff - src/scripts/repair-mdraid
core168: Add script to automatically repair MDRAID arrays
[ipfire-2.x.git] / src / scripts / repair-mdraid
diff --git a/src/scripts/repair-mdraid b/src/scripts/repair-mdraid
new file mode 100644 (file)
index 0000000..a622ff7
--- /dev/null
@@ -0,0 +1,169 @@
+#!/bin/bash
+###############################################################################
+#                                                                             #
+# IPFire.org - A linux based firewall                                         #
+# Copyright (C) 2022 IPFire Team  <info@ipfire.org>                           #
+#                                                                             #
+# This program is free software: you can redistribute it and/or modify        #
+# it under the terms of the GNU General Public License as published by        #
+# the Free Software Foundation, either version 3 of the License, or           #
+# (at your option) any later version.                                         #
+#                                                                             #
+# This program is distributed in the hope that it will be useful,             #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
+# GNU General Public License for more details.                                #
+#                                                                             #
+# You should have received a copy of the GNU General Public License           #
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
+#                                                                             #
+###############################################################################
+#
+# This script is supposed to repair any broken RAID installations
+# where the system has been booted from only one of the RAID devices
+# without the software RAID being activated first.
+#
+# This script does as follows:
+#
+# * It tries to find an inactive RAID called "ipfire:0"
+# * It will then destroy any devices that are still part of this RAID.
+#   This is required because if the RAID is being assembled correctly,
+#   data from the disk that has NOT been mounted will be replicated
+#   back to the device that has been changed. That causes that any
+#   data that has been written to the mounted disk will be lost.
+#   To avoid this, we will partially destroy the RAID.
+# * We will then erase any partition tables and destroy any filesystems
+#   on the devices so that they do not get accidentially mounted again.
+# * The system will then need to be rebooted where the RAID will be
+#   mounted again in a degraded state which might take some extra
+#   time at boot (the system stands still for about a minute).
+# * After the system has been booted up correctly, we will re-add
+#   the devices back to the RAID which will resync and the system
+#   will be back to its intended configuration.
+
+find_inactive_raid() {
+       local status
+       local device
+       local arg
+       local args
+
+       while read -r status device args; do
+               if [ "${status}" = "INACTIVE-ARRAY" ]; then
+                       for arg in ${args}; do
+                               case "${arg}" in
+                                       name=ipfire:0)
+                                               echo "${device}"
+                                               return 0
+                                               ;;
+                               esac
+                       done
+               fi
+       done <<< "$(mdadm --detail --scan)"
+
+       return 1
+}
+
+find_root() {
+       local device
+       local mp
+       local fs
+       local args
+
+       while read -r device mp fs args; do
+               if [ "${mp}" = "/" ]; then
+                       echo "${device:0:-1}"
+                       return 0
+               fi
+       done < /proc/mounts
+
+       return 1
+}
+
+find_raid_devices() {
+       local raid="${1}"
+
+       local IFS=,
+
+       local device
+       for device in $(mdadm -v --detail --scan "${raid}" | awk -F= '/^[ ]+devices/ { print $2 }'); do
+               echo "${device}"
+       done
+
+       return 0
+}
+
+destroy_everything() {
+       local device="${1}"
+       local part
+
+       # Destroy the RAID superblock
+       mdadm --zero-superblock "${device}"
+
+       # Wipe the partition table
+       wipefs -a "${device}"
+
+       # Wipe any partition signatures
+       for part in ${device}*; do
+               wipefs -a "${part}"
+       done
+}
+
+raid_rebuild() {
+       local devices=( "$@" )
+
+       cat > /etc/rc.d/rcsysinit.d/S99fix-raid <<EOF
+#!/bin/bash
+
+case "\${1}" in
+       start)
+               if [ -e "/dev/md/ipfire:0" ]; then
+                       for device in ${devices[@]}; do
+                               mdadm --add "/dev/md/ipfire:0" "\${device}"
+                       done
+
+                       # Delete this script
+                       rm "\${0}"
+               fi
+               ;;
+esac
+EOF
+
+       chmod a+x /etc/rc.d/rcsysinit.d/S99fix-raid
+}
+
+main() {
+       local raid="$(find_inactive_raid)"
+
+       # Nothing to do if no RAID device found
+       if [ -z "${raid}" ]; then
+               return 0
+       fi
+
+       echo "Fixing RAID ${raid}..."
+
+       local root="$(find_root)"
+
+       # Finding any devices in this RAID
+       local devices=(
+               $(find_raid_devices "${raid}")
+       )
+
+       # Stop the RAID
+       mdadm --stop "${raid}" &>/dev/null
+
+       # Destroy any useful data on all remaining RAID devices
+       local device
+       for device in ${devices[@]}; do
+               # Skip root
+               [ "${device}" = "${root}" ] && continue
+
+               destroy_everything "${device}"
+       done &>/dev/null
+
+       # Re-add devices to the RAID
+       raid_rebuild "${device}"
+
+       return 0
+}
+
+main "$@" || return $?