]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
xfs_scrub_all: survive systemd restarts when waiting for services
authorDarrick J. Wong <djwong@kernel.org>
Fri, 12 Jan 2024 02:07:06 +0000 (18:07 -0800)
committerDarrick J. Wong <djwong@kernel.org>
Fri, 12 Jan 2024 02:08:47 +0000 (18:08 -0800)
If xfs_scrub_all detects a running systemd, it will use it to invoke
xfs_scrub subprocesses in a sandboxed and resource-controlled
environment.  Unfortunately, if you happen to restart dbus or systemd
while it's running, you get this:

systemd[1]: Reexecuting.
xfs_scrub_all[9958]: Warning! D-Bus connection terminated.
xfs_scrub_all[9956]: Warning! D-Bus connection terminated.
xfs_scrub_all[9956]: Failed to wait for response: Connection reset by peer
xfs_scrub_all[9958]: Failed to wait for response: Connection reset by peer
xfs_scrub_all[9930]: Scrubbing / done, (err=1)
xfs_scrub_all[9930]: Scrubbing /storage done, (err=1)

The xfs_scrub units themselves are still running, it's just that the
`systemctl start' command that xfs_scrub_all uses to start and wait for
the unit lost its connection to dbus and hence is no longer monitoring
sub-services.

When this happens, we don't have great options -- systemctl doesn't have
a command to wait on an activating (aka running) unit.  Emulate the
functionality we normally get by polling the failed/active statuses.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
scrub/xfs_scrub_all.in

index 671d588177a858c52a60d90393fc6e2e5fc4781d..ab9b491fb4e7d1e8d531b4ae0a72af97e7de25e1 100644 (file)
@@ -14,6 +14,7 @@ import time
 import sys
 import os
 import argparse
+from io import TextIOWrapper
 
 retcode = 0
 terminate = False
@@ -58,12 +59,18 @@ def find_mounts():
 
        return fs
 
-def kill_systemd(unit, proc):
-       '''Kill systemd unit.'''
-       proc.terminate()
-       cmd=['systemctl', 'stop', unit]
-       x = subprocess.Popen(cmd)
-       x.wait()
+def backtick(cmd):
+       '''Generator function that yields lines of a program's stdout.'''
+       p = subprocess.Popen(cmd, stdout = subprocess.PIPE)
+       for line in TextIOWrapper(p.stdout, encoding="utf-8"):
+               yield line.strip()
+
+def remove_killfunc(killfuncs, fn):
+       '''Ensure fn is not in killfuncs.'''
+       try:
+               killfuncs.remove(fn)
+       except:
+               pass
 
 def run_killable(cmd, stdout, killfuncs, kill_fn):
        '''Run a killable program.  Returns program retcode or -1 if we can't start it.'''
@@ -72,10 +79,7 @@ def run_killable(cmd, stdout, killfuncs, kill_fn):
                real_kill_fn = lambda: kill_fn(proc)
                killfuncs.add(real_kill_fn)
                proc.wait()
-               try:
-                       killfuncs.remove(real_kill_fn)
-               except:
-                       pass
+               remove_killfunc(killfuncs, real_kill_fn)
                return proc.returncode
        except:
                return -1
@@ -96,6 +100,56 @@ def path_to_serviceunit(path):
        except:
                return None
 
+def systemctl_stop(unitname):
+       '''Stop a systemd unit.'''
+       cmd = ['systemctl', 'stop', unitname]
+       x = subprocess.Popen(cmd)
+       x.wait()
+
+def systemctl_start(unitname, killfuncs):
+       '''Start a systemd unit and wait for it to complete.'''
+       stop_fn = None
+       cmd = ['systemctl', 'start', unitname]
+       try:
+               proc = subprocess.Popen(cmd, stdout = DEVNULL())
+               stop_fn = lambda: systemctl_stop(unitname)
+               killfuncs.add(stop_fn)
+               proc.wait()
+               ret = proc.returncode
+       except:
+               if stop_fn is not None:
+                       remove_killfunc(killfuncs, stop_fn)
+               return -1
+
+       if ret != 1:
+               remove_killfunc(killfuncs, stop_fn)
+               return ret
+
+       # If systemctl-start returns 1, it's possible that the service failed
+       # or that dbus/systemd restarted and the client program lost its
+       # connection -- according to the systemctl man page, 1 means "unit not
+       # failed".
+       #
+       # Either way, we switch to polling the service status to try to wait
+       # for the service to end.  As of systemd 249, the is-active command
+       # returns any of the following states: active, reloading, inactive,
+       # failed, activating, deactivating, or maintenance.  Apparently these
+       # strings are not localized.
+       while True:
+               try:
+                       for l in backtick(['systemctl', 'is-active', unitname]):
+                               if l == 'failed':
+                                       remove_killfunc(killfuncs, stop_fn)
+                                       return 1
+                               if l == 'inactive':
+                                       remove_killfunc(killfuncs, stop_fn)
+                                       return 0
+               except:
+                       remove_killfunc(killfuncs, stop_fn)
+                       return -1
+
+               time.sleep(1)
+
 def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
        '''Run a scrub process.'''
        global retcode, terminate
@@ -110,9 +164,7 @@ def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
                # Try it the systemd way
                unitname = path_to_serviceunit(path)
                if unitname is not None:
-                       cmd=['systemctl', 'start', unitname]
-                       ret = run_killable(cmd, DEVNULL(), killfuncs, \
-                                       lambda proc: kill_systemd(unitname, proc))
+                       ret = systemctl_start(unitname, killfuncs)
                        if ret == 0 or ret == 1:
                                print("Scrubbing %s done, (err=%d)" % (mnt, ret))
                                sys.stdout.flush()