]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
xfs_scrub_all: fix termination signal handling
authorDarrick J. Wong <djwong@kernel.org>
Fri, 12 Jan 2024 02:07:07 +0000 (18:07 -0800)
committerDarrick J. Wong <djwong@kernel.org>
Fri, 12 Jan 2024 02:08:47 +0000 (18:08 -0800)
Currently, xfs_scrub_all does not handle termination signals well.
SIGTERM and SIGINT are left to their default handlers, which are
immediate termination of the process group in the case of SIGTERM and
raising KeyboardInterrupt in the case of SIGINT.

Terminating the process group is fine when the xfs_scrub processes are
direct children, but this completely doesn't work if we're farming the
work out to systemd services since we don't terminate the child service.
Instead, they keep going.

Raising KeyboardInterrupt doesn't work because once the main thread
calls sys.exit at the bottom of main(), it blocks in the python runtime
waiting for child threads to terminate.  There's no longer any context
to handle an exception, so the signal is ignored and no child processes
are killed.

In other words, if you try to kill a running xfs_scrub_all, chances are
good it won't kill the child xfs_scrub processes.  This is undesirable
and egregious since we actually have the ability to track and kill all
the subprocesses that we create.

Solve the subproblem of getting stuck in the python runtime by calling
it repeatedly until we no longer have subprocesses.  This means that the
main thread loops until all threads have exited.

Solve the subproblem of the signals doing the wrong thing by setting up
our own signal handler that can wake up the main thread and initiate
subprocess shutdown, no matter whether the subprocesses are systemd
services or directly fork/exec'd.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
scrub/xfs_scrub_all.in

index 2c20b91fdbe5efd182ad0a8756aa30e4dc80db47..d0ab27fd30602735c385f2493a511dc3ef67ea5b 100644 (file)
@@ -14,6 +14,7 @@ import time
 import sys
 import os
 import argparse
+import signal
 from io import TextIOWrapper
 
 retcode = 0
@@ -196,6 +197,45 @@ def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
                cond.notify()
                cond.release()
 
+def signal_scrubs(signum, cond):
+       '''Handle termination signals by killing xfs_scrub children.'''
+       global debug, terminate
+
+       if debug:
+               print('Signal handler called with signal', signum)
+               sys.stdout.flush()
+
+       terminate = True
+       cond.acquire()
+       cond.notify()
+       cond.release()
+
+def wait_for_termination(cond, killfuncs):
+       '''Wait for a child thread to terminate.  Returns True if we should
+       abort the program, False otherwise.'''
+       global debug, terminate
+
+       if debug:
+               print('waiting for threads to terminate')
+               sys.stdout.flush()
+
+       cond.acquire()
+       try:
+               cond.wait()
+       except KeyboardInterrupt:
+               terminate = True
+       cond.release()
+
+       if not terminate:
+               return False
+
+       print("Terminating...")
+       sys.stdout.flush()
+       while len(killfuncs) > 0:
+               fn = killfuncs.pop()
+               fn()
+       return True
+
 def main():
        '''Find mounts, schedule scrub runs.'''
        def thr(mnt, devs):
@@ -231,6 +271,10 @@ def main():
        running_devs = set()
        killfuncs = set()
        cond = threading.Condition()
+
+       signal.signal(signal.SIGINT, lambda s, f: signal_scrubs(s, cond))
+       signal.signal(signal.SIGTERM, lambda s, f: signal_scrubs(s, cond))
+
        while len(fs) > 0:
                if len(running_devs) == 0:
                        mnt, devs = fs.popitem()
@@ -250,18 +294,14 @@ def main():
                                thr(mnt, devs)
                for p in poppers:
                        fs.pop(p)
-               cond.acquire()
-               try:
-                       cond.wait()
-               except KeyboardInterrupt:
-                       terminate = True
-                       print("Terminating...")
-                       sys.stdout.flush()
-                       while len(killfuncs) > 0:
-                               fn = killfuncs.pop()
-                               fn()
-                       fs = []
-               cond.release()
+
+               # Wait for one thread to finish
+               if wait_for_termination(cond, killfuncs):
+                       break
+
+       # Wait for the rest of the threads to finish
+       while len(killfuncs) > 0:
+               wait_for_termination(cond, killfuncs)
 
        if journalthread is not None:
                journalthread.terminate()