]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - scrub/xfs_scrub_all.in
xfs_scrub_all: survive systemd restarts when waiting for services
[thirdparty/xfsprogs-dev.git] / scrub / xfs_scrub_all.in
CommitLineData
9d50331a 1#!/usr/bin/python3
f1dca11c 2
8d318d62 3# SPDX-License-Identifier: GPL-2.0-or-later
52520522 4# Copyright (C) 2018-2024 Oracle. All rights reserved.
f1dca11c 5#
8d318d62 6# Author: Darrick J. Wong <djwong@kernel.org>
959ef981
DC
7
8# Run online scrubbers in parallel, but avoid thrashing.
f1dca11c
DW
9
10import subprocess
11import json
12import threading
13import time
14import sys
824b5807 15import os
3dd91472 16import argparse
3abc6a0c 17from io import TextIOWrapper
f1dca11c
DW
18
19retcode = 0
20terminate = False
21
824b5807
DW
22def DEVNULL():
23 '''Return /dev/null in subprocess writable format.'''
24 try:
25 from subprocess import DEVNULL
26 return DEVNULL
27 except ImportError:
28 return open(os.devnull, 'wb')
29
f1dca11c
DW
30def find_mounts():
31 '''Map mountpoints to physical disks.'''
ab11d016
DW
32 def find_xfs_mounts(bdev, fs, lastdisk):
33 '''Attach lastdisk to each fs found under bdev.'''
34 if bdev['fstype'] == 'xfs' and bdev['mountpoint'] is not None:
35 mnt = bdev['mountpoint']
36 if mnt in fs:
37 fs[mnt].add(lastdisk)
38 else:
39 fs[mnt] = set([lastdisk])
40 if 'children' not in bdev:
41 return
42 for child in bdev['children']:
43 find_xfs_mounts(child, fs, lastdisk)
f1dca11c
DW
44
45 fs = {}
ab11d016 46 cmd=['lsblk', '-o', 'NAME,KNAME,TYPE,FSTYPE,MOUNTPOINT', '-J']
f1dca11c
DW
47 result = subprocess.Popen(cmd, stdout=subprocess.PIPE)
48 result.wait()
49 if result.returncode != 0:
50 return fs
74aed9c8 51 sarray = [x.decode(sys.stdout.encoding) for x in result.stdout.readlines()]
f1dca11c
DW
52 output = ' '.join(sarray)
53 bdevdata = json.loads(output)
ab11d016 54
f1dca11c
DW
55 # The lsblk output had better be in disks-then-partitions order
56 for bdev in bdevdata['blockdevices']:
ab11d016
DW
57 lastdisk = bdev['kname']
58 find_xfs_mounts(bdev, fs, lastdisk)
59
f1dca11c
DW
60 return fs
61
3abc6a0c
DW
62def backtick(cmd):
63 '''Generator function that yields lines of a program's stdout.'''
64 p = subprocess.Popen(cmd, stdout = subprocess.PIPE)
65 for line in TextIOWrapper(p.stdout, encoding="utf-8"):
66 yield line.strip()
67
68def remove_killfunc(killfuncs, fn):
69 '''Ensure fn is not in killfuncs.'''
70 try:
71 killfuncs.remove(fn)
72 except:
73 pass
824b5807 74
f1dca11c
DW
75def run_killable(cmd, stdout, killfuncs, kill_fn):
76 '''Run a killable program. Returns program retcode or -1 if we can't start it.'''
77 try:
78 proc = subprocess.Popen(cmd, stdout = stdout)
79 real_kill_fn = lambda: kill_fn(proc)
80 killfuncs.add(real_kill_fn)
81 proc.wait()
3abc6a0c 82 remove_killfunc(killfuncs, real_kill_fn)
f1dca11c
DW
83 return proc.returncode
84 except:
85 return -1
86
07c6fd59 87# systemd doesn't like unit instance names with slashes in them, so it
595874f2
DW
88# replaces them with dashes when it invokes the service. Filesystem paths
89# need a special --path argument so that dashes do not get mangled.
90def path_to_serviceunit(path):
91 '''Convert a pathname into a systemd service unit name.'''
92
93 cmd = ['systemd-escape', '--template', '@scrub_svcname@',
94 '--path', path]
07c6fd59
DW
95 try:
96 proc = subprocess.Popen(cmd, stdout = subprocess.PIPE)
97 proc.wait()
98 for line in proc.stdout:
595874f2 99 return line.decode(sys.stdout.encoding).strip()
07c6fd59 100 except:
7c4b91c5 101 return None
07c6fd59 102
3abc6a0c
DW
103def systemctl_stop(unitname):
104 '''Stop a systemd unit.'''
105 cmd = ['systemctl', 'stop', unitname]
106 x = subprocess.Popen(cmd)
107 x.wait()
108
109def systemctl_start(unitname, killfuncs):
110 '''Start a systemd unit and wait for it to complete.'''
111 stop_fn = None
112 cmd = ['systemctl', 'start', unitname]
113 try:
114 proc = subprocess.Popen(cmd, stdout = DEVNULL())
115 stop_fn = lambda: systemctl_stop(unitname)
116 killfuncs.add(stop_fn)
117 proc.wait()
118 ret = proc.returncode
119 except:
120 if stop_fn is not None:
121 remove_killfunc(killfuncs, stop_fn)
122 return -1
123
124 if ret != 1:
125 remove_killfunc(killfuncs, stop_fn)
126 return ret
127
128 # If systemctl-start returns 1, it's possible that the service failed
129 # or that dbus/systemd restarted and the client program lost its
130 # connection -- according to the systemctl man page, 1 means "unit not
131 # failed".
132 #
133 # Either way, we switch to polling the service status to try to wait
134 # for the service to end. As of systemd 249, the is-active command
135 # returns any of the following states: active, reloading, inactive,
136 # failed, activating, deactivating, or maintenance. Apparently these
137 # strings are not localized.
138 while True:
139 try:
140 for l in backtick(['systemctl', 'is-active', unitname]):
141 if l == 'failed':
142 remove_killfunc(killfuncs, stop_fn)
143 return 1
144 if l == 'inactive':
145 remove_killfunc(killfuncs, stop_fn)
146 return 0
147 except:
148 remove_killfunc(killfuncs, stop_fn)
149 return -1
150
151 time.sleep(1)
152
f1dca11c
DW
153def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
154 '''Run a scrub process.'''
155 global retcode, terminate
156
157 print("Scrubbing %s..." % mnt)
158 sys.stdout.flush()
159
160 try:
161 if terminate:
162 return
163
824b5807 164 # Try it the systemd way
595874f2
DW
165 unitname = path_to_serviceunit(path)
166 if unitname is not None:
3abc6a0c 167 ret = systemctl_start(unitname, killfuncs)
7c4b91c5
DW
168 if ret == 0 or ret == 1:
169 print("Scrubbing %s done, (err=%d)" % (mnt, ret))
170 sys.stdout.flush()
171 retcode |= ret
172 return
173
174 if terminate:
175 return
824b5807 176
f1dca11c 177 # Invoke xfs_scrub manually
27df677a
DW
178 cmd = ['@sbindir@/xfs_scrub']
179 cmd += '@scrub_args@'.split()
180 cmd += [mnt]
f1dca11c
DW
181 ret = run_killable(cmd, None, killfuncs, \
182 lambda proc: proc.terminate())
183 if ret >= 0:
184 print("Scrubbing %s done, (err=%d)" % (mnt, ret))
185 sys.stdout.flush()
186 retcode |= ret
187 return
188
189 if terminate:
190 return
191
192 print("Unable to start scrub tool.")
193 sys.stdout.flush()
194 finally:
195 running_devs -= mntdevs
196 cond.acquire()
197 cond.notify()
198 cond.release()
199
200def main():
201 '''Find mounts, schedule scrub runs.'''
202 def thr(mnt, devs):
203 a = (mnt, cond, running_devs, devs, killfuncs)
204 thr = threading.Thread(target = run_scrub, args = a)
205 thr.start()
206 global retcode, terminate
207
3dd91472
DW
208 parser = argparse.ArgumentParser( \
209 description = "Scrub all mounted XFS filesystems.")
210 parser.add_argument("-V", help = "Report version and exit.", \
211 action = "store_true")
212 args = parser.parse_args()
213
214 if args.V:
215 print("xfs_scrub_all version @pkg_version@")
216 sys.exit(0)
217
f1dca11c
DW
218 fs = find_mounts()
219
824b5807
DW
220 # Tail the journal if we ourselves aren't a service...
221 journalthread = None
222 if 'SERVICE_MODE' not in os.environ:
223 try:
224 cmd=['journalctl', '--no-pager', '-q', '-S', 'now', \
225 '-f', '-u', 'xfs_scrub@*', '-o', \
226 'cat']
227 journalthread = subprocess.Popen(cmd)
228 except:
229 pass
230
f1dca11c
DW
231 # Schedule scrub jobs...
232 running_devs = set()
233 killfuncs = set()
234 cond = threading.Condition()
235 while len(fs) > 0:
236 if len(running_devs) == 0:
237 mnt, devs = fs.popitem()
238 running_devs.update(devs)
239 thr(mnt, devs)
240 poppers = set()
241 for mnt in fs:
242 devs = fs[mnt]
243 can_run = True
244 for dev in devs:
245 if dev in running_devs:
246 can_run = False
247 break
248 if can_run:
249 running_devs.update(devs)
250 poppers.add(mnt)
251 thr(mnt, devs)
252 for p in poppers:
253 fs.pop(p)
254 cond.acquire()
255 try:
256 cond.wait()
257 except KeyboardInterrupt:
258 terminate = True
259 print("Terminating...")
260 sys.stdout.flush()
261 while len(killfuncs) > 0:
262 fn = killfuncs.pop()
263 fn()
264 fs = []
265 cond.release()
266
824b5807
DW
267 if journalthread is not None:
268 journalthread.terminate()
269
270 # See the service mode comments in xfs_scrub.c for why we do this.
271 if 'SERVICE_MODE' in os.environ:
272 time.sleep(2)
273 if retcode != 0:
274 retcode = 1
275
f1dca11c
DW
276 sys.exit(retcode)
277
278if __name__ == '__main__':
279 main()