From: Vasek Sraier Date: Sun, 5 Sep 2021 18:41:55 +0000 (+0200) Subject: systemd: resetting failed kresd units on startup X-Git-Tag: v6.0.0a1~127 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d18421859478da7524df2d172c379aeebdf9da1d;p=thirdparty%2Fknot-resolver.git systemd: resetting failed kresd units on startup fix #16 --- diff --git a/manager/knot_resolver_manager/kres_manager.py b/manager/knot_resolver_manager/kres_manager.py index 5acddc959..3253ba510 100644 --- a/manager/knot_resolver_manager/kres_manager.py +++ b/manager/knot_resolver_manager/kres_manager.py @@ -192,7 +192,10 @@ class KresManager: return self._last_used_config_strict async def _instability_handler(self) -> None: - logger.error("Instability callback invoked. No idea how to react, performing suicide. See you later!") + logger.error( + "Instability callback invoked. Something is wrong, no idea how to react." + " Performing suicide. See you later!" + ) sys.exit(1) async def _watchdog(self) -> None: diff --git a/manager/knot_resolver_manager/kresd_controller/systemd/__init__.py b/manager/knot_resolver_manager/kresd_controller/systemd/__init__.py index 6fb456730..507df2055 100644 --- a/manager/knot_resolver_manager/kresd_controller/systemd/__init__.py +++ b/manager/knot_resolver_manager/kresd_controller/systemd/__init__.py @@ -128,12 +128,21 @@ class SystemdSubprocessController(SubprocessController): async def get_all_running_instances(self) -> Iterable[Subprocess]: res: List[SystemdSubprocess] = [] - units = await compat.asyncio.to_thread(systemd.list_unit_names, self._systemd_type) + units = await compat.asyncio.to_thread(systemd.list_units, self._systemd_type) for unit in units: - u: str = unit - if u.startswith("kresd") and u.endswith(".service"): - iden = u.replace("kresd", "")[1:].replace(".service", "") - persistance_type = SystemdPersistanceType.PERSISTENT if "@" in u else SystemdPersistanceType.TRANSIENT + if unit.name.startswith("kresd") and unit.name.endswith(".service"): + iden = unit.name.replace("kresd", "")[1:].replace(".service", "") + persistance_type = ( + SystemdPersistanceType.PERSISTENT if "@" in unit.name else SystemdPersistanceType.TRANSIENT + ) + + if unit.state == "failed": + # if a unit is failed, remove it from the system by reseting its state + # should work for both transient and persistent units + logger.warning("Unit '%s' is already failed, resetting its state and ignoring it", unit.name) + await compat.asyncio.to_thread(systemd.reset_failed_unit, self._systemd_type, unit.name) + continue + res.append( SystemdSubprocess( SubprocessType.KRESD, @@ -142,7 +151,10 @@ class SystemdSubprocessController(SubprocessController): persistance_type, ) ) - elif u == "kres-cache-gc.service": + elif unit.name == "kres-cache-gc.service": + # we can't easily check, if the unit is transient or not without additional systemd call + # we ignore it for now and assume the default persistency state. It shouldn't cause any + # problems, because interactions with the process are done the same way in all cases res.append(SystemdSubprocess(SubprocessType.GC, alloc(), self._systemd_type)) return res diff --git a/manager/knot_resolver_manager/kresd_controller/systemd/dbus_api.py b/manager/knot_resolver_manager/kresd_controller/systemd/dbus_api.py index 8b0e364a5..8fa3e95e6 100644 --- a/manager/knot_resolver_manager/kresd_controller/systemd/dbus_api.py +++ b/manager/knot_resolver_manager/kresd_controller/systemd/dbus_api.py @@ -111,8 +111,9 @@ def list_unit_names(type_: SystemdType) -> List[str]: return [str(u[0]) for u in _list_units_internal(type_)] -def list_failed_unit_names(type_: SystemdType) -> List[str]: - return [str(u[0]) for u in _list_units_internal(type_) if str(u[3]) == "failed"] +def reset_failed_unit(typ: SystemdType, unit_name: str): + systemd = _create_manager_proxy(typ) + systemd.ResetFailedUnit(unit_name) def restart_unit(type_: SystemdType, unit_name: str):