]> git.ipfire.org Git - thirdparty/knot-resolver.git/commitdiff
systemd: resetting failed kresd units on startup
authorVasek Sraier <git@vakabus.cz>
Sun, 5 Sep 2021 18:41:55 +0000 (20:41 +0200)
committerAleš Mrázek <ales.mrazek@nic.cz>
Fri, 8 Apr 2022 14:17:52 +0000 (16:17 +0200)
fix #16

manager/knot_resolver_manager/kres_manager.py
manager/knot_resolver_manager/kresd_controller/systemd/__init__.py
manager/knot_resolver_manager/kresd_controller/systemd/dbus_api.py

index 5acddc959e0ab1ad9a81de54bf4aedc58e47447c..3253ba51037ea4f31173900e8c5f52e51e98b99e 100644 (file)
@@ -192,7 +192,10 @@ class KresManager:
         return self._last_used_config_strict
 
     async def _instability_handler(self) -> None:
-        logger.error("Instability callback invoked. No idea how to react, performing suicide. See you later!")
+        logger.error(
+            "Instability callback invoked. Something is wrong, no idea how to react."
+            " Performing suicide. See you later!"
+        )
         sys.exit(1)
 
     async def _watchdog(self) -> None:
index 6fb456730393ac70148de37500ad687425a2aad5..507df20551dad736454820812b7beb3a567ca6b1 100644 (file)
@@ -128,12 +128,21 @@ class SystemdSubprocessController(SubprocessController):
 
     async def get_all_running_instances(self) -> Iterable[Subprocess]:
         res: List[SystemdSubprocess] = []
-        units = await compat.asyncio.to_thread(systemd.list_unit_names, self._systemd_type)
+        units = await compat.asyncio.to_thread(systemd.list_units, self._systemd_type)
         for unit in units:
-            u: str = unit
-            if u.startswith("kresd") and u.endswith(".service"):
-                iden = u.replace("kresd", "")[1:].replace(".service", "")
-                persistance_type = SystemdPersistanceType.PERSISTENT if "@" in u else SystemdPersistanceType.TRANSIENT
+            if unit.name.startswith("kresd") and unit.name.endswith(".service"):
+                iden = unit.name.replace("kresd", "")[1:].replace(".service", "")
+                persistance_type = (
+                    SystemdPersistanceType.PERSISTENT if "@" in unit.name else SystemdPersistanceType.TRANSIENT
+                )
+
+                if unit.state == "failed":
+                    # if a unit is failed, remove it from the system by reseting its state
+                    # should work for both transient and persistent units
+                    logger.warning("Unit '%s' is already failed, resetting its state and ignoring it", unit.name)
+                    await compat.asyncio.to_thread(systemd.reset_failed_unit, self._systemd_type, unit.name)
+                    continue
+
                 res.append(
                     SystemdSubprocess(
                         SubprocessType.KRESD,
@@ -142,7 +151,10 @@ class SystemdSubprocessController(SubprocessController):
                         persistance_type,
                     )
                 )
-            elif u == "kres-cache-gc.service":
+            elif unit.name == "kres-cache-gc.service":
+                # we can't easily check, if the unit is transient or not without additional systemd call
+                # we ignore it for now and assume the default persistency state. It shouldn't cause any
+                # problems, because interactions with the process are done the same way in all cases
                 res.append(SystemdSubprocess(SubprocessType.GC, alloc(), self._systemd_type))
         return res
 
index 8b0e364a598b2d48d34aabc602e292bb709b7595..8fa3e95e63c295185fdd846ff4c681ceb269c24c 100644 (file)
@@ -111,8 +111,9 @@ def list_unit_names(type_: SystemdType) -> List[str]:
     return [str(u[0]) for u in _list_units_internal(type_)]
 
 
-def list_failed_unit_names(type_: SystemdType) -> List[str]:
-    return [str(u[0]) for u in _list_units_internal(type_) if str(u[3]) == "failed"]
+def reset_failed_unit(typ: SystemdType, unit_name: str):
+    systemd = _create_manager_proxy(typ)
+    systemd.ResetFailedUnit(unit_name)
 
 
 def restart_unit(type_: SystemdType, unit_name: str):