From: Daan De Meyer Date: Fri, 17 Jun 2022 14:16:18 +0000 (-0400) Subject: machine: Add retries for ssh X-Git-Tag: v13~15^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=05f07b3476a27b646262d1eaa51266cf812145e2;p=thirdparty%2Fmkosi.git machine: Add retries for ssh We've been seeing quite a bit of "connection refused" errors in CI. These are likely happening because sshd hasn't finished starting yet. The proper fix for this is to add notify socket support for systemd running qemu VMs via virtio sockets, but even if that's added, it will be a very long time before we can rely on it. For now, let's add a retry mechanism for SSH connections to make our CI setup more reliable. --- diff --git a/mkosi/machine.py b/mkosi/machine.py index 956cc03d3..3133f8b33 100644 --- a/mkosi/machine.py +++ b/mkosi/machine.py @@ -7,6 +7,7 @@ import os import signal import subprocess import sys +import time import unittest from textwrap import dedent from typing import Any, Iterator, Optional, Sequence, TextIO, Union @@ -170,7 +171,20 @@ class Machine: else: cmdline = run_shell_cmdline(self.args, pipe=True, commands=commands) - return run(cmdline, check=check, stdout=stdout, stderr=stderr, text=True, timeout=timeout) + # The retry logic only applies when running commands against a VM. + + for _ in range(0, 30): + try: + return run(cmdline, check=check, stdout=stdout, stderr=stderr, text=True, timeout=timeout) + except subprocess.CalledProcessError as e: + # Return code 255 is used for connection errors by ssh. + if self.args.verb != Verb.qemu or e.returncode != 255: + raise + + time.sleep(1) + + die("Failed to establish SSH connection") + def kill(self) -> None: self.__exit__(None, None, None)