From: Daan De Meyer Date: Fri, 3 May 2024 21:33:58 +0000 (+0200) Subject: qemu: Use systemd-run to allocate scopes X-Git-Tag: v23.1~75^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4d6ddaca962f9a820a5d14e9ae8a633e9192bac4;p=thirdparty%2Fmkosi.git qemu: Use systemd-run to allocate scopes This doesn't drastically change behavior, but will open the way for adding a RuntimeProperties= setting to allow configuring various properties of the scope unit. Since allocating a scope with systemd-run involves communicating with a daemon running on the host, there's no point in running it from the sandbox so we run it from the host instead. Because systemd-run needs to run as the uid that started mkosi, we can't use preexec_fn anymore to allocate the user namespace for virtiofsd. Instead, we reimplement what become_root() does on top of unshare and chain execute into that which then itself chain executes virtiofsd. --- diff --git a/mkosi/config.py b/mkosi/config.py index b1d01dd0d..806da3d51 100644 --- a/mkosi/config.py +++ b/mkosi/config.py @@ -1704,6 +1704,7 @@ class Config: scripts: Optional[Path] = None, mounts: Sequence[Mount] = (), options: Sequence[PathString] = (), + setup: Sequence[PathString] = (), extra: Sequence[PathString] = (), ) -> AbstractContextManager[list[PathString]]: mounts = [ @@ -1729,6 +1730,7 @@ class Config: tools=self.tools() if tools else Path("/"), mounts=mounts, options=options, + setup=setup, extra=extra, ) diff --git a/mkosi/qemu.py b/mkosi/qemu.py index 54121daf2..d89dd3e0e 100644 --- a/mkosi/qemu.py +++ b/mkosi/qemu.py @@ -45,7 +45,7 @@ from mkosi.run import SD_LISTEN_FDS_START, AsyncioThread, find_binary, fork_and_ from mkosi.sandbox import Mount from mkosi.tree import copy_tree, rmtree from mkosi.types import PathString -from mkosi.user import INVOKING_USER, become_root +from mkosi.user import INVOKING_USER, become_root, become_root_cmd from mkosi.util import StrEnum, flock, flock_or_die, try_or from mkosi.versioncomp import GenericVersion @@ -254,9 +254,18 @@ def find_ovmf_firmware(config: Config, firmware: QemuFirmware) -> Optional[OvmfC def start_swtpm(config: Config) -> Iterator[Path]: with tempfile.TemporaryDirectory(prefix="mkosi-swtpm") as state: # swtpm_setup is noisy and doesn't have a --quiet option so we pipe it's stdout to /dev/null. - run(["swtpm_setup", "--tpm-state", state, "--tpm2", "--pcr-banks", "sha256", "--config", "/dev/null"], - sandbox=config.sandbox(binary="swtpm_setup", mounts=[Mount(state, state)]), - stdout=None if ARG_DEBUG.get() else subprocess.DEVNULL) + run( + ["swtpm_setup", "--tpm-state", state, "--tpm2", "--pcr-banks", "sha256", "--config", "/dev/null"], + sandbox=config.sandbox( + binary="swtpm_setup", + mounts=[Mount(state, state)], + ), + scope=scope_cmd( + name=f"mkosi-swtpm-{config.machine_or_name()}", + description=f"swtpm for {config.machine_or_name()}", + ), + stdout=None if ARG_DEBUG.get() else subprocess.DEVNULL, + ) cmdline = ["swtpm", "socket", "--tpm2", "--tpmstate", f"dir={state}"] @@ -274,12 +283,6 @@ def start_swtpm(config: Config) -> Iterator[Path]: pass_fds=(sock.fileno(),), sandbox=config.sandbox(binary="swtpm", mounts=[Mount(state, state)]), ) as (proc, innerpid): - allocate_scope( - config, - name=f"mkosi-swtpm-{config.machine_or_name()}", - pid=innerpid, - description=f"swtpm for {config.machine_or_name()}", - ) yield path kill(proc, innerpid, signal.SIGTERM) @@ -338,6 +341,14 @@ def start_virtiofsd(config: Config, directory: PathString, *, name: str, selinux cmdline += ["--fd", str(SD_LISTEN_FDS_START)] + uid = gid = None + runas = [] + if uidmap and os.getuid() != INVOKING_USER.uid: + uid = INVOKING_USER.uid + gid = INVOKING_USER.gid + elif not uidmap and os.getuid() != 0: + runas = become_root_cmd() + with spawn( cmdline, pass_fds=(sock.fileno(),), @@ -345,21 +356,21 @@ def start_virtiofsd(config: Config, directory: PathString, *, name: str, selinux # in the user namespace it spawns, so by specifying --uid 0 --gid 0 we'll get a userns with the current # uid/gid mapped to root in the userns. --cap-add=all is required to make virtiofsd work. Since it drops # capabilities itself, we don't bother figuring out the exact set of capabilities it needs. - user=INVOKING_USER.uid if uidmap else None, - group=INVOKING_USER.gid if uidmap else None, - preexec_fn=become_root if not uidmap else None, + user=uid, + group=gid, sandbox=config.sandbox( binary=virtiofsd, mounts=[Mount(directory, directory)], options=["--uid", "0", "--gid", "0", "--cap-add", "all"], + setup=runas, ), - ) as (proc, innerpid): - allocate_scope( - config, + scope=scope_cmd( name=f"mkosi-virtiofsd-{name}", - pid=innerpid, description=f"virtiofsd for {directory}", - ) + user=uid, + group=gid, + ), + ) as (proc, innerpid): yield path kill(proc, innerpid, signal.SIGTERM) @@ -462,16 +473,14 @@ def start_journal_remote(config: Config, sockfd: int) -> Iterator[None]: Mount(f.name, "/etc/systemd/journal-remote.conf"), ], ), - user=config.forward_journal.parent.stat().st_uid if INVOKING_USER.invoked_as_root else None, - group=config.forward_journal.parent.stat().st_gid if INVOKING_USER.invoked_as_root else None, - foreground=False, - ) as (proc, innerpid): - allocate_scope( - config, + scope=scope_cmd( name=f"mkosi-journal-remote-{config.machine_or_name()}", - pid=innerpid, description=f"mkosi systemd-journal-remote for {config.machine_or_name()}", - ) + user=config.forward_journal.parent.stat().st_uid if INVOKING_USER.invoked_as_root else None, + group=config.forward_journal.parent.stat().st_gid if INVOKING_USER.invoked_as_root else None, + ), + foreground=False, + ) as (proc, innerpid): yield kill(proc, innerpid, signal.SIGTERM) @@ -676,47 +685,23 @@ def finalize_state(config: Config, cid: int) -> Iterator[None]: p.unlink(missing_ok=True) -def allocate_scope(config: Config, *, name: str, pid: int, description: str) -> None: - if os.getuid() != 0 and "DBUS_SESSION_BUS_ADDRESS" not in os.environ: - return - - if ( - os.getuid() == 0 and - "DBUS_SYSTEM_ADDRESS" not in os.environ and - not Path("/run/dbus/system_bus_socket").exists() - ): - return - - scope = run( - ["systemd-escape", "--mangle", f"{name}.scope"], - stdout=subprocess.PIPE, - foreground=False, - ).stdout.strip() - - run( - [ - "busctl", - "call", - "--system" if os.getuid() == 0 else "--user", - "--quiet", - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - "StartTransientUnit", - "ssa(sv)a(sa(sv))", - scope, - "fail", - "4", - "Description", "s", description, - "CollectMode", "s", "inactive-or-failed", - "PIDs", "au", "1", str(pid), - "AddRef", "b", "1", - "0", - ], - foreground=False, - env=os.environ | config.environment, - sandbox=config.sandbox(binary="busctl", relaxed=True), - ) +def scope_cmd( + name: str, + description: str, + user: Optional[int] = None, + group: Optional[int] = None, +) -> list[str]: + return [ + "systemd-run", + "--system" if os.getuid() == 0 else "--user", + *(["--quiet"] if not ARG_DEBUG.get() else []), + "--unit", name, + "--description", description, + "--scope", + "--collect", + *(["--uid", str(user)] if user is not None else []), + *(["--gid", str(group)] if group is not None else []), + ] def register_machine(config: Config, pid: int, fname: Path) -> None: @@ -1147,6 +1132,7 @@ def run_qemu(args: Args, config: Config) -> None: sys.stderr.fileno(), ) + name = f"mkosi-{config.machine_or_name().replace('_', '-')}" with spawn( cmdline, stdin=stdin, @@ -1157,18 +1143,12 @@ def run_qemu(args: Args, config: Config) -> None: log=False, foreground=True, sandbox=config.sandbox(binary=None, network=True, devices=True, relaxed=True), + scope=scope_cmd(name=name, description=f"mkosi Virtual Machine {name}"), ) as (proc, innerpid): # We have to close these before we wait for qemu otherwise we'll deadlock as qemu will never exit. for fd in qemu_device_fds.values(): os.close(fd) - name = f"mkosi-{config.machine_or_name().replace('_', '-')}" - allocate_scope( - config, - name=name, - pid=innerpid, - description=f"mkosi Virtual Machine {name}", - ) register_machine(config, innerpid, fname) if proc.wait() == 0 and (status := int(notifications.get("EXIT_STATUS", 0))): diff --git a/mkosi/run.py b/mkosi/run.py index 064db0690..2a4bf3ae5 100644 --- a/mkosi/run.py +++ b/mkosi/run.py @@ -140,6 +140,7 @@ def run( preexec_fn: Optional[Callable[[], None]] = None, success_exit_status: Sequence[int] = (0,), sandbox: AbstractContextManager[Sequence[PathString]] = contextlib.nullcontext([]), + scope: Sequence[str] = (), ) -> CompletedProcess: if input is not None: assert stdin is None # stdin and input cannot be specified together @@ -161,6 +162,7 @@ def run( preexec_fn=preexec_fn, success_exit_status=success_exit_status, sandbox=sandbox, + scope=scope, innerpid=False, ) as (process, _): out, err = process.communicate(input) @@ -187,6 +189,7 @@ def spawn( preexec_fn: Optional[Callable[[], None]] = None, success_exit_status: Sequence[int] = (0,), sandbox: AbstractContextManager[Sequence[PathString]] = contextlib.nullcontext([]), + scope: Sequence[str] = (), innerpid: bool = True, ) -> Iterator[tuple[Popen, int]]: assert sorted(set(pass_fds)) == list(pass_fds) @@ -215,6 +218,20 @@ def spawn( if "TMPDIR" in os.environ: env["TMPDIR"] = os.environ["TMPDIR"] + if scope: + if not find_binary("systemd-run"): + scope = [] + elif os.getuid() != 0 and "DBUS_SESSION_BUS_ADDRESS" in os.environ and "XDG_RUNTIME_DIR" in os.environ: + env["DBUS_SESSION_BUS_ADDRESS"] = os.environ["DBUS_SESSION_BUS_ADDRESS"] + env["XDG_RUNTIME_DIR"] = os.environ["XDG_RUNTIME_DIR"] + elif os.getuid() == 0 and "DBUS_SYSTEM_ADDRESS" in os.environ: + env["DBUS_SYSTEM_ADDRESS"] = os.environ["DBUS_SYSTEM_ADDRESS"] + else: + scope = [] + + if scope: + user = group = None + for e in ("SYSTEMD_LOG_LEVEL", "SYSTEMD_LOG_LOCATION"): if e in os.environ: env[e] = os.environ[e] @@ -299,7 +316,7 @@ def spawn( try: with subprocess.Popen( - prefix + cmdline, + [*scope, *prefix, *cmdline], stdin=stdin, stdout=stdout, stderr=stderr, @@ -335,7 +352,7 @@ def spawn( log_process_failure(prefix, cmdline, returncode) if ARG_DEBUG_SHELL.get(): subprocess.run( - [*prefix, "bash"], + [*scope, *prefix, "bash"], check=False, stdin=sys.stdin, text=True, diff --git a/mkosi/sandbox.py b/mkosi/sandbox.py index a1c59dbb9..c983948e4 100644 --- a/mkosi/sandbox.py +++ b/mkosi/sandbox.py @@ -46,7 +46,7 @@ class SandboxProtocol(Protocol): def __call__( self, *, - binary: Optional[PathString], + binary: Optional[PathString], mounts: Sequence[Mount] = () ) -> AbstractContextManager[list[PathString]]: ... @@ -120,6 +120,7 @@ def sandbox_cmd( relaxed: bool = False, mounts: Sequence[Mount] = (), options: Sequence[PathString] = (), + setup: Sequence[PathString] = (), extra: Sequence[PathString] = (), ) -> Iterator[list[PathString]]: cmdline: list[PathString] = [] @@ -132,6 +133,7 @@ def sandbox_cmd( vartmp = None cmdline += [ + *setup, "bwrap", *( ["--unshare-net"] diff --git a/mkosi/user.py b/mkosi/user.py index 4489c2166..e94958504 100644 --- a/mkosi/user.py +++ b/mkosi/user.py @@ -205,3 +205,26 @@ def become_root() -> None: INVOKING_USER.uid = SUBRANGE - 100 INVOKING_USER.gid = SUBRANGE - 100 + + +def become_root_cmd() -> list[str]: + if os.getuid() == 0: + return [] + + subuid = read_subrange(Path("/etc/subuid")) + subgid = read_subrange(Path("/etc/subgid")) + + cmd = [ + "unshare", + "--setuid", "0", + "--setgid", "0", + "--map-users", f"0:{subuid}:{SUBRANGE - 100}", + "--map-users", f"{SUBRANGE - 100}:{os.getuid()}:1", + "--map-users", f"{SUBRANGE - 100 + 1}:{subuid + SUBRANGE - 100 + 1}:99", + "--map-groups", f"0:{subgid}:{SUBRANGE - 100}", + "--map-groups", f"{SUBRANGE - 100}:{os.getgid()}:1", + "--map-groups", f"{SUBRANGE - 100 + 1}:{subgid + SUBRANGE - 100 + 1}:99", + "--keep-caps", + ] + + return [str(x) for x in cmd]