]> git.ipfire.org Git - thirdparty/mkosi.git/commitdiff
qemu: Use systemd-run to allocate scopes
authorDaan De Meyer <daan.j.demeyer@gmail.com>
Fri, 3 May 2024 21:33:58 +0000 (23:33 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Fri, 3 May 2024 22:33:04 +0000 (00:33 +0200)
This doesn't drastically change behavior, but will open the way for
adding a RuntimeProperties= setting to allow configuring various
properties of the scope unit.

Since allocating a scope with systemd-run involves communicating with
a daemon running on the host, there's no point in running it from the
sandbox so we run it from the host instead.

Because systemd-run needs to run as the uid that started mkosi, we can't
use preexec_fn anymore to allocate the user namespace for virtiofsd.
Instead, we reimplement what become_root() does on top of unshare and
chain execute into that which then itself chain executes virtiofsd.

mkosi/config.py
mkosi/qemu.py
mkosi/run.py
mkosi/sandbox.py
mkosi/user.py

index b1d01dd0d66d845da4bad44e7994fd9c8934e668..806da3d516d63ffdd9c5a9ba7c90470b54ad84d0 100644 (file)
@@ -1704,6 +1704,7 @@ class Config:
         scripts: Optional[Path] = None,
         mounts: Sequence[Mount] = (),
         options: Sequence[PathString] = (),
+        setup: Sequence[PathString] = (),
         extra: Sequence[PathString] = (),
     ) -> AbstractContextManager[list[PathString]]:
         mounts = [
@@ -1729,6 +1730,7 @@ class Config:
             tools=self.tools() if tools else Path("/"),
             mounts=mounts,
             options=options,
+            setup=setup,
             extra=extra,
         )
 
index 54121daf2b3333141f0eeaf19204bc012f0aa870..d89dd3e0e2df3c72255437f73157572292292adb 100644 (file)
@@ -45,7 +45,7 @@ from mkosi.run import SD_LISTEN_FDS_START, AsyncioThread, find_binary, fork_and_
 from mkosi.sandbox import Mount
 from mkosi.tree import copy_tree, rmtree
 from mkosi.types import PathString
-from mkosi.user import INVOKING_USER, become_root
+from mkosi.user import INVOKING_USER, become_root, become_root_cmd
 from mkosi.util import StrEnum, flock, flock_or_die, try_or
 from mkosi.versioncomp import GenericVersion
 
@@ -254,9 +254,18 @@ def find_ovmf_firmware(config: Config, firmware: QemuFirmware) -> Optional[OvmfC
 def start_swtpm(config: Config) -> Iterator[Path]:
     with tempfile.TemporaryDirectory(prefix="mkosi-swtpm") as state:
         # swtpm_setup is noisy and doesn't have a --quiet option so we pipe it's stdout to /dev/null.
-        run(["swtpm_setup", "--tpm-state", state, "--tpm2", "--pcr-banks", "sha256", "--config", "/dev/null"],
-            sandbox=config.sandbox(binary="swtpm_setup", mounts=[Mount(state, state)]),
-            stdout=None if ARG_DEBUG.get() else subprocess.DEVNULL)
+        run(
+            ["swtpm_setup", "--tpm-state", state, "--tpm2", "--pcr-banks", "sha256", "--config", "/dev/null"],
+            sandbox=config.sandbox(
+                binary="swtpm_setup",
+                mounts=[Mount(state, state)],
+            ),
+            scope=scope_cmd(
+                name=f"mkosi-swtpm-{config.machine_or_name()}",
+                description=f"swtpm for {config.machine_or_name()}",
+            ),
+            stdout=None if ARG_DEBUG.get() else subprocess.DEVNULL,
+        )
 
         cmdline = ["swtpm", "socket", "--tpm2", "--tpmstate", f"dir={state}"]
 
@@ -274,12 +283,6 @@ def start_swtpm(config: Config) -> Iterator[Path]:
                 pass_fds=(sock.fileno(),),
                 sandbox=config.sandbox(binary="swtpm", mounts=[Mount(state, state)]),
             ) as (proc, innerpid):
-                allocate_scope(
-                    config,
-                    name=f"mkosi-swtpm-{config.machine_or_name()}",
-                    pid=innerpid,
-                    description=f"swtpm for {config.machine_or_name()}",
-                )
                 yield path
                 kill(proc, innerpid, signal.SIGTERM)
 
@@ -338,6 +341,14 @@ def start_virtiofsd(config: Config, directory: PathString, *, name: str, selinux
 
         cmdline += ["--fd", str(SD_LISTEN_FDS_START)]
 
+        uid = gid = None
+        runas = []
+        if uidmap and os.getuid() != INVOKING_USER.uid:
+            uid = INVOKING_USER.uid
+            gid = INVOKING_USER.gid
+        elif not uidmap and os.getuid() != 0:
+            runas = become_root_cmd()
+
         with spawn(
             cmdline,
             pass_fds=(sock.fileno(),),
@@ -345,21 +356,21 @@ def start_virtiofsd(config: Config, directory: PathString, *, name: str, selinux
             # in the user namespace it spawns, so by specifying --uid 0 --gid 0 we'll get a userns with the current
             # uid/gid mapped to root in the userns. --cap-add=all is required to make virtiofsd work. Since it drops
             # capabilities itself, we don't bother figuring out the exact set of capabilities it needs.
-            user=INVOKING_USER.uid if uidmap else None,
-            group=INVOKING_USER.gid if uidmap else None,
-            preexec_fn=become_root if not uidmap else None,
+            user=uid,
+            group=gid,
             sandbox=config.sandbox(
                 binary=virtiofsd,
                 mounts=[Mount(directory, directory)],
                 options=["--uid", "0", "--gid", "0", "--cap-add", "all"],
+                setup=runas,
             ),
-        ) as (proc, innerpid):
-            allocate_scope(
-                config,
+            scope=scope_cmd(
                 name=f"mkosi-virtiofsd-{name}",
-                pid=innerpid,
                 description=f"virtiofsd for {directory}",
-            )
+                user=uid,
+                group=gid,
+            ),
+        ) as (proc, innerpid):
             yield path
             kill(proc, innerpid, signal.SIGTERM)
 
@@ -462,16 +473,14 @@ def start_journal_remote(config: Config, sockfd: int) -> Iterator[None]:
                     Mount(f.name, "/etc/systemd/journal-remote.conf"),
                 ],
             ),
-            user=config.forward_journal.parent.stat().st_uid if INVOKING_USER.invoked_as_root else None,
-            group=config.forward_journal.parent.stat().st_gid if INVOKING_USER.invoked_as_root else None,
-            foreground=False,
-        ) as (proc, innerpid):
-            allocate_scope(
-                config,
+            scope=scope_cmd(
                 name=f"mkosi-journal-remote-{config.machine_or_name()}",
-                pid=innerpid,
                 description=f"mkosi systemd-journal-remote for {config.machine_or_name()}",
-            )
+                user=config.forward_journal.parent.stat().st_uid if INVOKING_USER.invoked_as_root else None,
+                group=config.forward_journal.parent.stat().st_gid if INVOKING_USER.invoked_as_root else None,
+            ),
+            foreground=False,
+        ) as (proc, innerpid):
             yield
             kill(proc, innerpid, signal.SIGTERM)
 
@@ -676,47 +685,23 @@ def finalize_state(config: Config, cid: int) -> Iterator[None]:
             p.unlink(missing_ok=True)
 
 
-def allocate_scope(config: Config, *, name: str, pid: int, description: str) -> None:
-    if os.getuid() != 0 and "DBUS_SESSION_BUS_ADDRESS" not in os.environ:
-        return
-
-    if (
-        os.getuid() == 0 and
-        "DBUS_SYSTEM_ADDRESS" not in os.environ and
-        not Path("/run/dbus/system_bus_socket").exists()
-    ):
-        return
-
-    scope = run(
-        ["systemd-escape", "--mangle", f"{name}.scope"],
-        stdout=subprocess.PIPE,
-        foreground=False,
-    ).stdout.strip()
-
-    run(
-        [
-            "busctl",
-            "call",
-            "--system" if os.getuid() == 0 else "--user",
-            "--quiet",
-            "org.freedesktop.systemd1",
-            "/org/freedesktop/systemd1",
-            "org.freedesktop.systemd1.Manager",
-            "StartTransientUnit",
-            "ssa(sv)a(sa(sv))",
-            scope,
-            "fail",
-            "4",
-            "Description", "s", description,
-            "CollectMode", "s", "inactive-or-failed",
-            "PIDs", "au", "1", str(pid),
-            "AddRef", "b", "1",
-            "0",
-        ],
-        foreground=False,
-        env=os.environ | config.environment,
-        sandbox=config.sandbox(binary="busctl", relaxed=True),
-    )
+def scope_cmd(
+    name: str,
+    description: str,
+    user: Optional[int] = None,
+    group: Optional[int] = None,
+) -> list[str]:
+    return [
+        "systemd-run",
+        "--system" if os.getuid() == 0 else "--user",
+        *(["--quiet"] if not ARG_DEBUG.get() else []),
+        "--unit", name,
+        "--description", description,
+        "--scope",
+        "--collect",
+        *(["--uid", str(user)] if user is not None else []),
+        *(["--gid", str(group)] if group is not None else []),
+    ]
 
 
 def register_machine(config: Config, pid: int, fname: Path) -> None:
@@ -1147,6 +1132,7 @@ def run_qemu(args: Args, config: Config) -> None:
             sys.stderr.fileno(),
         )
 
+        name = f"mkosi-{config.machine_or_name().replace('_', '-')}"
         with spawn(
             cmdline,
             stdin=stdin,
@@ -1157,18 +1143,12 @@ def run_qemu(args: Args, config: Config) -> None:
             log=False,
             foreground=True,
             sandbox=config.sandbox(binary=None, network=True, devices=True, relaxed=True),
+            scope=scope_cmd(name=name, description=f"mkosi Virtual Machine {name}"),
         ) as (proc, innerpid):
             # We have to close these before we wait for qemu otherwise we'll deadlock as qemu will never exit.
             for fd in qemu_device_fds.values():
                 os.close(fd)
 
-            name = f"mkosi-{config.machine_or_name().replace('_', '-')}"
-            allocate_scope(
-                config,
-                name=name,
-                pid=innerpid,
-                description=f"mkosi Virtual Machine {name}",
-            )
             register_machine(config, innerpid, fname)
 
             if proc.wait() == 0 and (status := int(notifications.get("EXIT_STATUS", 0))):
index 064db069031460089c24e8439ffe8e03c6be7fa6..2a4bf3ae558bfdc13e85b2f246914af759818aa4 100644 (file)
@@ -140,6 +140,7 @@ def run(
     preexec_fn: Optional[Callable[[], None]] = None,
     success_exit_status: Sequence[int] = (0,),
     sandbox: AbstractContextManager[Sequence[PathString]] = contextlib.nullcontext([]),
+    scope: Sequence[str] = (),
 ) -> CompletedProcess:
     if input is not None:
         assert stdin is None  # stdin and input cannot be specified together
@@ -161,6 +162,7 @@ def run(
             preexec_fn=preexec_fn,
             success_exit_status=success_exit_status,
             sandbox=sandbox,
+            scope=scope,
             innerpid=False,
         ) as (process, _):
             out, err = process.communicate(input)
@@ -187,6 +189,7 @@ def spawn(
     preexec_fn: Optional[Callable[[], None]] = None,
     success_exit_status: Sequence[int] = (0,),
     sandbox: AbstractContextManager[Sequence[PathString]] = contextlib.nullcontext([]),
+    scope: Sequence[str] = (),
     innerpid: bool = True,
 ) -> Iterator[tuple[Popen, int]]:
     assert sorted(set(pass_fds)) == list(pass_fds)
@@ -215,6 +218,20 @@ def spawn(
     if "TMPDIR" in os.environ:
         env["TMPDIR"] = os.environ["TMPDIR"]
 
+    if scope:
+        if not find_binary("systemd-run"):
+            scope = []
+        elif os.getuid() != 0 and "DBUS_SESSION_BUS_ADDRESS" in os.environ and "XDG_RUNTIME_DIR" in os.environ:
+            env["DBUS_SESSION_BUS_ADDRESS"] = os.environ["DBUS_SESSION_BUS_ADDRESS"]
+            env["XDG_RUNTIME_DIR"] = os.environ["XDG_RUNTIME_DIR"]
+        elif os.getuid() == 0 and "DBUS_SYSTEM_ADDRESS" in os.environ:
+            env["DBUS_SYSTEM_ADDRESS"] = os.environ["DBUS_SYSTEM_ADDRESS"]
+        else:
+            scope = []
+
+    if scope:
+        user = group = None
+
     for e in ("SYSTEMD_LOG_LEVEL", "SYSTEMD_LOG_LOCATION"):
         if e in os.environ:
             env[e] = os.environ[e]
@@ -299,7 +316,7 @@ def spawn(
 
         try:
             with subprocess.Popen(
-                prefix + cmdline,
+                [*scope, *prefix, *cmdline],
                 stdin=stdin,
                 stdout=stdout,
                 stderr=stderr,
@@ -335,7 +352,7 @@ def spawn(
                         log_process_failure(prefix, cmdline, returncode)
                     if ARG_DEBUG_SHELL.get():
                         subprocess.run(
-                            [*prefix, "bash"],
+                            [*scope, *prefix, "bash"],
                             check=False,
                             stdin=sys.stdin,
                             text=True,
index a1c59dbb92503c99a0931f403cab1c75beee7fd9..c983948e4b0be15dccca87279f66052c8321651c 100644 (file)
@@ -46,7 +46,7 @@ class SandboxProtocol(Protocol):
     def __call__(
         self,
         *,
-        binary: Optional[PathString], 
+        binary: Optional[PathString],
         mounts: Sequence[Mount] = ()
     ) -> AbstractContextManager[list[PathString]]: ...
 
@@ -120,6 +120,7 @@ def sandbox_cmd(
     relaxed: bool = False,
     mounts: Sequence[Mount] = (),
     options: Sequence[PathString] = (),
+    setup: Sequence[PathString] = (),
     extra: Sequence[PathString] = (),
 ) -> Iterator[list[PathString]]:
     cmdline: list[PathString] = []
@@ -132,6 +133,7 @@ def sandbox_cmd(
         vartmp = None
 
     cmdline += [
+        *setup,
         "bwrap",
         *(
             ["--unshare-net"]
index 4489c216671b877769e3e228d5d7737544f5db2c..e94958504004dfda01cd1ceaaf55f9d894cbcc00 100644 (file)
@@ -205,3 +205,26 @@ def become_root() -> None:
 
     INVOKING_USER.uid = SUBRANGE - 100
     INVOKING_USER.gid = SUBRANGE - 100
+
+
+def become_root_cmd() -> list[str]:
+    if os.getuid() == 0:
+        return []
+
+    subuid = read_subrange(Path("/etc/subuid"))
+    subgid = read_subrange(Path("/etc/subgid"))
+
+    cmd = [
+        "unshare",
+        "--setuid", "0",
+        "--setgid", "0",
+        "--map-users",  f"0:{subuid}:{SUBRANGE - 100}",
+        "--map-users",  f"{SUBRANGE - 100}:{os.getuid()}:1",
+        "--map-users",  f"{SUBRANGE - 100 + 1}:{subuid + SUBRANGE - 100 + 1}:99",
+        "--map-groups", f"0:{subgid}:{SUBRANGE - 100}",
+        "--map-groups", f"{SUBRANGE - 100}:{os.getgid()}:1",
+        "--map-groups", f"{SUBRANGE - 100 + 1}:{subgid + SUBRANGE - 100 + 1}:99",
+        "--keep-caps",
+    ]
+
+    return [str(x) for x in cmd]