]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: support forwarding FDs from payloads to managers
authorLuca Boccassi <luca.boccassi@gmail.com>
Fri, 1 May 2026 13:06:11 +0000 (14:06 +0100)
committerLuca Boccassi <luca.boccassi@gmail.com>
Fri, 15 May 2026 12:46:08 +0000 (13:46 +0100)
When there is a NOTIFY_SOCKET, and FDs are received from the
payload following the FD Store protocol, forward them up the
chain to the service manager that is managing nspawn.

This allows FD Store persistence across container restarts,
and can chain up for user managers as well to survive restarting
those, or reexecs, and in the future reboots too via LUO.

Add a new test case to exercise the PID1 -> user session -> nspawn -> payload
chain.

man/systemd-nspawn.xml
mkosi/mkosi.extra/usr/lib/systemd/system/systemd-nspawn@.service.d/fdstore.conf [new file with mode: 0644]
mkosi/mkosi.extra/usr/lib/systemd/system/user@.service.d/fdstore.conf [new file with mode: 0644]
mkosi/mkosi.images/minimal-base/mkosi.postinst
mkosi/mkosi.initrd.conf/mkosi.extra/usr/lib/systemd/system/systemd-nspawn@.service.d/fdstore.conf [new file with mode: 0644]
mkosi/mkosi.initrd.conf/mkosi.extra/usr/lib/systemd/system/user@.service.d/fdstore.conf [new file with mode: 0644]
src/nspawn/nspawn.c
src/test/meson.build
src/test/test-fdstore.c [new file with mode: 0644]
test/units/TEST-13-NSPAWN.unpriv.sh

index c93fad377627e406bad04c96376f3799a0a060ae..f8b82a4a6ef521b9eac8e0961ec2f46e3bf00906 100644 (file)
         <citerefentry><refentrytitle>systemd-vmspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
         that defaults to true.)</para>
 
+        <para>If <command>systemd-nspawn</command> itself is invoked with a <varname>$NOTIFY_SOCKET</varname>
+        set in its environment (i.e. it is itself supervised by a service manager that uses the
+        <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>
+        protocol), <literal>FDSTORE=1</literal> and <literal>FDSTOREREMOVE=1</literal> messages received
+        from the container payload (along with any accompanying file descriptors and
+        <literal>FDNAME=</literal> tag) are forwarded one level up to the enveloping service manager. This
+        allows the file descriptor store of services running inside the container to be preserved across
+        container restarts (and, transitively, across restarts, re-execs, soft-reboots and LUO-based kexecs
+        of any outer service manager), provided
+        <varname>FileDescriptorStoreMax=</varname>/<varname>FileDescriptorStorePreserve=yes</varname> are
+        configured on the unit running <command>systemd-nspawn</command>. See the
+        <ulink url="https://systemd.io/FILE_DESCRIPTOR_STORE">File Descriptor Store</ulink> documentation
+        for details.</para>
+
         <xi:include href="version-info.xml" xpointer="v231"/></listitem>
       </varlistentry>
 
diff --git a/mkosi/mkosi.extra/usr/lib/systemd/system/systemd-nspawn@.service.d/fdstore.conf b/mkosi/mkosi.extra/usr/lib/systemd/system/systemd-nspawn@.service.d/fdstore.conf
new file mode 100644 (file)
index 0000000..3b023f7
--- /dev/null
@@ -0,0 +1,3 @@
+[Service]
+FileDescriptorStoreMax=16
+FileDescriptorStorePreserve=yes
diff --git a/mkosi/mkosi.extra/usr/lib/systemd/system/user@.service.d/fdstore.conf b/mkosi/mkosi.extra/usr/lib/systemd/system/user@.service.d/fdstore.conf
new file mode 100644 (file)
index 0000000..8a0b417
--- /dev/null
@@ -0,0 +1,5 @@
+# For tests exercising the fd store we need the unit in the rootfs to have these
+# settings, or the fdstore content will be dropped in the initrd -> rootfs transition
+[Service]
+FileDescriptorStoreMax=20
+FileDescriptorStorePreserve=yes
index 6feaebc19a33fca1a4cd818641165f6ce0b7fc6a..ba3f4aec31d425450ef0697c30f6e6ea6596b501 100755 (executable)
@@ -16,3 +16,8 @@ chmod +x "$BUILDROOT/sbin/init"
 if [ ! -e "$BUILDROOT/etc/os-release" ]; then
     ln -s ../usr/lib/os-release "$BUILDROOT/etc/os-release"
 fi
+
+# For use in the minimal containers, only needs libsystemd and libc
+if [[ -x "$BUILDDIR/test-fdstore" ]]; then
+    cp "$BUILDDIR/test-fdstore" "$BUILDROOT/usr/bin/test-fdstore"
+fi
diff --git a/mkosi/mkosi.initrd.conf/mkosi.extra/usr/lib/systemd/system/systemd-nspawn@.service.d/fdstore.conf b/mkosi/mkosi.initrd.conf/mkosi.extra/usr/lib/systemd/system/systemd-nspawn@.service.d/fdstore.conf
new file mode 100644 (file)
index 0000000..3b023f7
--- /dev/null
@@ -0,0 +1,3 @@
+[Service]
+FileDescriptorStoreMax=16
+FileDescriptorStorePreserve=yes
diff --git a/mkosi/mkosi.initrd.conf/mkosi.extra/usr/lib/systemd/system/user@.service.d/fdstore.conf b/mkosi/mkosi.initrd.conf/mkosi.extra/usr/lib/systemd/system/user@.service.d/fdstore.conf
new file mode 100644 (file)
index 0000000..311fedd
--- /dev/null
@@ -0,0 +1,5 @@
+# For tests exercising the FD store we need the unit in the initrd to have these
+# settings, or the fdstore content will be dropped in the initrd
+[Service]
+FileDescriptorStoreMax=20
+FileDescriptorStorePreserve=yes
index 0e532cf7b069910f571d0f473ed8d5f4b4775a4e..16ea48eaf885027e5791f32e7313102895247690 100644 (file)
@@ -3782,9 +3782,13 @@ static int setup_notify_child(const void *directory) {
         if (r < 0)
                 log_debug_errno(r, "Failed to enable SO_PASSPIDFD, ignoring: %m");
 
-        r = setsockopt_int(fd, SOL_SOCKET, SO_PASSRIGHTS, false);
-        if (r < 0)
-                log_debug_errno(r, "Failed to turn off SO_PASSRIGHTS, ignoring: %m");
+        /* Only allow the container payload to pass file descriptors to us if we ourselves are
+         * supervised by a service manager that enabled the FD store. */
+        if (!fdstore_detected()) {
+                r = setsockopt_int(fd, SOL_SOCKET, SO_PASSRIGHTS, false);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to turn off SO_PASSRIGHTS, ignoring: %m");
+        }
 
         return TAKE_FD(fd);
 }
@@ -4613,6 +4617,76 @@ static int setup_uid_map(
         return 0;
 }
 
+static int forward_fd_store(char **tags, FDSet *fds) {
+        int r;
+
+        /* Forward fd-store related messages to our own service manager, so that file descriptors stored
+         * by the inner payload propagate up the chain and are preserved across restarts. Skip entirely
+         * if we have no upstream supervisor (no NOTIFY_SOCKET) or no fd store available (no FDSTORE).
+         *
+         * Forwarded entries are namespaced with a "payload-" prefix on their FDNAME so that they
+         * cannot collide with fd-store entries that nspawn itself might want to push to its own
+         * upstream supervisor (the container payload and nspawn share a single upstream fdstore
+         * namespace, since there's only one init system per container). */
+        if (!getenv("NOTIFY_SOCKET") || !fdstore_detected())
+                return 0;
+
+        if (strv_contains(tags, "FDSTOREREMOVE=1")) {
+                const char *fdname = strv_find_startswith(tags, "FDNAME=");
+                if (!fdname)
+                        return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                 "Got FDSTOREREMOVE=1 from container payload without FDNAME=, ignoring.");
+                if (!fdname_is_valid(fdname))
+                        return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+                                                 "Got FDSTOREREMOVE=1 from container payload with invalid FDNAME='%s', ignoring.",
+                                                 fdname);
+
+                r = sd_notifyf(/* unset_environment= */ false,
+                               "FDSTOREREMOVE=1\nFDNAME=payload-%s", fdname);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to forward FDSTOREREMOVE upstream, ignoring: %m");
+        } else if (strv_contains(tags, "FDSTORE=1")) {
+                if (fdset_isempty(fds)) {
+                        log_debug("Got FDSTORE=1 from container payload without any attached file descriptors, ignoring.");
+                        return 0;
+                }
+
+                _cleanup_free_ int *fds_array = NULL;
+                int n;
+
+                n = fdset_to_array(fds, &fds_array);
+                if (n < 0)
+                        return log_warning_errno(n, "Failed to convert fdset to array, ignoring FDSTORE forward: %m");
+
+                const char *fdname = strv_find_startswith(tags, "FDNAME=");
+                bool fdpoll_off = strv_contains(tags, "FDPOLL=0");
+                _cleanup_free_ char *msg = NULL;
+                unsigned n_fds = (unsigned) n;
+
+                if (fdname && !fdname_is_valid(fdname)) {
+                        log_warning("Got FDSTORE=1 from container payload with invalid FDNAME='%s', ignoring name.", fdname);
+                        fdname = NULL;
+                }
+
+                if (asprintf(&msg, "FDSTORE=1\nFDNAME=payload-%s%s%s",
+                             fdname ?: "stored",
+                             fdpoll_off ? "\nFDPOLL=" : "",
+                             fdpoll_off ? "0" : "") < 0)
+                        return log_oom();
+
+                r = sd_pid_notify_with_fds(
+                                0,
+                                /* unset_environment= */ false,
+                                msg,
+                                fds_array,
+                                n_fds);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to forward FDSTORE upstream, ignoring: %m");
+        }
+
+        return 0;
+}
+
 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
         PidRef *inner_child_pid = ASSERT_PTR(userdata);
         int r;
@@ -4621,7 +4695,8 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
 
         _cleanup_(pidref_done) PidRef sender_pid = PIDREF_NULL;
         _cleanup_strv_free_ char **tags = NULL;
-        r = notify_recv_strv(fd, &tags, /* ret_ucred= */ NULL, &sender_pid);
+        _cleanup_(fdset_freep) FDSet *fds = NULL;
+        r = notify_recv_with_fds_strv(fd, &tags, /* ret_ucred= */ NULL, &sender_pid, &fds);
         if (r == -EAGAIN)
                 return 0;
         if (r < 0)
@@ -4656,6 +4731,8 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
                         (void) sd_notifyf(/* unset_environment= */ false, "STATUS=Container running.");
         }
 
+        (void) forward_fd_store(tags, fds);
+
         return 0;
 }
 
index faf075c10326ed15faa62dfbf3ae0abcda46d1d2..587a5c6159cd0bf98d355497f6ab91ad41fa2218 100644 (file)
@@ -323,6 +323,11 @@ executables += [
                 'sources' : files('test-fd-util.c'),
                 'dependencies' : libseccomp_cflags,
         },
+        test_template + {
+                'sources' : files('test-fdstore.c'),
+                'link_with' : libsystemd,
+                'type' : 'manual',
+        },
         test_template + {
                 'sources' : files(
                         'test-hashmap.c',
diff --git a/src/test/test-fdstore.c b/src/test/test-fdstore.c
new file mode 100644 (file)
index 0000000..6469d39
--- /dev/null
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* In 'store' mode pushes a couple of memfds with known content into the supervisor's fd store via FDSTORE=1
+ * sd_notify() messages. In 'check' mode reads back the fds passed via LISTEN_FDS and verifies the content
+ * matches what was pushed.
+ *
+ * This binary is intentionally linked against libsystemd only so that it can go in the minimal image. */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "sd-daemon.h"
+
+#define DATA_A "fdstore-data-a"
+#define DATA_B "fdstore-data-b"
+
+#define _cleanup_(f) __attribute__((cleanup(f)))
+
+static void closep(int *fd) {
+        if (!fd || *fd < 0)
+                return;
+
+        close(*fd);
+        *fd = -EBADF;
+}
+
+static int push_one(const char *fdname, const char *content) {
+        _cleanup_(closep) int fd = -EBADF;
+        int r;
+
+        assert(fdname);
+        assert(content);
+
+        fd = memfd_create(fdname, MFD_CLOEXEC | MFD_ALLOW_SEALING);
+        if (fd < 0) {
+                fprintf(stderr, "memfd_create(%s) failed: %m\n", fdname);
+                return -errno;
+        }
+
+        size_t len = strlen(content);
+        if (write(fd, content, len) != (ssize_t) len) {
+                fprintf(stderr, "write(%s) failed: %m\n", fdname);
+                return -errno;
+        }
+
+        char msg[256];
+        r = snprintf(msg, sizeof(msg), "FDSTORE=1\nFDNAME=%s", fdname);
+        if (r < 0 || (size_t) r >= sizeof(msg)) {
+                if (r >= 0)
+                        errno = ENOBUFS;
+                fprintf(stderr, "FDSTORE message for fdname=%s did not fit in buffer\n", fdname);
+                return -errno;
+        }
+
+        r = sd_pid_notify_with_fds(0, /* unset_environment= */ 0, msg, &fd, 1);
+        if (r < 0) {
+                errno = -r;
+                fprintf(stderr, "sd_pid_notify_with_fds(%s) failed: %m\n", fdname);
+                return r;
+        }
+        if (r == 0) {
+                fprintf(stderr, "NOTIFY_SOCKET not set\n");
+                return -ENOENT;
+        }
+
+        return 0;
+}
+
+static int do_store(void) {
+        int r;
+
+        if (push_one("test-fd-a", DATA_A) < 0)
+                return EXIT_FAILURE;
+
+        if (push_one("test-fd-b", DATA_B) < 0)
+                return EXIT_FAILURE;
+
+        /* Wait for our supervisor to actually process the FDSTORE messages before we exit, otherwise
+         * the cgroup-based pidref to unit lookup may fail once we're gone. */
+        r = sd_notify_barrier(0, 5 * 1000 * 1000);
+        if (r < 0) {
+                errno = -r;
+                fprintf(stderr, "sd_notify_barrier failed: %m\n");
+                return EXIT_FAILURE;
+        }
+
+        return EXIT_SUCCESS;
+}
+
+static int do_check(void) {
+        bool seen_a = false, seen_b = false;
+        int n;
+
+        n = sd_listen_fds(/* unset_environment= */ 0);
+        if (n < 0) {
+                errno = -n;
+                fprintf(stderr, "sd_listen_fds failed: %m\n");
+                return EXIT_FAILURE;
+        }
+        if (n < 2) {
+                fprintf(stderr, "Expected at least 2 fds via LISTEN_FDS, got %d\n", n);
+                return EXIT_FAILURE;
+        }
+
+        for (int i = 0; i < n; i++) {
+                int fd = SD_LISTEN_FDS_START + i;
+                char buf[256] = {};
+                ssize_t k;
+
+                if (lseek(fd, 0, SEEK_SET) < 0) {
+                        fprintf(stderr, "lseek(fd=%d) failed: %m\n", fd);
+                        return EXIT_FAILURE;
+                }
+                k = read(fd, buf, sizeof(buf) - 1);
+                if (k < 0) {
+                        fprintf(stderr, "read(fd=%d) failed: %m\n", fd);
+                        return EXIT_FAILURE;
+                }
+                buf[k] = 0;
+
+                if (strcmp(buf, DATA_A) == 0)
+                        seen_a = true;
+                else if (strcmp(buf, DATA_B) == 0)
+                        seen_b = true;
+                else
+                        fprintf(stderr, "Unexpected fd content: '%s'\n", buf);
+        }
+
+        if (!seen_a || !seen_b) {
+                fprintf(stderr, "Missing expected fds: seen_a=%d seen_b=%d\n", seen_a, seen_b);
+                return EXIT_FAILURE;
+        }
+
+        printf("Payload received both preserved fds with matching content.\n");
+        return EXIT_SUCCESS;
+}
+
+int main(int argc, char *argv[]) {
+        int r;
+
+        if (argc < 2) {
+                fprintf(stderr, "Usage: %s store|check\n", argv[0]);
+                return EXIT_FAILURE;
+        }
+
+        if (strcmp(argv[1], "store") == 0)
+                r = do_store();
+        else if (strcmp(argv[1], "check") == 0)
+                r = do_check();
+        else {
+                fprintf(stderr, "Unknown verb: %s\n", argv[1]);
+                return EXIT_FAILURE;
+        }
+
+        if (r != EXIT_SUCCESS)
+                return r;
+
+        /* On success, become sleep so if we are a container payload it can stay alive. */
+        execlp("sleep", "sleep", "infinity", (char *) NULL);
+        fprintf(stderr, "execlp(sleep) failed: %m\n");
+        return EXIT_FAILURE;
+}
index e0516449c70b00ce4c1e55ba663c0c2cbf882931..19b1d445c89d97dc27080dd52bca88ec6249bbd3 100755 (executable)
@@ -22,14 +22,17 @@ at_exit() {
     rm -rf /home/testuser/.local/state/machines/inodetest ||:
     rm -rf /home/testuser/.local/state/machines/inodetest2 ||:
     rm -rf /home/testuser/.local/state/machines/mangletest ||:
+    rm -rf /home/testuser/.local/state/machines/fdstore ||:
     machinectl terminate zurps ||:
     machinectl terminate exfiltrate ||:
     systemctl --user --machine testuser@ stop exfiltrate.service ||:
+    systemctl --user --machine testuser@ stop systemd-nspawn@fdstore.service ||:
     rm -f /etc/polkit-1/rules.d/registermachinetest.rules
     machinectl terminate nurps ||:
     machinectl terminate kurps ||:
     machinectl terminate wumms ||:
     machinectl terminate wamms ||:
+    machinectl terminate fdstore ||:
     rm -f /usr/share/polkit-1/rules.d/registermachinetest.rules
     rm -rf /var/tmp/mangletest
     rm -f /var/tmp/mangletest.tar.gz
@@ -307,4 +310,85 @@ tar -C /var/tmp/mangletest/ -cvzf /var/tmp/mangletest.tar.gz mangletest-0.1
 run0 --pipe -u testuser importctl -m --user import-tar /var/tmp/mangletest.tar.gz
 cmp /var/tmp/mangletest/mangletest-0.1/usr/lib/os-release /home/testuser/.local/state/machines/mangletest/usr/lib/os-release
 
+# Verify the fd-store preservation chain works end-to-end across:
+#   payload (inside container) -> systemd-nspawn (user manager) -> user manager
+#   -> system PID 1 (user@<UID>.service fd store)
+# Then restart the nspawn service and verify the inner payload actually
+# receives the preserved fds back via LISTEN_FDS, with their original content.
+create_dummy_container /home/testuser/.local/state/machines/fdstore
+# The container init execs the helper directly so the FDSTORE notification is
+# sent from PID 1 (nspawn rejects notify messages from anyone but the inner
+# payload's init). The helper itself execs sleep on success to keep the
+# container alive, and on failure it exits non-zero making the systemd-nspawn
+# service fail.
+cat >/home/testuser/.local/state/machines/fdstore/sbin/init <<'EOF'
+#!/usr/bin/env bash
+set -e
+if [[ "${LISTEN_FDS:-0}" -gt 0 ]]; then
+    exec /usr/bin/test-fdstore check
+else
+    exec /usr/bin/test-fdstore store
+fi
+EOF
+chmod +x /home/testuser/.local/state/machines/fdstore/sbin/init
+systemd-dissect --shift /home/testuser/.local/state/machines/fdstore foreign
+
+run0 -u testuser mkdir -p .config/systemd/nspawn/
+run0 -u testuser -i "cat >.config/systemd/nspawn/fdstore.nspawn <<EOF
+[Exec]
+KillSignal=SIGKILL
+EOF"
+
+run0 -u testuser mkdir -p ".config/systemd/user/systemd-nspawn@fdstore.service.d/"
+run0 -u testuser -i "cat >.config/systemd/user/systemd-nspawn@fdstore.service.d/fdstore.conf <<EOF
+[Service]
+FileDescriptorStoreMax=8
+FileDescriptorStorePreserve=yes
+EOF"
+run0 -u testuser systemctl --user daemon-reload
+
+run0 -u testuser systemctl start --user systemd-nspawn@fdstore.service
+timeout 30s bash -c \
+    "until [[ \"\$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)\" -ge 2 ]]; do sleep 0.5; done"
+
+# 1) Payload -> nspawn (user-side systemd-nspawn@fdstore.service fd store)
+n_nspawn_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)
+test "${n_nspawn_fds}" -ge 2
+
+# 2) nspawn -> user manager -> system PID 1 (user@<UID>.service fd store)
+TESTUSER_UID=$(id -u testuser)
+timeout 30s bash -c \
+    "until [[ \"\$(systemctl show -P NFileDescriptorStore user@${TESTUSER_UID}.service)\" -ge 2 ]]; do sleep 0.5; done"
+n_user_at_fds=$(systemctl show -P NFileDescriptorStore "user@${TESTUSER_UID}.service")
+test "${n_user_at_fds}" -ge 2
+
+# 3) Stop the nspawn service: payload is gone but FileDescriptorStorePreserve=yes
+# must keep the fds in the user-side fdstore (and propagated copy in PID 1).
+run0 -u testuser systemctl --user stop systemd-nspawn@fdstore.service
+n_nspawn_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)
+test "${n_nspawn_fds}" -ge 2
+
+# 4) Restart the service: nspawn must receive the preserved fds via LISTEN_FDS
+# and forward them into the inner payload, which verifies the content matches.
+run0 -u testuser systemctl start --user systemd-nspawn@fdstore.service
+run0 -u testuser systemctl is-active --user systemd-nspawn@fdstore.service
+
+# 5) Stop the nspawn service and the user session
+run0 -u testuser systemctl --user stop systemd-nspawn@fdstore.service
+n_nspawn_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)
+test "${n_nspawn_fds}" -ge 2
+systemctl stop "user@${TESTUSER_UID}.service"
+n_user_at_fds=$(systemctl show -P NFileDescriptorStore "user@${TESTUSER_UID}.service")
+test "${n_user_at_fds}" -ge 2
+
+# 6) Restart the user session and container payload
+systemctl start "user@${TESTUSER_UID}.service"
+timeout 30s bash -c \
+    "until systemctl is-active 'user@${TESTUSER_UID}.service' >/dev/null; do sleep 0.5; done"
+run0 -u testuser systemctl --user start systemd-nspawn@fdstore.service
+run0 -u testuser systemctl is-active --user systemd-nspawn@fdstore.service
+
+run0 -u testuser systemctl --user stop systemd-nspawn@fdstore.service
+machinectl terminate fdstore 2>/dev/null || true
+
 loginctl disable-linger testuser