<citerefentry><refentrytitle>systemd-vmspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
that defaults to true.)</para>
+ <para>If <command>systemd-nspawn</command> itself is invoked with a <varname>$NOTIFY_SOCKET</varname>
+ set in its environment (i.e. it is itself supervised by a service manager that uses the
+ <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>
+ protocol), <literal>FDSTORE=1</literal> and <literal>FDSTOREREMOVE=1</literal> messages received
+ from the container payload (along with any accompanying file descriptors and
+ <literal>FDNAME=</literal> tag) are forwarded one level up to the enveloping service manager. This
+ allows the file descriptor store of services running inside the container to be preserved across
+ container restarts (and, transitively, across restarts, re-execs, soft-reboots and LUO-based kexecs
+ of any outer service manager), provided
+ <varname>FileDescriptorStoreMax=</varname>/<varname>FileDescriptorStorePreserve=yes</varname> are
+ configured on the unit running <command>systemd-nspawn</command>. See the
+ <ulink url="https://systemd.io/FILE_DESCRIPTOR_STORE">File Descriptor Store</ulink> documentation
+ for details.</para>
+
<xi:include href="version-info.xml" xpointer="v231"/></listitem>
</varlistentry>
--- /dev/null
+[Service]
+FileDescriptorStoreMax=16
+FileDescriptorStorePreserve=yes
--- /dev/null
+# For tests exercising the fd store we need the unit in the rootfs to have these
+# settings, or the fdstore content will be dropped in the initrd -> rootfs transition
+[Service]
+FileDescriptorStoreMax=20
+FileDescriptorStorePreserve=yes
if [ ! -e "$BUILDROOT/etc/os-release" ]; then
ln -s ../usr/lib/os-release "$BUILDROOT/etc/os-release"
fi
+
+# For use in the minimal containers, only needs libsystemd and libc
+if [[ -x "$BUILDDIR/test-fdstore" ]]; then
+ cp "$BUILDDIR/test-fdstore" "$BUILDROOT/usr/bin/test-fdstore"
+fi
--- /dev/null
+[Service]
+FileDescriptorStoreMax=16
+FileDescriptorStorePreserve=yes
--- /dev/null
+# For tests exercising the FD store we need the unit in the initrd to have these
+# settings, or the fdstore content will be dropped in the initrd
+[Service]
+FileDescriptorStoreMax=20
+FileDescriptorStorePreserve=yes
if (r < 0)
log_debug_errno(r, "Failed to enable SO_PASSPIDFD, ignoring: %m");
- r = setsockopt_int(fd, SOL_SOCKET, SO_PASSRIGHTS, false);
- if (r < 0)
- log_debug_errno(r, "Failed to turn off SO_PASSRIGHTS, ignoring: %m");
+ /* Only allow the container payload to pass file descriptors to us if we ourselves are
+ * supervised by a service manager that enabled the FD store. */
+ if (!fdstore_detected()) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSRIGHTS, false);
+ if (r < 0)
+ log_debug_errno(r, "Failed to turn off SO_PASSRIGHTS, ignoring: %m");
+ }
return TAKE_FD(fd);
}
return 0;
}
+static int forward_fd_store(char **tags, FDSet *fds) {
+ int r;
+
+ /* Forward fd-store related messages to our own service manager, so that file descriptors stored
+ * by the inner payload propagate up the chain and are preserved across restarts. Skip entirely
+ * if we have no upstream supervisor (no NOTIFY_SOCKET) or no fd store available (no FDSTORE).
+ *
+ * Forwarded entries are namespaced with a "payload-" prefix on their FDNAME so that they
+ * cannot collide with fd-store entries that nspawn itself might want to push to its own
+ * upstream supervisor (the container payload and nspawn share a single upstream fdstore
+ * namespace, since there's only one init system per container). */
+ if (!getenv("NOTIFY_SOCKET") || !fdstore_detected())
+ return 0;
+
+ if (strv_contains(tags, "FDSTOREREMOVE=1")) {
+ const char *fdname = strv_find_startswith(tags, "FDNAME=");
+ if (!fdname)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Got FDSTOREREMOVE=1 from container payload without FDNAME=, ignoring.");
+ if (!fdname_is_valid(fdname))
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Got FDSTOREREMOVE=1 from container payload with invalid FDNAME='%s', ignoring.",
+ fdname);
+
+ r = sd_notifyf(/* unset_environment= */ false,
+ "FDSTOREREMOVE=1\nFDNAME=payload-%s", fdname);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to forward FDSTOREREMOVE upstream, ignoring: %m");
+ } else if (strv_contains(tags, "FDSTORE=1")) {
+ if (fdset_isempty(fds)) {
+ log_debug("Got FDSTORE=1 from container payload without any attached file descriptors, ignoring.");
+ return 0;
+ }
+
+ _cleanup_free_ int *fds_array = NULL;
+ int n;
+
+ n = fdset_to_array(fds, &fds_array);
+ if (n < 0)
+ return log_warning_errno(n, "Failed to convert fdset to array, ignoring FDSTORE forward: %m");
+
+ const char *fdname = strv_find_startswith(tags, "FDNAME=");
+ bool fdpoll_off = strv_contains(tags, "FDPOLL=0");
+ _cleanup_free_ char *msg = NULL;
+ unsigned n_fds = (unsigned) n;
+
+ if (fdname && !fdname_is_valid(fdname)) {
+ log_warning("Got FDSTORE=1 from container payload with invalid FDNAME='%s', ignoring name.", fdname);
+ fdname = NULL;
+ }
+
+ if (asprintf(&msg, "FDSTORE=1\nFDNAME=payload-%s%s%s",
+ fdname ?: "stored",
+ fdpoll_off ? "\nFDPOLL=" : "",
+ fdpoll_off ? "0" : "") < 0)
+ return log_oom();
+
+ r = sd_pid_notify_with_fds(
+ 0,
+ /* unset_environment= */ false,
+ msg,
+ fds_array,
+ n_fds);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to forward FDSTORE upstream, ignoring: %m");
+ }
+
+ return 0;
+}
+
static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
PidRef *inner_child_pid = ASSERT_PTR(userdata);
int r;
_cleanup_(pidref_done) PidRef sender_pid = PIDREF_NULL;
_cleanup_strv_free_ char **tags = NULL;
- r = notify_recv_strv(fd, &tags, /* ret_ucred= */ NULL, &sender_pid);
+ _cleanup_(fdset_freep) FDSet *fds = NULL;
+ r = notify_recv_with_fds_strv(fd, &tags, /* ret_ucred= */ NULL, &sender_pid, &fds);
if (r == -EAGAIN)
return 0;
if (r < 0)
(void) sd_notifyf(/* unset_environment= */ false, "STATUS=Container running.");
}
+ (void) forward_fd_store(tags, fds);
+
return 0;
}
'sources' : files('test-fd-util.c'),
'dependencies' : libseccomp_cflags,
},
+ test_template + {
+ 'sources' : files('test-fdstore.c'),
+ 'link_with' : libsystemd,
+ 'type' : 'manual',
+ },
test_template + {
'sources' : files(
'test-hashmap.c',
--- /dev/null
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* In 'store' mode pushes a couple of memfds with known content into the supervisor's fd store via FDSTORE=1
+ * sd_notify() messages. In 'check' mode reads back the fds passed via LISTEN_FDS and verifies the content
+ * matches what was pushed.
+ *
+ * This binary is intentionally linked against libsystemd only so that it can go in the minimal image. */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "sd-daemon.h"
+
+#define DATA_A "fdstore-data-a"
+#define DATA_B "fdstore-data-b"
+
+#define _cleanup_(f) __attribute__((cleanup(f)))
+
+static void closep(int *fd) {
+ if (!fd || *fd < 0)
+ return;
+
+ close(*fd);
+ *fd = -EBADF;
+}
+
+static int push_one(const char *fdname, const char *content) {
+ _cleanup_(closep) int fd = -EBADF;
+ int r;
+
+ assert(fdname);
+ assert(content);
+
+ fd = memfd_create(fdname, MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ if (fd < 0) {
+ fprintf(stderr, "memfd_create(%s) failed: %m\n", fdname);
+ return -errno;
+ }
+
+ size_t len = strlen(content);
+ if (write(fd, content, len) != (ssize_t) len) {
+ fprintf(stderr, "write(%s) failed: %m\n", fdname);
+ return -errno;
+ }
+
+ char msg[256];
+ r = snprintf(msg, sizeof(msg), "FDSTORE=1\nFDNAME=%s", fdname);
+ if (r < 0 || (size_t) r >= sizeof(msg)) {
+ if (r >= 0)
+ errno = ENOBUFS;
+ fprintf(stderr, "FDSTORE message for fdname=%s did not fit in buffer\n", fdname);
+ return -errno;
+ }
+
+ r = sd_pid_notify_with_fds(0, /* unset_environment= */ 0, msg, &fd, 1);
+ if (r < 0) {
+ errno = -r;
+ fprintf(stderr, "sd_pid_notify_with_fds(%s) failed: %m\n", fdname);
+ return r;
+ }
+ if (r == 0) {
+ fprintf(stderr, "NOTIFY_SOCKET not set\n");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int do_store(void) {
+ int r;
+
+ if (push_one("test-fd-a", DATA_A) < 0)
+ return EXIT_FAILURE;
+
+ if (push_one("test-fd-b", DATA_B) < 0)
+ return EXIT_FAILURE;
+
+ /* Wait for our supervisor to actually process the FDSTORE messages before we exit, otherwise
+ * the cgroup-based pidref to unit lookup may fail once we're gone. */
+ r = sd_notify_barrier(0, 5 * 1000 * 1000);
+ if (r < 0) {
+ errno = -r;
+ fprintf(stderr, "sd_notify_barrier failed: %m\n");
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
+
+static int do_check(void) {
+ bool seen_a = false, seen_b = false;
+ int n;
+
+ n = sd_listen_fds(/* unset_environment= */ 0);
+ if (n < 0) {
+ errno = -n;
+ fprintf(stderr, "sd_listen_fds failed: %m\n");
+ return EXIT_FAILURE;
+ }
+ if (n < 2) {
+ fprintf(stderr, "Expected at least 2 fds via LISTEN_FDS, got %d\n", n);
+ return EXIT_FAILURE;
+ }
+
+ for (int i = 0; i < n; i++) {
+ int fd = SD_LISTEN_FDS_START + i;
+ char buf[256] = {};
+ ssize_t k;
+
+ if (lseek(fd, 0, SEEK_SET) < 0) {
+ fprintf(stderr, "lseek(fd=%d) failed: %m\n", fd);
+ return EXIT_FAILURE;
+ }
+ k = read(fd, buf, sizeof(buf) - 1);
+ if (k < 0) {
+ fprintf(stderr, "read(fd=%d) failed: %m\n", fd);
+ return EXIT_FAILURE;
+ }
+ buf[k] = 0;
+
+ if (strcmp(buf, DATA_A) == 0)
+ seen_a = true;
+ else if (strcmp(buf, DATA_B) == 0)
+ seen_b = true;
+ else
+ fprintf(stderr, "Unexpected fd content: '%s'\n", buf);
+ }
+
+ if (!seen_a || !seen_b) {
+ fprintf(stderr, "Missing expected fds: seen_a=%d seen_b=%d\n", seen_a, seen_b);
+ return EXIT_FAILURE;
+ }
+
+ printf("Payload received both preserved fds with matching content.\n");
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char *argv[]) {
+ int r;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s store|check\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ if (strcmp(argv[1], "store") == 0)
+ r = do_store();
+ else if (strcmp(argv[1], "check") == 0)
+ r = do_check();
+ else {
+ fprintf(stderr, "Unknown verb: %s\n", argv[1]);
+ return EXIT_FAILURE;
+ }
+
+ if (r != EXIT_SUCCESS)
+ return r;
+
+ /* On success, become sleep so if we are a container payload it can stay alive. */
+ execlp("sleep", "sleep", "infinity", (char *) NULL);
+ fprintf(stderr, "execlp(sleep) failed: %m\n");
+ return EXIT_FAILURE;
+}
rm -rf /home/testuser/.local/state/machines/inodetest ||:
rm -rf /home/testuser/.local/state/machines/inodetest2 ||:
rm -rf /home/testuser/.local/state/machines/mangletest ||:
+ rm -rf /home/testuser/.local/state/machines/fdstore ||:
machinectl terminate zurps ||:
machinectl terminate exfiltrate ||:
systemctl --user --machine testuser@ stop exfiltrate.service ||:
+ systemctl --user --machine testuser@ stop systemd-nspawn@fdstore.service ||:
rm -f /etc/polkit-1/rules.d/registermachinetest.rules
machinectl terminate nurps ||:
machinectl terminate kurps ||:
machinectl terminate wumms ||:
machinectl terminate wamms ||:
+ machinectl terminate fdstore ||:
rm -f /usr/share/polkit-1/rules.d/registermachinetest.rules
rm -rf /var/tmp/mangletest
rm -f /var/tmp/mangletest.tar.gz
run0 --pipe -u testuser importctl -m --user import-tar /var/tmp/mangletest.tar.gz
cmp /var/tmp/mangletest/mangletest-0.1/usr/lib/os-release /home/testuser/.local/state/machines/mangletest/usr/lib/os-release
+# Verify the fd-store preservation chain works end-to-end across:
+# payload (inside container) -> systemd-nspawn (user manager) -> user manager
+# -> system PID 1 (user@<UID>.service fd store)
+# Then restart the nspawn service and verify the inner payload actually
+# receives the preserved fds back via LISTEN_FDS, with their original content.
+create_dummy_container /home/testuser/.local/state/machines/fdstore
+# The container init execs the helper directly so the FDSTORE notification is
+# sent from PID 1 (nspawn rejects notify messages from anyone but the inner
+# payload's init). The helper itself execs sleep on success to keep the
+# container alive, and on failure it exits non-zero making the systemd-nspawn
+# service fail.
+cat >/home/testuser/.local/state/machines/fdstore/sbin/init <<'EOF'
+#!/usr/bin/env bash
+set -e
+if [[ "${LISTEN_FDS:-0}" -gt 0 ]]; then
+ exec /usr/bin/test-fdstore check
+else
+ exec /usr/bin/test-fdstore store
+fi
+EOF
+chmod +x /home/testuser/.local/state/machines/fdstore/sbin/init
+systemd-dissect --shift /home/testuser/.local/state/machines/fdstore foreign
+
+run0 -u testuser mkdir -p .config/systemd/nspawn/
+run0 -u testuser -i "cat >.config/systemd/nspawn/fdstore.nspawn <<EOF
+[Exec]
+KillSignal=SIGKILL
+EOF"
+
+run0 -u testuser mkdir -p ".config/systemd/user/systemd-nspawn@fdstore.service.d/"
+run0 -u testuser -i "cat >.config/systemd/user/systemd-nspawn@fdstore.service.d/fdstore.conf <<EOF
+[Service]
+FileDescriptorStoreMax=8
+FileDescriptorStorePreserve=yes
+EOF"
+run0 -u testuser systemctl --user daemon-reload
+
+run0 -u testuser systemctl start --user systemd-nspawn@fdstore.service
+timeout 30s bash -c \
+ "until [[ \"\$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)\" -ge 2 ]]; do sleep 0.5; done"
+
+# 1) Payload -> nspawn (user-side systemd-nspawn@fdstore.service fd store)
+n_nspawn_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)
+test "${n_nspawn_fds}" -ge 2
+
+# 2) nspawn -> user manager -> system PID 1 (user@<UID>.service fd store)
+TESTUSER_UID=$(id -u testuser)
+timeout 30s bash -c \
+ "until [[ \"\$(systemctl show -P NFileDescriptorStore user@${TESTUSER_UID}.service)\" -ge 2 ]]; do sleep 0.5; done"
+n_user_at_fds=$(systemctl show -P NFileDescriptorStore "user@${TESTUSER_UID}.service")
+test "${n_user_at_fds}" -ge 2
+
+# 3) Stop the nspawn service: payload is gone but FileDescriptorStorePreserve=yes
+# must keep the fds in the user-side fdstore (and propagated copy in PID 1).
+run0 -u testuser systemctl --user stop systemd-nspawn@fdstore.service
+n_nspawn_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)
+test "${n_nspawn_fds}" -ge 2
+
+# 4) Restart the service: nspawn must receive the preserved fds via LISTEN_FDS
+# and forward them into the inner payload, which verifies the content matches.
+run0 -u testuser systemctl start --user systemd-nspawn@fdstore.service
+run0 -u testuser systemctl is-active --user systemd-nspawn@fdstore.service
+
+# 5) Stop the nspawn service and the user session
+run0 -u testuser systemctl --user stop systemd-nspawn@fdstore.service
+n_nspawn_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore systemd-nspawn@fdstore.service)
+test "${n_nspawn_fds}" -ge 2
+systemctl stop "user@${TESTUSER_UID}.service"
+n_user_at_fds=$(systemctl show -P NFileDescriptorStore "user@${TESTUSER_UID}.service")
+test "${n_user_at_fds}" -ge 2
+
+# 6) Restart the user session and container payload
+systemctl start "user@${TESTUSER_UID}.service"
+timeout 30s bash -c \
+ "until systemctl is-active 'user@${TESTUSER_UID}.service' >/dev/null; do sleep 0.5; done"
+run0 -u testuser systemctl --user start systemd-nspawn@fdstore.service
+run0 -u testuser systemctl is-active --user systemd-nspawn@fdstore.service
+
+run0 -u testuser systemctl --user stop systemd-nspawn@fdstore.service
+machinectl terminate fdstore 2>/dev/null || true
+
loginctl disable-linger testuser