]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: propagate FDs from store from user to system manager
authorLuca Boccassi <luca.boccassi@gmail.com>
Fri, 1 May 2026 13:19:33 +0000 (14:19 +0100)
committerLuca Boccassi <luca.boccassi@gmail.com>
Fri, 15 May 2026 12:46:08 +0000 (13:46 +0100)
In order to allow FD Stores of user units to survive a user
session restart, propagate FDs received via the protocol up one
level from user to system manager via sd_notify.

And the other way around, propagate them down via LISTEN_FDS
tagging them with the unit name so that the child manager can
inject them in the appropriate unit.

Ensure units that are dead or not loaded can get FDs added to
their stores, and that they are correctly propagated once the
unit is started or loaded. When the unit is not loaded we don't
know what the FD max limit is, so simply increase it for each FD
injected, and then when the unit is realised prune it down to
match the unit's now available config in case the limit is lower
than the number of FDs in the store.

Each FD sent up or down is assigned a monotonic index, and the manager
also sends a JSON map that associates the index with the original
unit and FDNAME:

 {
   "unit-name.service": [
     { "name": "fdname1", "index": 1 },
     { "name": "fdname2", "index": 2 }
   ],
   ...
 }

This allows the manager to assign back the FDs to the appropriate
unit using the appropriate name, given the FDNAMEs are not unique.

25 files changed:
docs/FILE_DESCRIPTOR_STORE.md
man/systemd.service.xml
src/analyze/analyze-condition.c
src/analyze/analyze-security.c
src/analyze/analyze-verify-util.c
src/core/main.c
src/core/manager-serialize.c
src/core/manager.c
src/core/manager.h
src/core/service.c
src/core/service.h
src/core/unit.h
src/shared/daemon-util.c
src/shared/daemon-util.h
src/test/test-bpf-firewall.c
src/test/test-bpf-foreign-programs.c
src/test/test-bpf-restrict-fs.c
src/test/test-cgroup-mask.c
src/test/test-engine.c
src/test/test-execute.c
src/test/test-load-fragment.c
src/test/test-path.c
src/test/test-sched-prio.c
src/test/test-socket-bind.c
src/test/test-watch-pid.c

index 231af87c912d4b03e1f074c40c07da1aa368993f..8fa2ae0127c9625d19282c8592ae497b1b003181 100644 (file)
@@ -198,6 +198,27 @@ The soft reboot cycle transition and the initrd→host transition are
 semantically very similar, hence similar rules apply, and in both cases it is
 recommended to use the fdstore if pinned resources shall be passed over.
 
+## Propagation Across Manager Boundaries
+
+When a service that has `FileDescriptorStorePreserve=yes` set is itself running
+under another service manager, for example a service of the per-user manager
+(`user@.service`), or a payload running inside a
+[`systemd-nspawn`](https://www.freedesktop.org/software/systemd/man/latest/systemd-nspawn.html)
+container, fds pushed into its fdstore are automatically forwarded one level up
+the supervisor chain via the enveloping manager's `$NOTIFY_SOCKET`. This allows
+the fdstore contents of inner services to be preserved across restarts, re-execs,
+soft-reboots, etc. of the *outer* manager, even when the inner manager (or the
+container payload) is itself restarted along the way. On the way up, each fd is
+tagged with its originating unit id and the original `FDNAME=…` value, so that
+when the fds are eventually handed back down (via the regular
+`$LISTEN_FDS`/`$LISTEN_FDNAMES` protocol), each manager along the chain can
+route them back to the correct unit's fdstore. `FDSTOREREMOVE=1` notifications
+are forwarded the same way, so that explicit removals propagate all the way up too.
+
+For this to work the enveloping unit must itself enable the fdstore (i.e. set
+`FileDescriptorStoreMax=` to a sufficiently large value and
+`FileDescriptorStorePreserve=yes`).
+
 ## Debugging
 
 The
index d4a19785230112d7c15e397701ccc30b81e8559c..b25f1a90aabe0ce903adafea2c3b5ab6d278e902 100644 (file)
@@ -1255,6 +1255,17 @@ RestartMaxDelaySec=160s</programlisting>
         is removed, the service manager exits, or the file descriptors get <constant>EPOLLHUP</constant> or
         <constant>EPOLLERR</constant>.</para>
 
+        <para>When set to <constant>yes</constant>, and the service is itself running under another service
+        manager (e.g. a service of <filename>user@.service</filename>, or a payload inside
+        <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>),
+        file descriptors pushed into the store are also forwarded one level up via the enveloping manager's
+        <varname>$NOTIFY_SOCKET</varname>, tagged with the originating unit id, so that they are preserved
+        across restarts of the inner manager and handed back to the originating unit when it is started
+        again. For this to take effect, the enveloping unit must itself enable
+        <varname>FileDescriptorStoreMax=</varname> and <varname>FileDescriptorStorePreserve=yes</varname>.
+        See the <ulink url="https://systemd.io/FILE_DESCRIPTOR_STORE">File Descriptor Store</ulink>
+        overview for details.</para>
+
         <para>Use <command>systemctl clean --what=fdstore …</command> to release the file descriptor store
         explicitly.</para>
 
index a928f84ef4e9522dbf4b5b0597e8db9f4423c532..5c5177c3bea6a0f9416815a046692cd9dc948b0b 100644 (file)
@@ -98,7 +98,7 @@ static int verify_conditions(char **lines, RuntimeScope scope, const char *unit,
                 return log_error_errno(r, "Failed to initialize manager: %m");
 
         log_debug("Starting manager...");
-        r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, root);
+        r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, /* named_listen_fds= */ NULL, root);
         if (r < 0)
                 return r;
 
index 08bff2cf9e9d2e3a1e5ad3abafb578ef1c31b6ef..fdeaf69e174465f5e8ec9fc2eb11548fda0113de 100644 (file)
@@ -2713,7 +2713,7 @@ static int offline_security_checks(
 
         log_debug("Starting manager...");
 
-        r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, root);
+        r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, /* named_listen_fds= */ NULL, root);
         if (r < 0)
                 return r;
 
index e7ffae5a287873cb72def8293f61396822a32a70..ad553078d4833878db993e88b12b6899b58481cc 100644 (file)
@@ -315,7 +315,7 @@ int verify_units(
 
         log_debug("Starting manager...");
 
-        r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, root);
+        r = manager_startup(m, /* serialization= */ NULL, /* fds= */ NULL, /* named_listen_fds= */ NULL, root);
         if (r < 0)
                 return r;
 
index c6d66c9ec783b624ff91e794e0b294377f1fa254..8d5ab57e3df5485e3429520c208c4ab85ba23c4e 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "sd-bus.h"
 #include "sd-daemon.h"
+#include "sd-json.h"
 #include "sd-messages.h"
 
 #include "alloc-util.h"
 #include "emergency-action.h"
 #include "env-util.h"
 #include "escape.h"
+#include "extract-word.h"
 #include "fd-util.h"
 #include "fdset.h"
 #include "fileio.h"
 #include "format-table.h"
 #include "format-util.h"
 #include "glyph-util.h"
+#include "hash-funcs.h"
+#include "hashmap.h"
 #include "help-util.h"
 #include "hexdecoct.h"
 #include "hostname-setup.h"
@@ -59,6 +63,7 @@
 #include "initrd-util.h"
 #include "io-util.h"
 #include "ipe-setup.h"
+#include "json-util.h"
 #include "killall.h"
 #include "kmod-setup.h"
 #include "label-util.h"
@@ -82,6 +87,7 @@
 #include "parse-argument.h"
 #include "parse-util.h"
 #include "path-util.h"
+#include "pidfd-util.h"
 #include "proc-cmdline.h"
 #include "process-util.h"
 #include "random-util.h"
@@ -91,6 +97,7 @@
 #include "selinux-setup.h"
 #include "selinux-util.h"
 #include "serialize.h"
+#include "service.h"
 #include "set.h"
 #include "signal-util.h"
 #include "smack-setup.h"
@@ -3069,10 +3076,254 @@ static int initialize_security(
         return 0;
 }
 
-static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
+static int parse_listen_fds_env(unsigned *ret_n_fds, char ***ret_names) {
+        _cleanup_strv_free_ char **names = NULL;
+        const char *e;
+        unsigned n_fds;
+        int r;
+
+        assert(ret_n_fds);
+        assert(ret_names);
+
+        /* Parse and validate the LISTEN_PID=/LISTEN_PIDFDID=/LISTEN_FDS=/LISTEN_FDNAMES= environment
+         * variables. */
+
+        e = secure_getenv("LISTEN_PID");
+        if (!e)
+                return -ENXIO;
+
+        pid_t pid;
+        r = parse_pid(e, &pid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to parse LISTEN_PID=%s: %m", e);
+        if (pid != getpid_cached())
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "LISTEN_PID=%s does not match our own PID " PID_FMT ", ignoring.",
+                                       e,
+                                       getpid_cached());
+
+        e = secure_getenv("LISTEN_PIDFDID");
+        if (e) {
+                uint64_t own_pidfdid, pidfdid;
+
+                r = safe_atou64(e, &pidfdid);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to parse LISTEN_PIDFDID=%s: %m", e);
+
+                if (pidfd_get_inode_id_self_cached(&own_pidfdid) >= 0 && pidfdid != own_pidfdid)
+                        return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "LISTEN_PIDFDID=%s does not match our own pidfdid %" PRIu64 ", ignoring.",
+                                               e,
+                                               own_pidfdid);
+        }
+
+        e = secure_getenv("LISTEN_FDS");
+        if (!e)
+                return -ENXIO;
+
+        r = safe_atou(e, &n_fds);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to parse LISTEN_FDS= value '%s': %m", e);
+        if (n_fds == 0)
+                return -ENXIO;
+        if (n_fds > INT_MAX - SD_LISTEN_FDS_START)
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid number of fds in LISTEN_FDS= value '%s'", e);
+
+        e = secure_getenv("LISTEN_FDNAMES");
+        if (!e)
+                return -ENXIO;
+
+        r = strv_split_full(&names, e, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to parse LISTEN_FDNAMES=%s: %m", e);
+        if (strv_length(names) != (size_t) n_fds)
+                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                       "Mismatch between number of LISTEN_FDS= and LISTEN_FDNAMES= entries: %u vs %zu",
+                                       n_fds, strv_length(names));
+
+        *ret_n_fds = n_fds;
+        *ret_names = TAKE_PTR(names);
+        return 0;
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+                index_to_tag_hash_ops,
+                uint64_t, uint64_hash_func, uint64_compare_func,
+                ListenFDsTag, listen_fds_tag_free);
+
+static int parse_listen_fds_mapping(int mapping_fd, Hashmap **ret_index_to_tag) {
+        _cleanup_(sd_json_variant_unrefp) sd_json_variant *root = NULL;
+        _cleanup_hashmap_free_ Hashmap *index_to_tag = NULL;
+        const char *unit_id;
+        sd_json_variant *fds_json;
+        int r;
+
+        assert(mapping_fd >= 0);
+        assert(ret_index_to_tag);
+
+        /* Parse the JSON mapping memfd that the downstream manager pushed alongside the indexed fds:
+         *   { "unit-name.service": [ { "name": "fdname1", "index": 1 }, ... ], ... }
+         * Returns a hashmap keyed by stringified index ("1", "2", ...) with ListenFDsTag* values
+         * carrying the resolved (unit_id, original fdname, upstream index). */
+
+        _cleanup_fclose_ FILE *f = NULL;
+        r = fdopen_independent(mapping_fd, "r", &f);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to open fdstore-mapping memfd: %m");
+
+        r = sd_json_parse_file(f, "fdstore-mapping", SD_JSON_PARSE_MUST_BE_OBJECT, &root,
+                               /* reterr_line= */ NULL, /* reterr_column= */ NULL);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to parse fdstore-mapping JSON: %m");
+
+        JSON_VARIANT_OBJECT_FOREACH(unit_id, fds_json, root) {
+                sd_json_variant *entry;
+
+                if (!unit_name_is_valid(unit_id, UNIT_NAME_ANY)) {
+                        log_warning("fdstore-mapping has invalid unit name '%s', skipping.", unit_id);
+                        continue;
+                }
+
+                JSON_VARIANT_ARRAY_FOREACH(entry, fds_json) {
+                        struct {
+                                const char *name;
+                                uint64_t index;
+                        } p = { };
+
+                        static const sd_json_dispatch_field dispatch_table[] = {
+                                { "name",  SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, voffsetof(p, name),  SD_JSON_MANDATORY },
+                                { "index", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64,       voffsetof(p, index), SD_JSON_MANDATORY },
+                                {}
+                        };
+
+                        r = sd_json_dispatch(entry, dispatch_table, SD_JSON_ALLOW_EXTENSIONS|SD_JSON_LOG|SD_JSON_WARNING, &p);
+                        if (r < 0)
+                                continue;
+
+                        if (p.index == 0) {
+                                log_warning("fdstore-mapping entry for unit '%s' name '%s' has zero index, skipping.", unit_id, p.name);
+                                continue;
+                        }
+
+                        _cleanup_(listen_fds_tag_freep) ListenFDsTag *t = new(ListenFDsTag, 1);
+                        if (!t)
+                                return log_oom();
+
+                        *t = (ListenFDsTag) {
+                                .index = p.index,
+                        };
+
+                        t->unit_id = strdup(unit_id);
+                        t->fdname = strdup(p.name);
+                        if (!t->unit_id || !t->fdname)
+                                return log_oom();
+
+                        /* Key points into the value struct, so freeing the value frees the key. */
+                        r = hashmap_ensure_put(&index_to_tag, &index_to_tag_hash_ops, &t->index, t);
+                        if (r < 0)
+                                return log_warning_errno(r, "Failed to insert fdstore-mapping entry into hashmap: %m");
+                        if (r > 0)
+                                TAKE_PTR(t);
+                }
+        }
+
+        *ret_index_to_tag = TAKE_PTR(index_to_tag);
+        return 0;
+}
+
+static int collect_listen_fds_named(FDSet *fds, Hashmap **ret_named_fds) {
+        _cleanup_hashmap_free_ Hashmap *named_fds = NULL, *index_to_tag = NULL;
+        _cleanup_strv_free_ char **names = NULL;
+        unsigned n_fds;
+        int r;
+
+        assert(fds);
+        assert(ret_named_fds);
+
+        /* Pull entries from the LISTEN_FDS=/LISTEN_FDNAMES= protocol out of 'fds' into a hashmap
+         * keyed by fd. Two flavours of named entries are recognized:
+         *
+         *   - A single mapping memfd whose fdname matches SERVICE_FDSTORE_MAPPING_FDNAME, which
+         *     contains a JSON map pairing numeric indices to (unit-id, original-fdname).
+         *   - Numeric indices (matching entries in the mapping document) for the actual fds.
+         *
+         * The hashmap owns the fds (closed via destructor on cleanup) so any entries the dispatcher
+         * does not consume are correctly cleaned up. */
+
+        r = parse_listen_fds_env(&n_fds, &names);
+        if (r < 0) {
+                /* Fail gracefully here, just warn and ignore but otherwise proceed on parsing failure */
+                if (r != -ENXIO)
+                        log_warning_errno(r, "Failed to parse LISTEN_FDS environment, ignoring: %m");
+                *ret_named_fds = NULL;
+                return 0;
+        }
+
+        /* First pass: locate and parse the mapping memfd, if any. */
+        for (unsigned i = 0; i < n_fds; i++) {
+                int fd = SD_LISTEN_FDS_START + i;
+
+                if (!streq(names[i], SERVICE_FDSTORE_MAPPING_FDNAME))
+                        continue;
+
+                if (!fdset_contains(fds, fd))
+                        continue;
+
+                (void) parse_listen_fds_mapping(fd, &index_to_tag);
+
+                /* The mapping memfd itself is not routed to any unit; close it and remove from fds
+                 * so it doesn't get redistributed */
+                assert_se(fdset_remove(fds, fd) == fd);
+                safe_close(fd);
+                break;
+        }
+
+        /* Second pass: route fds whose name matches an entry in the mapping. */
+        for (unsigned i = 0; i < n_fds; i++) {
+                int fd = SD_LISTEN_FDS_START + i;
+                const char *name = names[i], *suffix;
+                ListenFDsTag *t;
+                uint64_t idx;
+
+                if (!fdset_contains(fds, fd))
+                        continue;
+
+                if (!index_to_tag)
+                        continue;
+
+                suffix = startswith(name, SERVICE_FDSTORE_SUB_FDNAME_PREFIX);
+                if (!suffix || safe_atou64(suffix, &idx) < 0)
+                        continue;
+
+                /* Steal the matching mapping entry — we transfer ownership of the parsed
+                 * (unit_id, fdname, index) struct into the per-fd hashmap that the manager
+                 * will consume. */
+                t = hashmap_remove(index_to_tag, &idx);
+                if (!t)
+                        continue;
+
+                _cleanup_(listen_fds_tag_freep) ListenFDsTag *t_owned = t;
+
+                r = hashmap_ensure_put(&named_fds, &fd_to_listen_fds_tag_hash_ops, FD_TO_PTR(fd), t_owned);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to insert named fd into hashmap: %m");
+                if (r == 0)
+                        continue; /* fd already inserted, cannot really happen */
+
+                TAKE_PTR(t_owned);
+
+                assert_se(fdset_remove(fds, fd) == fd);
+        }
+
+        *ret_named_fds = TAKE_PTR(named_fds);
+        return 1;
+}
+
+static int collect_fds(FDSet **ret_fds, Hashmap **ret_named_fds, const char **ret_error_message) {
         int r;
 
         assert(ret_fds);
+        assert(ret_named_fds);
         assert(ret_error_message);
 
         /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
@@ -3094,6 +3345,8 @@ static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
         /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
         assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
 
+        (void) collect_listen_fds_named(*ret_fds, ret_named_fds);
+
         return 0;
 }
 
@@ -3151,6 +3404,7 @@ int main(int argc, char *argv[]) {
                                                                           * for the two that indicate whether
                                                                           * these fields are initialized! */
         bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
+        _cleanup_hashmap_free_ Hashmap *named_listen_fds = NULL;
         char *switch_root_dir = NULL, *switch_root_init = NULL;
         usec_t before_startup, after_startup;
         static char systemd[] = "systemd";
@@ -3396,7 +3650,7 @@ int main(int argc, char *argv[]) {
                 log_close();
 
                 /* Remember open file descriptors for later deserialization */
-                r = collect_fds(&fds, &error_message);
+                r = collect_fds(&fds, &named_listen_fds, &error_message);
                 if (r < 0)
                         goto finish;
 
@@ -3471,7 +3725,7 @@ int main(int argc, char *argv[]) {
 
         before_startup = now(CLOCK_MONOTONIC);
 
-        r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
+        r = manager_startup(m, arg_serialization, fds, named_listen_fds, /* root= */ NULL);
         if (r < 0) {
                 error_message = "Failed to start up manager";
                 goto finish;
index 6bc41f15f93bfeab295b8a7aba1e671678d00a3f..bef4021771e3feac8c7bdf03d3097c4a05a1be01 100644 (file)
@@ -124,6 +124,7 @@ int manager_serialize(
 
         (void) serialize_item(f, "previous-objective", manager_objective_to_string(m->objective));
         (void) serialize_item_format(f, "soft-reboots-count", "%u", m->soft_reboots_count);
+        (void) serialize_item_format(f, "fd-store-upstream-next-index", "%" PRIu64, m->fd_store_upstream_next_index);
 
         for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
                 _cleanup_free_ char *joined = NULL;
@@ -757,6 +758,9 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
                                 log_notice("Failed to parse soft reboots counter '%s', ignoring.", val);
                         else
                                 m->soft_reboots_count = n;
+                } else if ((val = startswith(l, "fd-store-upstream-next-index="))) {
+                        if (safe_atou64(val, &m->fd_store_upstream_next_index) < 0)
+                                log_notice("Failed to parse fd-store-upstream-next-index '%s', ignoring.", val);
                 } else if ((val = startswith(l, "previous-objective="))) {
                         ManagerObjective objective;
 
index 038690a808a00f71fa796aba61c2ddc89a5551cd..1bd35a7d21b53e8289275d8bb9d3092ae06339ba 100644 (file)
@@ -76,6 +76,7 @@
 #include "rm-rf.h"
 #include "selinux-util.h"
 #include "serialize.h"
+#include "service.h"
 #include "set.h"
 #include "signal-util.h"
 #include "socket-util.h"
@@ -1878,6 +1879,93 @@ static void manager_catchup(Manager *m) {
         }
 }
 
+ListenFDsTag* listen_fds_tag_free(ListenFDsTag *t) {
+        if (!t)
+                return NULL;
+
+        free(t->unit_id);
+        free(t->fdname);
+        return mfree(t);
+}
+
+DEFINE_HASH_OPS_FULL(
+                fd_to_listen_fds_tag_hash_ops,
+                void, trivial_hash_func, trivial_compare_func, close_fd_ptr,
+                ListenFDsTag, listen_fds_tag_free);
+
+int manager_dispatch_external_fd_to_unit(
+                Manager *m,
+                const char *unit_id,
+                const char *fdname,
+                uint64_t index,
+                int fd_in,
+                const char *log_context) {
+
+        _cleanup_close_ int fd = ASSERT_FD(fd_in);
+        Unit *u = NULL;
+        int r;
+
+        assert(m);
+        assert(unit_id);
+        assert(fdname);
+        assert(log_context);
+
+        /* Load the unit eagerly: if the unit file exists this brings it into UNIT_LOADED, otherwise it
+         * lands in UNIT_NOT_FOUND. In both cases we want to attach the fd so it's preserved until the
+         * unit is fully stopped (or its file appears via daemon-reload). */
+        r = manager_load_unit(m, unit_id, /* path= */ NULL, /* e= */ NULL, &u);
+        if (r < 0)
+                return log_warning_errno(r, "%s: failed to load unit '%s', closing fd '%s': %m",
+                                         log_context, unit_id, fdname);
+
+        if (!UNIT_VTABLE(u)->attach_external_fd_to_fdstore)
+                return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+                                         "%s: unit '%s' does not support fd restoration, closing fd '%s'.",
+                                         log_context, unit_id, fdname);
+
+        r = UNIT_VTABLE(u)->attach_external_fd_to_fdstore(u, TAKE_FD(fd), fdname, index);
+        if (r < 0)
+                return log_unit_warning_errno(u, r, "%s: failed to attach fd '%s' to fd store: %m",
+                                              log_context, fdname);
+
+        return 1; /* fd consumed */
+}
+
+static int manager_distribute_listen_fds_named(Manager *m, Hashmap *named_listen_fds) {
+        assert(m);
+
+        /* Route fds whose LISTEN_FDNAMES name was a numeric index into the matching unit's fd store.
+         * The hashmap is built and owned by main.c's collect_fds(), keyed by fd, with ListenFDsTag* values
+         * that already carry the parsed unit-id, original fdname and index (resolved against the
+         * upstream-pushed fdstore-mapping memfd). We steal entries here so any leftover (skipped) entries
+         * are still cleaned up by the hashmap's destructor on the caller side. */
+
+        if (MANAGER_IS_TEST_RUN(m))
+                return 0;
+
+        for (;;) {
+                _cleanup_(listen_fds_tag_freep) ListenFDsTag *t = NULL;
+                _cleanup_close_ int fd = -EBADF;
+                void *key;
+
+                t = hashmap_steal_first_key_and_value(named_listen_fds, &key);
+                if (!t)
+                        break;
+
+                fd = PTR_TO_FD(key);
+
+                if (!t->unit_id || !t->fdname)
+                        continue;
+
+                if (!unit_name_is_valid(t->unit_id, UNIT_NAME_ANY))
+                        continue;
+
+                (void) manager_dispatch_external_fd_to_unit(m, t->unit_id, t->fdname, t->index, TAKE_FD(fd), "LISTEN_FDS");
+        }
+
+        return 0;
+}
+
 static void manager_distribute_fds(Manager *m, FDSet *fds) {
         Unit *u;
 
@@ -2034,7 +2122,7 @@ static int manager_make_runtime_dir(Manager *m) {
         return 0;
 }
 
-int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root) {
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, Hashmap *named_listen_fds, const char *root) {
         int r;
 
         assert(m);
@@ -2103,6 +2191,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
                 if (m->previous_objective == MANAGER_SOFT_REBOOT)
                         m->soft_reboots_count++;
 
+                /* Pick up fds passed via the LISTEN_FDS=/LISTEN_FDNAMES= protocol that are tagged with a
+                 * unit id ("unit-id|fdname"), and route them into the matching unit's fd store. Untagged
+                 * fds remain in 'fds' and are handed to socket units below as before. */
+                (void) manager_distribute_listen_fds_named(m, named_listen_fds);
+
                 /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass
                  * some file descriptors to us pre-initialized. This enables socket-based activation of entire
                  * containers. */
@@ -2142,6 +2235,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
                 /* Clean up runtime objects */
                 manager_vacuum(m);
 
+                /* After deserialization, refresh the upstream JSON mapping memfd so the supervisor's
+                 * view of our fd store stays consistent with the indices we just restored. */
+                (void) service_propagate_fd_store_mapping_upstream(m);
+
                 if (serialization)
                         /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the
                          * reload is finished */
index d17693369ab5a4937dff09b8349f7c2df566a1c3..4695112c041ffe8b0864a29f39f4207c1a4431e1 100644 (file)
@@ -513,6 +513,11 @@ typedef struct Manager {
         /* The number of successfully completed configuration reloads. */
         uint64_t reload_count;
 
+        /* Monotonic counter for fdstore entries propagated to a NOTIFY_SOCKET supervisor. Each propagated
+         * fd is sent upstream using this index as the FDNAME. The mapping (index -> unit_id + original fdname)
+         * is pushed alongside as a JSON memfd named "systemd-fdstore-mapping". */
+        uint64_t fd_store_upstream_next_index;
+
         /* Original ambient capabilities when we were initialized */
         uint64_t saved_ambient_set;
 } Manager;
@@ -542,7 +547,20 @@ int manager_new(RuntimeScope scope, ManagerTestRunFlags test_run_flags, Manager
 Manager* manager_free(Manager *m);
 DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
 
-int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root);
+/* One entry parsed out of the upstream "systemd-fdstore-mapping" memfd. Pairs the numeric index from the
+ * JSON map to the (unit-id, original fdname) the fd was originally stored as. */
+typedef struct ListenFDsTag {
+        char *unit_id;
+        char *fdname;
+        uint64_t index;
+} ListenFDsTag;
+
+ListenFDsTag* listen_fds_tag_free(ListenFDsTag *t);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ListenFDsTag*, listen_fds_tag_free);
+
+extern const struct hash_ops fd_to_listen_fds_tag_hash_ops;
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, Hashmap *named_listen_fds, const char *root);
 
 Job *manager_get_job(Manager *m, uint32_t id);
 Unit *manager_get_unit(Manager *m, const char *name);
@@ -552,6 +570,7 @@ int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j);
 bool manager_unit_cache_should_retry_load(Unit *u);
 int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
 int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_dispatch_external_fd_to_unit(Manager *m, const char *unit_id, const char *fdname, uint64_t index, int fd, const char *log_context);
 int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret);
 int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u);
 
index 445812f9522d53098cc2378784ea8aebec7a6ec1..6a02ca8d6f9833dfd7f8dfcd2ec4c1becf409375 100644 (file)
@@ -6,6 +6,7 @@
 #include <unistd.h>
 
 #include "sd-bus.h"
+#include "sd-json.h"
 #include "sd-messages.h"
 
 #include "alloc-util.h"
@@ -15,6 +16,7 @@
 #include "bus-util.h"
 #include "cgroup.h"
 #include "chase.h"
+#include "daemon-util.h"
 #include "dbus-service.h"
 #include "dbus-unit.h"
 #include "devnum-util.h"
@@ -33,6 +35,7 @@
 #include "image-policy.h"
 #include "log.h"
 #include "manager.h"
+#include "memfd-util.h"
 #include "mount-util.h"
 #include "namespace.h"
 #include "open-file.h"
@@ -462,12 +465,27 @@ static void service_override_watchdog_timeout(Service *s, usec_t watchdog_overri
         log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec);
 }
 
-static ServiceFDStore* service_fd_store_unlink(ServiceFDStore *fs) {
+static ServiceFDStore* service_fd_store_unlink_full(ServiceFDStore *fs, bool propagate_upstream) {
         if (!fs)
                 return NULL;
 
         if (fs->service) {
                 assert(fs->service->n_fd_store > 0);
+
+                /* If we previously propagated this fd to an enveloping service/container manager via
+                 * the FDSTORE=1 protocol on its NOTIFY_SOCKET (only done when persistence is on),
+                 * tell that supervisor to drop it now too, so the upstream fd store stays in sync.
+                 * Only do this for explicit removals (EPOLLHUP/EPOLLERR or app FDSTOREREMOVE), not
+                 * for local cleanup like service shutdown or fdstore-limit truncation: in those
+                 * cases we want the upstream copy to survive so it can be handed back to us later. */
+                if (propagate_upstream && fs->index > 0) {
+                        (void) notify_remove_fd_warnf(SERVICE_FDSTORE_SUB_FDNAME_PREFIX "%" PRIu64, fs->index);
+                        fs->index = 0;
+                        /* Refresh the upstream JSON mapping so the supervisor's view stays in sync
+                         * with what fds are actually still around. */
+                        (void) service_propagate_fd_store_mapping_upstream(UNIT(fs->service)->manager);
+                }
+
                 LIST_REMOVE(fd_store, fs->service->fd_store, fs);
                 fs->service->n_fd_store--;
         }
@@ -479,6 +497,10 @@ static ServiceFDStore* service_fd_store_unlink(ServiceFDStore *fs) {
         return mfree(fs);
 }
 
+static ServiceFDStore* service_fd_store_unlink(ServiceFDStore *fs) {
+        return service_fd_store_unlink_full(fs, /* propagate_upstream= */ false);
+}
+
 DEFINE_TRIVIAL_CLEANUP_FUNC(ServiceFDStore*, service_fd_store_unlink);
 
 static void service_release_fd_store(Service *s) {
@@ -495,6 +517,21 @@ static void service_release_fd_store(Service *s) {
         assert(s->n_fd_store == 0);
 }
 
+static void service_truncate_fd_store(Service *s) {
+        assert(s);
+
+        /* Drop fds that exceed the (possibly newly lowered) n_fd_store_max, e.g. after the fragment was
+         * parsed and FileDescriptorStoreMax= shrunk the configured limit. Newest entries are at the head
+         * of the list, so drop from the head (newest first). */
+
+        while (s->n_fd_store > s->n_fd_store_max) {
+                ServiceFDStore *fs = ASSERT_PTR(s->fd_store);
+                log_unit_debug(UNIT(s), "Dropping stored fd '%s' to honor FileDescriptorStoreMax=%u.",
+                               strna(fs->fdname), s->n_fd_store_max);
+                service_fd_store_unlink(fs);
+        }
+}
+
 static void service_release_extra_fds(Service *s) {
         assert(s);
 
@@ -512,6 +549,15 @@ static void service_release_extra_fds(Service *s) {
         s->n_extra_fds = 0;
 }
 
+ServiceExtraFD* service_extra_fd_free(ServiceExtraFD *fd) {
+        if (!fd)
+                return NULL;
+
+        safe_close(fd->fd);
+        free(fd->fdname);
+        return mfree(fd);
+}
+
 static void service_release_stdio_fd(Service *s) {
         assert(s);
 
@@ -585,7 +631,7 @@ static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *us
                        "Received %s on stored fd %d (%s), closing.",
                        revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP",
                        fs->fd, strna(fs->fdname));
-        service_fd_store_unlink(fs);
+        service_fd_store_unlink_full(fs, /* propagate_upstream= */ true);
 
         if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !SERVICE_FD_STORE_POPULATED(s))
                 service_set_state(s, SERVICE_DEAD);
@@ -593,7 +639,7 @@ static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *us
         return 0;
 }
 
-static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do_poll) {
+int service_add_fd_store(Service *s, int fd_in, const char *name, bool do_poll, bool propagate_upstream) {
         _cleanup_(service_fd_store_unlinkp) ServiceFDStore *fs = NULL;
         _cleanup_(asynchronous_closep) int fd = ASSERT_FD(fd_in);
         struct stat st;
@@ -647,14 +693,43 @@ static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do
 
         log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname);
 
+        /* If fd-store persistence is enabled and we have an enveloping service/container manager (i.e.
+         * NOTIFY_SOCKET is set), forward the fd to it via sd_notify(FDSTORE=1) tagged with a fresh
+         * incrementing index, and (re-)push the JSON mapping memfd that pairs the index back to this
+         * unit and the original fdname. This way fdstore persistence chains all the way up to whichever
+         * entity is ultimately responsible for surviving across kexec/restart, regardless of fdname
+         * length or charset constraints. */
+        if (propagate_upstream && s->fd_store_preserve_mode == EXEC_PRESERVE_YES) {
+                Manager *m = ASSERT_PTR(UNIT(s)->manager);
+                char idx_str[STRLEN(SERVICE_FDSTORE_SUB_FDNAME_PREFIX) + DECIMAL_STR_MAX(uint64_t)];
+
+                assert(m->fd_store_upstream_next_index < UINT64_MAX);
+                uint64_t idx = ++m->fd_store_upstream_next_index;
+
+                xsprintf(idx_str, SERVICE_FDSTORE_SUB_FDNAME_PREFIX "%" PRIu64, idx);
+
+                r = notify_push_fd(fs->fd, idx_str);
+                if (r < 0)
+                        log_unit_debug_errno(UNIT(s), r,
+                                             "Failed to propagate fd '%s' to upstream supervisor as index %" PRIu64 ", ignoring: %m",
+                                             fs->fdname, idx);
+                else
+                        fs->index = idx;
+        }
+
         fs->service = s;
         LIST_PREPEND(fd_store, s->fd_store, TAKE_PTR(fs));
         s->n_fd_store++;
 
+        if (propagate_upstream && s->fd_store_preserve_mode == EXEC_PRESERVE_YES)
+                /* Refresh the JSON mapping memfd so the supervisor can resolve the new index. Do this
+                 * after LIST_PREPEND so the new entry is visible to the helper. */
+                (void) service_propagate_fd_store_mapping_upstream(UNIT(s)->manager);
+
         return 1; /* fd newly stored */
 }
 
-static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll) {
+static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll, bool propagate_upstream) {
         int r;
 
         assert(s);
@@ -666,7 +741,7 @@ static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bo
                 if (fd < 0)
                         break;
 
-                r = service_add_fd_store(s, fd, name, do_poll);
+                r = service_add_fd_store(s, fd, name, do_poll, propagate_upstream);
                 if (r == -EXFULL)
                         return log_unit_warning_errno(UNIT(s), r,
                                                       "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.",
@@ -678,6 +753,139 @@ static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bo
         return 0;
 }
 
+int service_propagate_fd_store_mapping_upstream(Manager *m) {
+        _cleanup_(sd_json_variant_unrefp) sd_json_variant *root = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        _cleanup_free_ char *text = NULL;
+        Unit *u;
+        int r;
+
+        assert(m);
+
+        /* Build a JSON object listing all fdstore entries that have been propagated upstream:
+         *
+         *   {
+         *     "unit-name.service": [
+         *       { "name": "fdname1", "index": 1 },
+         *       { "name": "fdname2", "index": 2 }
+         *     ],
+         *     ...
+         *   }
+         *
+         * Push it as a sealed memfd to the upstream supervisor under a fixed FDNAME so it can resolve
+         * the per-fd numeric indices back to (unit_id, original fdname) at startup. The mapping is
+         * regenerated and re-pushed after every add/remove, so the supervisor's view stays in sync. */
+        HASHMAP_FOREACH(u, m->units) {
+                _cleanup_(sd_json_variant_unrefp) sd_json_variant *entries = NULL;
+                Service *s;
+
+                if (u->type != UNIT_SERVICE)
+                        continue;
+
+                s = SERVICE(u);
+                if (!s->fd_store)
+                        continue;
+
+                LIST_FOREACH(fd_store, fs, s->fd_store) {
+                        if (fs->index == 0)
+                                continue;
+
+                        r = sd_json_variant_append_arraybo(
+                                        &entries,
+                                        SD_JSON_BUILD_PAIR_STRING("name", fs->fdname),
+                                        SD_JSON_BUILD_PAIR_UNSIGNED("index", fs->index));
+                        if (r < 0)
+                                return log_warning_errno(r, "Failed to build fdstore-mapping JSON entry: %m");
+                }
+
+                if (!entries)
+                        continue;
+
+                r = sd_json_variant_set_field(&root, u->id, entries);
+                if (r < 0)
+                        return log_warning_errno(r, "Failed to add unit to fdstore-mapping JSON: %m");
+        }
+
+        if (!root) {
+                /* Nothing to map: tell the supervisor to drop any previously-pushed mapping memfd
+                 * so it doesn't keep stale entries around. Only do this if we have actually pushed
+                 * one in the past (i.e. we ever assigned an upstream index, either in this
+                 * incarnation or in a previous one whose counter we deserialized), otherwise we
+                 * might inadvertently remove a mapping that was just handed back to us via
+                 * LISTEN_FDS during a fresh manager startup. */
+                if (m->fd_store_upstream_next_index > 0)
+                        (void) notify_remove_fd_warn(SERVICE_FDSTORE_MAPPING_FDNAME);
+                return 0;
+        }
+
+        r = sd_json_variant_format(root, /* flags= */ 0, &text);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to format fdstore-mapping JSON: %m");
+
+        fd = memfd_new_and_seal_string(SERVICE_FDSTORE_MAPPING_FDNAME, text);
+        if (fd < 0)
+                return log_warning_errno(fd, "Failed to create fdstore-mapping memfd: %m");
+
+        r = notify_push_fd(fd, SERVICE_FDSTORE_MAPPING_FDNAME);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to propagate fdstore-mapping to upstream supervisor: %m");
+
+        return 0;
+}
+
+static int service_attach_external_fd_to_fdstore(Unit *u, int fd, const char *fdname, uint64_t index) {
+        Service *s = ASSERT_PTR(SERVICE(u));
+        int r;
+
+        assert(u->type == UNIT_SERVICE);
+
+        /* If the unit file is absent, bump the limit by one and force preserve so the fd is
+         * accepted and pins the unit until a daemon-reload picks up the unit file or it is
+         * explicitly stopped. */
+        if (u->load_state == UNIT_NOT_FOUND) {
+                s->fd_store_preserve_mode = EXEC_PRESERVE_YES;
+                s->n_fd_store_max++;
+        }
+
+        /* Don't propagate upstream: the fd just came back from upstream, forwarding it would loop. */
+        r = service_add_fd_store(s, fd, fdname, /* do_poll= */ true, /* propagate_upstream= */ false);
+        if (r <= 0 && u->load_state == UNIT_NOT_FOUND)
+                s->n_fd_store_max--;
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to add LUO fd '%s' to fd store: %m", fdname);
+
+        /* If the fd was previously propagated to an upstream supervisor under a numeric index,
+         * preserve that index on the freshly-added entry so that future FDSTOREREMOVE messages
+         * (and the fdstore-mapping memfd we re-push to the supervisor) reference the same index
+         * the supervisor already knows about. service_add_fd_store() does LIST_PREPEND() on
+         * success, so the new entry is at the head. Also keep the manager's allocator counter
+         * past the highest restored index, to avoid collisions with newly allocated indices. */
+        if (r > 0 && index > 0 && s->fd_store) {
+                Manager *m = ASSERT_PTR(u->manager);
+
+                s->fd_store->index = index;
+                if (index > m->fd_store_upstream_next_index)
+                        m->fd_store_upstream_next_index = index;
+        }
+
+        /* If the unit is otherwise inactive (typical for LUO/upstream restore), pin its resources so it
+         * isn't garbage-collected before something explicitly stops it. Only flip the state when both
+         * runtime and deserialized state agree on DEAD, to avoid clobbering a just-deserialized live
+         * state (e.g. SERVICE_RUNNING after daemon-reload, where service_coldplug() will set the proper
+         * state later). */
+        if (r > 0 &&
+            s->state == SERVICE_DEAD &&
+            s->deserialized_state == SERVICE_DEAD &&
+            s->fd_store_preserve_mode == EXEC_PRESERVE_YES) {
+                service_set_state(s, SERVICE_DEAD_RESOURCES_PINNED);
+                s->deserialized_state = SERVICE_DEAD_RESOURCES_PINNED;
+        }
+
+        if (r > 0)
+                log_unit_debug(u, "Restored fd '%s'.", fdname);
+        return r;
+}
+
 static void service_remove_fd_store(Service *s, const char *name) {
         assert(s);
         assert(name);
@@ -687,7 +895,7 @@ static void service_remove_fd_store(Service *s, const char *name) {
                         continue;
 
                 log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name);
-                service_fd_store_unlink(fs);
+                service_fd_store_unlink_full(fs, /* propagate_upstream= */ true);
         }
 }
 
@@ -952,6 +1160,11 @@ static int service_load(Unit *u) {
         if (u->load_state != UNIT_LOADED)
                 return 0;
 
+        /* The fragment may have lowered FileDescriptorStoreMax= below the number of fds currently in the
+         * store (e.g. fds restored from LUO into a synthesized UNIT_NOT_FOUND service that just got a real
+         * fragment via lazy reload, but which now disables the fd store). */
+        service_truncate_fd_store(s);
+
         /* This is a new unit? Then let's add in some extras */
         r = service_add_extras(s);
         if (r < 0)
@@ -1437,7 +1650,8 @@ static int service_coldplug(Unit *u) {
         int r;
 
         assert(s);
-        assert(s->state == SERVICE_DEAD);
+        /* Ensure we can insert FD store into units at boot */
+        assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED));
 
         if (s->deserialized_state == s->state)
                 return 0;
@@ -3470,7 +3684,8 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
                 if (!c)
                         return log_oom();
 
-                (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %s", copy, c, one_zero(fs->do_poll));
+                (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %s %" PRIu64,
+                                             copy, c, one_zero(fs->do_poll), fs->index);
         }
 
         FOREACH_ARRAY(i, s->extra_fds, s->n_extra_fds) {
@@ -3740,12 +3955,13 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
                 s->socket_fd = deserialize_fd(fds, value);
 
         } else if (streq(key, "fd-store-fd")) {
-                _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL;
+                _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL, *fdi = NULL;
                 _cleanup_close_ int fd = -EBADF;
                 int do_poll;
+                uint64_t index = 0;
 
-                r = extract_many_words(&value, " ", EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE, &fdv, &fdn, &fdp);
-                if (r < 2 || r > 3) {
+                r = extract_many_words(&value, " ", EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE, &fdv, &fdn, &fdp, &fdi);
+                if (r < 2 || r > 4) {
                         log_unit_debug(u, "Failed to deserialize fd-store-fd, ignoring: %s", value);
                         return 0;
                 }
@@ -3754,19 +3970,45 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
                 if (fd < 0)
                         return 0;
 
-                do_poll = r == 3 ? parse_boolean(fdp) : true;
+                do_poll = r >= 3 ? parse_boolean(fdp) : true;
                 if (do_poll < 0) {
                         log_unit_debug_errno(u, do_poll,
                                              "Failed to deserialize fd-store-fd do_poll, ignoring: %s", fdp);
                         return 0;
                 }
 
-                r = service_add_fd_store(s, TAKE_FD(fd), fdn, do_poll);
+                if (r == 4 && safe_atou64(fdi, &index) < 0) {
+                        log_unit_debug(u, "Failed to parse fd-store-fd index '%s', ignoring.", fdi);
+                        index = 0;
+                }
+
+                /* If the unit file is currently absent (e.g. after switch-root, before the unit file is
+                 * available in the new root), the synthesized service has n_fd_store_max=0 and
+                 * preserve_mode=NO, which would reject the fd. Grow the limit by one per fd so it matches
+                 * exactly what was handed back, and force EXEC_PRESERVE_YES, so the fd survives until
+                 * either a daemon-reload picks up the unit file or the service is explicitly stopped.
+                 * Same logic as in luo_dispatch_fd(). */
+                if (u->load_state == UNIT_NOT_FOUND) {
+                        s->fd_store_preserve_mode = EXEC_PRESERVE_YES;
+                        s->n_fd_store_max++;
+                }
+
+                /* Don't propagate upstream during deserialization: the upstream supervisor (if any)
+                 * already has these fds from when they were originally pushed. */
+                r = service_add_fd_store(s, TAKE_FD(fd), fdn, do_poll, /* propagate_upstream= */ false);
+                if (r <= 0 && u->load_state == UNIT_NOT_FOUND)
+                        /* The fd was not actually stored, roll back the limit bump. */
+                        s->n_fd_store_max--;
                 if (r < 0) {
                         log_unit_debug_errno(u, r,
                                              "Failed to store deserialized fd '%s', ignoring: %m", fdn);
                         return 0;
                 }
+                /* If preservation is enabled then this fd was previously propagated upstream when it
+                 * was first pushed. Restore the index so future removals can be forwarded upstream
+                 * and the JSON mapping memfd can be regenerated. */
+                if (r > 0 && s->fd_store && index > 0)
+                        s->fd_store->index = index;
         } else if (streq(key, "extra-fd")) {
                 _cleanup_free_ char *fdv = NULL, *fdn = NULL;
                 _cleanup_close_ int fd = -EBADF;
@@ -5280,7 +5522,7 @@ static void service_notify_message(
                         name = NULL;
                 }
 
-                (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0"));
+                (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0"), /* propagate_upstream= */ fdstore_detected());
         }
 
         /* Notify clients about changed status or main pid */
@@ -6178,6 +6420,8 @@ const UnitVTable service_vtable = {
         .serialize = service_serialize,
         .deserialize_item = service_deserialize_item,
 
+        .attach_external_fd_to_fdstore = service_attach_external_fd_to_fdstore,
+
         .active_state = service_active_state,
         .sub_state_to_string = service_sub_state_to_string,
 
index 9750b19ce285ffcf137a3274ba0b4e0424b980ea..b57634cdb0f41c7ff151ff0371a68c9c4d0b338e 100644 (file)
@@ -8,6 +8,18 @@
 #include "pidref.h"
 #include "unit.h"
 
+/* FDNAME used to push the JSON mapping memfd that pairs upstream-propagated fdstore indices with
+ * (unit-id, original fdname) tuples. The receiving manager looks for this fdname in LISTEN_FDNAMES
+ * to find the mapping document. */
+#define SERVICE_FDSTORE_MAPPING_FDNAME "systemd-fdstore-mapping"
+
+/* Prefix for the upstream FDNAME used when forwarding individual fd-store entries to a parent
+ * supervisor: the entries are exposed as "sub-fdstore-<index>" so the supervisor's own fd-store
+ * namespace doesn't collide with names a downstream service manager assigns. The trailing index
+ * is matched up with an entry in the SERVICE_FDSTORE_MAPPING_FDNAME memfd to recover the original
+ * (unit, fdname) pair. */
+#define SERVICE_FDSTORE_SUB_FDNAME_PREFIX "sub-fdstore-"
+
 typedef enum ServiceRestart {
         SERVICE_RESTART_NO,
         SERVICE_RESTART_ON_SUCCESS,
@@ -112,6 +124,10 @@ typedef struct ServiceFDStore {
         char *fdname;
         sd_event_source *event_source;
         bool do_poll;
+        /* If non-zero, this fd was forwarded to the NOTIFY_SOCKET supervisor via FDSTORE=1, with the
+         * stringified value of this index as its FDNAME. The originating unit-id and original fdname
+         * are recorded in a JSON mapping memfd that is also pushed upstream. */
+        uint64_t index;
 
         LIST_FIELDS(struct ServiceFDStore, fd_store);
 } ServiceFDStore;
@@ -279,6 +295,12 @@ extern const UnitVTable service_vtable;
 int service_set_socket_fd(Service *s, int fd, struct Socket *socket, struct SocketPeer *peer, bool selinux_context_net);
 void service_release_socket_fd(Service *s);
 
+int service_add_fd_store(Service *s, int fd_in, const char *name, bool do_poll, bool propagate_upstream);
+
+int service_propagate_fd_store_mapping_upstream(Manager *m);
+
+ServiceExtraFD* service_extra_fd_free(ServiceExtraFD *fd);
+
 usec_t service_restart_usec_next(const Service *s) _pure_;
 
 int service_determine_exec_selinux_label(Service *s, char **ret);
index d20e46ab57927f548afd9ed5c60ff8c082c0fde0..f96ae279362ccf207e6a1ea34705814a12d6640d 100644 (file)
@@ -618,6 +618,13 @@ typedef struct UnitVTable {
         /* Try to match up fds with what we need for this unit */
         void (*distribute_fds)(Unit *u, FDSet *fds);
 
+        /* Restore one file descriptor that PID 1 retrieved from a Live Update Orchestrator session into the
+         * unit's per-instance state (e.g. fd store). Always consumes 'fd', even on failure. If the fd
+         * was previously propagated to an upstream NOTIFY_SOCKET supervisor under a numeric index,
+         * 'index' carries that index so it can be re-claimed (avoiding collisions with newly allocated
+         * indices and keeping FDSTOREREMOVE messages routable). Pass 0 to indicate no preserved index. */
+        int (*attach_external_fd_to_fdstore)(Unit *u, int fd, const char *fdname, uint64_t index);
+
         /* Boils down the more complex internal state of this unit to
          * a simpler one that the engine can understand */
         UnitActiveState (*active_state)(Unit *u);
index 321a9e58dafc37ed8f4d87e4e78dcdbed487369b..d85e52bcb68c3d6d89bf2af8f36adb5368b8190d 100644 (file)
@@ -7,6 +7,7 @@
 #include "errno-util.h"
 #include "fd-util.h"
 #include "log.h"
+#include "parse-util.h"
 #include "string-util.h"
 #include "time-util.h"
 
@@ -84,6 +85,27 @@ int notify_push_fdf(int fd, const char *format, ...) {
         return notify_push_fd(fd, name);
 }
 
+bool fdstore_detected(void) {
+        static int cached = -1;
+        int r;
+
+        if (cached >= 0)
+                return cached;
+
+        const char *e = getenv("FDSTORE");
+        if (isempty(e))
+                return (cached = 0);
+
+        unsigned u;
+        r = safe_atou(e, &u);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to parse 'FDSTORE=%s', ignoring: %m", e);
+                return (cached = 0);
+        }
+
+        return (cached = u > 0);
+}
+
 int notify_reloading_full(const char *status) {
         int r;
 
index 708a32985c6c521a521890560cbc11ca806a6322..089e418f7b809eb373b60c07094660fd23aa5375 100644 (file)
@@ -27,6 +27,8 @@ int close_and_notify_warn(int fd, const char *name);
 int notify_push_fd(int fd, const char *name);
 int notify_push_fdf(int fd, const char *format, ...) _printf_(2, 3);
 
+bool fdstore_detected(void);
+
 int notify_reloading_full(const char *status);
 static inline int notify_reloading(void) {
         return notify_reloading_full("Reloading configuration...");
index c3d8e7d5d54b82c1a0c1187fc69cc18cc86437f3..e591d0d5ea60ee5e7ba7ba129ddde8a1b167a6d5 100644 (file)
@@ -75,7 +75,7 @@ int main(int argc, char *argv[]) {
         /* The simple tests succeeded. Now let's try full unit-based use-case. */
 
         ASSERT_OK(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m));
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
         ASSERT_EQ(unit_add_name(u, "foo.service"), 0);
index bf56451d2501c47fbaaedd1fb4aa6eea693157f7..9d9093bee76471a61ca9a64c4d5f17d388b71a5d 100644 (file)
@@ -303,7 +303,7 @@ int main(int argc, char *argv[]) {
         assert_se(runtime_dir = setup_fake_runtime_dir());
 
         ASSERT_OK(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m));
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_OK(test_bpf_cgroup_programs(m,
                                 "single_prog.service", single_prog, ELEMENTSOF(single_prog)));
index 9d54ad033760edb2a45358b0780c32cd65135fda..8e41537f21ff7fe0a61f69cfd7e045985318e130 100644 (file)
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
         ASSERT_NOT_NULL((runtime_dir = setup_fake_runtime_dir()));
 
         ASSERT_OK(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_BASIC, &m));
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         /* We need to enable access to the filesystem where the binary is so we
          * add @common-block and @application */
index 6b123f8761b4786a64fa98fd8bc6474fee4de9fa..967341508053efd9ea9834a360a4d663d747d1aa 100644 (file)
@@ -51,7 +51,7 @@ TEST_RET(cgroup_mask, .sd_booted = true) {
                 m->defaults.tasks_accounting = false;
         m->defaults.tasks_max = CGROUP_TASKS_MAX_UNSET;
 
-        assert_se(manager_startup(m, NULL, NULL, NULL) >= 0);
+        assert_se(manager_startup(m, NULL, NULL, NULL, NULL) >= 0);
 
         /* Load units and verify hierarchy. */
         ASSERT_OK(manager_load_startable_unit_or_warn(m, "parent.slice", NULL, &parent));
index e1a2f7ea0482333da167a31ecd551c2a74aeef54..5a6a1a42044060deaddcdbe3a20e4b0e73620dad 100644 (file)
@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) {
         if (manager_errno_skip_test(r))
                 return log_tests_skipped_errno(r, "manager_new");
         assert_se(r >= 0);
-        assert_se(manager_startup(m, NULL, NULL, NULL) >= 0);
+        assert_se(manager_startup(m, NULL, NULL, NULL, NULL) >= 0);
 
         printf("Load1:\n");
         assert_se(manager_load_startable_unit_or_warn(m, "a.service", NULL, &a) >= 0);
index e14205bdf86a3bf1e6e7aab017d7ac9786718a6d..3a124f967431ff35003d95502dd42108c6200efd 100644 (file)
@@ -1435,7 +1435,7 @@ static void run_tests(RuntimeScope scope, char **patterns) {
         ASSERT_OK(r);
 
         m->defaults.std_output = EXEC_OUTPUT_INHERIT; /* don't rely on host journald */
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         /* Uncomment below if you want to make debugging logs stored to journal. */
         //manager_override_log_target(m, LOG_TARGET_AUTO);
index 3e318a36c91d3e9c33c99fb2ab42338ab87fb3f0..259bd142e4efb23fafb35b8ea338708cfa79d771 100644 (file)
@@ -108,7 +108,7 @@ TEST(config_parse_exec) {
         }
 
         ASSERT_OK(r);
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
 
@@ -430,7 +430,7 @@ TEST(config_parse_log_extra_fields) {
         }
 
         ASSERT_OK(r);
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
 
@@ -788,7 +788,7 @@ TEST(config_parse_unit_env_file) {
         }
 
         ASSERT_OK(r);
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
         ASSERT_OK_ZERO(unit_add_name(u, "foobar.service"));
@@ -912,7 +912,7 @@ TEST(unit_is_recursive_template_dependency) {
         }
 
         ASSERT_OK(r);
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
         ASSERT_OK_ZERO(unit_add_name(u, "foobar@1.service"));
@@ -1006,7 +1006,7 @@ TEST(config_parse_open_file) {
         }
 
         ASSERT_OK(r);
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
         ASSERT_OK_ZERO(unit_add_name(u, "foobar.service"));
@@ -1065,7 +1065,7 @@ TEST(config_parse_service_refresh_on_reload) {
         }
 
         ASSERT_OK(r);
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(u = unit_new(m, sizeof(Service)));
         ASSERT_OK_ZERO(unit_add_name(u, "foobar.service"));
index 8b02f5d0fffa4542a999f01b564dfb85ab2587f4..512eb96ead60f94b6650cb887b191a762227d5f6 100644 (file)
@@ -34,7 +34,7 @@ static int setup_test(Manager **m) {
         if (manager_errno_skip_test(r))
                 return log_tests_skipped_errno(r, "manager_new");
         assert_se(r >= 0);
-        assert_se(manager_startup(tmp, NULL, NULL, NULL) >= 0);
+        assert_se(manager_startup(tmp, NULL, NULL, NULL, NULL) >= 0);
 
         STRV_FOREACH(test_path, tests_path) {
                 _cleanup_free_ char *p = NULL;
index c1305ac4abd21bfd5f29bcced460829a7998001f..a523c01e8f01b1f174bbdadf4c79affd5fdcc458 100644 (file)
@@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
         if (manager_errno_skip_test(r))
                 return log_tests_skipped_errno(r, "manager_new");
         assert_se(r >= 0);
-        assert_se(manager_startup(m, NULL, NULL, NULL) >= 0);
+        assert_se(manager_startup(m, NULL, NULL, NULL, NULL) >= 0);
 
         /* load idle ok */
         assert_se(manager_load_startable_unit_or_warn(m, "sched_idle_ok.service", NULL, &idle_ok) >= 0);
index 4e4fdbedd0fa9b62ceb4da8e0572e9b6204e6316..63d249d6b4efe60194b9f352c22077653774835d 100644 (file)
@@ -133,7 +133,7 @@ int main(int argc, char *argv[]) {
         assert_se(runtime_dir = setup_fake_runtime_dir());
 
         assert_se(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m) >= 0);
-        assert_se(manager_startup(m, NULL, NULL, NULL) >= 0);
+        assert_se(manager_startup(m, NULL, NULL, NULL, NULL) >= 0);
 
         assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "2000", STRV_MAKE("2000"), STRV_MAKE("any")) >= 0);
         assert_se(test_socket_bind(m, "socket_bind_test.service", netcat_path, "2000", STRV_MAKE("ipv6:2001-2002"), STRV_MAKE("any")) >= 0);
index 0a325dfb0880a40ffd82f31fe97245baea3e474c..59f043f49ca33e9a5c3d5c49e8f78ea7a4a94802 100644 (file)
@@ -21,7 +21,7 @@ TEST(watch_pid) {
         ASSERT_NOT_NULL(runtime_dir = setup_fake_runtime_dir());
 
         ASSERT_OK(manager_new(RUNTIME_SCOPE_USER, MANAGER_TEST_RUN_BASIC, &m));
-        ASSERT_OK(manager_startup(m, NULL, NULL, NULL));
+        ASSERT_OK(manager_startup(m, NULL, NULL, NULL, NULL));
 
         ASSERT_NOT_NULL(a = unit_new(m, sizeof(Service)));
         ASSERT_OK(unit_add_name(a, "a.service"));