From c10c2674f39929998009e735ff85ed7b8378ec04 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 15 Apr 2026 13:11:30 +0100 Subject: [PATCH] LUO: add support for preserving third party sessions LUO sessions cannot be nested under other sessions. This means we need to handle them explicitly, and held them open in the shutdown binary like we do with our own internal session, to allow services to create their own. The requirement to support third party sessions comes from VMMs that wish to preserve VM(s) state(s) across kexec, as some file descriptors (KVM's vmfd from the KVM_CREATE_VM ioctl) cannot be transfered between processes via SCM_RIGHTS, so they cannot be stashed in the FD Store directly. Also some file descriptors have to be handled all together or not at all, again to do with KVM and devices that are all part of the same vm. --- docs/FILE_DESCRIPTOR_STORE.md | 8 +++ src/core/luo.c | 42 +++++++++++++- src/shared/luo-util.c | 67 ++++++++++++++++++++- src/shared/luo-util.h | 6 ++ src/test/test-luo.c | 99 +++++++++++++++++++++++++++++--- test/units/TEST-91-LIVEUPDATE.sh | 23 ++++---- 6 files changed, 220 insertions(+), 25 deletions(-) diff --git a/docs/FILE_DESCRIPTOR_STORE.md b/docs/FILE_DESCRIPTOR_STORE.md index e6141a4e01b..c3336453781 100644 --- a/docs/FILE_DESCRIPTOR_STORE.md +++ b/docs/FILE_DESCRIPTOR_STORE.md @@ -206,6 +206,14 @@ Adding a `FDNAME=…` string identifying the fd is also highly recommended, otherwise in case multiple fds are stored, it will be impossible to distinguish them, as they will all carry the default name (`stored`). +Services that need to preserve additional kernel state may also create their +own LUO sessions by opening `/dev/liveupdate` directly (see the kernel +documentation linked above) and pushing the obtained session fd into their +fdstore (it is recommended to use a `FDNAME=…` string, as above). systemd +detects such fds and arranges for them to survive the kexec as well, so that +the session, and any supported file descriptors preserved inside it, is +handed back to the service on the other side of the reboot. + ## Initrd Transitions The fdstore may also be used to pass file descriptors for resources from the diff --git a/src/core/luo.c b/src/core/luo.c index 4b5632c6dba..9dc892d5d69 100644 --- a/src/core/luo.c +++ b/src/core/luo.c @@ -104,6 +104,7 @@ int manager_luo_restore_fd_stores(Manager *m) { const char *type; const char *name; uint64_t token; + const char *session_name; } p = { .token = UINT64_MAX, }; @@ -112,6 +113,7 @@ int manager_luo_restore_fd_stores(Manager *m) { { "type", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, voffsetof(p, type), SD_JSON_MANDATORY }, { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, voffsetof(p, name), SD_JSON_MANDATORY }, { "token", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64, voffsetof(p, token), 0 }, + { "sessionName", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, voffsetof(p, session_name), 0 }, {} }; @@ -137,6 +139,20 @@ int manager_luo_restore_fd_stores(Manager *m) { unit_id, p.name, p.token); continue; } + } else if (streq(p.type, "luo_session")) { + if (!p.session_name) { + log_warning("LUO mapping for unit '%s' fd '%s': missing sessionName.", unit_id, p.name); + continue; + } + + fd = luo_retrieve_session(device_fd, p.session_name); + if (fd < 0) { + log_warning_errno(fd, "Failed to retrieve LUO session '%s' for unit '%s' name '%s': %m", + p.session_name, unit_id, p.name); + continue; + } + + log_debug("Retrieved LUO session '%s' for unit fd store '%s'.", p.session_name, p.name); } else { log_warning("LUO mapping for unit '%s' fd '%s': unknown type '%s', skipping.", unit_id, p.name, p.type); @@ -177,7 +193,8 @@ int manager_luo_serialize_fd_stores(Manager *m, FILE **ret_f, FDSet **ret_fds) { if (!fds) return log_oom(); - /* Build a JSON object: { "unit_id": [ { "type": "fd", "name": "...", "fd": N }, ... ], ... } + /* Build a JSON object: { "unit_id": [ { "type": "fd", "name": "...", "fd": N }, + * { "type": "luo_session", "name": "...", "fd": N, "sessionName": "..." } ], ... } * This is passed to systemd-shutdown which will create a LUO session and preserve the fds. */ HASHMAP_FOREACH(u, m->units) { _cleanup_(sd_json_variant_unrefp) sd_json_variant *entries = NULL; @@ -195,17 +212,36 @@ int manager_luo_serialize_fd_stores(Manager *m, FILE **ret_f, FDSet **ret_fds) { continue; LIST_FOREACH(fd_store, fs, s->fd_store) { + _cleanup_free_ char *session_name = NULL; int copy; + /* Check if this fd is itself a LUO session, as those cannot be nested and need + * special handling */ + r = fd_get_luo_session_name(fs->fd, &session_name); + if (r < 0 && r != -EMEDIUMTYPE) { + log_warning_errno(r, "Failed to check if fd '%s' of unit '%s' is a LUO session, skipping: %m", + fs->fdname, u->id); + continue; + } + + /* Ensure nobody tries to hijack our session, as we will create this later before + * kexec */ + if (streq_ptr(session_name, LUO_SESSION_NAME)) { + log_warning("Skipping fd '%s' of unit '%s' for LUO serialization, as the session name '%s' infringes systemd's namespace.", + fs->fdname, u->id, session_name); + continue; + } + copy = fdset_put_dup(fds, fs->fd); if (copy < 0) return log_error_errno(copy, "Failed to duplicate fd for LUO serialization: %m"); r = sd_json_variant_append_arraybo( &entries, - SD_JSON_BUILD_PAIR_STRING("type", "fd"), + SD_JSON_BUILD_PAIR_STRING("type", session_name ? "luo_session" : "fd"), SD_JSON_BUILD_PAIR_STRING("name", fs->fdname), - SD_JSON_BUILD_PAIR_INTEGER("fd", copy)); + SD_JSON_BUILD_PAIR_INTEGER("fd", copy), + JSON_BUILD_PAIR_STRING_NON_EMPTY("sessionName", session_name)); if (r < 0) return log_error_errno(r, "Failed to build JSON for LUO serialization: %m"); diff --git a/src/shared/luo-util.c b/src/shared/luo-util.c index c1a579a0a00..898e51efa52 100644 --- a/src/shared/luo-util.c +++ b/src/shared/luo-util.c @@ -2,7 +2,9 @@ #include #include +#include #include +#include #include "sd-json.h" @@ -15,6 +17,7 @@ #include "luo-util.h" #include "memfd-util.h" #include "parse-util.h" +#include "stat-util.h" #include "string-util.h" /* Kernel API defined at https://docs.kernel.org/userspace-api/liveupdate.html The /dev/liveupdate is a @@ -219,9 +222,12 @@ int luo_preserve_fd_stores(sd_json_variant *serialization, int *ret_session_fd) return log_error_errno(session_fd, "Failed to create LUO session '%s': %m", LUO_SESSION_NAME); /* Build the mapping JSON for the new kernel's PID 1 and preserve each fd. - * JSON format: { "unit_id": [ {"type": "fd", "name": "...", "token": N}, ... ], ... } + * JSON format: { "unit_id": [ {"type": "fd", "name": "...", "token": N}, + * {"type": "luo_session", "name": "...", "sessionName": "..."} ], ... } * - * For regular fds: type=fd, preserved in the systemd session with the given LUO token. */ + * For regular fds: type=fd, preserved in the systemd session with the given LUO token. + * For LUO session fds: type=luo_session, the session survives kexec independently, as it cannot be + * nested. */ JSON_VARIANT_OBJECT_FOREACH(unit_id, entries, serialization) { _cleanup_(sd_json_variant_unrefp) sd_json_variant *fd_list = NULL; sd_json_variant *entry; @@ -270,6 +276,23 @@ int luo_preserve_fd_stores(sd_json_variant *serialization, int *ret_session_fd) return log_error_errno(r, "Failed to build LUO mapping: %m"); ++token; + } else if (streq(p.type, "luo_session")) { + if (!p.session_name) { + log_warning("LUO mapping for unit '%s' fd '%s': missing sessionName, skipping.", unit_id, p.name); + continue; + } + + /* Remember the FDStore name to session name mapping */ + r = sd_json_variant_append_arraybo( + &fd_list, + SD_JSON_BUILD_PAIR_STRING("type", "luo_session"), + SD_JSON_BUILD_PAIR_STRING("name", p.name), + SD_JSON_BUILD_PAIR_STRING("sessionName", p.session_name)); + if (r < 0) + return log_error_errno(r, "Failed to build LUO mapping for session fd: %m"); + + log_debug("LUO session fd '%s' (session '%s') recorded in mapping.", + p.name, p.session_name); } else log_warning("Unknown fd type '%s' for unit '%s' fd '%s', skipping.", p.type, unit_id, p.name); } @@ -311,3 +334,43 @@ int luo_preserve_fd_stores(sd_json_variant *serialization, int *ret_session_fd) *ret_session_fd = TAKE_FD(session_fd); return 1; } + +int fd_get_luo_session_name(int fd, char **ret) { + _cleanup_free_ char *path = NULL; + int r; + + assert(fd >= 0); + + // TODO: switch to LUO specific inode magic once available + r = fd_is_fs_type(fd, ANON_INODE_FS_MAGIC); + if (r < 0) + return r; + if (r == 0) + return -EMEDIUMTYPE; + + r = fd_get_path(fd, &path); + if (r < 0) + return r; + + /* Path is "anon_inode:[luo_session] " */ + const char *suffix = startswith(path, "anon_inode:[luo_session] "); + if (isempty(suffix)) + return -EMEDIUMTYPE; + + if (ret) + return strdup_to(ret, suffix); + + return 0; +} + +int fd_is_luo_session(int fd) { + int r; + + r = fd_get_luo_session_name(fd, /* ret= */ NULL); + if (r == -EMEDIUMTYPE) + return false; + if (r < 0) + return r; + + return true; +} diff --git a/src/shared/luo-util.h b/src/shared/luo-util.h index 9f6c061910a..1ca7ca6d712 100644 --- a/src/shared/luo-util.h +++ b/src/shared/luo-util.h @@ -13,6 +13,7 @@ * "unit-name.service": [ * { "type": "fd", "name": "fdname1", "token": 1 }, * { "type": "fd", "name": "fdname2", "token": 2 }, + * { "type": "luo_session", "name": "fdname3", "sessionName": "unit.service/myapp" } * ], * "other-unit.service": [ * { "type": "fd", "name": "stored", "token": 3 } @@ -20,6 +21,8 @@ * } * * type=fd: the fd was preserved in the "systemd" LUO session with the given token. + * type=luo_session: a service-owned LUO session that survives kexec independently, + * retrieved by session_name on the next boot. */ #define LUO_MAPPING_INDEX UINT64_C(0) @@ -32,3 +35,6 @@ int luo_session_finish(int session_fd); int luo_parse_serialization(sd_json_variant **ret, int **ret_fds, size_t *ret_n_fds); int luo_preserve_fd_stores(sd_json_variant *serialization, int *ret_session_fd); + +int fd_is_luo_session(int fd); +int fd_get_luo_session_name(int fd, char **ret); diff --git a/src/test/test-luo.c b/src/test/test-luo.c index 95ccc84b1a6..97ba66459d2 100644 --- a/src/test/test-luo.c +++ b/src/test/test-luo.c @@ -1,11 +1,12 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ /* Helper for TEST-91-LIVEUPDATE: creates memfds and stores them in the fd store, - * or verifies that inherited fd store entries contain the expected content. + * creates a LUO session directly via /dev/liveupdate and stores a memfd in it, + * or verifies everything after kexec. * * Usage: - * test-luo store - create memfds with test data and push them to the fd store - * test-luo check - verify fd store content matches expectations + * test-luo store - create memfds and a LUO session, push all to the fd store + * test-luo check - verify fd store content and LUO session memfd after kexec */ #include @@ -16,6 +17,7 @@ #include "fd-util.h" #include "log.h" +#include "luo-util.h" #include "main-func.h" #include "memfd-util.h" #include "parse-util.h" @@ -25,8 +27,10 @@ #define TEST_DATA_1 "liveupdate-test-data-1" #define TEST_DATA_2 "liveupdate-test-data-2" +#define SESSION_MEMFD_DATA "luo-session-memfd-test-data" +#define SESSION_MEMFD_TOKEN UINT64_C(42) -static int do_store(void) { +static int do_store(const char *prefix) { _cleanup_close_ int fd1 = -EBADF, fd2 = -EBADF; int r; @@ -48,6 +52,33 @@ static int do_store(void) { log_info("Stored 2 memfds in fd store."); + /* Create a LUO session directly via /dev/liveupdate, put a memfd in it, and store the session fd */ + _cleanup_close_ int device_fd = -EBADF, session_fd = -EBADF, session_memfd = -EBADF; + const char *session_name = strjoina(prefix, "-direct"); + + device_fd = luo_open_device(); + if (device_fd < 0) + return log_error_errno(device_fd, "Failed to open /dev/liveupdate: %m"); + + session_fd = luo_create_session(device_fd, session_name); + if (session_fd < 0) + return log_error_errno(session_fd, "Failed to create LUO session '%s': %m", session_name); + + session_memfd = memfd_new_and_seal("session-test", SESSION_MEMFD_DATA, strlen(SESSION_MEMFD_DATA)); + if (session_memfd < 0) + return log_error_errno(session_memfd, "Failed to create session memfd: %m"); + + r = luo_session_preserve_fd(session_fd, session_memfd, SESSION_MEMFD_TOKEN); + if (r < 0) + return log_error_errno(r, "Failed to preserve memfd in session: %m"); + + r = sd_pid_notifyf_with_fds(0, false, &session_fd, 1, "FDSTORE=1\nFDNAME=%s-direct", prefix); + if (r < 0) + return log_error_errno(r, "Failed to store session fd in fd store: %m"); + TAKE_FD(session_fd); + + log_info("Stored LUO session '%s' with memfd in fd store.", session_name); + /* Wait for PID 1 to actually process all our FDSTORE notifications before we exit, otherwise * the cgroup-based pidref to unit lookup may fail once we're gone, and the fds end up closed. */ r = sd_notify_barrier(0, 5 * USEC_PER_SEC); @@ -57,9 +88,10 @@ static int do_store(void) { return 0; } -static int do_check(void) { +static int do_check(const char *prefix) { const char *e; _cleanup_strv_free_ char **names = NULL; + const char *session_fdname = strjoina(prefix, "-direct"); size_t n_fds; int r; @@ -146,6 +178,53 @@ static int do_check(void) { log_info("All fd store checks passed."); + /* Verify the LUO session fd survived and its memfd content is intact */ + int session_fd = -EBADF; + size_t idx = 0; + STRV_FOREACH(name, names) { + if (idx >= n_fds) + break; + if (streq(*name, session_fdname)) { + session_fd = SD_LISTEN_FDS_START + idx; + break; + } + idx++; + } + + if (session_fd < 0) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), + "LUO session fd '%s' not found in fd store", session_fdname); + + r = fd_is_luo_session(session_fd); + if (r < 0) + return log_error_errno(r, "Failed to check if fd is LUO session: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "fd '%s' is not a LUO session!", session_fdname); + + _cleanup_close_ int session_memfd = luo_session_retrieve_fd(session_fd, SESSION_MEMFD_TOKEN); + if (session_memfd < 0) + return log_error_errno(session_memfd, "Failed to retrieve memfd from session: %m"); + + char sbuf[256]; + ssize_t sn = pread(session_memfd, sbuf, sizeof(sbuf) - 1, 0); + if (sn < 0) + return log_error_errno(errno, "Failed to read session memfd: %m"); + sbuf[sn] = '\0'; + + if (!streq(sbuf, SESSION_MEMFD_DATA)) + return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Session memfd content mismatch: expected '%s', got '%s'", + SESSION_MEMFD_DATA, sbuf); + + /* Remove the LUO session fd from the fd store as well. */ + r = sd_pid_notifyf(0, /* unset_environment= */ false, + "FDSTOREREMOVE=1\nFDNAME=%s", session_fdname); + if (r < 0) + return log_error_errno(r, "Failed to remove fd '%s' from fd store: %m", session_fdname); + + log_info("Verified LUO session memfd content matches."); + /* Wait for PID 1 to actually process all our FDSTORE notifications before we exit, otherwise * the cgroup-based pidref to unit lookup may fail once we're gone, and the fds end up closed. */ r = sd_notify_barrier(0, 5 * USEC_PER_SEC); @@ -156,13 +235,15 @@ static int do_check(void) { } static int run(int argc, char *argv[]) { - if (argc != 2) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Usage: %s store|check", argv[0]); + if (argc < 2 || argc > 3) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Usage: %s store|check [PREFIX]", argv[0]); + + const char *prefix = argc > 2 ? argv[2] : "luosession"; if (streq(argv[1], "store")) - return do_store(); + return do_store(prefix); if (streq(argv[1], "check")) - return do_check(); + return do_check(prefix); return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown command: %s", argv[1]); } diff --git a/test/units/TEST-91-LIVEUPDATE.sh b/test/units/TEST-91-LIVEUPDATE.sh index f51f2f39065..1d2df255cf2 100755 --- a/test/units/TEST-91-LIVEUPDATE.sh +++ b/test/units/TEST-91-LIVEUPDATE.sh @@ -69,13 +69,13 @@ if grep -qw luo_nboot=1 /proc/cmdline; then # Verify that the user manager also preserved its FD store n_user_at_fds=$(systemctl show -P NFileDescriptorStore "${TESTUSER_USER_SVC}") - test "${n_user_at_fds}" -ge 2 + test "${n_user_at_fds}" -ge 3 write_late_unit user TEST-91-LIVEUPDATE-user-late \ "/usr/lib/systemd/tests/unit-tests/manual/test-luo check user-late" systemctl restart "${TESTUSER_USER_SVC}" timeout 30s bash -c "until systemctl is-active --quiet '${TESTUSER_USER_SVC}'; do sleep 0.5; done" n_user_unit_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore TEST-91-LIVEUPDATE-user-late.service) - test "${n_user_unit_fds}" -eq 2 + test "${n_user_unit_fds}" -eq 3 run0 -u testuser systemctl --user start TEST-91-LIVEUPDATE-user-late.service # nspawn fdstore variant: after kexec, PID 1 propagated the @@ -102,22 +102,22 @@ EOF # late.service: rewrite the fragment with the second-boot ExecStart and # exercise the daemon-reload + daemon-reexec preservation paths. write_late_unit system TEST-91-LIVEUPDATE-late \ - "/usr/lib/systemd/tests/unit-tests/manual/test-luo check" + "/usr/lib/systemd/tests/unit-tests/manual/test-luo check late" n_fds=$(systemctl show -P NFileDescriptorStore TEST-91-LIVEUPDATE-late.service) - test "$n_fds" -eq 2 + test "$n_fds" -eq 3 systemctl daemon-reload # Verify the late unit doesn't get GC'ed during daemon-reload n_fds=$(systemctl show -P NFileDescriptorStore TEST-91-LIVEUPDATE-late.service) - test "$n_fds" -eq 2 + test "$n_fds" -eq 3 systemctl daemon-reexec # Verify the late unit doesn't get GC'ed during daemon-reexec n_fds=$(systemctl show -P NFileDescriptorStore TEST-91-LIVEUPDATE-late.service) - test "$n_fds" -eq 2 + test "$n_fds" -eq 3 systemctl start TEST-91-LIVEUPDATE-late.service @@ -127,7 +127,7 @@ EOF write_late_unit system TEST-91-LIVEUPDATE-late-noreload \ "/usr/lib/systemd/tests/unit-tests/manual/test-luo check late-noreload" n_fds=$(systemctl show -P NFileDescriptorStore TEST-91-LIVEUPDATE-late-noreload.service) - test "$n_fds" -eq 2 + test "$n_fds" -eq 3 systemctl start TEST-91-LIVEUPDATE-late-noreload.service # Zero-fds variant: fragment on second boot sets FileDescriptorStoreMax=0, @@ -140,6 +140,7 @@ EOF systemctl start TEST-91-LIVEUPDATE-late-zerofds.service else # Create memfds with known content and push them to our fd store. + # Also request a LUO session, store a memfd in it, and push the session fd to the fd store. /usr/lib/systemd/tests/unit-tests/manual/test-luo store # Exercise the user manager FD preservation across kexec too @@ -149,9 +150,9 @@ else "/usr/lib/systemd/tests/unit-tests/manual/test-luo store user-late" run0 -u testuser systemctl --user start TEST-91-LIVEUPDATE-user-late.service n_user_unit_fds=$(run0 -u testuser systemctl --user show -P NFileDescriptorStore TEST-91-LIVEUPDATE-user-late.service) - test "${n_user_unit_fds}" -eq 2 + test "${n_user_unit_fds}" -eq 3 n_user_at_fds=$(systemctl show -P NFileDescriptorStore "${TESTUSER_USER_SVC}") - test "${n_user_at_fds}" -ge 2 + test "${n_user_at_fds}" -ge 3 # Exercise the FD-store preservation chain across a kexec for a privileged # nspawn container managed as a system service: @@ -180,11 +181,11 @@ EOF # to avoid collisions in the LUO session namespace. for variant in late late-noreload late-zerofds; do write_late_unit system "TEST-91-LIVEUPDATE-${variant}" \ - "/usr/lib/systemd/tests/unit-tests/manual/test-luo store" + "/usr/lib/systemd/tests/unit-tests/manual/test-luo store ${variant}" systemctl start "TEST-91-LIVEUPDATE-${variant}.service" n_fds=$(systemctl show -P NFileDescriptorStore "TEST-91-LIVEUPDATE-${variant}.service") - test "$n_fds" -eq 2 + test "$n_fds" -eq 3 done # 'systemctl kexec' auto-loads the default boot entry (i.e. the booted UKI, -- 2.47.3