From: Lennart Poettering Date: Mon, 24 Nov 2025 21:23:41 +0000 (+0100) Subject: pid1: introduce RootMStack= for using an mstack as root dir for a service X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=32b88dd25a5c726d1d8dd0b8472455109cb8b215;p=thirdparty%2Fsystemd.git pid1: introduce RootMStack= for using an mstack as root dir for a service --- diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index fbed12c4be9..f0a7ccc6d6e 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3137,6 +3137,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b RootEphemeral = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s RootMStack = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -4496,6 +4498,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4938,6 +4942,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { MountImages ExtensionImages ExtensionDirectories + RootMStack see systemd.exec(5) for their meaning. MemoryAvailable takes into account unit's and parents' MemoryMax @@ -5392,6 +5397,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b RootEphemeral = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s RootMStack = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -6741,6 +6748,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7461,6 +7470,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b RootEphemeral = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s RootMStack = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -8642,6 +8653,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -9495,6 +9508,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b RootEphemeral = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s RootMStack = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -10640,6 +10655,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -12520,9 +12537,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills, ExecReloadPost, and ExecReloadPostEx were added in version 259. - BindNetworkInterface, - MemoryTHP, and - RefreshOnReload were added in version 260. + BindNetworkInterface, MemoryTHP, + RefreshOnReload, and RootMStack were added in version 260. Socket Unit Objects @@ -12591,8 +12607,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ UserNamespacePath, OOMKills, and ManagedOOMKills were added in 259. - BindNetworkInterface, and - MemoryTHP were added in version 260. + BindNetworkInterface MemoryTHP, and + RootMStack were added in version 260. Mount Unit Objects @@ -12656,8 +12672,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ UserNamespacePath, OOMKills, and ManagedOOMKills were added in 259. - BindNetworkInterface, and - MemoryTHP were added in version 260. + BindNetworkInterface MemoryTHP, and + RootMStack were added in version 260. Swap Unit Objects @@ -12719,8 +12735,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ UserNamespacePath, OOMKills, and ManagedOOMKills were added in 259. - BindNetworkInterface, and - MemoryTHP were added in version 260. + BindNetworkInterface, MemoryTHP, and + RootMStack were added in version 260. Slice Unit Objects diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 6fb90604151..2bd7b1c07ee 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1258,6 +1258,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootMStack", "s", NULL, offsetof(ExecContext, root_mstack), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -1882,6 +1883,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "RootImage")) return bus_set_transient_path(u, name, &c->root_image, message, flags, reterr_error); + if (streq(name, "RootMStack")) + return bus_set_transient_path(u, name, &c->root_mstack, message, flags, reterr_error); + if (streq(name, "RootImageOptions")) { _cleanup_(mount_options_free_allp) MountOptions *options = NULL; _cleanup_free_ char *format_str = NULL; diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 3ffedf58db7..560df952874 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1633,6 +1633,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item_escaped(f, "exec-context-root-mstack", c->root_mstack); + if (r < 0) + return r; + r = serialize_item_format(f, "exec-context-umask", "%04o", c->umask); if (r < 0) return r; @@ -2568,6 +2572,14 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (r < 0) return r; c->root_ephemeral = r; + } else if ((val = startswith(l, "exec-context-root-mstack="))) { + ssize_t k; + char *p; + + k = cunescape(val, 0, &p); + if (k < 0) + return k; + free_and_replace(c->root_mstack, p); } else if ((val = startswith(l, "exec-context-umask="))) { r = parse_mode(val, &c->umask); if (r < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 8e5796b92ac..6a7fc65f202 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -263,7 +263,8 @@ bool exec_needs_mount_namespace( assert(context); - if (context->root_image) + if (context->root_image || + context->root_mstack) return true; if (context->root_directory_as_fd) @@ -684,6 +685,7 @@ void exec_context_done(ExecContext *c) { iovec_done(&c->root_hash_sig); c->root_hash_sig_path = mfree(c->root_hash_sig_path); c->root_verity = mfree(c->root_verity); + c->root_mstack = mfree(c->root_mstack); c->tty_path = mfree(c->tty_path); c->syslog_identifier = mfree(c->syslog_identifier); c->user = mfree(c->user); @@ -1203,6 +1205,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { if (c->root_verity) fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity); + if (c->root_mstack) + fprintf(f, "%sRootMStack: %s\n", prefix, c->root_mstack); + STRV_FOREACH(e, c->environment) fprintf(f, "%sEnvironment: %s\n", prefix, *e); @@ -2058,9 +2063,9 @@ bool exec_context_restrict_filesystems_set(const ExecContext *c) { bool exec_context_with_rootfs(const ExecContext *c) { assert(c); - /* Checks if RootDirectory=, RootImage= or RootDirectoryFileDescriptor= are used */ + /* Checks if RootDirectory=, RootImage=, RootMStack= or RootDirectoryFileDescriptor= are used */ - return !empty_or_root(c->root_directory) || c->root_image || c->root_directory_as_fd; + return !empty_or_root(c->root_directory) || c->root_image || c->root_directory_as_fd || c->root_mstack; } bool exec_context_with_rootfs_strict(const ExecContext *c) { @@ -2070,7 +2075,7 @@ bool exec_context_with_rootfs_strict(const ExecContext *c) { * true in more cases: when a root directory is explicitly configured, even if it's our usual * root. */ - return c->root_directory || c->root_image || c->root_directory_as_fd; + return c->root_directory || c->root_image || c->root_directory_as_fd || c->root_mstack; } int exec_context_has_vpicked_extensions(const ExecContext *context) { diff --git a/src/core/execute.h b/src/core/execute.h index 6fe0d5e5707..c5f39883716 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -199,9 +199,12 @@ typedef struct ExecContext { char **unset_environment; struct rlimit *rlimit[_RLIMIT_MAX]; - char *working_directory, *root_directory, *root_image, *root_verity, *root_hash_path, *root_hash_sig_path; + char *working_directory; + char *root_directory; + char *root_image, *root_verity, *root_hash_path, *root_hash_sig_path; struct iovec root_hash, root_hash_sig; MountOptions *root_image_options; + char *root_mstack; bool root_ephemeral; bool working_directory_missing_ok:1; bool working_directory_home:1; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 5605445c34e..60e616a03f6 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -11,6 +11,7 @@ {{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context) {{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity) {{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral) +{{type}}.RootMStack, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_mstack) {{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories) {{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context) {{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy) diff --git a/src/core/namespace.c b/src/core/namespace.c index 0ed686f9ece..6927845b9c3 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -34,6 +34,7 @@ #include "mkdir-label.h" #include "mount-util.h" #include "mountpoint-util.h" +#include "mstack.h" #include "namespace.h" #include "namespace-util.h" #include "nsflags.h" @@ -1296,6 +1297,13 @@ static int create_temporary_mount_point(RuntimeScope scope, char **ret) { return 0; } +static bool namespace_with_rootfs(const NamespaceParameters *p) { + /* Returns true, if we have a root dir, root image or too mstack, and hence the root mount is + * changed */ + + return p->root_image || p->root_directory || p->root_directory_fd >= 0 || p->root_mstack; +} + static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) { static const char devnodes[] = "/dev/null\0" @@ -1360,7 +1368,7 @@ static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) /* We assume /run/systemd/journal/ is available if not changing root, which isn't entirely accurate * but shouldn't matter, as either way the user would get ENOENT when accessing /dev/log */ - if ((!p->root_image && !p->root_directory && p->root_directory_fd < 0) || p->bind_log_sockets) { + if (!namespace_with_rootfs(p) || p->bind_log_sockets) { const char *devlog = strjoina(temporary_mount, "/dev/log"); if (symlink("/run/systemd/journal/dev-log", devlog) < 0) log_debug_errno(errno, @@ -2497,10 +2505,22 @@ static bool home_read_only( return false; } +static bool namespace_read_only(const NamespaceParameters *p) { + assert(p); + + return root_read_only(p->read_only_paths, + p->protect_system) && + home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories, + p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems, + p->protect_home) && + strv_isempty(p->read_write_paths); +} + int setup_namespace(const NamespaceParameters *p, char **reterr_path) { _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(mstack_freep) MStack *mstack = NULL; _cleanup_strv_free_ char **hierarchies = NULL; _cleanup_(mount_list_done) MountList ml = {}; _cleanup_close_ int userns_fd = -EBADF; @@ -2518,6 +2538,7 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { DISSECT_IMAGE_PIN_PARTITION_DEVICES | DISSECT_IMAGE_ALLOW_USERSPACE_VERITY | DISSECT_IMAGE_VERITY_SHARE; + MStackFlags mstack_flags = 0; int r; assert(p); @@ -2531,12 +2552,7 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { if (p->root_image) { /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */ - if (root_read_only(p->read_only_paths, - p->protect_system) && - home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories, - p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems, - p->protect_home) && - strv_isempty(p->read_write_paths)) + if (namespace_read_only(p)) dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path); @@ -2620,6 +2636,24 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { return r; } } + + } else if (p->root_mstack) { + if (namespace_read_only(p)) + mstack_flags |= MSTACK_RDONLY; + + r = mstack_load(p->root_mstack, /* dir_fd= */ -EBADF, &mstack); + if (r < 0) + return r; + + if (p->runtime_scope != RUNTIME_SCOPE_SYSTEM) { + userns_fd = namespace_open_by_type(NAMESPACE_USER); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m"); + } + + r = mstack_open_images(mstack, userns_fd, p->root_image_policy, /* image_filter= */ NULL, mstack_flags); + if (r < 0) + return r; } if (p->root_directory) @@ -3014,6 +3048,15 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { return r; } + } else if (p->root_mstack) { + r = mstack_make_mounts(mstack, root, mstack_flags); + if (r < 0) + return r; + + r = mstack_bind_mounts(mstack, root, /* where_fd= */ -EBADF, mstack_flags, /* ret_root_fd= */ NULL); + if (r < 0) + return r; + } else { /* Let's mount the main root directory to the root directory to use */ r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL); @@ -3022,7 +3065,7 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { } /* Try to set up the new root directory before mounting anything else there. */ - if (p->root_image || p->root_directory || p->root_directory_fd >= 0) + if (namespace_with_rootfs(p)) (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); /* Now make the magic happen */ diff --git a/src/core/namespace.h b/src/core/namespace.h index b96e7b4372e..4f5e6546bd8 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -130,6 +130,7 @@ typedef struct NamespaceParameters { int root_directory_fd; const char *root_directory; const char *root_image; + const char *root_mstack; const MountOptions *root_image_options; const ImagePolicy *root_image_policy; diff --git a/src/core/service.c b/src/core/service.c index aa133a57db2..aa3690e92a0 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -5887,8 +5887,9 @@ int service_determine_exec_selinux_label(Service *s, char **ret) { if (s->exec_context.root_image || s->exec_context.n_extension_images > 0 || - !strv_isempty(s->exec_context.extension_directories)) /* We cannot chase paths through images */ - return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(ENODATA), "Service with RootImage=, ExtensionImages= or ExtensionDirectories= set, cannot determine socket SELinux label before activation, ignoring."); + !strv_isempty(s->exec_context.extension_directories) || + s->exec_context.root_mstack) /* We cannot chase paths through images */ + return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(ENODATA), "Service with RootImage=, ExtensionImages=, ExtensionDirectories= or RootMStack= set cannot determine socket SELinux label before activation, ignoring."); ExecCommand *c = s->exec_command[SERVICE_EXEC_START]; if (!c) diff --git a/src/core/unit.c b/src/core/unit.c index 0f6ea877561..d468f1303d2 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1266,6 +1266,12 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { return r; } + if (c->root_mstack) { + r = unit_add_mounts_for(u, c->root_mstack, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS); + if (r < 0) + return r; + } + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { if (!u->manager->prefix[dt]) continue; @@ -1322,9 +1328,9 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { return r; } - if (c->root_image) { + if (c->root_image || c->root_mstack) { /* We need to wait for /dev/loopX to appear when doing RootImage=, hence let's add an - * implicit dependency on udev */ + * implicit dependency on udev. (And for RootMStack= we might need it) */ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_UDEVD_SERVICE, true, UNIT_DEPENDENCY_FILE); if (r < 0) @@ -4461,9 +4467,10 @@ int unit_patch_contexts(Unit *u) { /* Only add these if needed, as they imply that everything else is blocked. */ if (cgroup_context_has_device_policy(cc)) { - if (ec->root_image || ec->mount_images) { + if (ec->root_image || ec->mount_images || ec->root_mstack) { - /* When RootImage= or MountImages= is specified, the following devices are touched. */ + /* When RootImage= or MountImages= is specified, the following devices are + * touched. For RootMStack= there's the possibility the are touched. */ FOREACH_STRING(p, "/dev/loop-control", "/dev/mapper/control") { r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); if (r < 0) diff --git a/src/core/varlink-execute.c b/src/core/varlink-execute.c index 35ea1b1330f..e6efd598959 100644 --- a/src/core/varlink-execute.c +++ b/src/core/varlink-execute.c @@ -794,6 +794,7 @@ int unit_exec_context_build_json(sd_json_variant **ret, const char *name, void * JSON_BUILD_PAIR_CALLBACK_NON_NULL("WorkingDirectory", working_directory_build_json, c), JSON_BUILD_PAIR_STRING_NON_EMPTY("RootDirectory", c->root_directory), JSON_BUILD_PAIR_STRING_NON_EMPTY("RootImage", c->root_image), + JSON_BUILD_PAIR_STRING_NON_EMPTY("RootMStack", c->root_mstack), JSON_BUILD_PAIR_CALLBACK_NON_NULL("RootImageOptions", root_image_options_build_json, c->root_image_options), SD_JSON_BUILD_PAIR_BOOLEAN("RootEphemeral", c->root_ephemeral), JSON_BUILD_PAIR_BASE64_NON_EMPTY("RootHash", c->root_hash.iov_base, c->root_hash.iov_len), diff --git a/src/shared/varlink-io.systemd.Unit.c b/src/shared/varlink-io.systemd.Unit.c index a7d56bb5e97..a008b506e9b 100644 --- a/src/shared/varlink-io.systemd.Unit.c +++ b/src/shared/varlink-io.systemd.Unit.c @@ -409,6 +409,8 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(RootDirectory, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man"PROJECT_VERSION_STR"systemd.exec.html#RootImage="), SD_VARLINK_DEFINE_FIELD(RootImage, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man"PROJECT_VERSION_STR"systemd.exec.html#RootMStack="), + SD_VARLINK_DEFINE_FIELD(RootMStack, SD_VARLINK_STRING, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man"PROJECT_VERSION_STR"systemd.exec.html#RootImageOptions="), SD_VARLINK_DEFINE_FIELD_BY_TYPE(RootImageOptions, PartitionMountOptions, SD_VARLINK_ARRAY|SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man"PROJECT_VERSION_STR"systemd.exec.html#RootEphemeral="),