From 9e26ced9809bcac96a34dc89825cdcee4e17a078 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 28 Oct 2025 23:47:26 +0100 Subject: [PATCH] core: Add RootDirectoryFileDescriptor= RootDirectory= but via a open_tree() file descriptor. This allows setting up the execution environment for a service by the client in a mount namespace and then starting a transient unit in that execution environment using the new property. We also add --root-directory= and --same-root-dir= to systemd-run to have it run services within the given root directory. As systemd-run might be invoked from a different mount namespace than what systemd is running in, systemd-run opens the given path with open_tree() and then sends it to systemd using the new RootDirectoryFileDescriptor= property. --- man/systemd-run.xml | 26 +++++++++++++++++ src/core/dbus-service.c | 3 ++ src/core/exec-invoke.c | 14 ++++++++-- src/core/execute-serialize.c | 23 +++++++++++++++ src/core/execute.c | 10 +++++-- src/core/execute.h | 3 ++ src/core/fuzz-execute-serialize.c | 1 + src/core/namespace.c | 17 ++++++++++-- src/core/namespace.h | 1 + src/core/service.c | 26 ++++++++++++++++- src/core/service.h | 3 ++ src/run/run.c | 40 ++++++++++++++++++++++++++- src/test/test-namespace.c | 1 + src/test/test-ns.c | 1 + test/units/TEST-50-DISSECT.dissect.sh | 3 ++ 15 files changed, 162 insertions(+), 10 deletions(-) diff --git a/man/systemd-run.xml b/man/systemd-run.xml index 7d4b7011c58..d18b80faa8a 100644 --- a/man/systemd-run.xml +++ b/man/systemd-run.xml @@ -291,6 +291,32 @@ + + + + Runs the service process with the specified root directory. Also see + RootDirectory= in + systemd.exec5. + + Note that the path is looked up inside the file system namespace that systemd-run is running + in, which might be different that the file system namespace the manager process is running in. Use + the RootDirectory= property directly if you want the path to be looked up in the + manager process's file system namespace. + + + + + + + + + + Similar to , but uses the root directory of the + systemd-run process as the root directory to execute the service in. + + + + diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c index b3c3c32cea7..2ff6272bd4c 100644 --- a/src/core/dbus-service.c +++ b/src/core/dbus-service.c @@ -799,6 +799,9 @@ static int bus_service_set_transient_property( return 1; } + if (streq(name, "RootDirectoryFileDescriptor")) + return bus_set_transient_exec_context_fd(u, &s->root_directory_fd, &s->exec_context.root_directory_as_fd, message, flags, error); + return 0; } diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 14e15cb8a66..92d6223beb5 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3470,7 +3470,7 @@ static bool insist_on_sandboxing( if (context->n_temporary_filesystems > 0) return true; - if (root_dir || root_image) + if (root_dir || root_image || context->root_directory_as_fd) return true; if (context->n_mount_images > 0) @@ -3506,6 +3506,7 @@ static int setup_ephemeral( int r; assert(context); + assert(!context->root_directory_as_fd); assert(runtime); assert(root_image); assert(root_directory); @@ -3645,6 +3646,7 @@ static int pick_versions( int r; assert(context); + assert(!context->root_directory_as_fd); assert(params); assert(ret_root_image); assert(ret_root_directory); @@ -3733,7 +3735,7 @@ static int apply_mount_namespace( CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many); - if (params->flags & EXEC_APPLY_CHROOT) { + if (params->flags & EXEC_APPLY_CHROOT && !context->root_directory_as_fd) { r = pick_versions( context, params, @@ -3855,6 +3857,7 @@ static int apply_mount_namespace( .root_directory = root_dir, .root_image = root_image, + .root_directory_fd = params->flags & EXEC_APPLY_CHROOT ? params->root_directory_fd : -EBADF, .root_image_options = context->root_image_options, .root_image_policy = context->root_image_policy ?: &image_policy_service, @@ -4495,6 +4498,7 @@ static bool exec_needs_cap_sys_admin(const ExecContext *context, const ExecParam context->n_bind_mounts > 0 || context->n_temporary_filesystems > 0 || context->root_directory || + context->root_directory_as_fd || !strv_isempty(context->extension_directories) || context->root_image || context->n_mount_images > 0 || @@ -5136,6 +5140,12 @@ int exec_invoke( } #endif + r = add_shifted_fd(&keep_fds, &n_keep_fds, ¶ms->root_directory_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_error_errno(r, "Failed to collect shifted fd: %m"); + } + r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds); if (r < 0) { *exit_status = EXIT_FDS; diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 8033ecdb8e1..575d34ff24c 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1177,6 +1177,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext if (r < 0) return r; + r = serialize_fd(f, fds, "exec-parameters-root-directory-fd", p->root_directory_fd); + if (r < 0) + return r; + r = serialize_fd(f, fds, "exec-parameters-exec-fd", p->exec_fd); if (r < 0) return r; @@ -1422,6 +1426,16 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { continue; close_and_replace(p->stderr_fd, fd); + + } else if ((val = startswith(l, "exec-parameters-root-directory-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + close_and_replace(p->root_directory_fd, fd); + } else if ((val = startswith(l, "exec-parameters-exec-fd="))) { int fd; @@ -1994,6 +2008,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + r = serialize_bool_elide(f, "exec-context-root-directory-as-fd", c->root_directory_as_fd); + if (r < 0) + return r; + switch (c->std_input) { case EXEC_INPUT_NAMED_FD: r = serialize_item(f, "exec-context-std-input-fd-name", c->stdio_fdname[STDIN_FILENO]); @@ -3000,6 +3018,11 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (r < 0) return r; c->stdio_as_fds = r; + } else if ((val = startswith(l, "exec-context-root-directory-as-fd="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->root_directory_as_fd = r; } else if ((val = startswith(l, "exec-context-std-input-fd-name="))) { r = free_and_strdup(&c->stdio_fdname[STDIN_FILENO], val); if (r < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 5b01758733a..5d4c26934dd 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -267,6 +267,9 @@ bool exec_needs_mount_namespace( if (context->root_image) return true; + if (context->root_directory_as_fd) + return true; + if (!strv_isempty(context->read_write_paths) || !strv_isempty(context->read_only_paths) || !strv_isempty(context->inaccessible_paths) || @@ -354,7 +357,7 @@ const char* exec_get_private_notify_socket_path(const ExecContext *context, cons if (!needs_sandboxing) return NULL; - if (!context->root_directory && !context->root_image) + if (!context->root_directory && !context->root_image && !context->root_directory_as_fd) return NULL; if (!exec_context_get_effective_mount_apivfs(context)) @@ -2045,9 +2048,9 @@ bool exec_context_restrict_filesystems_set(const ExecContext *c) { bool exec_context_with_rootfs(const ExecContext *c) { assert(c); - /* Checks if RootDirectory= or RootImage= are used */ + /* Checks if RootDirectory=, RootImage= or RootDirectoryFileDescriptor= are used */ - return !empty_or_root(c->root_directory) || c->root_image; + return !empty_or_root(c->root_directory) || c->root_image || c->root_directory_as_fd; } int exec_context_has_vpicked_extensions(const ExecContext *context) { @@ -2846,6 +2849,7 @@ void exec_params_deep_clear(ExecParameters *p) { p->stdin_fd = safe_close(p->stdin_fd); p->stdout_fd = safe_close(p->stdout_fd); p->stderr_fd = safe_close(p->stderr_fd); + p->root_directory_fd = safe_close(p->root_directory_fd); p->notify_socket = mfree(p->notify_socket); diff --git a/src/core/execute.h b/src/core/execute.h index 2c79a37d54a..1ce78af6afe 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -218,6 +218,7 @@ typedef struct ExecContext { /* At least one of stdin/stdout/stderr was initialized from an fd passed in. This boolean survives * the fds being closed. This only makes sense for transient units. */ bool stdio_as_fds; + bool root_directory_as_fd; char *stdio_fdname[3]; char *stdio_file[3]; @@ -418,6 +419,7 @@ typedef struct ExecParameters { int stdin_fd; int stdout_fd; int stderr_fd; + int root_directory_fd; /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. */ int exec_fd; @@ -449,6 +451,7 @@ typedef struct ExecParameters { .stdin_fd = -EBADF, \ .stdout_fd = -EBADF, \ .stderr_fd = -EBADF, \ + .root_directory_fd = -EBADF, \ .exec_fd = -EBADF, \ .bpf_restrict_fs_map_fd = -EBADF, \ .user_lookup_fd = -EBADF, \ diff --git a/src/core/fuzz-execute-serialize.c b/src/core/fuzz-execute-serialize.c index 1e72918a39e..8b8267f3c75 100644 --- a/src/core/fuzz-execute-serialize.c +++ b/src/core/fuzz-execute-serialize.c @@ -58,6 +58,7 @@ static void exec_fuzz_one(FILE *f, FDSet *fdset) { params.stdin_fd = -EBADF; params.stdout_fd = -EBADF; params.stderr_fd = -EBADF; + params.root_directory_fd = -EBADF; params.exec_fd = -EBADF; params.user_lookup_fd = -EBADF; params.bpf_restrict_fs_map_fd = -EBADF; diff --git a/src/core/namespace.c b/src/core/namespace.c index 6cf4f8b0a17..0e12a16592b 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1359,7 +1359,7 @@ static int mount_private_dev(const MountEntry *m, const NamespaceParameters *p) /* We assume /run/systemd/journal/ is available if not changing root, which isn't entirely accurate * but shouldn't matter, as either way the user would get ENOENT when accessing /dev/log */ - if ((!p->root_image && !p->root_directory) || p->bind_log_sockets) { + if ((!p->root_image && !p->root_directory && p->root_directory_fd < 0) || p->bind_log_sockets) { const char *devlog = strjoina(temporary_mount, "/dev/log"); if (symlink("/run/systemd/journal/dev-log", devlog) < 0) log_debug_errno(errno, @@ -2948,7 +2948,18 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m"); - if (p->root_image) { + if (p->root_directory_fd >= 0) { + + if (move_mount(p->root_directory_fd, "", AT_FDCWD, root, MOVE_MOUNT_F_EMPTY_PATH) < 0) + return log_debug_errno(errno, "Failed to move detached mount to '%s': %m", root); + + /* We just remounted / as slave, but that didn't affect the detached mount that we just + * mounted, so remount that one as slave recursive as well now. */ + + if (mount(NULL, root, NULL, MS_SLAVE|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to remount '%s' as SLAVE: %m", root); + + } else if (p->root_image) { /* A root image is specified, mount it to the right place */ r = dissected_image_mount( dissected_image, @@ -2992,7 +3003,7 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { } /* Try to set up the new root directory before mounting anything else there. */ - if (p->root_image || p->root_directory) + if (p->root_image || p->root_directory || p->root_directory_fd >= 0) (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); /* Now make the magic happen */ diff --git a/src/core/namespace.h b/src/core/namespace.h index 66a88ae3c1e..86c09ec2315 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -128,6 +128,7 @@ typedef struct MountImage { typedef struct NamespaceParameters { RuntimeScope runtime_scope; + int root_directory_fd; const char *root_directory; const char *root_image; const MountOptions *root_image_options; diff --git a/src/core/service.c b/src/core/service.c index 3d7861f377c..f45d0c48011 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -165,6 +165,7 @@ static void service_init(Unit *u) { s->type = _SERVICE_TYPE_INVALID; s->socket_fd = -EBADF; s->stdin_fd = s->stdout_fd = s->stderr_fd = -EBADF; + s->root_directory_fd = -EBADF; s->guess_main_pid = true; s->main_pid = PIDREF_NULL; s->control_pid = PIDREF_NULL; @@ -542,6 +543,7 @@ static void service_done(Unit *u) { service_release_stdio_fd(s); service_release_fd_store(s); service_release_extra_fds(s); + s->root_directory_fd = asynchronous_close(s->root_directory_fd); s->mount_request = sd_bus_message_unref(s->mount_request); } @@ -1108,6 +1110,9 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) { f, prefix); + if (s->root_directory_fd >= 0) + (void) service_dump_fd(s->root_directory_fd, "Root Directory File Descriptor", "", f, prefix); + if (s->open_files) LIST_FOREACH(open_files, of, s->open_files) { _cleanup_free_ char *ofs = NULL; @@ -1925,6 +1930,7 @@ static int service_spawn_internal( exec_params.stdin_fd = s->stdin_fd; exec_params.stdout_fd = s->stdout_fd; exec_params.stderr_fd = s->stderr_fd; + exec_params.root_directory_fd = s->root_directory_fd; r = exec_spawn(UNIT(s), c, @@ -2834,6 +2840,7 @@ static void service_enter_refresh_extensions(Service *s) { .n_extension_images = s->exec_context.n_extension_images, .extension_directories = s->exec_context.extension_directories, .extension_image_policy = s->exec_context.extension_image_policy, + .root_directory_fd = -EBADF, }; /* Only reload confext, and not sysext as they also typically contain the executable(s) used @@ -3226,13 +3233,19 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) { r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd); if (r < 0) return r; + r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd); if (r < 0) return r; + r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd); if (r < 0) return r; + r = serialize_fd(f, fds, "root-directory-fd", s->root_directory_fd); + if (r < 0) + return r; + if (s->exec_fd_event_source) { r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source)); if (r < 0) @@ -3637,6 +3650,13 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, if (s->stderr_fd >= 0) s->exec_context.stdio_as_fds = true; + } else if (streq(key, "root-directory-fd")) { + + asynchronous_close(s->root_directory_fd); + s->root_directory_fd = deserialize_fd(fds, value); + if (s->root_directory_fd >= 0) + s->exec_context.root_directory_as_fd = true; + } else if (streq(key, "exec-fd")) { _cleanup_close_ int fd = -EBADF; @@ -5589,6 +5609,7 @@ static void service_release_resources(Unit *u) { service_release_socket_fd(s); service_release_stdio_fd(s); service_release_extra_fds(s); + s->root_directory_fd = asynchronous_close(s->root_directory_fd); if (s->fd_store_preserve_mode != EXEC_PRESERVE_YES) service_release_fd_store(s); @@ -5622,7 +5643,10 @@ int service_determine_exec_selinux_label(Service *s, char **ret) { return -ENODATA; _cleanup_free_ char *path = NULL; - r = chase(c->path, s->exec_context.root_directory, CHASE_PREFIX_ROOT|CHASE_TRIGGER_AUTOFS, &path, NULL); + if (s->exec_context.root_directory_as_fd) + r = chaseat(s->root_directory_fd, c->path, CHASE_AT_RESOLVE_IN_ROOT|CHASE_TRIGGER_AUTOFS, &path, NULL); + else + r = chase(c->path, s->exec_context.root_directory, CHASE_PREFIX_ROOT|CHASE_TRIGGER_AUTOFS, &path, NULL); if (r < 0) { log_unit_debug_errno(UNIT(s), r, "Failed to resolve service binary '%s', ignoring.", c->path); return -ENODATA; diff --git a/src/core/service.h b/src/core/service.h index c81b5b7637f..b69f3008de1 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -224,6 +224,9 @@ typedef struct Service { int stdout_fd; int stderr_fd; + /* File descriptor received from RootDirectoryFileDescriptor= */ + int root_directory_fd; + /* If service spawned from transient unit, extra file descriptors can be passed via dbus API */ ServiceExtraFD *extra_fds; size_t n_extra_fds; diff --git a/src/run/run.c b/src/run/run.c index b5030b9cb72..47757d19d0d 100644 --- a/src/run/run.c +++ b/src/run/run.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,7 @@ static bool arg_quiet = false; static bool arg_verbose = false; static bool arg_aggressive_gc = false; static char *arg_working_directory = NULL; +static char *arg_root_directory = NULL; static bool arg_shell = false; static JobMode arg_job_mode = JOB_FAIL; static char **arg_cmdline = NULL; @@ -168,6 +170,8 @@ static int help(void) { " --nice=NICE Nice level\n" " --working-directory=PATH Set working directory\n" " -d --same-dir Inherit working directory from caller\n" + " --root-directory=PATH Set root directory\n" + " -R --same-root-dir Inherit root directory from caller\n" " -E --setenv=NAME[=VALUE] Set environment variable\n" " -t --pty Run service on pseudo TTY as STDIN/STDOUT/\n" " STDERR\n" @@ -326,6 +330,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_NO_ASK_PASSWORD, ARG_WAIT, ARG_WORKING_DIRECTORY, + ARG_ROOT_DIRECTORY, ARG_SHELL, ARG_JOB_MODE, ARG_IGNORE_FAILURE, @@ -379,6 +384,8 @@ static int parse_argv(int argc, char *argv[]) { { "collect", no_argument, NULL, 'G' }, { "working-directory", required_argument, NULL, ARG_WORKING_DIRECTORY }, { "same-dir", no_argument, NULL, 'd' }, + { "root-directory", required_argument, NULL, ARG_ROOT_DIRECTORY }, + { "same-root-dir", no_argument, NULL, 'R' }, { "shell", no_argument, NULL, 'S' }, { "job-mode", required_argument, NULL, ARG_JOB_MODE }, { "ignore-failure", no_argument, NULL, ARG_IGNORE_FAILURE }, @@ -388,7 +395,7 @@ static int parse_argv(int argc, char *argv[]) { {}, }; - bool with_trigger = false; + bool with_trigger = false, same_dir = false; int r, c; assert(argc >= 0); @@ -653,6 +660,7 @@ static int parse_argv(int argc, char *argv[]) { if (r < 0) return r; + same_dir = false; break; case 'd': { @@ -666,9 +674,25 @@ static int parse_argv(int argc, char *argv[]) { arg_working_directory = mfree(arg_working_directory); else free_and_replace(arg_working_directory, p); + + same_dir = true; break; } + case ARG_ROOT_DIRECTORY: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_root_directory); + if (r < 0) + return r; + + break; + + case 'R': + r = free_and_strdup_warn(&arg_root_directory, "/"); + if (r < 0) + return r; + + break; + case 'G': arg_aggressive_gc = true; break; @@ -842,6 +866,10 @@ static int parse_argv(int argc, char *argv[]) { "--wait may not be combined with --scope."); } + if (same_dir && arg_root_directory && !path_equal(arg_root_directory, "/")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--same-dir cannot be used with a root directory other than '/'"); + return 1; } @@ -1406,6 +1434,16 @@ static int transient_service_set_properties(sd_bus_message *m, const char *pty_p return bus_log_create_error(r); } + if (arg_root_directory) { + _cleanup_close_ int fd = open_tree(AT_FDCWD, arg_root_directory, OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_RECURSIVE); + if (fd < 0) + return log_error_errno(errno, "Failed to clone mount tree at '%s': %m", arg_root_directory); + + r = sd_bus_message_append(m, "(sv)", "RootDirectoryFileDescriptor", "h", fd); + if (r < 0) + return bus_log_create_error(r); + } + if (pty_path) { r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)", "TTYPath", "s", pty_path, diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c index 7b66fb472d6..c8b4b181ba7 100644 --- a/src/test/test-namespace.c +++ b/src/test/test-namespace.c @@ -201,6 +201,7 @@ TEST(protect_kernel_logs) { static const NamespaceParameters p = { .runtime_scope = RUNTIME_SCOPE_SYSTEM, .protect_kernel_logs = true, + .root_directory_fd = -EBADF, }; pid_t pid; diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 245bf345825..c6d6f2e4232 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -81,6 +81,7 @@ int main(int argc, char *argv[]) { .runtime_scope = RUNTIME_SCOPE_SYSTEM, .root_directory = root_directory, + .root_directory_fd = -EBADF, .read_write_paths = (char**) writable, .read_only_paths = (char**) readonly, diff --git a/test/units/TEST-50-DISSECT.dissect.sh b/test/units/TEST-50-DISSECT.dissect.sh index 7daf7236ee4..f6e5c07bc93 100755 --- a/test/units/TEST-50-DISSECT.dissect.sh +++ b/test/units/TEST-50-DISSECT.dissect.sh @@ -890,6 +890,9 @@ systemctl stop test-root-ephemeral timeout 10 bash -c 'until test -z "$(ls -A /var/lib/systemd/ephemeral-trees)"; do sleep .5; done' test ! -f /tmp/img/abc +# Test RootDirectoryFileDescriptor= +systemd-run --wait --pipe --root-directory=/tmp/img -- grep -q 'MARKER=1' /usr/lib/os-release + systemd-dissect --mtree /tmp/img >/dev/null systemd-dissect --list /tmp/img >/dev/null -- 2.47.3