]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: allow to run unpriv from dir
authorLennart Poettering <lennart@poettering.net>
Mon, 11 Nov 2024 16:09:06 +0000 (17:09 +0100)
committerLennart Poettering <lennart@poettering.net>
Thu, 23 Jan 2025 20:48:02 +0000 (21:48 +0100)
This simply calls into mountfsd to acquire the root mount and uses it as
root for the container.

Note that this also makes one more change: previously we ran containers
directory off their backing directory. Except when we didn't, and there
were a variety of exceptions: if we had no privs, if we ran off a disk
image, if the directory was the host's root dir, and some others.

This simplifies the logic a bit: we now simply always create a temporary
directory in /tmp/ and bind mount everything there, in all code paths.
This simplifies our code a bit. After all, in order to control
propagation we need to turn the root into a mount point anyway, hence we
might just do it at one place for all cases.

src/nspawn/nspawn.c

index 5ab04b121a61e10746ac6218c68710ee435c06b4..b7622b8db43ecc07471a4bfe11981b7e0fcadf25 100644 (file)
@@ -3934,6 +3934,7 @@ static DissectImageFlags determine_dissect_image_flags(void) {
 static int outer_child(
                 Barrier *barrier,
                 const char *directory,
+                int mount_fd,
                 DissectedImage *dissected_image,
                 int fd_outer_socket,
                 int fd_inner_socket,
@@ -3987,7 +3988,23 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        if (dissected_image) {
+        /* Put the root dir into the target directory now. One of three mechanisms is provided: either we
+         * have a single mount fd (typically unprivileged --directory= mode) or we have a fully dissected
+         * image (--image= mode), or we have a regular path. */
+        if (mount_fd >= 0) {
+                assert(arg_directory);
+                assert(!arg_image);
+
+                if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0)
+                        return log_error_errno(errno, "Failed to attach root directory: %m");
+
+                mount_fd = safe_close(mount_fd);
+                log_debug("Successfully attached root directory to '%s'.", directory);
+
+        } else if (dissected_image) {
+                assert(!arg_directory);
+                assert(arg_image);
+
                 /* If we are operating on a disk image, then mount its root directory now, but leave out the
                  * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
                  * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
@@ -4004,6 +4021,13 @@ static int outer_child(
                                 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
                 if (r < 0)
                         return r;
+        } else {
+                assert(arg_directory);
+                assert(!arg_image);
+
+                r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL);
+                if (r < 0)
+                        return r;
         }
 
         r = determine_uid_shift(directory);
@@ -4047,27 +4071,6 @@ static int outer_child(
                          "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
         }
 
-        if (path_equal(directory, "/")) {
-                /* If the directory we shall boot is the host, let's operate on a bind mount at a different
-                 * place, so that we can make changes to its mount structure (for example, to implement
-                 * --volatile=) without this interfering with our ability to access files such as
-                 * /etc/localtime to copy into the container. Note that we use a fixed place for this
-                 * (instead of a temporary directory, since we are living in our own mount namespace here
-                 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
-                (void) mkdir_p("/run/systemd/nspawn-root", 0755);
-
-                r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
-                if (r < 0)
-                        return r;
-
-                directory = "/run/systemd/nspawn-root";
-        }
-
-        /* Make sure we always have a mount that we can move to root later on. */
-        r = make_mount_point(directory);
-        if (r < 0)
-                return r;
-
         /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
          * mount namespace. For the directory we are going to run our container let's turn this off, so that
          * we'll live in our own little world from now on, and propagation from the host may only happen via
@@ -5161,6 +5164,8 @@ static int load_oci_bundle(void) {
 }
 
 static int run_container(
+                const char *directory,
+                int mount_fd,
                 DissectedImage *dissected_image,
                 int userns_fd,
                 FDSet *fds,
@@ -5298,7 +5303,8 @@ static int run_container(
                 (void) reset_signal_mask();
 
                 r = outer_child(&barrier,
-                                arg_directory,
+                                directory,
+                                mount_fd,
                                 dissected_image,
                                 fd_outer_socket_pair[1],
                                 fd_inner_socket_pair[1],
@@ -5915,14 +5921,14 @@ static int cant_be_in_netns(void) {
 }
 
 static int run(int argc, char *argv[]) {
-        bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
-        _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
+        bool remove_directory = false, remove_image = false, veth_created = false;
+        _cleanup_close_ int master = -EBADF, userns_fd = -EBADF, mount_fd = -EBADF;
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int r, n_fd_passed, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
         struct ExposeArgs expose_args = {};
         _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
-        char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
+        _cleanup_(rmdir_and_freep) char *rootdir = NULL;
         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
         _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
@@ -6023,14 +6029,25 @@ static int run(int argc, char *argv[]) {
         if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
                 arg_quiet = true;
 
-        if (arg_directory) {
-                assert(!arg_image);
+        if (!arg_privileged) {
+                /* if we are unprivileged, let's allocate a 64K userns first */
 
-                if (!arg_privileged) {
-                        r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
+                _cleanup_free_ char *userns_name = NULL;
+                if (asprintf(&userns_name, "nspawn-" PID_FMT "-%s", getpid_cached(), arg_machine) < 0) {
+                        r = log_oom();
                         goto finish;
                 }
 
+                userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
+                if (userns_fd < 0) {
+                        r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
+                        goto finish;
+                }
+        }
+
+        if (arg_directory) {
+                assert(!arg_image);
+
                 /* Safety precaution: let's not allow running images from the live host OS image, as long as
                  * /var from the host will propagate into container dynamically (because bad things happen if
                  * two systems write to the same /var). Let's allow it for the special cases where /var is
@@ -6200,6 +6217,15 @@ static int run(int argc, char *argv[]) {
                         }
                 }
 
+                if (!arg_privileged) {
+                        r = mountfsd_mount_directory(
+                                        arg_directory,
+                                        userns_fd,
+                                        determine_dissect_image_flags(),
+                                        &mount_fd);
+                        if (r < 0)
+                                goto finish;
+                }
         } else {
                 DissectImageFlags dissect_image_flags =
                         determine_dissect_image_flags();
@@ -6274,19 +6300,6 @@ static int run(int argc, char *argv[]) {
                                 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
                 }
 
-                if (!mkdtemp(tmprootdir)) {
-                        r = log_error_errno(errno, "Failed to create temporary directory: %m");
-                        goto finish;
-                }
-
-                remove_tmprootdir = true;
-
-                arg_directory = strdup(tmprootdir);
-                if (!arg_directory) {
-                        r = log_oom();
-                        goto finish;
-                }
-
                 if (arg_privileged) {
                         r = loop_device_make_by_path(
                                         arg_image,
@@ -6339,19 +6352,6 @@ static int run(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
                 } else {
-                        _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
-                        if (!userns_name) {
-                                r = log_oom();
-                                goto finish;
-                        }
-
-                        /* if we are unprivileged, let's allocate a 64K userns first */
-                        userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
-                        if (userns_fd < 0) {
-                                r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
-                                goto finish;
-                        }
-
                         r = mountfsd_mount_image(
                                         arg_image,
                                         userns_fd,
@@ -6370,7 +6370,14 @@ static int run(int argc, char *argv[]) {
                         arg_architecture = dissected_image_architecture(dissected_image);
         }
 
-        r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
+        /* Create a temporary place to mount stuff. */
+        r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir);
+        if (r < 0) {
+                log_error_errno(r, "Failed to create temporary directory: %m");
+                goto finish;
+        }
+
+        r = custom_mount_prepare_all(rootdir, arg_custom_mounts, arg_n_custom_mounts);
         if (r < 0)
                 goto finish;
 
@@ -6405,6 +6412,8 @@ static int run(int argc, char *argv[]) {
         }
         for (;;) {
                 r = run_container(
+                                rootdir,
+                                mount_fd,
                                 dissected_image,
                                 userns_fd,
                                 fds,
@@ -6447,11 +6456,6 @@ finish:
                         log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
         }
 
-        if (remove_tmprootdir) {
-                if (rmdir(tmprootdir) < 0)
-                        log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
-        }
-
         if (arg_machine && arg_privileged) {
                 const char *p;