mstack: introduce "mstack" concept

author Lennart Poettering <lennart@amutable.com>

Sun, 9 Nov 2025 20:16:44 +0000 (21:16 +0100)

committer Lennart Poettering <lennart@amutable.com>

Thu, 19 Feb 2026 14:05:15 +0000 (15:05 +0100)
author Lennart Poettering <lennart@amutable.com>
Sun, 9 Nov 2025 20:16:44 +0000 (21:16 +0100)
committer Lennart Poettering <lennart@amutable.com>
Thu, 19 Feb 2026 14:05:15 +0000 (15:05 +0100)
diff --git a/README b/README

index d85295b10e234d58f185b1906cf95a4017b6789d..0b2d53de1c895994a0673dbdb2cdb1867d3df09e 100644 (file)
--- a/README
+++ b/README
@@ -77,7 +77,8 @@ REQUIREMENTS:
                       ≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH),
                                  and block device 'partscan' sysfs attribute
                       ≥ 6.12 for AT_HANDLE_MNT_ID_UNIQUE
-                     ≥ 6.13 for PIDFD_GET_INFO and {set,remove}xattrat()
+                     ≥ 6.13 for PIDFD_GET_INFO and {set,remove}xattrat() and
+                                FSCONFIG_SET_FD support for overlayfs layers
                       ≥ 6.16 for coredump pattern '%F' (pidfd) specifier and SO_PASSRIGHTS
  
          ✅ systemd utilizes several new kernel APIs, but will fall back gracefully
diff --git a/src/shared/meson.build b/src/shared/meson.build

index 8b49ad9a52d8afcc160c955b1c1e9dd04d91808c..5becbf3b4fd2d22208432baa8b8bc0ae9c7468d4 100644 (file)
--- a/src/shared/meson.build
+++ b/src/shared/meson.build
@@ -132,6 +132,7 @@ shared_sources = files(
          'module-util.c',
          'mount-setup.c',
          'mount-util.c',
+        'mstack.c',
          'net-condition.c',
          'netif-naming-scheme.c',
          'netif-sriov.c',
diff --git a/src/shared/mstack.c b/src/shared/mstack.c

new file mode 100644 (file)

index 0000000..8b0beb2
--- /dev/null
+++ b/src/shared/mstack.c
@@ -0,0 +1,1187 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <linux/loop.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "dissect-image.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "loop-util.h"
+#include "macro.h"
+#include "mount-util.h"
+#include "mstack.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+#include "uid-classification.h"
+#include "unit-name.h"
+#include "vpick.h"
+
+static void mstack_mount_done(MStackMount *m) {
+        assert(m);
+
+        m->where = mfree(m->where);
+        m->what = mfree(m->what);
+        m->what_fd = safe_close(m->what_fd);
+        m->mount_fd = safe_close(m->mount_fd);
+        m->sort_key = mfree(m->sort_key);
+        m->dissected_image = dissected_image_unref(m->dissected_image);
+}
+
+static void mstack_done(MStack *mstack) {
+        assert(mstack);
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts)
+                mstack_mount_done(m);
+
+        mstack->mounts = mfree(mstack->mounts);
+        mstack->n_mounts = 0;
+        mstack->root_mount = NULL;
+        mstack->has_tmpfs_root = mstack->has_overlayfs = false;
+        mstack->path = mfree(mstack->path);
+        safe_close(mstack->root_mount_fd);
+        safe_close(mstack->usr_mount_fd);
+}
+
+MStack *mstack_free(MStack *mstack) {
+        if (!mstack)
+                return NULL;
+
+        mstack_done(mstack);
+
+        return mfree(mstack);
+}
+
+static int validate_prefix_name(const char *name, const char *prefix, char **ret_parameter) {
+        _cleanup_free_ char *p = NULL;
+
+        assert(name);
+        assert(prefix);
+
+        const char *a = startswith(name, prefix);
+        if (isempty(a)) {
+                if (ret_parameter)
+                        *ret_parameter = NULL;
+
+                return false;
+        }
+
+        p = strdup(a);
+        if (!p)
+                return -ENOMEM;
+
+        if (ret_parameter)
+                *ret_parameter = TAKE_PTR(p);
+
+        return true;
+}
+
+static MStackMount *mstack_find(MStack *mstack, MStackMountType t, const char *sort_key, const char *where) {
+        assert(mstack);
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+
+                if (t >= 0 && m->mount_type != t)
+                        continue;
+
+                if (sort_key && !streq_ptr(m->sort_key, sort_key))
+                        continue;
+
+                if (where && !path_equal(m->where, where))
+                        continue;
+
+                return m;
+        }
+
+        return NULL;
+}
+
+static int mstack_load_one(MStack *mstack, const char *dir, int dir_fd, const char *fname) {
+        int r;
+
+        assert(mstack);
+        assert(dir_fd >= 0);
+        assert(fname);
+
+        _cleanup_close_ int what_fd = openat(dir_fd, fname, O_PATH|O_CLOEXEC);
+        if (what_fd < 0)
+                return log_debug_errno(errno, "Failed to open %s/%s: %m", dir, fname);
+
+        struct stat st;
+        if (fstat(what_fd, &st) < 0)
+                return log_debug_errno(errno, "Failed to stat %s/%s: %m", dir, fname);
+
+        ImageType image_type = _IMAGE_TYPE_INVALID;
+        _cleanup_free_ char *what = NULL, *unsuffixed = NULL;
+        if (S_ISDIR(st.st_mode)) {
+
+                const char *dotv = endswith(fname, ".v");
+                if (dotv) {
+                        const char *dotrawv = endswith(fname, ".raw.v");
+
+                        PickFilter filter = {
+                                .type_mask = dotrawv ? (1U << DT_REG) : ((1U << DT_DIR) | (1U << DT_BLK)),
+                                .suffix = dotrawv ? ".raw" : NULL,
+                                .architecture = _ARCHITECTURE_INVALID,
+                        };
+
+                        _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+                        r = path_pick(dir, dir_fd, fname, &filter, /* n_filters= */ 1, PICK_ARCHITECTURE, &result);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to resolve '%s' directory: %m", fname);
+                        if (r == 0)
+                                return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Found no suitable entry in '%s': %m", fname);
+
+                        what = TAKE_PTR(result.path);
+                        close_and_replace(what_fd, result.fd);
+                        st = result.st;
+
+                        unsuffixed = strndup(fname, (dotrawv ?: dotv) - fname);
+                        if (!unsuffixed)
+                                return log_oom();
+
+                        image_type = S_ISDIR(st.st_mode) ? IMAGE_DIRECTORY :
+                                     S_ISREG(st.st_mode) ? IMAGE_RAW :
+                                     S_ISBLK(st.st_mode) ? IMAGE_BLOCK : _IMAGE_TYPE_INVALID;
+
+                        assert(image_type >= 0);
+                } else
+                        image_type = IMAGE_DIRECTORY;
+
+        } else if (S_ISREG(st.st_mode)) {
+                const char *e = endswith(fname, ".raw");
+                if (!e)
+                        return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected suffix of '%s/%s', refusing.", dir, fname);
+
+                unsuffixed = strndup(fname, e - fname);
+                if (!unsuffixed)
+                        return -ENOMEM;
+
+                image_type = IMAGE_RAW;
+
+        } else if (S_ISBLK(st.st_mode))
+                image_type = IMAGE_BLOCK;
+        else
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected inode type of '%s/%s', refusing.", dir, fname);
+
+        if (!what) {
+                what = strdup(fname);
+                if (!what)
+                        return -ENOMEM;
+        }
+
+        if (!unsuffixed) {
+                unsuffixed = strdup(what);
+                if (!unsuffixed)
+                        return -ENOMEM;
+        }
+
+        if (!GREEDY_REALLOC(mstack->mounts, mstack->n_mounts+1))
+                return -ENOMEM;
+
+        MStackMount *m = mstack->mounts + mstack->n_mounts;
+
+        _cleanup_free_ char *parameter = NULL;
+        r = validate_prefix_name(unsuffixed, "layer@", &parameter);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check prefix of %s/%s: %m", dir, fname);
+        if (r > 0) {
+                /* Paranoia: let's refuse two layers that have the same sort key. Howe can that happen?
+                 * People might have a .raw layer and one dir layer with the same name. Or one with .v and
+                 * one without. */
+                if (mstack_find(mstack, MSTACK_LAYER, parameter, /* where= */ NULL))
+                        return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Duplicate layer '%s', refusing.", parameter);
+
+                *m = (MStackMount) {
+                        .mount_type = MSTACK_LAYER,
+                        .what = TAKE_PTR(what),
+                        .what_fd = TAKE_FD(what_fd),
+                        .mount_fd = -EBADF,
+                        .sort_key = TAKE_PTR(parameter),
+                        .image_type = image_type,
+                };
+
+                mstack->n_mounts++;
+                log_debug("Found mstack layer '%s' ('%s', owned by UID " UID_FMT ")", m->sort_key, m->what, st.st_uid);
+                return 0;
+        }
+
+        if (streq(unsuffixed, "rw")) {
+                if (mstack_find(mstack, MSTACK_RW, /* sort_key= */ NULL, /* where= */ NULL))
+                        return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Duplicate rw entry, refusing.");
+
+                *m = (MStackMount) {
+                        .mount_type = MSTACK_RW,
+                        .what = TAKE_PTR(what),
+                        .what_fd = TAKE_FD(what_fd),
+                        .mount_fd = -EBADF,
+                        .image_type = image_type,
+                };
+
+                mstack->n_mounts++;
+                log_debug("Found mstack rw layer ('%s')", m->what);
+                return 0;
+        }
+
+        MStackMountType bind_type = _MSTACK_MOUNT_TYPE_INVALID;
+        r = validate_prefix_name(unsuffixed, "bind@", &parameter);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check prefix of %s/%s: %m", dir, fname);
+        if (r > 0)
+                bind_type = MSTACK_BIND;
+        else {
+                r = validate_prefix_name(unsuffixed, "robind@", &parameter);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to check prefix of %s/%s: %m", dir, fname);
+                if (r > 0)
+                        bind_type = MSTACK_ROBIND;
+        }
+        if (bind_type >= 0) {
+                _cleanup_free_ char *where = NULL;
+                r = unit_name_path_unescape(parameter, &where);
+                if (r < 0)
+                        return log_debug_errno(r, "Cannot unescape path '%s' of '%s/%s'", parameter, dir, fname);
+
+                if (mstack_find(mstack, MSTACK_BIND, /* sort_key= */ NULL, /* where= */ where) ||
+                    mstack_find(mstack, MSTACK_ROBIND, /* sort_key= */ NULL, /* where= */ where))
+                        return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Duplicate bind entry, refusing");
+
+                *m = (MStackMount) {
+                        .mount_type = bind_type,
+                        .what = TAKE_PTR(what),
+                        .what_fd = TAKE_FD(what_fd),
+                        .mount_fd = -EBADF,
+                        .where = TAKE_PTR(where),
+                        .image_type = image_type,
+                };
+
+                mstack->n_mounts++;
+                log_debug("Found mstack bind layer '%s' ('%s')", empty_to_root(m->where), m->what);
+                return 0;
+        }
+
+        if (streq(unsuffixed, "root")) {
+                if (mstack_find(mstack, MSTACK_ROOT, /* sort_key= */ NULL, /* where= */ NULL))
+                        return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Duplicate root entry, refusing");
+
+                *m = (MStackMount) {
+                        .mount_type = MSTACK_ROOT,
+                        .what = TAKE_PTR(what),
+                        .what_fd = TAKE_FD(what_fd),
+                        .mount_fd = -EBADF,
+                        .image_type = image_type,
+                };
+
+                mstack->n_mounts++;
+                log_debug("Found mstack root layer ('%s')", m->what);
+                return 0;
+        }
+
+        return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Unrecognized entry '%s/%s', refusing", dir, fname);
+}
+
+static int mount_compare_func(const MStackMount *a, const MStackMount *b) {
+        int r;
+
+        assert(a);
+        assert(b);
+
+        /* If we apply this mstack in read-only mode then we'll convert the 'rw' layer which normally is an
+         * upperdir into the topmost lowerdir. When sorting the mstack it is hence essential, that the "rw"
+         * layer ends up *after* the regular layers. Enforce this here via a compile-time check. */
+        assert_cc(MSTACK_RW > MSTACK_LAYER);
+
+        r = CMP(a->mount_type, b->mount_type);
+        if (r != 0)
+                return r;
+
+        r = path_compare(a->where, b->where);
+        if (r != 0)
+                return r;
+
+        r = strverscmp_improved(a->sort_key, b->sort_key);
+        if (r != 0)
+                return r;
+
+        return 0;
+}
+
+static void mstack_remove(MStack *mstack, MStackMountType t) {
+        assert(mstack);
+
+        size_t z = 0;
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+                if (m->mount_type == t)
+                        mstack_mount_done(m);
+                else
+                        mstack->mounts[z++] = *m;
+        }
+
+        mstack->n_mounts = z;
+}
+
+static int mstack_normalize(MStack *mstack) {
+        int r;
+
+        assert(mstack);
+
+        typesafe_qsort(mstack->mounts, mstack->n_mounts, mount_compare_func);
+
+        size_t n_layers = 0;
+        bool has_rw = false, has_root_bind = false, has_usr_bind = false, has_root = false;
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+                switch (m->mount_type) {
+                case MSTACK_LAYER:
+                        n_layers++;
+                        break;
+
+                case MSTACK_RW:
+                        assert(!has_rw);
+                        has_rw = true;
+                        break;
+
+                case MSTACK_BIND:
+                case MSTACK_ROBIND:
+                        if (empty_or_root(m->where))
+                                has_root_bind = true;
+                        else if (path_equal(m->where, "/usr"))
+                                has_usr_bind = true;
+                        break;
+
+                case MSTACK_ROOT:
+                        assert(!has_root);
+                        has_root = true;
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        /* If the overlayfs stack is fully obstructed, kill it */
+        if (has_root_bind || (has_root && has_usr_bind)) {
+                mstack_remove(mstack, MSTACK_LAYER);
+                mstack_remove(mstack, MSTACK_RW);
+
+                n_layers = 0;
+                has_rw = false;
+        }
+
+        /* Only a single read-only or read-write layer? Turn into bind mount! */
+        if (n_layers + has_rw == 1) {
+                FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+                        if (m->mount_type == MSTACK_LAYER)
+                                m->mount_type = MSTACK_ROBIND;
+                        else if (m->mount_type == MSTACK_RW)
+                                m->mount_type = MSTACK_BIND;
+                        else
+                                continue;
+
+                        if (has_root) {
+                                /* If there's a root dir, let's only bind mount the /usr/ subdir */
+                                _cleanup_close_ int usr_fd = openat(m->what_fd, "usr", O_CLOEXEC|O_PATH|O_NOFOLLOW|O_DIRECTORY);
+                                if (usr_fd < 0)
+                                        return log_debug_errno(errno, "Failed to open /usr/ subdir: %m");
+
+                                _cleanup_free_ char *usr = path_join(m->what, "usr");
+                                if (!usr)
+                                        return log_oom();
+
+                                r = free_and_strdup_warn(&m->where, "/usr");
+                                if (r < 0)
+                                        return r;
+
+                                close_and_replace(m->what_fd, usr_fd);
+                                free_and_replace(m->what, usr);
+                        } else {
+                                r = free_and_strdup_warn(&m->where, "/");
+                                if (r < 0)
+                                        return r;
+
+                                has_root_bind = true;
+                        }
+                }
+
+                n_layers = 0;
+                has_rw = false;
+        }
+
+        /* If the root dir is overmounted, we can drop the original root */
+        if (has_root_bind) {
+                mstack_remove(mstack, MSTACK_ROOT);
+                has_root = false;
+        }
+
+        /* After converting, let's sort things again */
+        typesafe_qsort(mstack->mounts, mstack->n_mounts, mount_compare_func);
+
+        /* Find root mount (unless it's the overlayfs stack) */
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts)
+                if ((m->mount_type == MSTACK_ROOT) ||
+                    (IN_SET(m->mount_type, MSTACK_BIND, MSTACK_ROBIND) && empty_or_root(m->where))) {
+                        assert(!mstack->root_mount);
+                        mstack->root_mount = m;
+                }
+        assert((has_root || has_root_bind) == !!mstack->root_mount);
+
+        mstack->has_tmpfs_root = n_layers == 0 && !has_rw && !has_root_bind && !has_root;
+        mstack->has_overlayfs = n_layers > 0 || has_rw;
+        return 0;
+}
+
+static int mstack_load_now(MStack *mstack, const char *dir, int dir_fd, MStackFlags flags) {
+        _cleanup_close_ int _dir_fd = -EBADF;
+        int r;
+
+        assert(mstack);
+
+        r = free_and_strdup_warn(&mstack->path, dir);
+        if (r < 0)
+                return r;
+
+        /* Expects dir_fd already opened. If not, then we'll open it based on 'dir' */
+        if (dir_fd < 0) {
+                _dir_fd = openat(AT_FDCWD, isempty(dir) ? "." : dir, O_DIRECTORY|O_CLOEXEC);
+                if (_dir_fd < 0)
+                        return log_debug_errno(errno, "Failed to to open '%s': %m", dir);
+
+                dir_fd = _dir_fd;
+        } else {
+                /* Possibly convert an O_PATH fd to a real one */
+                dir_fd = fd_reopen_condition(dir_fd, O_DIRECTORY|O_CLOEXEC, O_PATH|O_DIRECTORY, &_dir_fd);
+                if (dir_fd < 0)
+                        return log_debug_errno(dir_fd, "Failed to reopen '%s': %m", dir);
+        }
+
+        _cleanup_free_ DirectoryEntries *de = NULL;
+        r = readdir_all(dir_fd, RECURSE_DIR_IGNORE_DOT, &de);
+        if (r < 0)
+                return r;
+
+        FOREACH_ARRAY(i, de->entries, de->n_entries) {
+                r = mstack_load_one(mstack, dir, dir_fd, (*i)->d_name);
+                if (r < 0)
+                        return r;
+        }
+
+        return mstack_normalize(mstack);
+}
+
+static int mount_get_fd(MStackMount *m) {
+        assert(m);
+
+        if (m->dissected_image) {
+                assert(m->dissected_image->partitions[PARTITION_ROOT].found);
+                return ASSERT_FD(m->dissected_image->partitions[PARTITION_ROOT].fsmount_fd);
+        }
+
+        if (m->mount_fd >= 0)
+                return m->mount_fd;
+
+        return m->what_fd;
+}
+
+static bool mount_is_ro(MStackMount *m, MStackFlags flags) {
+        assert(m);
+
+        return FLAGS_SET(flags, MSTACK_RDONLY) ||
+                IN_SET(m->mount_type, MSTACK_LAYER, MSTACK_ROBIND);
+}
+
+static const char *mount_name(MStackMount *m) {
+        assert(m);
+
+        /* Returns some vaguely useful identifier for this layer, for showing in debug output */
+
+        if (m->sort_key)
+                return m->sort_key;
+
+        if (m->where)
+                return m->where;
+
+        return mstack_mount_type_to_string(m->mount_type);
+}
+
+int mstack_open_images(
+                MStack *mstack,
+                int userns_fd,
+                const ImagePolicy *image_policy,
+                const ImageFilter *image_filter,
+                MStackFlags flags) {
+
+        int r;
+
+        assert(mstack);
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+
+                DissectImageFlags dissect_image_flags =
+                        DISSECT_IMAGE_DISCARD|
+                        DISSECT_IMAGE_GENERIC_ROOT|
+                        DISSECT_IMAGE_REQUIRE_ROOT|
+                        DISSECT_IMAGE_MOUNT_ROOT_ONLY|
+                        DISSECT_IMAGE_FSCK|
+                        DISSECT_IMAGE_USR_NO_ROOT|
+                        DISSECT_IMAGE_GROWFS|
+                        DISSECT_IMAGE_ADD_PARTITION_DEVICES|
+                        DISSECT_IMAGE_PIN_PARTITION_DEVICES|
+                        DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
+
+                SET_FLAG(dissect_image_flags, DISSECT_IMAGE_READ_ONLY, mount_is_ro(m, flags));
+                SET_FLAG(dissect_image_flags, DISSECT_IMAGE_FOREIGN_UID, userns_fd >= 0);
+
+                switch (m->image_type) {
+
+                case IMAGE_RAW:
+                case IMAGE_BLOCK:
+                        assert(!m->dissected_image);
+
+                        if (userns_fd >= 0) {
+                                r = mountfsd_mount_image_fd(
+                                                m->what_fd,
+                                                userns_fd,
+                                                /* options= */ NULL,
+                                                image_policy,
+                                                /* verity= */ NULL,
+                                                dissect_image_flags,
+                                                &m->dissected_image);
+                                if (r < 0)
+                                        return r;
+                        } else {
+                                _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+                                _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+
+                                r = loop_device_make(
+                                                m->what_fd,
+                                                FLAGS_SET(flags, MSTACK_RDONLY) ? O_RDONLY : O_RDWR,
+                                                /* offset= */ 0,
+                                                /* size= */ UINT64_MAX,
+                                                /* sector_size= */ UINT32_MAX,
+                                                LO_FLAGS_PARTSCAN,
+                                                LOCK_SH,
+                                                &loop_device);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to allocate loopback device for '%s': %m", m->what);
+
+                                _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+                                r = dissect_loop_device_and_warn(
+                                                loop_device,
+                                                &verity,
+                                                /* mount_options= */ NULL,
+                                                image_policy,
+                                                image_filter,
+                                                dissect_image_flags,
+                                                &dissected_image);
+                                if (r < 0)
+                                        return r;
+
+                                if (!dissected_image->partitions[PARTITION_ROOT].found)
+                                        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Currently images withot root partition are not supported: %m");
+
+                                r = dissected_image_load_verity_sig_partition(
+                                                dissected_image,
+                                                loop_device->fd,
+                                                &verity);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to load Verity signature partition of '%s': %m", m->what);
+
+                                r = dissected_image_guess_verity_roothash(
+                                                dissected_image,
+                                                &verity);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to guess Verity root hash of '%s': %m", m->what);
+
+                                r = dissected_image_decrypt(
+                                                dissected_image,
+                                                /* root= */ NULL,
+                                                /* passphrase= */ NULL,
+                                                &verity,
+                                                image_policy,
+                                                dissect_image_flags);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to decrypt image '%s': %m", m->what);
+
+                                r = dissected_image_mount(
+                                                dissected_image,
+                                                /* where= */ NULL,               /* allocate as mount fds, do not attach anywhere */
+                                                /* uid_shift= */ UID_INVALID,
+                                                /* uid_range= */ UID_INVALID,
+                                                /* userns_fd = */ -EBADF,
+                                                dissect_image_flags);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to mount image '%s': %m", m->what);
+
+                                r = loop_device_flock(loop_device, LOCK_UN);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to unlock loopback block device: %m");
+
+                                r = dissected_image_relinquish(dissected_image);
+                                if (r < 0)
+                                        return log_debug_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+
+                                m->dissected_image = TAKE_PTR(dissected_image);
+                        }
+
+                        log_debug("Acquired mstack DDI layer '%s'", mount_name(m));
+                        break;
+
+                case IMAGE_DIRECTORY:
+                case IMAGE_SUBVOLUME:
+                        assert(m->mount_fd < 0);
+
+                        if (userns_fd >= 0) {
+                                r = mountfsd_mount_directory_fd(
+                                                m->what_fd,
+                                                userns_fd,
+                                                dissect_image_flags,
+                                                &m->mount_fd);
+                                if (r < 0)
+                                        return r;
+                        } else {
+                                m->mount_fd = open_tree_attr_with_fallback(
+                                                mount_get_fd(m),
+                                                /* path= */ "",
+                                                OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_EMPTY_PATH,
+                                                &(struct mount_attr) {
+                                                        .attr_set = mount_is_ro(m, flags) ? MOUNT_ATTR_RDONLY : 0,
+                                                        .attr_clr = mount_is_ro(m, flags) ? 0 : MOUNT_ATTR_RDONLY,
+                                                        .propagation = MS_PRIVATE, /* disconnect us from bind mount source */
+                                                });
+                                if (m->mount_fd < 0)
+                                        return log_debug_errno(m->mount_fd, "Failed to create bind mount inode '%s': %m", m->where);
+                        }
+
+                        log_debug("Acquired bind mount for layer '%s'.", mount_name(m));
+                        break;
+
+                default:
+                        assert_not_reached();
+                }
+        }
+
+        return 0;
+}
+
+static int mstack_has_writable_layers(MStack *mstack, MStackFlags flags) {
+        assert(mstack);
+
+        if (FLAGS_SET(flags, MSTACK_RDONLY))
+                return false;
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts)
+                if (m->mount_type == MSTACK_RW)
+                        return true;
+
+        return false;
+}
+
+static int fsconfig_add_layer(int sb_fd, const char *key, int layer_fd) {
+        int r;
+
+        assert(sb_fd >= 0);
+        assert(key);
+        assert(layer_fd >= 0);
+
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *pretty = NULL;
+                (void) fd_get_path(layer_fd, &pretty);
+                log_debug("Adding '%s' as layer '%s' to overlayfs.", key, pretty);
+        }
+
+        r = RET_NERRNO(fsconfig(sb_fd, FSCONFIG_SET_FD, key, /* value= */ NULL, layer_fd));
+        if (r != -EBADF && !ERRNO_IS_NEG_NOT_SUPPORTED(r))
+                return r;
+
+        /* overlayfs learnt support for FSCONFIG_SET_FD only with linux 6.13, hence provide a fallback here via /proc/self/fd/ */
+
+        // FIXME: This compatibility code path shall be removed once kernel 6.13
+        //        becomes the new minimal baseline
+
+        const char *layer_path = FORMAT_PROC_FD_PATH(layer_fd);
+        log_debug_errno(r, "FSCONFIG_SET_FD for layer '%s' failed, falling back to FSCONFIG_SET with '%s': %m", key, layer_path);
+        return RET_NERRNO(fsconfig(sb_fd, FSCONFIG_SET_STRING, key, layer_path, /* aux= */ 0));
+}
+
+static int mstack_make_overlayfs(
+                MStack *mstack,
+                const char *temp_mount_dir,
+                MStackFlags flags,
+                int *ret_overlayfs_mnt_fd) {
+
+        int r;
+
+        assert(mstack);
+        assert(temp_mount_dir);
+        assert(ret_overlayfs_mnt_fd);
+
+        if (!mstack->has_overlayfs) {
+                *ret_overlayfs_mnt_fd = -EBADF;
+                return 0;
+        }
+
+        bool writable = mstack_has_writable_layers(mstack, flags);
+
+        _cleanup_close_ int sb_fd = fsopen("overlay", FSOPEN_CLOEXEC);
+        if (sb_fd < 0)
+                return log_debug_errno(errno, "Failed to create overlayfs: %m");
+
+        _cleanup_close_pair_ int errno_pipe_fds[2] = EBADF_PAIR;
+        if (pipe2(errno_pipe_fds, O_CLOEXEC) < 0)
+                return log_debug_errno(errno, "Failed to open pipe: %m");
+
+        /* If we operate unpriv, we have to attach the layers to a place in the fs, before we can pass them
+         * to overlayfs (see comments below), hence fork off a child with a private mount namespace, so that
+         * noone else sees that. */
+        r = pidref_safe_fork("(layerfd)",
+                      FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_REOPEN_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE,
+                      /* ret= */ NULL);
+        if (r < 0) {
+                errno_pipe_fds[1] = safe_close(errno_pipe_fds[1]);
+
+                int q = read_errno(errno_pipe_fds[0]);
+                if (q < 0 && q != -EIO)
+                        return q;
+
+                return r;
+        }
+        if (r == 0) {
+                /* child */
+
+                /* Kernel expects the stack in reverse order, hence go from back to front */
+                for (size_t i = mstack->n_mounts; i > 0; i--) {
+                        MStackMount *m = mstack->mounts + i - 1;
+
+                        if (!IN_SET(m->mount_type, MSTACK_RW, MSTACK_LAYER))
+                                continue;
+
+                        /* overlayfs refuses to work with layers on mounts not owned by our userns, hence create a
+                         * clone that is owned by our userns */
+                        _cleanup_close_ int cloned_fd = mount_fd_clone(ASSERT_FD(mount_get_fd(m)), /* recursive= */ false, /* replacement_fd= */ NULL);
+                        if (cloned_fd < 0)
+                                report_errno_and_exit(errno_pipe_fds[1], cloned_fd);
+
+                        /* When working with detached mounts overlayfs (which requires kernel 6.14) currently
+                         * insists on upperdir being the root inode of the mount. But that collides with the
+                         * requirement that upperdir/workdir are on the same mount and siblings. Bummer. To
+                         * work around this we'll temporarily attach the thing, which relaxes the rules
+                         * sufficiently. */
+                        if (move_mount(cloned_fd, "", -EBADF, temp_mount_dir, MOVE_MOUNT_F_EMPTY_PATH) < 0)
+                                report_errno_and_exit(errno_pipe_fds[1], -errno);
+
+                        /* Open the layer immediately after attaching */
+                        _cleanup_close_ int temp_fd = open(temp_mount_dir, O_PATH|O_CLOEXEC);
+                        if (temp_fd < 0)
+                                report_errno_and_exit(errno_pipe_fds[1], -errno);
+
+                        switch (m->mount_type) {
+
+                        case MSTACK_RW: {
+                                if (mount_is_ro(m, flags)) {
+                                        /* If invoked in read-only mode we'll not create the data dir, but use it if it exists */
+                                        _cleanup_close_ int data_fd = openat(temp_fd, "data", O_CLOEXEC|O_NOFOLLOW|O_DIRECTORY);
+                                        if (data_fd < 0) {
+                                                if (errno == ENOENT) /* If the 'data' dir doesn't exist, just skip
+                                                                      * over it, it apparently was never created, but
+                                                                      * that's fine for a read-only invocation */
+                                                        break;
+
+                                                log_debug_errno(errno, "Failed to open 'data' directory below 'rw' layer: %m");
+                                                report_errno_and_exit(errno_pipe_fds[1], -errno);
+                                        }
+
+                                        /* Downgrade to regular lowerdir if read-only is requested */
+                                        r = fsconfig_add_layer(sb_fd, "lowerdir+", data_fd);
+                                        if (r < 0) {
+                                                log_debug_errno(r, "Failed to set mount layer lowerdir+=%s/data: %m", m->what);
+                                                report_errno_and_exit(errno_pipe_fds[1], r);
+                                        }
+                                } else {
+                                        /* If invoked in writable mode, let's create the data dir if it is missing */
+                                        _cleanup_close_ int data_fd = open_mkdir_at(temp_fd, "data", O_CLOEXEC|O_NOFOLLOW, 0755);
+                                        if (data_fd < 0) {
+                                                log_debug_errno(data_fd, "Failed to open 'data' directory below 'rw' layer: %m");
+                                                report_errno_and_exit(errno_pipe_fds[1], data_fd);
+                                        }
+
+                                        r = fsconfig_add_layer(sb_fd, "upperdir", data_fd);
+                                        if (r < 0) {
+                                                log_debug_errno(r, "Failed to set mount layer upperdir=%s/data: %m", m->what);
+                                                report_errno_and_exit(errno_pipe_fds[1], r);
+                                        }
+
+                                        /* Similar, create the work directory */
+                                        _cleanup_close_ int work_fd = open_mkdir_at(temp_fd, "work", O_CLOEXEC|O_NOFOLLOW, 0755);
+                                        if (work_fd < 0) {
+                                                log_debug_errno(work_fd, "Failed to open 'work' directory below 'rw' layer: %m");
+                                                report_errno_and_exit(errno_pipe_fds[1], work_fd);
+                                        }
+
+                                        /* rm_rf_children() takes possession of the fd no matter what, let's dup it here */
+                                        int dup_fd = fcntl(work_fd, F_DUPFD_CLOEXEC, 3);
+                                        if (dup_fd < 0) {
+                                                log_debug_errno(errno, "Failed to duplicate work fd: %m");
+                                                report_errno_and_exit(errno_pipe_fds[1], -errno);
+                                        }
+
+                                        /* Empty the work directory, just in case it existed before. It's supposed to be empty. */
+                                        r = rm_rf_children(dup_fd, REMOVE_PHYSICAL, /* root_dev= */ NULL);
+                                        if (r < 0)
+                                                log_debug_errno(r, "Failed to empty 'work' directory below 'rw' layer, ignoring: %m");
+
+                                        r = fsconfig_add_layer(sb_fd, "workdir", work_fd);
+                                        if (r < 0) {
+                                                log_debug_errno(r, "Failed to set mount layer workdir=%s/work: %m", m->what);
+                                                report_errno_and_exit(errno_pipe_fds[1], r);
+                                        }
+
+                                        break;
+                                }
+                                break;
+                        }
+
+                        case MSTACK_LAYER:
+                                r = fsconfig_add_layer(sb_fd, "lowerdir+", temp_fd);
+                                if (r < 0) {
+                                        log_debug_errno(r, "Failed to set mount layer lowerdir+=%s: %m", m->what);
+                                        report_errno_and_exit(errno_pipe_fds[1], r);
+                                }
+
+                                break;
+
+                        default:
+                                break;
+                        }
+                }
+
+                if (!writable && fsconfig(sb_fd, FSCONFIG_SET_FLAG, "ro", /* value= */ NULL, /* aux= */ 0) < 0) {
+                        log_debug_errno(errno, "Failed to set read-only mount flag: %m");
+                        report_errno_and_exit(errno_pipe_fds[1], -errno);
+                }
+
+                if (fsconfig(sb_fd, FSCONFIG_SET_FLAG, "userxattr", /* value= */ NULL, /* aux= */ 0) < 0) {
+                        log_debug_errno(errno, "Failed to set userxattr mount flag: %m");
+                        report_errno_and_exit(errno_pipe_fds[1], -errno);
+                }
+
+                if (fsconfig(sb_fd, FSCONFIG_SET_STRING, "source", mstack->path, /* aux= */ 0) < 0) {
+                        log_debug_errno(errno, "Failed to set mount source: %m");
+                        report_errno_and_exit(errno_pipe_fds[1], -errno);
+                }
+
+                /* This is where the superblock is materialized. It must be called from the child's
+                 * namespace, where the mounts are attached as described above, otherwise overlayfs is
+                 * unhappy and will refuse the superblock to be created. */
+                if (fsconfig(sb_fd, FSCONFIG_CMD_CREATE, /* key= */ NULL, /* value= */ NULL, /* aux= */ 0) < 0) {
+                        log_debug_errno(errno, "Failed to realize overlayfs: %m");
+                        report_errno_and_exit(errno_pipe_fds[1], -errno);
+                }
+
+                report_errno_and_exit(errno_pipe_fds[1], 0);
+        }
+
+        _cleanup_close_ int overlayfs_mnt_fd = fsmount(sb_fd, FSMOUNT_CLOEXEC, 0);
+        if (overlayfs_mnt_fd < 0)
+                return log_debug_errno(errno, "Failed to create mount fd: %m");
+
+        if (mount_setattr(overlayfs_mnt_fd, "", AT_EMPTY_PATH,
+                          &(struct mount_attr) {
+                                  .attr_set = writable ? 0 : MOUNT_ATTR_RDONLY,
+                                  .attr_clr = writable ? MOUNT_ATTR_RDONLY : 0,
+                          }, sizeof(struct mount_attr)) < 0)
+                return log_debug_errno(errno, "Failed to mark root bind mount read-only: %m");
+
+        *ret_overlayfs_mnt_fd = TAKE_FD(overlayfs_mnt_fd);
+        return 1;
+}
+
+int mstack_make_mounts(
+                MStack *mstack,
+                const char *temp_mount_dir,
+                MStackFlags flags) {
+
+        int r;
+
+        assert(mstack);
+        assert(temp_mount_dir);
+
+        _cleanup_close_ int overlayfs_mnt_fd = -EBADF;
+        r = mstack_make_overlayfs(mstack, temp_mount_dir, flags, &overlayfs_mnt_fd);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                log_debug("Acquired mstack overlayfs mount.");
+
+        assert(mstack->root_mount_fd < 0);
+        if (mstack->root_mount) {
+                assert(!mstack->has_tmpfs_root);
+
+                mstack->root_mount_fd = fcntl(mount_get_fd(mstack->root_mount), F_DUPFD_CLOEXEC, 3);
+                if (mstack->root_mount_fd < 0)
+                        return log_debug_errno(errno, "Failed to create root bind mount: %m");
+
+                log_debug("Acquired mstack root bind mount.");
+
+        } else if (mstack->has_tmpfs_root) {
+                _cleanup_close_ int sb_fd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+                if (sb_fd < 0)
+                        return log_debug_errno(errno, "Failed to create tmpfs: %m");
+
+                if (fsconfig(sb_fd, FSCONFIG_SET_STRING, "source", mstack->path, 0) < 0)
+                        return log_debug_errno(errno, "Failed to set mount source: %m");
+
+                if (fsconfig(sb_fd, FSCONFIG_SET_STRING, "mode", "0755", 0) < 0)
+                        return log_debug_errno(errno, "Failed to set mount source: %m");
+
+                if (fsconfig(sb_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
+                        return log_debug_errno(errno, "Failed to realize tmpfs: %m");
+
+                mstack->root_mount_fd = fsmount(sb_fd, FSMOUNT_CLOEXEC, 0);
+                if (mstack->root_mount_fd < 0)
+                        return log_debug_errno(errno, "Failed to create mount fd: %m");
+
+                log_debug("Acquired root tmpfs mount.");
+        }
+
+        if (mstack->root_mount_fd >= 0 && overlayfs_mnt_fd >= 0) {
+                /* If we have an overlayfs and a root fs, then the overlayfs should be placed on /usr/. */
+                mstack->usr_mount_fd = open_tree(overlayfs_mnt_fd, "usr", OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW);
+                if (mstack->usr_mount_fd < 0)
+                        return log_debug_errno(errno, "Failed to create bind mount inode '/usr/': %m");
+
+                if (mount_setattr(mstack->usr_mount_fd, "", AT_EMPTY_PATH,
+                                  &(struct mount_attr) {
+                                          .attr_set = mount_is_ro(mstack->root_mount, flags) ? MOUNT_ATTR_RDONLY : 0,
+                                          .attr_clr = mount_is_ro(mstack->root_mount, flags) ? 0 : MOUNT_ATTR_RDONLY,
+                                          .propagation = MS_PRIVATE, /* disconnect us from bind mount source */
+                                  }, sizeof(struct mount_attr)) < 0)
+                        return log_debug_errno(errno, "Failed to mark usr bind mount read-only: %m");
+
+                log_debug("Acquired mstack overlayfs '/usr/' submount.");
+        }
+
+        /* If we acquired no other root fs, then the overlayfs is our root */
+        if (mstack->root_mount_fd < 0)
+                mstack->root_mount_fd = TAKE_FD(overlayfs_mnt_fd);
+
+        return 0;
+}
+
+int mstack_bind_mounts(
+                MStack *mstack,
+                const char *where,
+                int where_fd,
+                MStackFlags flags,
+                int *ret_root_fd) {
+
+        int r;
+
+        assert(mstack);
+
+        _cleanup_close_ int _where_fd = -EBADF;
+        if (where_fd == AT_FDCWD) {
+                _where_fd = open(".", O_CLOEXEC|O_PATH|O_DIRECTORY);
+                if (_where_fd < 0)
+                        return log_debug_errno(errno, "Failed to open current working directory: %m");
+                where_fd = _where_fd;
+        } else if (where_fd < 0) {
+                r = chase(where,
+                          /* root= */ NULL,
+                          (FLAGS_SET(flags, MSTACK_MKDIR) ? CHASE_MKDIR_0755 : 0)|CHASE_MUST_BE_DIRECTORY,
+                          /* ret_path= */ NULL,
+                          &_where_fd);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to open '%s': %m", where);
+
+                where_fd = _where_fd;
+        }
+
+        assert(mstack->root_mount_fd >= 0);
+        if (move_mount(mstack->root_mount_fd, "", where_fd, "", MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_T_EMPTY_PATH) < 0)
+                return log_debug_errno(errno, "Failed to attach mstack root mount to '%s': %m", where);
+
+        log_debug("Attached mstack root mount to '%s'.", where);
+
+        _cleanup_close_ int root_fd = open(where, O_CLOEXEC|O_PATH|O_DIRECTORY|O_NOFOLLOW);
+        if (root_fd < 0)
+                return log_debug_errno(errno, "Failed to mount root mount '%s': %m", where);
+
+        if (mstack->usr_mount_fd >= 0) {
+                _cleanup_close_ int subdir_fd = -EBADF;
+                r = chaseat(root_fd, "usr", CHASE_AT_RESOLVE_IN_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_MKDIR_0755|CHASE_MUST_BE_DIRECTORY, /* ret_path= */ NULL, &subdir_fd);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to open mount point inode '%s': %m", where);
+
+                if (move_mount(mstack->usr_mount_fd, "", subdir_fd, "", MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_T_EMPTY_PATH) < 0)
+                        return log_debug_errno(errno, "Failed to attach bind mount to '/usr/' subdir: %m");
+
+                log_debug("Attached mstack '/usr/' mount to '%s/usr/'.", where);
+        }
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+
+                if (!IN_SET(m->mount_type, MSTACK_BIND, MSTACK_ROBIND) ||
+                    m == mstack->root_mount)
+                        continue;
+
+                assert(m->mount_fd >= 0);
+
+                _cleanup_close_ int subdir_fd = -EBADF;
+                r = chaseat(root_fd, m->where, CHASE_AT_RESOLVE_IN_ROOT|CHASE_PROHIBIT_SYMLINKS|CHASE_MKDIR_0755|CHASE_MUST_BE_DIRECTORY, /* ret_path= */ NULL, &subdir_fd);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to open mount point inode '%s': %m", m->where);
+
+                if (move_mount(m->mount_fd, "", subdir_fd, "", MOVE_MOUNT_F_EMPTY_PATH|MOVE_MOUNT_T_EMPTY_PATH) < 0)
+                        return log_debug_errno(errno, "Failed to attach bind mount to '%s' subdir: %m", m->where);
+
+                log_debug("Attached mstack '%s/' mount to '%s/%s/'.", m->where, where, m->where);
+        }
+
+        /* If we have a tmpfs root, the above might have created mount point inodes. Hence we left the tmpfs
+         * writable for that. Let's fix that now. Also, let's enable propagation for the future. (Reminder:
+         * we disconnect propagation from the host, but we *want* propagation by default for everything
+         * created further down the tree. Hence we'll set MS_SHARED here right-away.) */
+        if (mount_setattr(root_fd, "", AT_EMPTY_PATH|AT_RECURSIVE,
+                          &(struct mount_attr) {
+                                  .attr_set = FLAGS_SET(flags, MSTACK_RDONLY) ? MOUNT_ATTR_RDONLY : 0,
+                                  .attr_clr = FLAGS_SET(flags, MSTACK_RDONLY) ? 0 : MOUNT_ATTR_RDONLY,
+                                  .propagation = MS_SHARED,
+                          }, sizeof(struct mount_attr)) < 0)
+                return log_debug_errno(errno, "Failed to mark root bind mount read-only: %m");
+
+        if (ret_root_fd)
+                *ret_root_fd = TAKE_FD(root_fd);
+
+        return 0;
+}
+
+int mstack_apply(
+                const char *dir,
+                int dir_fd,
+                const char *where,
+                const char *temp_mount_dir,
+                int userns_fd,
+                const ImagePolicy *image_policy,
+                const ImageFilter *image_filter,
+                MStackFlags flags,
+                int *ret_root_fd) {
+        int r;
+
+        assert(where);
+
+        _cleanup_(mstack_done) MStack mstack = MSTACK_INIT;
+        r = mstack_load_now(&mstack, dir, dir_fd, flags);
+        if (r < 0)
+                return r;
+
+        r = mstack_open_images(&mstack, userns_fd, image_policy, image_filter, flags);
+        if (r < 0)
+                return r;
+
+        _cleanup_(rmdir_and_freep) char *t = NULL;
+        if (!temp_mount_dir) {
+                r = mkdtemp_malloc("/tmp/mstack-temporary-XXXXXX", &t);
+                if (r < 0)
+                        return r;
+
+                temp_mount_dir = t;
+        }
+
+        r = mstack_make_mounts(&mstack, temp_mount_dir, flags);
+        if (r < 0)
+                return r;
+
+        return mstack_bind_mounts(&mstack, where, /* where_fd= */ -EBADF, flags, ret_root_fd);
+}
+
+int mstack_load(const char *dir,
+                int dir_fd,
+                MStack **ret) {
+
+        int r;
+
+        /* Well-known errors:
+         *
+         *     -ENOTUNIQ → Multiple conflicting layers for the same path defined
+         *     -EBADMSG  → Bad file suffix, inode type for layer, or unrecognized entry
+         */
+
+        MStack *mstack = new(MStack, 1);
+        if (!mstack)
+                return -ENOMEM;
+
+        *mstack = MSTACK_INIT;
+
+        r = mstack_load_now(mstack, dir, dir_fd, /* flags= */ 0);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = TAKE_PTR(mstack);
+
+        return 0;
+}
+
+int mstack_is_read_only(MStack *mstack) {
+        assert(mstack);
+
+        /* Checks if the mstack consists of only read-only layers and bind mounts */
+
+        if (mstack->has_tmpfs_root)
+                return false;
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts)
+                if (IN_SET(m->mount_type, MSTACK_ROOT, MSTACK_RW, MSTACK_BIND))
+                        return false;
+
+        return true;
+}
+
+int mstack_is_foreign_uid_owned(MStack *mstack) {
+        int r;
+
+        assert(mstack);
+
+        /* Checks if any of the layers are owned by the host's foreign UID range */
+
+        FOREACH_ARRAY(m, mstack->mounts, mstack->n_mounts) {
+
+                if (!IN_SET(m->image_type, IMAGE_DIRECTORY, IMAGE_SUBVOLUME))
+                        continue;
+
+                assert(m->what_fd >= 0);
+
+                struct stat st;
+                if (fstat(m->what_fd, &st) < 0)
+                        return -errno;
+
+                r = stat_verify_directory(&st);
+                if (r < 0)
+                        return r;
+
+                if (uid_is_foreign(st.st_uid))
+                        return true;
+        }
+
+        return false;
+}
+
+static const char *const mstack_mount_type_table[] = {
+        [MSTACK_ROOT]   = "root",
+        [MSTACK_LAYER]  = "layer",
+        [MSTACK_RW]     = "rw",
+        [MSTACK_BIND]   = "bind",
+        [MSTACK_ROBIND] = "robind",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(mstack_mount_type, MStackMountType);
diff --git a/src/shared/mstack.h b/src/shared/mstack.h

new file mode 100644 (file)

index 0000000..e526f17
--- /dev/null
+++ b/src/shared/mstack.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "discover-image.h"
+#include "shared-forward.h"
+
+typedef enum MStackFlags {
+        MSTACK_MKDIR  = 1 << 0, /* when mounting, create top-level inode to mount on top */
+        MSTACK_RDONLY = 1 << 1,
+} MStackFlags;
+
+typedef enum MStackMountType {
+        MSTACK_ROOT,     /* optional "root" entry used as root, with the layer@/rw layers only used for /usr/ */
+        MSTACK_LAYER,    /* "layer@…" entries that are the lower (read-only) layers of an overlayfs stack */
+        MSTACK_RW,       /* "rw" entry that is the upper (writable) layer of an overlayfs stack (contains two subdirs: 'data' + 'work') */
+        MSTACK_BIND,     /* "bind@…" entries that are (writable) bind mounted on top of the overlayfs */
+        MSTACK_ROBIND,   /* "robind@…" similar, but read-only */
+        _MSTACK_MOUNT_TYPE_MAX,
+        _MSTACK_MOUNT_TYPE_INVALID = -EINVAL,
+} MStackMountType;
+
+typedef struct MStackMount {
+        MStackMountType mount_type;
+        char *what;
+        int what_fd;
+        int mount_fd;
+        char *sort_key;
+        char *where;
+        ImageType image_type;
+        DissectedImage *dissected_image;
+} MStackMount;
+
+typedef struct MStack {
+        char *path;
+        MStackMount *mounts;
+        size_t n_mounts;
+        bool has_tmpfs_root;      /* If true, we need a throw-away tmpfs as root */
+        bool has_overlayfs;       /* Indicates whether we need overlayfs (i.e. if there are more than a single layer */
+        MStackMount *root_mount;  /* If there's a MOUNT_BIND/MOUNT_ROBIND/MOUNT_ROOT mount, this points to it */
+        int root_mount_fd;
+        int usr_mount_fd;
+} MStack;
+
+#define MSTACK_INIT                             \
+        (MStack) {                              \
+                .root_mount_fd = -EBADF,        \
+                .usr_mount_fd = -EBADF,         \
+        }
+
+MStack *mstack_free(MStack *mstack);
+DEFINE_TRIVIAL_CLEANUP_FUNC(MStack*, mstack_free);
+
+int mstack_load(const char *dir, int dir_fd, MStack **ret);
+int mstack_open_images(MStack *mstack, int userns_fd, const ImagePolicy *image_policy, const ImageFilter *image_filter, MStackFlags flags);
+int mstack_make_mounts(MStack *mstack, const char *temp_mount_dir, MStackFlags flags);
+int mstack_bind_mounts(MStack *mstack, const char *where, int where_fd, MStackFlags flags, int *ret_root_fd);
+
+/* The four calls above in one */
+int mstack_apply(const char *dir, int dir_fd, const char *where, const char *temp_mount_dir, int userns_fd, const ImagePolicy *image_policy, const ImageFilter *image_filter, MStackFlags flags, int *ret_root_fd);
+
+int mstack_is_read_only(MStack *mstack);
+int mstack_is_foreign_uid_owned(MStack *mstack);
+
+DECLARE_STRING_TABLE_LOOKUP_TO_STRING(mstack_mount_type, MStackMountType);
diff --git a/src/shared/shared-forward.h b/src/shared/shared-forward.h

index f35287746b073a57e6c102aea8ad6b8b27721675..82bdf86330faa876e5d307ed1c8a1ff7c675eff4 100644 (file)
--- a/src/shared/shared-forward.h
+++ b/src/shared/shared-forward.h
@@ -53,6 +53,7 @@ typedef struct Condition Condition;
  typedef struct ConfigSection ConfigSection;
  typedef struct ConfigTableItem ConfigTableItem;
  typedef struct CPUSet CPUSet;
+typedef struct DissectedImage DissectedImage;
  typedef struct DnsAnswer DnsAnswer;
  typedef struct DnsPacket DnsPacket;
  typedef struct DnsQuestion DnsQuestion;
@@ -64,6 +65,7 @@ typedef struct FDSet FDSet;
  typedef struct Fido2HmacSalt Fido2HmacSalt;
  typedef struct GroupRecord GroupRecord;
  typedef struct Image Image;
+typedef struct ImageFilter ImageFilter;
  typedef struct ImagePolicy ImagePolicy;
  typedef struct InstallChange InstallChange;
  typedef struct InstallInfo InstallInfo;
diff --git a/src/test/meson.build b/src/test/meson.build

index 3ca614db36283f3fa355e572471c8bf1f511f4ea..adbcd3c0d4dcf7612ade4dced101b2283ba7e9de 100644 (file)
--- a/src/test/meson.build
+++ b/src/test/meson.build
@@ -149,6 +149,7 @@ simple_tests += files(
          'test-mkdir.c',
          'test-modhex.c',
          'test-mountpoint-util.c',
+        'test-mstack.c',
          'test-namespace-util.c',
          'test-net-naming-scheme.c',
          'test-notify-recv.c',
diff --git a/src/test/test-mstack.c b/src/test/test-mstack.c

new file mode 100644 (file)

index 0000000..15400af
--- /dev/null
+++ b/src/test/test-mstack.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include "capability-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "mountpoint-util.h"
+#include "mstack.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
+#include "tests.h"
+#include "tmpfile-util.h"
+#include "virt.h"
+
+static bool overlayfs_set_fd_lowerdir_plus_supported(void) {
+        int r;
+
+        _cleanup_close_ int sb_fd = fsopen("overlay", FSOPEN_CLOEXEC);
+        if (sb_fd < 0 && (ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV))
+                return false;
+        ASSERT_OK_ERRNO(sb_fd);
+
+        _cleanup_close_ int layer_fd = open("/", O_DIRECTORY|O_CLOEXEC);
+        ASSERT_OK_ERRNO(layer_fd);
+
+        r = RET_NERRNO(fsconfig(sb_fd, FSCONFIG_SET_FD, "lowerdir+", /* value= */ NULL, layer_fd));
+        if (r < 0 && (ERRNO_IS_NEG_NOT_SUPPORTED(r) || r == -EINVAL))
+                return false;
+
+        ASSERT_OK_ERRNO(r);
+        return true;
+}
+
+TEST(mstack) {
+        _cleanup_(rm_rf_physical_and_freep) char *t = NULL;
+        _cleanup_close_ int tfd = -EBADF;
+        int r;
+
+        tfd = mkdtemp_open("/tmp/mstack-what-XXXXXX", O_PATH, &t);
+        ASSERT_OK(tfd);
+
+        ASSERT_OK_ERRNO(mkdirat(tfd, "rw", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "rw/data", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "rw/data/check1", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "layer@0", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "layer@0/check2", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "layer@0/zzz", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "layer@1", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "layer@1/check3", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "layer@0/yyy", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "bind@zzz", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "bind@zzz/check4", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "robind@yyy", 0755));
+        ASSERT_OK_ERRNO(mkdirat(tfd, "robind@yyy/check5", 0755));
+
+        _cleanup_(mstack_freep) MStack *mstack = NULL;
+        ASSERT_OK(mstack_load(t, tfd, &mstack));
+
+        ASSERT_OK_ZERO(mstack_is_read_only(mstack));
+        ASSERT_OK_ZERO(mstack_is_foreign_uid_owned(mstack));
+
+        if (!have_effective_cap(CAP_SYS_ADMIN))
+                return (void) log_tests_skipped("not attaching mstack, lacking privs");
+        if (!mount_new_api_supported())
+                return (void) log_tests_skipped("kernel does not support new mount API, skipping mstack attachment test.");
+        if (!overlayfs_set_fd_lowerdir_plus_supported())
+                return (void) log_tests_skipped("overlayfs does not support FSCONFIG_SET_FD with lowerdir+, skipping mstack attachment test.");
+        if (running_in_chroot() > 0) /* we cannot disable mount prop if we are in a chroot without the root inode being a proper mount point */
+                return (void) log_tests_skipped("running in chroot(), skipping mstack attachment test.");
+
+        mstack = mstack_free(mstack);
+
+        /* For with a new mountns */
+        r = pidref_safe_fork("(mstack-test", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, /* ret= */ NULL);
+        ASSERT_OK(r);
+
+        if (r == 0) {
+                MStackFlags flags = 0;
+
+                /* Close the original temporary fd, it still points to an inode of the original mountns,
+                 * which we cannot use to generate mounts from */
+                tfd = safe_close(tfd);
+
+                {
+                        ASSERT_OK(mstack_load(t, -EBADF, &mstack));
+
+                        ASSERT_OK(mstack_open_images(
+                                                  mstack,
+                                                  /* userns_fd= */ -EBADF,
+                                                  /* image_policy= */ NULL,
+                                                  /* image_filter= */ NULL,
+                                                  flags));
+
+                        _cleanup_(rmdir_and_freep) char *m = NULL;
+                        ASSERT_OK(mkdtemp_malloc("/tmp/mstack-temporary-XXXXXX", &m));
+
+                        ASSERT_OK(mstack_make_mounts(mstack, m, flags));
+
+                        _cleanup_(rmdir_and_freep) char *w = NULL;
+                        ASSERT_OK(mkdtemp_malloc("/tmp/mstack-where-XXXXXX", &w));
+
+                        _cleanup_close_ int rfd = -EBADF;
+                        ASSERT_OK(mstack_bind_mounts(mstack, w, /* where_fd= */ -EBADF, flags, &rfd));
+
+                        _cleanup_close_ int ofd = open(w, O_PATH|O_CLOEXEC);
+                        ASSERT_OK_ERRNO(ofd);
+
+                        ASSERT_OK_ERRNO(faccessat(ofd, "check1", F_OK, AT_SYMLINK_NOFOLLOW));
+                        ASSERT_OK_ERRNO(faccessat(ofd, "check2/", F_OK, AT_SYMLINK_NOFOLLOW));
+                        ASSERT_OK_ERRNO(faccessat(ofd, "check3/", F_OK, AT_SYMLINK_NOFOLLOW));
+                        ASSERT_OK_ERRNO(faccessat(ofd, "zzz/check4/", F_OK, AT_SYMLINK_NOFOLLOW));
+                        ASSERT_OK_ERRNO(faccessat(ofd, "yyy/check5/", F_OK, AT_SYMLINK_NOFOLLOW));
+
+                        _cleanup_free_ char *j = ASSERT_PTR(path_join(w, "zzz"));
+                        ASSERT_OK_ERRNO(umount2(j, MNT_DETACH));
+                        _cleanup_free_ char *jj = ASSERT_PTR(path_join(w, "yyy"));
+                        ASSERT_OK_ERRNO(umount2(jj, MNT_DETACH));
+                        ASSERT_OK_ERRNO(umount2(w, MNT_DETACH));
+                }
+
+                mstack = mstack_free(mstack);
+
+                _exit(EXIT_SUCCESS);
+        }
+}
+
+DEFINE_TEST_MAIN(LOG_INFO);
author	Lennart Poettering <lennart@amutable.com>
	Sun, 9 Nov 2025 20:16:44 +0000 (21:16 +0100)
committer	Lennart Poettering <lennart@amutable.com>
	Thu, 19 Feb 2026 14:05:15 +0000 (15:05 +0100)
README		patch \| blob \| blame \| history
src/shared/meson.build		patch \| blob \| blame \| history
src/shared/mstack.c	[new file with mode: 0644]	patch \| blob
src/shared/mstack.h	[new file with mode: 0644]	patch \| blob
src/shared/shared-forward.h		patch \| blob \| blame \| history
src/test/meson.build		patch \| blob \| blame \| history
src/test/test-mstack.c	[new file with mode: 0644]	patch \| blob