]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
mount-util: use mount beneath to replace previous namespace mount
authorLuca Boccassi <bluca@debian.org>
Fri, 29 Sep 2023 00:50:15 +0000 (01:50 +0100)
committerLuca Boccassi <luca.boccassi@gmail.com>
Mon, 16 Oct 2023 13:33:47 +0000 (14:33 +0100)
Instead of mounting over, do an atomic swap using mount beneath, if
available. This way assets can be mounted again and again (e.g.:
updates) without leaking mounts.

man/org.freedesktop.systemd1.xml
man/systemctl.xml
src/basic/missing_syscall.h
src/shared/dissect-image.c
src/shared/dissect-image.h
src/shared/mount-util.c
src/shared/mount-util.h
test/units/testsuite-23.runtime-bind-paths.sh
test/units/testsuite-50.sh

index 7247f3d2fc6700ab61787092dbce297a2bd1a2c0..d8319318a9c0a4e09c2cd52eea59ddb099347cab 100644 (file)
@@ -1310,11 +1310,16 @@ node /org/freedesktop/systemd1 {
       <function>TryRestartUnit()</function> or <function>ReloadOrTryRestartUnit()</function> for the marked
       units.</para>
 
-      <para><function>BindMountUnit()</function> can be used to bind mount new files or directories into
-      a running service mount namespace.</para>
+      <para><function>BindMountUnit()</function> can be used to bind mount new files or directories into a
+      running service mount namespace. If supported by the kernel, any prior mount on the selected target
+      will be replaced by the new mount. If not supported, any prior mount will be over-mounted, but remain
+      pinned and inaccessible.
+      </para>
 
       <para><function>MountImageUnit()</function> can be used to mount new images into a running service
-      mount namespace.</para>
+      mount namespace. If supported by the kernel, any prior mount on the selected target will be replaced
+      by the new mount. If not supported, any prior mount will be over-mounted, but remain pinned and
+      inaccessible.</para>
 
       <para><function>KillUnit()</function> may be used to kill (i.e. send a signal to) all processes of a
       unit. It takes the unit <varname>name</varname>, an enum <varname>who</varname> and a UNIX
index d07c8eae28b929d8eda21522e0153e9c39227ef6..a673c18c0ee0bd55674e5d3f7e4fbc84d45b126a 100644 (file)
@@ -663,6 +663,10 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err
           <option>ExecReload=</option>, <option>ExecStartPre=</option>, etc.) run in distinct namespaces.
           </para>
 
+          <para>If supported by the kernel, any prior mount on the selected target will be replaced by the
+          new mount. If not supported, any prior mount will be over-mounted, but remain pinned and
+          inaccessible.</para>
+
           <xi:include href="version-info.xml" xpointer="v248"/></listitem>
         </varlistentry>
 
@@ -693,6 +697,10 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err
           <option>ExecReload=</option>, <option>ExecStartPre=</option>, etc.) run in distinct namespaces.
           </para>
 
+          <para>If supported by the kernel, any prior mount on the selected target will be replaced by the
+          new mount. If not supported, any prior mount will be over-mounted, but remain pinned and
+          inaccessible.</para>
+
           <para>Example:
           <programlisting>systemctl mount-image foo.service /tmp/img.raw /var/lib/image root:ro,nosuid</programlisting>
           <programlisting>systemctl mount-image --mkdir bar.service /tmp/img.raw /var/lib/baz/img</programlisting>
index ddee457f4449d710f531368c8de42e75358e6a22..83ec137fa59288b2d19e678a54a7d91141fa9162 100644 (file)
@@ -539,6 +539,10 @@ static inline int missing_open_tree(
 
 /* ======================================================================= */
 
+#ifndef MOVE_MOUNT_BENEATH
+#define MOVE_MOUNT_BENEATH 0x00000200
+#endif
+
 #if !HAVE_MOVE_MOUNT
 
 #ifndef MOVE_MOUNT_F_EMPTY_PATH
index 91d2ca3025ab9047c8f3547b576bd5692e5a6982..aa81a4b9ba21b61d15ff777b55569284d50d62a1 100644 (file)
@@ -2007,8 +2007,12 @@ static int mount_partition(
                 if (m->fsmount_fd >= 0) {
                         /* Case #1: Attach existing fsmount fd to the file system */
 
-                        if (move_mount(m->fsmount_fd, "", -EBADF, p, MOVE_MOUNT_F_EMPTY_PATH) < 0)
-                                return -errno;
+                        r = mount_exchange_graceful(
+                                        m->fsmount_fd,
+                                        p,
+                                        FLAGS_SET(flags, DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE));
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to mount image on '%s': %m", p);
 
                 } else {
                         assert(node);
index f25686a7fd4bd9413cc5fd68fdbf89fcd3d6ba68..979fd384fe665d2b0fe30673365d5de5bc2e39b6 100644 (file)
@@ -56,36 +56,37 @@ struct DissectedPartition {
         })
 
 typedef enum DissectImageFlags {
-        DISSECT_IMAGE_DEVICE_READ_ONLY         = 1 << 0,  /* Make device read-only */
-        DISSECT_IMAGE_DISCARD_ON_LOOP          = 1 << 1,  /* Turn on "discard" if on a loop device and file system supports it */
-        DISSECT_IMAGE_DISCARD                  = 1 << 2,  /* Turn on "discard" if file system supports it, on all block devices */
-        DISSECT_IMAGE_DISCARD_ON_CRYPTO        = 1 << 3,  /* Turn on "discard" also on crypto devices */
-        DISSECT_IMAGE_DISCARD_ANY              = DISSECT_IMAGE_DISCARD_ON_LOOP |
-                                                 DISSECT_IMAGE_DISCARD |
-                                                 DISSECT_IMAGE_DISCARD_ON_CRYPTO,
-        DISSECT_IMAGE_GPT_ONLY                 = 1 << 4,  /* Only recognize images with GPT partition tables */
-        DISSECT_IMAGE_GENERIC_ROOT             = 1 << 5,  /* If no partition table or only single generic partition, assume it's the root fs */
-        DISSECT_IMAGE_MOUNT_ROOT_ONLY          = 1 << 6,  /* Mount only the root and /usr partitions */
-        DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY      = 1 << 7,  /* Mount only the non-root and non-/usr partitions */
-        DISSECT_IMAGE_VALIDATE_OS              = 1 << 8,  /* Refuse mounting images that aren't identifiable as OS images */
-        DISSECT_IMAGE_VALIDATE_OS_EXT          = 1 << 9,  /* Refuse mounting images that aren't identifiable as OS extension images */
-        DISSECT_IMAGE_RELAX_VAR_CHECK          = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
-        DISSECT_IMAGE_FSCK                     = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
-        DISSECT_IMAGE_NO_PARTITION_TABLE       = 1 << 12, /* Only recognize single file system images */
-        DISSECT_IMAGE_VERITY_SHARE             = 1 << 13, /* When activating a verity device, reuse existing one if already open */
-        DISSECT_IMAGE_MKDIR                    = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */
-        DISSECT_IMAGE_USR_NO_ROOT              = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
-        DISSECT_IMAGE_REQUIRE_ROOT             = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
-        DISSECT_IMAGE_MOUNT_READ_ONLY          = 1 << 17, /* Make mounts read-only */
-        DISSECT_IMAGE_READ_ONLY                = DISSECT_IMAGE_DEVICE_READ_ONLY |
-                                                 DISSECT_IMAGE_MOUNT_READ_ONLY,
-        DISSECT_IMAGE_GROWFS                   = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
-        DISSECT_IMAGE_MOUNT_IDMAPPED           = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
-        DISSECT_IMAGE_ADD_PARTITION_DEVICES    = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */
-        DISSECT_IMAGE_PIN_PARTITION_DEVICES    = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */
-        DISSECT_IMAGE_RELAX_EXTENSION_CHECK    = 1 << 22, /* Don't insist that the extension-release file name matches the image name */
-        DISSECT_IMAGE_DISKSEQ_DEVNODE          = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */
-        DISSECT_IMAGE_ALLOW_EMPTY              = 1 << 24, /* Allow that no usable partitions is present */
+        DISSECT_IMAGE_DEVICE_READ_ONLY          = 1 << 0,  /* Make device read-only */
+        DISSECT_IMAGE_DISCARD_ON_LOOP           = 1 << 1,  /* Turn on "discard" if on a loop device and file system supports it */
+        DISSECT_IMAGE_DISCARD                   = 1 << 2,  /* Turn on "discard" if file system supports it, on all block devices */
+        DISSECT_IMAGE_DISCARD_ON_CRYPTO         = 1 << 3,  /* Turn on "discard" also on crypto devices */
+        DISSECT_IMAGE_DISCARD_ANY               = DISSECT_IMAGE_DISCARD_ON_LOOP |
+                                                  DISSECT_IMAGE_DISCARD |
+                                                  DISSECT_IMAGE_DISCARD_ON_CRYPTO,
+        DISSECT_IMAGE_GPT_ONLY                  = 1 << 4,  /* Only recognize images with GPT partition tables */
+        DISSECT_IMAGE_GENERIC_ROOT              = 1 << 5,  /* If no partition table or only single generic partition, assume it's the root fs */
+        DISSECT_IMAGE_MOUNT_ROOT_ONLY           = 1 << 6,  /* Mount only the root and /usr partitions */
+        DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY       = 1 << 7,  /* Mount only the non-root and non-/usr partitions */
+        DISSECT_IMAGE_VALIDATE_OS               = 1 << 8,  /* Refuse mounting images that aren't identifiable as OS images */
+        DISSECT_IMAGE_VALIDATE_OS_EXT           = 1 << 9,  /* Refuse mounting images that aren't identifiable as OS extension images */
+        DISSECT_IMAGE_RELAX_VAR_CHECK           = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
+        DISSECT_IMAGE_FSCK                      = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
+        DISSECT_IMAGE_NO_PARTITION_TABLE        = 1 << 12, /* Only recognize single file system images */
+        DISSECT_IMAGE_VERITY_SHARE              = 1 << 13, /* When activating a verity device, reuse existing one if already open */
+        DISSECT_IMAGE_MKDIR                     = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */
+        DISSECT_IMAGE_USR_NO_ROOT               = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
+        DISSECT_IMAGE_REQUIRE_ROOT              = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
+        DISSECT_IMAGE_MOUNT_READ_ONLY           = 1 << 17, /* Make mounts read-only */
+        DISSECT_IMAGE_READ_ONLY                 = DISSECT_IMAGE_DEVICE_READ_ONLY |
+                                                  DISSECT_IMAGE_MOUNT_READ_ONLY,
+        DISSECT_IMAGE_GROWFS                    = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
+        DISSECT_IMAGE_MOUNT_IDMAPPED            = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
+        DISSECT_IMAGE_ADD_PARTITION_DEVICES     = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */
+        DISSECT_IMAGE_PIN_PARTITION_DEVICES     = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */
+        DISSECT_IMAGE_RELAX_EXTENSION_CHECK     = 1 << 22, /* Don't insist that the extension-release file name matches the image name */
+        DISSECT_IMAGE_DISKSEQ_DEVNODE           = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */
+        DISSECT_IMAGE_ALLOW_EMPTY               = 1 << 24, /* Allow that no usable partitions is present */
+        DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE = 1 << 25, /* Try to mount the image beneath the specified mountpoint, rather than on top of it, and then umount the top */
 } DissectImageFlags;
 
 struct DissectedImage {
index 011508e632a30214f505a907f7c467105c165939..b6d2b6b61592e1438fb696a5b298d1fb0bafb469 100644 (file)
@@ -730,6 +730,45 @@ int umount_verbose(
         return 0;
 }
 
+int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) {
+        int r;
+
+        assert(fsmount_fd >= 0);
+        assert(dest);
+
+        /* First, try to mount beneath an existing mount point, and if that works, umount the old mount,
+         * which is now at the top. This will ensure we can atomically replace a mount. Note that this works
+         * also in the case where there are submounts down the tree. Mount propagation is allowed but
+         * restricted to layouts that don't end up propagation the new mount on top of the mount stack.  If
+         * this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get
+         * -EINVAL and then we fallback to normal mounting. */
+
+        r = RET_NERRNO(move_mount(
+                        fsmount_fd,
+                        /* from_path= */ "",
+                        /* to_fd= */ -EBADF,
+                        dest,
+                        MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0)));
+        if (mount_beneath) {
+                if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */
+                        log_debug_errno(r,
+                                        "Failed to mount beneath '%s', falling back to overmount",
+                                        dest);
+                        return RET_NERRNO(move_mount(
+                                        fsmount_fd,
+                                        /* from_path= */ "",
+                                        /* to_fd= */ -EBADF,
+                                        dest,
+                                        MOVE_MOUNT_F_EMPTY_PATH));
+                }
+
+                if (r >= 0) /* If it is, now remove the old mount */
+                        return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH);
+        }
+
+        return r;
+}
+
 int mount_option_mangle(
                 const char *options,
                 unsigned long mount_flags,
@@ -1155,7 +1194,7 @@ static int mount_in_namespace(
                         (void) mkdir_parents(dest, 0755);
 
                 if (img) {
-                        DissectImageFlags f = 0;
+                        DissectImageFlags f = DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE;
 
                         if (make_file_or_directory)
                                 f |= DISSECT_IMAGE_MKDIR;
@@ -1174,11 +1213,7 @@ static int mount_in_namespace(
                         if (make_file_or_directory)
                                 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
 
-                        r = RET_NERRNO(move_mount(new_mount_fd,
-                                                  "",
-                                                  -EBADF,
-                                                  dest,
-                                                  MOVE_MOUNT_F_EMPTY_PATH));
+                        r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true);
                 }
                 if (r < 0) {
                         (void) write(errno_pipe_fd[1], &r, sizeof(r));
index a56530c42ea6784757c57593d01e77c8f9d5ea0d..7c0189480e314ff854efc08270e675381109de27 100644 (file)
@@ -68,6 +68,8 @@ int umount_verbose(
                 const char *where,
                 int flags);
 
+int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath);
+
 int mount_option_mangle(
                 const char *options,
                 unsigned long mount_flags,
index 8dc4d9123c7eef29aa38271d4f388ca6e15e980b..65c2dbf41dbde988b34f2af4569ac610b8eda4ec 100755 (executable)
@@ -23,8 +23,12 @@ systemctl start testsuite-23-namespaced.service
 # Ensure that inaccessible paths aren't bypassed by the runtime setup,
 (! systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-fixed /run/inaccessible/testfile-marker-fixed)
 
+echo "MARKER_WRONG" >/run/testsuite-23-marker-wrong
 echo "MARKER_RUNTIME" >/run/testsuite-23-marker-runtime
 
+# Mount twice to exercise mount-beneath (on kernel 6.5+, on older kernels it will just overmount)
+systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-wrong /tmp/testfile-marker-runtime
+test "$(systemctl show -P SubState testsuite-23-namespaced.service)" = "running"
 systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-runtime /tmp/testfile-marker-runtime
 
 timeout 10 bash -xec 'while [[ "$(systemctl show -P SubState testsuite-23-namespaced.service)" == running ]]; do sleep .5; done'
index 3a719d20597fbb09f6434c1dadf265a6bf14a709..4142e1ebbc8cea32f5fcddafdabda13b88e1acd4 100755 (executable)
@@ -363,6 +363,11 @@ ExecStart=/bin/sh -c ' \\
 EOF
 systemctl start testservice-50d.service
 
+# Mount twice to exercise mount-beneath (on kernel 6.5+, on older kernels it will just overmount)
+mkdir -p /tmp/wrong/foo
+mksquashfs /tmp/wrong/foo /tmp/wrong.raw
+systemctl mount-image --mkdir testservice-50d.service /tmp/wrong.raw /tmp/img
+test "$(systemctl show -P SubState testservice-50d.service)" = "running"
 systemctl mount-image --mkdir testservice-50d.service "${image}.raw" /tmp/img root:nosuid
 
 while systemctl show -P SubState testservice-50d.service | grep -q running