From: Luca Boccassi Date: Fri, 29 Sep 2023 00:50:15 +0000 (+0100) Subject: mount-util: use mount beneath to replace previous namespace mount X-Git-Tag: v255-rc1~232 X-Git-Url: http://git.ipfire.org/?p=thirdparty%2Fsystemd.git;a=commitdiff_plain;h=7c83d42ef8c875018918615599a4fecc3e4fbe6d mount-util: use mount beneath to replace previous namespace mount Instead of mounting over, do an atomic swap using mount beneath, if available. This way assets can be mounted again and again (e.g.: updates) without leaking mounts. --- diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 7247f3d2fc6..d8319318a9c 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -1310,11 +1310,16 @@ node /org/freedesktop/systemd1 { TryRestartUnit() or ReloadOrTryRestartUnit() for the marked units. - BindMountUnit() can be used to bind mount new files or directories into - a running service mount namespace. + BindMountUnit() can be used to bind mount new files or directories into a + running service mount namespace. If supported by the kernel, any prior mount on the selected target + will be replaced by the new mount. If not supported, any prior mount will be over-mounted, but remain + pinned and inaccessible. + MountImageUnit() can be used to mount new images into a running service - mount namespace. + mount namespace. If supported by the kernel, any prior mount on the selected target will be replaced + by the new mount. If not supported, any prior mount will be over-mounted, but remain pinned and + inaccessible. KillUnit() may be used to kill (i.e. send a signal to) all processes of a unit. It takes the unit name, an enum who and a UNIX diff --git a/man/systemctl.xml b/man/systemctl.xml index d07c8eae28b..a673c18c0ee 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -663,6 +663,10 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err , , etc.) run in distinct namespaces. + If supported by the kernel, any prior mount on the selected target will be replaced by the + new mount. If not supported, any prior mount will be over-mounted, but remain pinned and + inaccessible. + @@ -693,6 +697,10 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err , , etc.) run in distinct namespaces. + If supported by the kernel, any prior mount on the selected target will be replaced by the + new mount. If not supported, any prior mount will be over-mounted, but remain pinned and + inaccessible. + Example: systemctl mount-image foo.service /tmp/img.raw /var/lib/image root:ro,nosuid systemctl mount-image --mkdir bar.service /tmp/img.raw /var/lib/baz/img diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h index ddee457f444..83ec137fa59 100644 --- a/src/basic/missing_syscall.h +++ b/src/basic/missing_syscall.h @@ -539,6 +539,10 @@ static inline int missing_open_tree( /* ======================================================================= */ +#ifndef MOVE_MOUNT_BENEATH +#define MOVE_MOUNT_BENEATH 0x00000200 +#endif + #if !HAVE_MOVE_MOUNT #ifndef MOVE_MOUNT_F_EMPTY_PATH diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 91d2ca3025a..aa81a4b9ba2 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -2007,8 +2007,12 @@ static int mount_partition( if (m->fsmount_fd >= 0) { /* Case #1: Attach existing fsmount fd to the file system */ - if (move_mount(m->fsmount_fd, "", -EBADF, p, MOVE_MOUNT_F_EMPTY_PATH) < 0) - return -errno; + r = mount_exchange_graceful( + m->fsmount_fd, + p, + FLAGS_SET(flags, DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE)); + if (r < 0) + return log_debug_errno(r, "Failed to mount image on '%s': %m", p); } else { assert(node); diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h index f25686a7fd4..979fd384fe6 100644 --- a/src/shared/dissect-image.h +++ b/src/shared/dissect-image.h @@ -56,36 +56,37 @@ struct DissectedPartition { }) typedef enum DissectImageFlags { - DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */ - DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */ - DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */ - DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */ - DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP | - DISSECT_IMAGE_DISCARD | - DISSECT_IMAGE_DISCARD_ON_CRYPTO, - DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */ - DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */ - DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */ - DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */ - DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */ - DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */ - DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */ - DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */ - DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */ - DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */ - DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */ - DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */ - DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */ - DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */ - DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY | - DISSECT_IMAGE_MOUNT_READ_ONLY, - DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */ - DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */ - DISSECT_IMAGE_ADD_PARTITION_DEVICES = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */ - DISSECT_IMAGE_PIN_PARTITION_DEVICES = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */ - DISSECT_IMAGE_RELAX_EXTENSION_CHECK = 1 << 22, /* Don't insist that the extension-release file name matches the image name */ - DISSECT_IMAGE_DISKSEQ_DEVNODE = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */ - DISSECT_IMAGE_ALLOW_EMPTY = 1 << 24, /* Allow that no usable partitions is present */ + DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */ + DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */ + DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */ + DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */ + DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_DISCARD | + DISSECT_IMAGE_DISCARD_ON_CRYPTO, + DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */ + DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */ + DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */ + DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */ + DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */ + DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */ + DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */ + DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */ + DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */ + DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */ + DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */ + DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */ + DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */ + DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */ + DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY | + DISSECT_IMAGE_MOUNT_READ_ONLY, + DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */ + DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */ + DISSECT_IMAGE_ADD_PARTITION_DEVICES = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */ + DISSECT_IMAGE_PIN_PARTITION_DEVICES = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */ + DISSECT_IMAGE_RELAX_EXTENSION_CHECK = 1 << 22, /* Don't insist that the extension-release file name matches the image name */ + DISSECT_IMAGE_DISKSEQ_DEVNODE = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */ + DISSECT_IMAGE_ALLOW_EMPTY = 1 << 24, /* Allow that no usable partitions is present */ + DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE = 1 << 25, /* Try to mount the image beneath the specified mountpoint, rather than on top of it, and then umount the top */ } DissectImageFlags; struct DissectedImage { diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index 011508e632a..b6d2b6b6159 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -730,6 +730,45 @@ int umount_verbose( return 0; } +int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) { + int r; + + assert(fsmount_fd >= 0); + assert(dest); + + /* First, try to mount beneath an existing mount point, and if that works, umount the old mount, + * which is now at the top. This will ensure we can atomically replace a mount. Note that this works + * also in the case where there are submounts down the tree. Mount propagation is allowed but + * restricted to layouts that don't end up propagation the new mount on top of the mount stack. If + * this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get + * -EINVAL and then we fallback to normal mounting. */ + + r = RET_NERRNO(move_mount( + fsmount_fd, + /* from_path= */ "", + /* to_fd= */ -EBADF, + dest, + MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0))); + if (mount_beneath) { + if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */ + log_debug_errno(r, + "Failed to mount beneath '%s', falling back to overmount", + dest); + return RET_NERRNO(move_mount( + fsmount_fd, + /* from_path= */ "", + /* to_fd= */ -EBADF, + dest, + MOVE_MOUNT_F_EMPTY_PATH)); + } + + if (r >= 0) /* If it is, now remove the old mount */ + return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH); + } + + return r; +} + int mount_option_mangle( const char *options, unsigned long mount_flags, @@ -1155,7 +1194,7 @@ static int mount_in_namespace( (void) mkdir_parents(dest, 0755); if (img) { - DissectImageFlags f = 0; + DissectImageFlags f = DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE; if (make_file_or_directory) f |= DISSECT_IMAGE_MKDIR; @@ -1174,11 +1213,7 @@ static int mount_in_namespace( if (make_file_or_directory) (void) make_mount_point_inode_from_stat(&st, dest, 0700); - r = RET_NERRNO(move_mount(new_mount_fd, - "", - -EBADF, - dest, - MOVE_MOUNT_F_EMPTY_PATH)); + r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true); } if (r < 0) { (void) write(errno_pipe_fd[1], &r, sizeof(r)); diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index a56530c42ea..7c0189480e3 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -68,6 +68,8 @@ int umount_verbose( const char *where, int flags); +int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath); + int mount_option_mangle( const char *options, unsigned long mount_flags, diff --git a/test/units/testsuite-23.runtime-bind-paths.sh b/test/units/testsuite-23.runtime-bind-paths.sh index 8dc4d9123c7..65c2dbf41db 100755 --- a/test/units/testsuite-23.runtime-bind-paths.sh +++ b/test/units/testsuite-23.runtime-bind-paths.sh @@ -23,8 +23,12 @@ systemctl start testsuite-23-namespaced.service # Ensure that inaccessible paths aren't bypassed by the runtime setup, (! systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-fixed /run/inaccessible/testfile-marker-fixed) +echo "MARKER_WRONG" >/run/testsuite-23-marker-wrong echo "MARKER_RUNTIME" >/run/testsuite-23-marker-runtime +# Mount twice to exercise mount-beneath (on kernel 6.5+, on older kernels it will just overmount) +systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-wrong /tmp/testfile-marker-runtime +test "$(systemctl show -P SubState testsuite-23-namespaced.service)" = "running" systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-runtime /tmp/testfile-marker-runtime timeout 10 bash -xec 'while [[ "$(systemctl show -P SubState testsuite-23-namespaced.service)" == running ]]; do sleep .5; done' diff --git a/test/units/testsuite-50.sh b/test/units/testsuite-50.sh index 3a719d20597..4142e1ebbc8 100755 --- a/test/units/testsuite-50.sh +++ b/test/units/testsuite-50.sh @@ -363,6 +363,11 @@ ExecStart=/bin/sh -c ' \\ EOF systemctl start testservice-50d.service +# Mount twice to exercise mount-beneath (on kernel 6.5+, on older kernels it will just overmount) +mkdir -p /tmp/wrong/foo +mksquashfs /tmp/wrong/foo /tmp/wrong.raw +systemctl mount-image --mkdir testservice-50d.service /tmp/wrong.raw /tmp/img +test "$(systemctl show -P SubState testservice-50d.service)" = "running" systemctl mount-image --mkdir testservice-50d.service "${image}.raw" /tmp/img root:nosuid while systemctl show -P SubState testservice-50d.service | grep -q running