]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
tree-wide: take BSD lock on loopback devices we dissect/mount/operate on
authorLennart Poettering <lennart@poettering.net>
Thu, 7 Apr 2022 14:09:45 +0000 (16:09 +0200)
committerYu Watanabe <watanabe.yu+github@gmail.com>
Sun, 10 Apr 2022 13:52:29 +0000 (22:52 +0900)
So here's something we should always keep in mind:

systemd-udevd actually does *two* things with BSD file locks on block
devices:

1. While it probes a device it takes a LOCK_SH lock. Thus everyone else
   taking a LOCK_EX lock will temporarily block udev from probing
   devices, which is good when making changes to it.

2. Whenever a device is closed after write (detected via inotify), udevd
   will issue BLKRRPART (requesting the kernel to reread the partition
   table). It does this while holding a LOCK_EX lock on the block
   device. Thus anyone else taking LOCK_SH or LOCK_EX will temporarily
   block udevd from issuing that ioctl. And that's quite relevant, since
   the kernel will temporarily flush out all partitions while re-reading
   the partition table and then create them anew. Thus it is smart to
   take LOCK_SH when dissecting a block device to ensure that no
   BLKRRPART is issued in the background, until we mounted the devices.

src/core/namespace.c
src/dissect/dissect.c
src/gpt-auto-generator/gpt-auto-generator.c
src/nspawn/nspawn.c
src/portable/portable.c
src/shared/discover-image.c
src/shared/dissect-image.c
src/sysext/sysext.c
src/test/test-loop-block.c

index e6aa7b6473eaa602abe1c609a800f94e1061b50a..5fa7a59bc5959e99648f25483253312f7c92136c 100644 (file)
@@ -2055,6 +2055,12 @@ int setup_namespace(
                 if (r < 0)
                         return log_debug_errno(r, "Failed to create loop device for root image: %m");
 
+                /* Make sure udevd won't issue BLKRRPART (which might flush out the loaded partition table)
+                 * while we are still trying to mount things */
+                r = loop_device_flock(loop_device, LOCK_SH);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to lock loopback device with LOCK_SH: %m");
+
                 r = dissect_image(
                                 loop_device->fd,
                                 &verity,
@@ -2403,6 +2409,14 @@ int setup_namespace(
                         goto finish;
                 }
 
+                /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
+                 * if it likes. */
+                r = loop_device_flock(loop_device, LOCK_UN);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to release lock on loopback block device: %m");
+                        goto finish;
+                }
+
                 if (decrypted_image) {
                         r = decrypted_image_relinquish(decrypted_image);
                         if (r < 0) {
index 1d78dc50858336413865342620eeab10a44e4ffb..51e3c4b83769ea59801390a4961f3c7654ac2e6a 100644 (file)
@@ -639,6 +639,10 @@ static int action_mount(DissectedImage *m, LoopDevice *d) {
         if (r < 0)
                 return r;
 
+        r = loop_device_flock(d, LOCK_UN);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock loopback block device: %m");
+
         if (di) {
                 r = decrypted_image_relinquish(di);
                 if (r < 0)
@@ -687,6 +691,10 @@ static int action_copy(DissectedImage *m, LoopDevice *d) {
 
         mounted_dir = TAKE_PTR(created_dir);
 
+        r = loop_device_flock(d, LOCK_UN);
+        if (r < 0)
+                return log_error_errno(r, "Failed to unlock loopback block device: %m");
+
         if (di) {
                 r = decrypted_image_relinquish(di);
                 if (r < 0)
@@ -845,6 +853,12 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 return log_error_errno(r, "Failed to set up loopback device for %s: %m", arg_image);
 
+        /* Make sure udevd doesn't issue BLKRRPART underneath us thus making devices disappear in the middle,
+         * that we assume already are there. */
+        r = loop_device_flock(d, LOCK_SH);
+        if (r < 0)
+                return log_error_errno(r, "Failed to lock loopback device: %m");
+
         r = dissect_image_and_warn(
                         d->fd,
                         arg_image,
index 08bb9ac7249bbac11d8ee996e6b95743990da310..4731687b7f8f4da172325f52fad83bd13ed15a6b 100644 (file)
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
 #include <stdlib.h>
+#include <sys/file.h>
 #include <unistd.h>
 
 #include "sd-device.h"
@@ -696,6 +697,12 @@ static int enumerate_partitions(dev_t devnum) {
         if (r <= 0)
                 return r;
 
+        /* Let's take a LOCK_SH lock on the block device, in case udevd is already running. If we don't take
+         * the lock, udevd might end up issuing BLKRRPART in the middle, and we don't want that, since that
+         * might remove all partitions while we are operating on them. */
+        if (flock(fd, LOCK_SH) < 0)
+                return log_error_errno(errno, "Failed to lock root block device: %m");
+
         r = dissect_image(
                         fd,
                         NULL, NULL,
index a5b826530361fcaa38c691c3c42c120177ffd7f6..807c6669807d299f3c41e1bd122ba4f716dad67b 100644 (file)
@@ -5737,6 +5737,13 @@ static int run(int argc, char *argv[]) {
                         goto finish;
                 }
 
+                /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
+                r = loop_device_flock(loop, LOCK_SH);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to take lock on loopback block device: %m");
+                        goto finish;
+                }
+
                 r = dissect_image_and_warn(
                                 loop->fd,
                                 arg_image,
index 0c10b161f01a000cf599d327c66cd8409515adf3..7bba7b47e42a6d36a216d8c1c50705167e0f3d06 100644 (file)
@@ -359,6 +359,10 @@ static int portable_extract_by_path(
                 /* We now have a loopback block device, let's fork off a child in its own mount namespace, mount it
                  * there, and extract the metadata we need. The metadata is sent from the child back to us. */
 
+                r = loop_device_flock(d, LOCK_SH);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to acquire lock on loopback block device: %m");
+
                 BLOCK_SIGNALS(SIGCHLD);
 
                 r = mkdtemp_malloc("/tmp/inspect-XXXXXX", &tmpdir);
index 3f1644ea859949fb7e9a7f8ab6ea35233ccb0262..ea0afb81a5042c5c6ca8ab5d60bd0aa8c4167c18 100644 (file)
@@ -1196,6 +1196,12 @@ int image_read_metadata(Image *i) {
                 if (r < 0)
                         return r;
 
+                /* Make sure udevd doesn't issue BLKRRPART in the background which might make our partitions
+                 * disappear temporarily. */
+                r = loop_device_flock(d, LOCK_SH);
+                if (r < 0)
+                        return r;
+
                 r = dissect_image(
                                 d->fd,
                                 NULL, NULL,
index a84df05e3fddf16f9cf9043039f8847a0be7fac6..25fed4cf508ea0ca8d6dd8fabf574cf788cf86cc 100644 (file)
@@ -2980,6 +2980,11 @@ int mount_image_privately_interactively(
         if (r < 0)
                 return log_error_errno(r, "Failed to set up loopback device for %s: %m", image);
 
+        /* Make sure udevd doesn't issue BLKRRPART behind our backs */
+        r = loop_device_flock(d, LOCK_SH);
+        if (r < 0)
+                return r;
+
         r = dissect_image_and_warn(d->fd, image, &verity, NULL, d->diskseq, d->uevent_seqnum_not_before, d->timestamp_not_before, flags, &dissected_image);
         if (r < 0)
                 return r;
@@ -3006,6 +3011,10 @@ int mount_image_privately_interactively(
         if (r < 0)
                 return r;
 
+        r = loop_device_flock(d, LOCK_UN);
+        if (r < 0)
+                return r;
+
         if (decrypted_image) {
                 r = decrypted_image_relinquish(decrypted_image);
                 if (r < 0)
@@ -3086,6 +3095,10 @@ int verity_dissect_and_mount(
         if (r < 0)
                 return log_debug_errno(r, "Failed to create loop device for image: %m");
 
+        r = loop_device_flock(loop_device, LOCK_SH);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to lock loop device: %m");
+
         r = dissect_image(
                         loop_device->fd,
                         &verity,
@@ -3133,6 +3146,10 @@ int verity_dissect_and_mount(
         if (r < 0)
                 return log_debug_errno(r, "Failed to mount image: %m");
 
+        r = loop_device_flock(loop_device, LOCK_UN);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to unlock loopback device: %m");
+
         /* If we got os-release values from the caller, then we need to match them with the image's
          * extension-release.d/ content. Return -EINVAL if there's any mismatch.
          * First, check the distro ID. If that matches, then check the new SYSEXT_LEVEL value if
index 6d4df0afd27f75495716f638794317b2f3558ac6..ccc0bd2687e43043bb40df7de6feabc2c27d8c88 100644 (file)
@@ -533,6 +533,10 @@ static int merge_subprocess(Hashmap *images, const char *workspace) {
                         if (r < 0)
                                 return log_error_errno(r, "Failed to set up loopback device for %s: %m", img->path);
 
+                        r = loop_device_flock(d, LOCK_SH);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to lock loopback device: %m");
+
                         r = dissect_image_and_warn(
                                         d->fd,
                                         img->path,
index 714d51cfc2a8142615d5fe35cb4de5ca4bc99861..d1793222f0ee07f8e8b72d06f135a687a488ddab 100644 (file)
@@ -55,6 +55,9 @@ static void* thread_func(void *ptr) {
 
                 log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted);
 
+                /* Let's make sure udev doesn't call BLKRRPART in the background, while we try to mount the device. */
+                assert_se(loop_device_flock(loop, LOCK_SH) >= 0);
+
                 r = dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, DISSECT_IMAGE_READ_ONLY, &dissected);
                 if (r < 0)
                         log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node);
@@ -85,6 +88,10 @@ static void* thread_func(void *ptr) {
                 log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted);
                 assert_se(r >= 0);
 
+                /* Now the block device is mounted, we don't need no manual lock anymore, the devices are now
+                 * pinned by the mounts. */
+                assert_se(loop_device_flock(loop, LOCK_UN) >= 0);
+
                 log_notice("Unmounting %s", mounted);
                 mounted = umount_and_rmdir_and_free(mounted);
 
@@ -215,6 +222,11 @@ static int run(int argc, char *argv[]) {
         pthread_t threads[arg_n_threads];
         sd_id128_t id;
 
+        /* Take an explicit lock while we format the file systems, in accordance with
+         * https://systemd.io/BLOCK_DEVICE_LOCKING/. We don't want udev to interfere and probe while we write
+         * or even issue BLKRRPART or similar while we are working on this. */
+        assert_se(loop_device_flock(loop, LOCK_EX) >= 0);
+
         assert_se(dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, 0, &dissected) >= 0);
 
         assert_se(dissected->partitions[PARTITION_ESP].found);
@@ -243,9 +255,21 @@ static int run(int argc, char *argv[]) {
 
         assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
 
+        /* We are particularly correct here, and now downgrade LOCK → LOCK_SH. That's because we are done
+         * with formatting the file systems, so we don't need the exclusive lock anymore. From now on a
+         * shared one is fine. This way udev can now probe the device if it wants, but still won't call
+         * BLKRRPART on it, and that's good, because that would destroy our partition table while we are at
+         * it. */
+        assert_se(loop_device_flock(loop, LOCK_SH) >= 0);
+
         /* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */
         assert_se(dissected_image_mount(dissected, mounted, UID_INVALID, UID_INVALID, 0) >= 0);
 
+        /* Now we mounted everything, the partitions are pinned. Now it's fine to release the lock
+         * fully. This means udev could now issue BLKRRPART again, but that's OK given this will fail because
+         * we now mounted the device. */
+        assert_se(loop_device_flock(loop, LOCK_UN) >= 0);
+
         assert_se(umount_recursive(mounted, 0) >= 0);
         loop = loop_device_unref(loop);