]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
loop-util: create loop device for block devices with sector size mismatch
authorDaan De Meyer <daan@amutable.com>
Mon, 30 Mar 2026 17:23:48 +0000 (19:23 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Thu, 2 Apr 2026 16:08:25 +0000 (18:08 +0200)
Previously, loop_device_make_internal() always used the block device
directly (via loop_device_open_from_fd()) for whole-device access,
regardless of sector size. This is incorrect when the GPT partition
table was written with a different sector size than the device reports,
as happens with CD-ROM/ISO boot via El Torito: the device has
2048-byte blocks but the GPT uses 512-byte sectors.

Restructure the sector size handling in loop_device_make_internal():

- Move GPT sector size probing (UINT32_MAX case) before the
  block-vs-regular-file split so both paths share the same logic and
  O_DIRECT handling. Check f_flags instead of loop_flags for O_DIRECT
  detection, since we're probing the original fd before any reopening.

- For block devices, get the device sector size and compare it against
  the resolved sector_size. Only use the block device directly when
  sector sizes match. When they differ (probed GPT mismatch or explicit
  sector size request), fall through to create a real loop device with
  the correct sector size.

- Default sector_size=0 to the device sector size for block devices
  (instead of always 512), so "no preference" correctly matches the
  device's sector size.

Co-developed-by: Claude Opus 4.6 <noreply@anthropic.com>
src/shared/loop-util.c
src/test/test-loop-util.c

index 4a759f4a7e24e522f3d6afa835a1bf4bd26827c4..0ab5d5fd8cf8e66ca8ad1262636d4ebbcbc1efb9 100644 (file)
@@ -413,6 +413,33 @@ static int fd_set_max_discard(int fd, uint64_t max_discard) {
         return write_string_filef(sysfs_path, WRITE_STRING_FILE_DISABLE_BUFFER, "%" PRIu64, max_discard);
 }
 
+static int probe_sector_size_harder(int fd, uint32_t *ret) {
+        _cleanup_close_ int non_direct_io_fd = -EBADF;
+        int probe_fd, f_flags;
+
+        assert(fd >= 0);
+        assert(ret);
+
+        /* Wraps probe_sector_size() but handles O_DIRECT: if the fd is opened with O_DIRECT there are
+         * strict alignment requirements for reads, so we temporarily reopen it without O_DIRECT for the
+         * probing logic. */
+
+        f_flags = fcntl(fd, F_GETFL);
+        if (f_flags < 0)
+                return -errno;
+
+        if (FLAGS_SET(f_flags, O_DIRECT)) {
+                non_direct_io_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
+                if (non_direct_io_fd < 0)
+                        return non_direct_io_fd;
+
+                probe_fd = non_direct_io_fd;
+        } else
+                probe_fd = fd;
+
+        return probe_sector_size(probe_fd, ret);
+}
+
 static int loop_device_make_internal(
                 const char *path,
                 int fd,
@@ -435,6 +462,11 @@ static int loop_device_make_internal(
         assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY));
         assert(ret);
 
+        /* sector_size interpretation:
+         *   0          → use device sector size for block devices, 512 for regular files
+         *   UINT32_MAX → probe GPT header to find the right sector size, fall back to 0 behavior
+         *   other      → use the specified sector size explicitly */
+
         f_flags = fcntl(fd, F_GETFL);
         if (f_flags < 0)
                 return -errno;
@@ -449,19 +481,45 @@ static int loop_device_make_internal(
                         return log_debug_errno(SYNTHETIC_ERRNO(EBADFD), "Access mode of image file is write only (?)");
         }
 
+        if (sector_size == UINT32_MAX) {
+                /* If sector size is specified as UINT32_MAX, we'll try to probe the right sector size
+                 * by looking for the GPT partition header at various offsets. This of course only works
+                 * if the image already has a disk label. */
+
+                r = probe_sector_size_harder(fd, &sector_size);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        sector_size = 0; /* If we can't probe anything, use default sector size. */
+        }
+
         if (fstat(fd, &st) < 0)
                 return -errno;
 
         if (S_ISBLK(st.st_mode)) {
-                if (offset == 0 && IN_SET(size, 0, UINT64_MAX))
+                uint32_t device_ssz;
+                r = blockdev_get_sector_size(fd, &device_ssz);
+                if (r < 0)
+                        return r;
+
+                if (sector_size == 0)
+                        sector_size = device_ssz;
+
+                if (offset == 0 && IN_SET(size, 0, UINT64_MAX) && sector_size == device_ssz)
                         /* If this is already a block device and we are supposed to cover the whole of it
-                         * then store an fd to the original open device node — and do not actually create an
-                         * unnecessary loopback device for it. */
+                         * then store an fd to the original open device node — and do not actually create
+                         * an unnecessary loopback device for it. If an explicit sector size was requested
+                         * that differs from the device sector size, or if the probed GPT sector size
+                         * differs (e.g. CD-ROMs with 2048-byte blocks but a 512-byte sector GPT), create
+                         * a real loop device to change the sector size. */
                         return loop_device_open_from_fd(fd, open_flags, lock_op, ret);
         } else {
                 r = stat_verify_regular(&st);
                 if (r < 0)
                         return r;
+
+                if (sector_size == 0)
+                        sector_size = 512;
         }
 
         if (path) {
@@ -500,47 +558,6 @@ static int loop_device_make_internal(
         if (control < 0)
                 return -errno;
 
-        if (sector_size == 0)
-                /* If no sector size is specified, default to the classic default */
-                sector_size = 512;
-        else if (sector_size == UINT32_MAX) {
-
-                if (S_ISBLK(st.st_mode))
-                        /* If the sector size is specified as UINT32_MAX we'll propagate the sector size of
-                         * the underlying block device. */
-                        r = blockdev_get_sector_size(fd, &sector_size);
-                else {
-                        _cleanup_close_ int non_direct_io_fd = -EBADF;
-                        int probe_fd;
-
-                        assert(S_ISREG(st.st_mode));
-
-                        /* If sector size is specified as UINT32_MAX, we'll try to probe the right sector
-                         * size of the image in question by looking for the GPT partition header at various
-                         * offsets. This of course only works if the image already has a disk label.
-                         *
-                         * So here we actually want to read the file contents ourselves. This is quite likely
-                         * not going to work if we managed to enable O_DIRECT, because in such a case there
-                         * are some pretty strict alignment requirements to offset, size and target, but
-                         * there's no way to query what alignment specifically is actually required. Hence,
-                         * let's avoid the mess, and temporarily open an fd without O_DIRECT for the probing
-                         * logic. */
-
-                        if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) {
-                                non_direct_io_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
-                                if (non_direct_io_fd < 0)
-                                        return non_direct_io_fd;
-
-                                probe_fd = non_direct_io_fd;
-                        } else
-                                probe_fd = fd;
-
-                        r = probe_sector_size(probe_fd, &sector_size);
-                }
-                if (r < 0)
-                        return r;
-        }
-
         /* Strip LO_FLAGS_PARTSCAN from LOOP_CONFIGURE and enable it afterwards via
          * LOOP_SET_STATUS64 to work around a kernel race: LOOP_CONFIGURE sends a uevent with
          * GD_NEED_PART_SCAN set before calling loop_reread_partitions(). If udev opens the device in
index 98cd9cbaf890bb5905c6f57290b59c4a9a749fbf..f90bf0e1998fbdcee2c152efbd6f505df1edb198 100644 (file)
@@ -371,4 +371,171 @@ TEST(loop_block) {
 #endif
 }
 
+static int make_test_image(int *ret_fd) {
+        _cleanup_free_ char *p = NULL, *cmd = NULL;
+        _cleanup_pclose_ FILE *sfdisk = NULL;
+
+        ASSERT_OK(tempfn_random_child("/var/tmp", "sfdisk", &p));
+        int fd = ASSERT_OK_ERRNO(open(p, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC|O_NOFOLLOW, 0666));
+        ASSERT_OK_ERRNO(ftruncate(fd, 256*1024*1024));
+
+        cmd = ASSERT_NOT_NULL(strjoin("sfdisk ", p));
+        sfdisk = ASSERT_NOT_NULL(popen(cmd, "we"));
+
+        fputs("label: gpt\n"
+              "size=32M, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B\n", sfdisk);
+
+        ASSERT_EQ(pclose(sfdisk), 0);
+        sfdisk = NULL;
+
+        (void) unlink(p);
+
+        *ret_fd = fd;
+        return 0;
+}
+
+TEST(sector_size_regular_file) {
+        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
+        _cleanup_close_ int fd = -EBADF;
+
+        if (have_effective_cap(CAP_SYS_ADMIN) <= 0) {
+                log_tests_skipped("not running privileged");
+                return;
+        }
+
+        if (detect_container() != 0 || running_in_chroot() != 0) {
+                log_tests_skipped("Test not supported in a container/chroot");
+                return;
+        }
+
+        ASSERT_OK(make_test_image(&fd));
+
+        /* sector_size=0 on regular file: should default to 512 */
+        ASSERT_OK(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, 0, 0, LOCK_EX, &loop));
+        ASSERT_EQ(loop->sector_size, 512u);
+        loop = loop_device_unref(loop);
+
+        /* sector_size=UINT32_MAX on regular file with GPT: should probe and find 512 */
+        ASSERT_OK(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, UINT32_MAX, 0, LOCK_EX, &loop));
+        ASSERT_EQ(loop->sector_size, 512u);
+        loop = loop_device_unref(loop);
+
+        /* Explicit sector_size=512 on regular file */
+        ASSERT_OK(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, 512, 0, LOCK_EX, &loop));
+        ASSERT_EQ(loop->sector_size, 512u);
+        loop = loop_device_unref(loop);
+
+        /* Explicit sector_size=4096 on regular file */
+        ASSERT_OK(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, 4096, 0, LOCK_EX, &loop));
+        ASSERT_EQ(loop->sector_size, 4096u);
+        loop = loop_device_unref(loop);
+}
+
+TEST(sector_size_block_device) {
+        _cleanup_(loop_device_unrefp) LoopDevice *block_loop = NULL, *loop = NULL;
+        _cleanup_close_ int fd = -EBADF;
+
+        if (have_effective_cap(CAP_SYS_ADMIN) <= 0) {
+                log_tests_skipped("not running privileged");
+                return;
+        }
+
+        if (detect_container() != 0 || running_in_chroot() != 0) {
+                log_tests_skipped("Test not supported in a container/chroot, requires udev/uevent notifications");
+                return;
+        }
+
+        ASSERT_OK(make_test_image(&fd));
+
+        /* Create a loop device to use as our block device */
+        ASSERT_OK(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, 0, LO_FLAGS_PARTSCAN, LOCK_EX, &block_loop));
+        ASSERT_FALSE(LOOP_DEVICE_IS_FOREIGN(block_loop));
+        ASSERT_OK(loop_device_flock(block_loop, LOCK_SH));
+
+        uint32_t device_ssz = block_loop->sector_size;
+
+        /* sector_size=0 on block device: should use device directly */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, 0, 0, LOCK_SH, &loop));
+        ASSERT_FALSE(loop->created);
+        ASSERT_EQ(loop->sector_size, device_ssz);
+        loop = loop_device_unref(loop);
+
+        /* sector_size=UINT32_MAX on block device: should probe, match device, use directly */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, UINT32_MAX, 0, LOCK_SH, &loop));
+        ASSERT_FALSE(loop->created);
+        ASSERT_EQ(loop->sector_size, device_ssz);
+        loop = loop_device_unref(loop);
+
+        /* sector_size=UINT32_MAX with LO_FLAGS_PARTSCAN: should probe, match, use directly */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &loop));
+        ASSERT_FALSE(loop->created);
+        ASSERT_EQ(loop->sector_size, device_ssz);
+        loop = loop_device_unref(loop);
+
+        /* Explicit sector_size matching device: should use device directly */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, device_ssz, 0, LOCK_SH, &loop));
+        ASSERT_FALSE(loop->created);
+        ASSERT_EQ(loop->sector_size, device_ssz);
+        loop = loop_device_unref(loop);
+
+        /* Explicit sector_size=4096 (differs from device 512): should create a real loop device */
+        if (device_ssz != 4096) {
+                ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, 4096, 0, LOCK_SH, &loop));
+                ASSERT_TRUE(loop->created);
+                ASSERT_EQ(loop->sector_size, 4096u);
+                loop = loop_device_unref(loop);
+        }
+}
+
+TEST(sector_size_mismatch) {
+        _cleanup_(loop_device_unrefp) LoopDevice *block_loop = NULL, *loop = NULL;
+        _cleanup_close_ int fd = -EBADF;
+
+        if (have_effective_cap(CAP_SYS_ADMIN) <= 0) {
+                log_tests_skipped("not running privileged");
+                return;
+        }
+
+        if (detect_container() != 0 || running_in_chroot() != 0) {
+                log_tests_skipped("Test not supported in a container/chroot");
+                return;
+        }
+
+        /* Create an image with a GPT written at 512-byte sectors, then create a loop device with
+         * 4096-byte sectors on top. This simulates the CD-ROM scenario where the device has large
+         * blocks but the GPT uses 512-byte sectors. */
+        ASSERT_OK(make_test_image(&fd));
+
+        /* Create a loop device with 4096-byte sector size — GPT was written at 512 */
+        ASSERT_OK(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, 4096, 0, LOCK_EX, &block_loop));
+        ASSERT_TRUE(block_loop->created);
+        ASSERT_EQ(block_loop->sector_size, 4096u);
+        ASSERT_OK(loop_device_flock(block_loop, LOCK_SH));
+
+        /* sector_size=0: no preference, should use block device directly despite GPT mismatch */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, 0, 0, LOCK_SH, &loop));
+        ASSERT_FALSE(loop->created);
+        ASSERT_EQ(loop->sector_size, 4096u);
+        loop = loop_device_unref(loop);
+
+        /* sector_size=UINT32_MAX: should probe GPT at 512, detect mismatch with device 4096,
+         * and create a new loop device with 512-byte sectors */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, UINT32_MAX, 0, LOCK_SH, &loop));
+        ASSERT_TRUE(loop->created);
+        ASSERT_EQ(loop->sector_size, 512u);
+        loop = loop_device_unref(loop);
+
+        /* Explicit sector_size=512: differs from device 4096, should create a new loop device */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, 512, 0, LOCK_SH, &loop));
+        ASSERT_TRUE(loop->created);
+        ASSERT_EQ(loop->sector_size, 512u);
+        loop = loop_device_unref(loop);
+
+        /* Explicit sector_size=4096: matches device, should use directly */
+        ASSERT_OK(loop_device_make(block_loop->fd, O_RDWR, 0, UINT64_MAX, 4096, 0, LOCK_SH, &loop));
+        ASSERT_FALSE(loop->created);
+        ASSERT_EQ(loop->sector_size, 4096u);
+        loop = loop_device_unref(loop);
+}
+
 DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro);