]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
homework: Ensure we don't stack block devices
authorscarlet-storm <12461256+scarlet-storm@users.noreply.github.com>
Sat, 28 Dec 2024 07:55:25 +0000 (13:25 +0530)
committerscarlet-storm <12461256+scarlet-storm@users.noreply.github.com>
Tue, 17 Feb 2026 12:53:35 +0000 (18:23 +0530)
Ensure we don't create a loop device on top of a physical block device.
This leads to huge performance degradation of discard operations if the
physical device does not support discard_on_zeroes.

- loop device historical semantics dictates that when the device is
  discarded, it needs to return zero data on read. This can be
  implemented easily on a filesystem. since fallocate zero-range
  would return immediately & the holes are handled at the filesystem
  level to return zero data on read.
- For a raw block device, the feature (discard_zeroes_data) depends on
  the capabilities of the physical device that is exposed to the
  block layer by the driver. This means that to guarantee that the loop
  device stacked on a block device returns zero on discarded data,
  it needs to convert discarded range into write_zero op on the block device.
  https://github.com/torvalds/linux/blob/63676eefb7a026d04b51dcb7aaf54f358517a2ec/drivers/block/loop.c#L773

For example on one of my local nvme I can see the following:
cat /sys/class/block/nvme1n1/queue/write_zeroes_max_bytes
131072
cat /sys/class/block/nvme0n1/queue/discard_max_hw_bytes
2199023255040

This means maximum size of a write_zero operation can be 128KiB &
maximum size of discard operation can be 2TiB on the block device.
So discarding for example 1TB of data, which would be a single block
device operation, gets split into 8.3 million block device operations
when issued on top of stacked loop device.

src/home/homework-luks.c

index 5e6d782fdbbe58319bf2577899595b221693625f..e7dc26794cdab08046b730839879f880694cdef3 100644 (file)
@@ -1433,6 +1433,11 @@ int home_setup_luks(
                                 return r;
                 }
 
+                /* Before we make the loop device, make sure offset is zero & we are using the full partition
+                 * If our offset is not zero, loop_device_make will create a loop device on top of the block device */
+                if (S_ISBLK(st.st_mode))
+                        assert(offset == 0 && size == UINT64_MAX);
+
                 r = loop_device_make(
                                 setup->image_fd,
                                 O_RDWR,
@@ -2191,6 +2196,7 @@ int home_create_luks(
         _cleanup_close_ int mount_fd = -EBADF;
         const char *fstype, *ip;
         struct statfs sfs;
+        struct stat st;
         int r;
         _cleanup_strv_free_ char **extra_mkfs_options = NULL;
 
@@ -2263,7 +2269,6 @@ int home_create_luks(
         if (path_startswith(ip, "/dev/")) {
                 _cleanup_free_ char *sysfs = NULL;
                 uint64_t block_device_size;
-                struct stat st;
 
                 /* Let's place the home directory on a real device, i.e. a USB stick or such */
 
@@ -2376,21 +2381,48 @@ int home_create_luks(
 
         log_info("Writing of partition table completed.");
 
-        r = loop_device_make(
-                        setup->image_fd,
-                        O_RDWR,
-                        partition_offset,
-                        partition_size,
-                        image_sector_size,
-                        0,
-                        LOCK_EX,
-                        &setup->loop);
-        if (r == -ENOENT) /* this means /dev/loop-control doesn't exist, i.e. we are in a container
-                           * or similar and loopback bock devices are not available, return a
-                           * recognizable error in this case. */
-                return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), "Loopback block device support is not available on this system.");
-        if (r < 0)
-                return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path);
+        if (fstat(setup->image_fd, &st) < 0)
+                return log_error_errno(errno, "Failed to fstat home image: %m");
+
+        /* Ensure we don't create a loop device over block device as it leads to huge overhead for discard operations
+         * if the device does not support discard_zeroes_data */
+        if (S_ISBLK(st.st_mode)) {
+                _cleanup_free_ char *partition_path = NULL;
+                assert(!sd_id128_is_null(partition_uuid));
+                if (asprintf(&partition_path, "/dev/disk/by-partuuid/" SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(partition_uuid)) < 0)
+                        return log_oom();
+
+                /* Release the lock, so that udev can find the partition */
+                setup->image_fd = safe_close(setup->image_fd);
+                (void) wait_for_devlink(partition_path);
+                setup->image_fd = open_image_file(h, ip, &st);
+                if (setup->image_fd < 0)
+                        return setup->image_fd;
+
+                r = loop_device_open_from_path(
+                                partition_path,
+                                O_RDWR,
+                                LOCK_EX,
+                                &setup->loop);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open newly written partition device: %s", partition_path);
+        } else {
+                r = loop_device_make(
+                                setup->image_fd,
+                                O_RDWR,
+                                partition_offset,
+                                partition_size,
+                                image_sector_size,
+                                0,
+                                LOCK_EX,
+                                &setup->loop);
+                if (r == -ENOENT) /* this means /dev/loop-control doesn't exist, i.e. we are in a container
+                                   * or similar and loopback bock devices are not available, return a
+                                   * recognizable error in this case. */
+                        return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), "Loopback block device support is not available on this system.");
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path);
+        }
 
         log_info("Setting up loopback device %s completed.", setup->loop->node ?: ip);