From 6a389701b22af2a5d492df328cd883a3889b2c1f Mon Sep 17 00:00:00 2001 From: scarlet-storm <12461256+scarlet-storm@users.noreply.github.com> Date: Sat, 28 Dec 2024 13:25:25 +0530 Subject: [PATCH] homework: Ensure we don't stack block devices Ensure we don't create a loop device on top of a physical block device. This leads to huge performance degradation of discard operations if the physical device does not support discard_on_zeroes. - loop device historical semantics dictates that when the device is discarded, it needs to return zero data on read. This can be implemented easily on a filesystem. since fallocate zero-range would return immediately & the holes are handled at the filesystem level to return zero data on read. - For a raw block device, the feature (discard_zeroes_data) depends on the capabilities of the physical device that is exposed to the block layer by the driver. This means that to guarantee that the loop device stacked on a block device returns zero on discarded data, it needs to convert discarded range into write_zero op on the block device. https://github.com/torvalds/linux/blob/63676eefb7a026d04b51dcb7aaf54f358517a2ec/drivers/block/loop.c#L773 For example on one of my local nvme I can see the following: cat /sys/class/block/nvme1n1/queue/write_zeroes_max_bytes 131072 cat /sys/class/block/nvme0n1/queue/discard_max_hw_bytes 2199023255040 This means maximum size of a write_zero operation can be 128KiB & maximum size of discard operation can be 2TiB on the block device. So discarding for example 1TB of data, which would be a single block device operation, gets split into 8.3 million block device operations when issued on top of stacked loop device. --- src/home/homework-luks.c | 64 ++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/src/home/homework-luks.c b/src/home/homework-luks.c index 5e6d782fdbb..e7dc26794cd 100644 --- a/src/home/homework-luks.c +++ b/src/home/homework-luks.c @@ -1433,6 +1433,11 @@ int home_setup_luks( return r; } + /* Before we make the loop device, make sure offset is zero & we are using the full partition + * If our offset is not zero, loop_device_make will create a loop device on top of the block device */ + if (S_ISBLK(st.st_mode)) + assert(offset == 0 && size == UINT64_MAX); + r = loop_device_make( setup->image_fd, O_RDWR, @@ -2191,6 +2196,7 @@ int home_create_luks( _cleanup_close_ int mount_fd = -EBADF; const char *fstype, *ip; struct statfs sfs; + struct stat st; int r; _cleanup_strv_free_ char **extra_mkfs_options = NULL; @@ -2263,7 +2269,6 @@ int home_create_luks( if (path_startswith(ip, "/dev/")) { _cleanup_free_ char *sysfs = NULL; uint64_t block_device_size; - struct stat st; /* Let's place the home directory on a real device, i.e. a USB stick or such */ @@ -2376,21 +2381,48 @@ int home_create_luks( log_info("Writing of partition table completed."); - r = loop_device_make( - setup->image_fd, - O_RDWR, - partition_offset, - partition_size, - image_sector_size, - 0, - LOCK_EX, - &setup->loop); - if (r == -ENOENT) /* this means /dev/loop-control doesn't exist, i.e. we are in a container - * or similar and loopback bock devices are not available, return a - * recognizable error in this case. */ - return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), "Loopback block device support is not available on this system."); - if (r < 0) - return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path); + if (fstat(setup->image_fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat home image: %m"); + + /* Ensure we don't create a loop device over block device as it leads to huge overhead for discard operations + * if the device does not support discard_zeroes_data */ + if (S_ISBLK(st.st_mode)) { + _cleanup_free_ char *partition_path = NULL; + assert(!sd_id128_is_null(partition_uuid)); + if (asprintf(&partition_path, "/dev/disk/by-partuuid/" SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(partition_uuid)) < 0) + return log_oom(); + + /* Release the lock, so that udev can find the partition */ + setup->image_fd = safe_close(setup->image_fd); + (void) wait_for_devlink(partition_path); + setup->image_fd = open_image_file(h, ip, &st); + if (setup->image_fd < 0) + return setup->image_fd; + + r = loop_device_open_from_path( + partition_path, + O_RDWR, + LOCK_EX, + &setup->loop); + if (r < 0) + return log_error_errno(r, "Failed to open newly written partition device: %s", partition_path); + } else { + r = loop_device_make( + setup->image_fd, + O_RDWR, + partition_offset, + partition_size, + image_sector_size, + 0, + LOCK_EX, + &setup->loop); + if (r == -ENOENT) /* this means /dev/loop-control doesn't exist, i.e. we are in a container + * or similar and loopback bock devices are not available, return a + * recognizable error in this case. */ + return log_error_errno(SYNTHETIC_ERRNO(ENOLINK), "Loopback block device support is not available on this system."); + if (r < 0) + return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path); + } log_info("Setting up loopback device %s completed.", setup->loop->node ?: ip); -- 2.47.3