From e582484789a6d889d11b97d9c2afa74c3c985130 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 22 Oct 2025 22:47:53 +0200 Subject: [PATCH] tree-wide: open block device locks in writable mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit udev's block device locking protocol has one pitfall not even the example in the documentation got right so far (even though this is explained in all detail above): udev's rescanning is only triggered when an fd that is opened for writing is closed. This means that if a separate locking fd is opened on a block device – one that is maintained independently of the fd actually used for writing – it must be opened for writing too, so that closing the lock definitely triggers a rescan. This matters in cases where the lock fd is kept for longer than the fd used for writing to disk. (Because otherwise udev might get the IN_CLOSE_WRITE event, but when it tries to rescan will find the device locked, and never retry because no IN_CLOSE_WRITE is triggred anymore.) Let's fix that across the codebase, at 4 places: 1. in makefs (a lock fd is kept, and mkfs then invoked as child, which uses a different fd, and the lock fd is closed only once the child died) 2. in udevadm lock (embarassing!): which is intended to be used to wrap tools that modify disk contents, very similar to the makefs case. The lock is also kept until after the tool exited. 3. In storagetm: the kernel nvme-tcp layer writes to the device directly, we just keep a lock fd. 4. the example in BLOCK_DEVICE_LOCKING.md --- docs/BLOCK_DEVICE_LOCKING.md | 5 +++-- src/growfs/makefs.c | 2 +- src/shared/blockdev-util.c | 14 +++++++++----- src/shared/blockdev-util.h | 2 +- src/storagetm/storagetm.c | 2 +- src/udev/udevadm-lock.c | 3 ++- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/docs/BLOCK_DEVICE_LOCKING.md b/docs/BLOCK_DEVICE_LOCKING.md index a6e3374bc79..a3edd80829b 100644 --- a/docs/BLOCK_DEVICE_LOCKING.md +++ b/docs/BLOCK_DEVICE_LOCKING.md @@ -223,11 +223,12 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } - // try to take an exclusive and nonblocking BSD lock + // try to take an exclusive and nonblocking BSD lock (use O_WRONLY mode to ensure udev + // rescans the device once the lock is closed) __attribute__((cleanup(closep))) int fd = lock_whole_disk_from_devname( argv[1], - O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, + O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, LOCK_EX|LOCK_NB); if (fd < 0) diff --git a/src/growfs/makefs.c b/src/growfs/makefs.c index 897b9a76007..22bea9e24ae 100644 --- a/src/growfs/makefs.c +++ b/src/growfs/makefs.c @@ -41,7 +41,7 @@ static int run(int argc, char *argv[]) { if (S_ISBLK(st.st_mode)) { /* Lock the device so that udev doesn't interfere with our work */ - lock_fd = lock_whole_block_device(st.st_rdev, LOCK_EX); + lock_fd = lock_whole_block_device(st.st_rdev, O_WRONLY, LOCK_EX); if (lock_fd < 0) return log_error_errno(lock_fd, "Failed to lock whole block device of \"%s\": %m", device); } else diff --git a/src/shared/blockdev-util.c b/src/shared/blockdev-util.c index d86238b84e6..bac1471cee2 100644 --- a/src/shared/blockdev-util.c +++ b/src/shared/blockdev-util.c @@ -340,20 +340,24 @@ int get_block_device_harder(const char *path, dev_t *ret) { return get_block_device_harder_fd(fd, ret); } -int lock_whole_block_device(dev_t devt, int operation) { +int lock_whole_block_device(dev_t devt, int open_flags, int operation) { _cleanup_close_ int lock_fd = -EBADF; dev_t whole_devt; int r; - /* Let's get a BSD file lock on the whole block device, as per: https://systemd.io/BLOCK_DEVICE_LOCKING */ + /* Let's get a BSD file lock on the whole block device, as per: https://systemd.io/BLOCK_DEVICE_LOCKING + * + * NB: it matters whether open_flags indicates open for write: only then will the eventual closing of + * the fd trigger udev's partitioning rescanning of the device (as it watches for IN_CLOSE_WRITE), + * hence make sure to pass the right value there. */ r = block_get_whole_disk(devt, &whole_devt); if (r < 0) return r; - lock_fd = r = device_open_from_devnum(S_IFBLK, whole_devt, O_RDONLY|O_CLOEXEC|O_NONBLOCK, NULL); - if (r < 0) - return r; + lock_fd = device_open_from_devnum(S_IFBLK, whole_devt, open_flags|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL); + if (lock_fd < 0) + return lock_fd; if (flock(lock_fd, operation) < 0) return -errno; diff --git a/src/shared/blockdev-util.h b/src/shared/blockdev-util.h index 3a481dd3964..eddb521e4a1 100644 --- a/src/shared/blockdev-util.h +++ b/src/shared/blockdev-util.h @@ -35,7 +35,7 @@ int get_block_device(const char *path, dev_t *dev); int get_block_device_harder_fd(int fd, dev_t *dev); int get_block_device_harder(const char *path, dev_t *dev); -int lock_whole_block_device(dev_t devt, int operation); +int lock_whole_block_device(dev_t devt, int open_flags, int operation); int blockdev_partscan_enabled(sd_device *d); int blockdev_partscan_enabled_fd(int fd); diff --git a/src/storagetm/storagetm.c b/src/storagetm/storagetm.c index 972b892ca17..54d516ef128 100644 --- a/src/storagetm/storagetm.c +++ b/src/storagetm/storagetm.c @@ -390,7 +390,7 @@ static int nvme_subsystem_add(const char *node, int consumed_fd, sd_device *devi return log_oom(); if (fd < 0) { - fd = RET_NERRNO(open(node, O_RDONLY|O_CLOEXEC|O_NONBLOCK)); + fd = RET_NERRNO(open(node, O_RDWR|O_CLOEXEC|O_NONBLOCK)); if (fd < 0) return log_error_errno(fd, "Failed to open '%s': %m", node); } diff --git a/src/udev/udevadm-lock.c b/src/udev/udevadm-lock.c index 00f4deafd68..17b9e2d3e9b 100644 --- a/src/udev/udevadm-lock.c +++ b/src/udev/udevadm-lock.c @@ -184,7 +184,8 @@ static int lock_device( struct stat st; int r; - fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + /* We open in O_WRONLY mode here, to trigger a rescan in udev once we are done */ + fd = open(path, O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); if (fd < 0) return log_error_errno(errno, "Failed to open '%s': %m", path); -- 2.47.3