]>
Commit | Line | Data |
---|---|---|
c3e270f4 FB |
1 | --- |
2 | title: Locking Block Device Access | |
4cdca0af | 3 | category: Interfaces |
b41a3f66 | 4 | layout: default |
0aff7b75 | 5 | SPDX-License-Identifier: LGPL-2.1-or-later |
c3e270f4 FB |
6 | --- |
7 | ||
ecb1a44c LP |
8 | # Locking Block Device Access |
9 | ||
10 | *TL;DR: Use BSD file locks | |
e2285c57 | 11 | [(`flock(2)`)](https://man7.org/linux/man-pages/man2/flock.2.html) on block |
ecb1a44c LP |
12 | device nodes to synchronize access for partitioning and file system formatting |
13 | tools.* | |
14 | ||
15 | `systemd-udevd` probes all block devices showing up for file system superblock | |
16 | and partition table information (utilizing `libblkid`). If another program | |
17 | concurrently modifies a superblock or partition table this probing might be | |
18 | affected, which is bad in itself, but also might in turn result in undesired | |
19 | effects in programs subscribing to `udev` events. | |
20 | ||
21 | Applications manipulating a block device can temporarily stop `systemd-udevd` | |
22 | from processing rules on it — and thus bar it from probing the device — by | |
23 | taking a BSD file lock on the block device node. Specifically, whenever | |
24 | `systemd-udevd` starts processing a block device it takes a `LOCK_SH|LOCK_NB` | |
e2285c57 | 25 | lock using [`flock(2)`](https://man7.org/linux/man-pages/man2/flock.2.html) on |
ecb1a44c LP |
26 | the main block device (i.e. never on any partition block device, but on the |
27 | device the partition belongs to). If this lock cannot be taken (i.e. `flock()` | |
21547367 | 28 | returns `EAGAIN`), it refrains from processing the device. If it manages to take |
ecb1a44c LP |
29 | the lock it is kept for the entire time the device is processed. |
30 | ||
31 | Note that `systemd-udevd` also watches all block device nodes it manages for | |
5fa661a4 LP |
32 | `inotify()` `IN_CLOSE_WRITE` events: whenever such an event is seen, this is |
33 | used as trigger to re-run the rule-set for the device. | |
ecb1a44c LP |
34 | |
35 | These two concepts allow tools such as disk partitioners or file system | |
36 | formatting tools to safely and easily take exclusive ownership of a block | |
37 | device while operating: before starting work on the block device, they should | |
38 | take an `LOCK_EX` lock on it. This has two effects: first of all, in case | |
39 | `systemd-udevd` is still processing the device the tool will wait for it to | |
5fa661a4 LP |
40 | finish. Second, after the lock is taken, it can be sure that `systemd-udevd` |
41 | will refrain from processing the block device, and thus all other client | |
42 | applications subscribed to it won't get device notifications from potentially | |
43 | half-written data either. After the operation is complete the | |
ecb1a44c LP |
44 | partitioner/formatter can simply close the device node. This has two effects: |
45 | it implicitly releases the lock, so that `systemd-udevd` can process events on | |
5fa661a4 LP |
46 | the device node again. Secondly, it results an `IN_CLOSE_WRITE` event, which |
47 | causes `systemd-udevd` to immediately re-process the device — seeing all | |
48 | changes the tool made — and notify subscribed clients about it. | |
49 | ||
50 | Ideally, `systemd-udevd` would explicitly watch block devices for `LOCK_EX` | |
51 | locks being released. Such monitoring is not supported on Linux however, which | |
52 | is why it watches for `IN_CLOSE_WRITE` instead, i.e. for `close()` calls to | |
53 | writable file descriptors referring to the block device. In almost all cases, | |
54 | the difference between these two events does not matter much, as any locks | |
55 | taken are implicitly released by `close()`. However, it should be noted that if | |
56 | an application unlocks a device after completing its work without closing it, | |
57 | i.e. while keeping the file descriptor open for further, longer time, then | |
58 | `systemd-udevd` will not notice this and not retrigger and thus reprobe the | |
59 | device. | |
ecb1a44c LP |
60 | |
61 | Besides synchronizing block device access between `systemd-udevd` and such | |
62 | tools this scheme may also be used to synchronize access between those tools | |
63 | themselves. However, do note that `flock()` locks are advisory only. This means | |
64 | if one tool honours this scheme and another tool does not, they will of course | |
65 | not be synchronized properly, and might interfere with each other's work. | |
66 | ||
67 | Note that the file locks follow the usual access semantics of BSD locks: since | |
68 | `systemd-udevd` never writes to such block devices it only takes a `LOCK_SH` | |
69 | *shared* lock. A program intending to make changes to the block device should | |
70 | take a `LOCK_EX` *exclusive* lock instead. For further details, see the | |
71 | `flock(2)` man page. | |
72 | ||
73 | And please keep in mind: BSD file locks (`flock()`) and POSIX file locks | |
74 | (`lockf()`, `F_SETLK`, …) are different concepts, and in their effect | |
75 | orthogonal. The scheme discussed above uses the former and not the latter, | |
edc8e7b8 | 76 | because these types of locks more closely match the required semantics. |
ecb1a44c | 77 | |
ae61c53c LP |
78 | If multiple devices are to be locked at the same time (for example in order to |
79 | format a RAID file system), the devices should be locked in the order of the | |
80 | the device nodes' major numbers (primary ordering key, ascending) and minor | |
81 | numbers (secondary ordering key, ditto), in order to avoid ABBA locking issues | |
82 | between subsystems. | |
83 | ||
84 | Note that the locks should only be taken while the device is repartitioned, | |
85 | file systems formatted or `dd`'ed in, and similar cases that | |
86 | apply/remove/change superblocks/partition information. It should not be held | |
87 | during normal operation, i.e. while file systems on it are mounted for | |
88 | application use. | |
89 | ||
90 | The [`udevadm | |
91 | lock`](https://www.freedesktop.org/software/systemd/man/udevadm.html) command | |
92 | is provided to lock block devices following this scheme from the command line, | |
93 | for the use in scripts and similar. (Note though that it's typically preferable | |
94 | to use native support for block device locking in tools where that's | |
95 | available.) | |
96 | ||
ecb1a44c LP |
97 | Summarizing: it is recommended to take `LOCK_EX` BSD file locks when |
98 | manipulating block devices in all tools that change file system block devices | |
99 | (`mkfs`, `fsck`, …) or partition tables (`fdisk`, `parted`, …), right after | |
100 | opening the node. | |
55371658 J |
101 | |
102 | # Example of Locking The Whole Disk | |
103 | ||
104 | The following is an example to leverage `libsystemd` infrastructure to get the whole disk of a block device and take a BSD lock on it. | |
105 | ||
106 | ## Compile and Execute | |
107 | **Note that this example requires `libsystemd` version 251 or newer.** | |
108 | ||
109 | Place the code in a source file, e.g. `take_BSD_lock.c` and run the following commands: | |
110 | ``` | |
111 | $ gcc -o take_BSD_lock -lsystemd take_BSD_lock.c | |
112 | ||
113 | $ ./take_BSD_lock /dev/sda1 | |
114 | Successfully took a BSD lock: /dev/sda | |
115 | ||
116 | $ flock -x /dev/sda ./take_BSD_lock /dev/sda1 | |
117 | Failed to take a BSD lock on /dev/sda: Resource temporarily unavailable | |
118 | ``` | |
119 | ||
120 | ## Code | |
b1c4466b | 121 | ```c |
55371658 J |
122 | /* SPDX-License-Identifier: MIT-0 */ |
123 | ||
124 | #include <stdio.h> | |
125 | #include <stdlib.h> | |
126 | #include <string.h> | |
127 | #include <sys/file.h> | |
128 | #include <systemd/sd-device.h> | |
129 | #include <unistd.h> | |
130 | ||
131 | static inline void closep(int *fd) { | |
132 | if (*fd >= 0) | |
133 | close(*fd); | |
134 | } | |
135 | ||
136 | /** | |
137 | * lock_whole_disk_from_devname | |
138 | * @devname: devname of a block device, e.g., /dev/sda or /dev/sda1 | |
139 | * @open_flags: the flags to open the device, e.g., O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY | |
140 | * @flock_operation: the operation to call flock, e.g., LOCK_EX|LOCK_NB | |
141 | * | |
142 | * given the devname of a block device, take a BSD lock of the whole disk | |
143 | * | |
144 | * Returns: negative errno value on error, or non-negative fd if the lock was taken successfully. | |
145 | **/ | |
146 | int lock_whole_disk_from_devname(const char *devname, int open_flags, int flock_operation) { | |
147 | __attribute__((cleanup(sd_device_unrefp))) sd_device *dev = NULL; | |
148 | sd_device *whole_dev; | |
149 | const char *whole_disk_devname, *subsystem, *devtype; | |
150 | int r; | |
151 | ||
152 | // create a sd_device instance from devname | |
153 | r = sd_device_new_from_devname(&dev, devname); | |
154 | if (r < 0) { | |
155 | errno = -r; | |
156 | fprintf(stderr, "Failed to create sd_device: %m\n"); | |
157 | return r; | |
158 | } | |
159 | ||
160 | // if the subsystem of dev is block, but its devtype is not disk, find its parent | |
161 | r = sd_device_get_subsystem(dev, &subsystem); | |
162 | if (r < 0) { | |
163 | errno = -r; | |
164 | fprintf(stderr, "Failed to get the subsystem: %m\n"); | |
165 | return r; | |
166 | } | |
167 | if (strcmp(subsystem, "block") != 0) { | |
168 | fprintf(stderr, "%s is not a block device, refusing.\n", devname); | |
169 | return -EINVAL; | |
170 | } | |
171 | ||
172 | r = sd_device_get_devtype(dev, &devtype); | |
173 | if (r < 0) { | |
174 | errno = -r; | |
175 | fprintf(stderr, "Failed to get the devtype: %m\n"); | |
176 | return r; | |
177 | } | |
178 | if (strcmp(devtype, "disk") == 0) | |
179 | whole_dev = dev; | |
180 | else { | |
181 | r = sd_device_get_parent_with_subsystem_devtype(dev, "block", "disk", &whole_dev); | |
182 | if (r < 0) { | |
183 | errno = -r; | |
184 | fprintf(stderr, "Failed to get the parent device: %m\n"); | |
185 | return r; | |
186 | } | |
187 | } | |
188 | ||
189 | // open the whole disk device node | |
190 | __attribute__((cleanup(closep))) int fd = sd_device_open(whole_dev, open_flags); | |
191 | if (fd < 0) { | |
192 | errno = -fd; | |
193 | fprintf(stderr, "Failed to open the device: %m\n"); | |
194 | return fd; | |
195 | } | |
196 | ||
197 | // get the whole disk devname | |
198 | r = sd_device_get_devname(whole_dev, &whole_disk_devname); | |
199 | if (r < 0) { | |
200 | errno = -r; | |
201 | fprintf(stderr, "Failed to get the whole disk name: %m\n"); | |
202 | return r; | |
203 | } | |
204 | ||
205 | // take a BSD lock of the whole disk device node | |
206 | if (flock(fd, flock_operation) < 0) { | |
207 | r = -errno; | |
208 | fprintf(stderr, "Failed to take a BSD lock on %s: %m\n", whole_disk_devname); | |
209 | return r; | |
210 | } | |
211 | ||
212 | printf("Successfully took a BSD lock: %s\n", whole_disk_devname); | |
213 | ||
214 | // take the fd to avoid automatic cleanup | |
215 | int ret_fd = fd; | |
254d1313 | 216 | fd = -EBADF; |
55371658 J |
217 | return ret_fd; |
218 | } | |
219 | ||
220 | int main(int argc, char **argv) { | |
221 | if (argc != 2) { | |
222 | fprintf(stderr, "Invalid number of parameters.\n"); | |
223 | return EXIT_FAILURE; | |
224 | } | |
225 | ||
226 | // try to take an exclusive and nonblocking BSD lock | |
227 | __attribute__((cleanup(closep))) int fd = | |
228 | lock_whole_disk_from_devname( | |
229 | argv[1], | |
230 | O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, | |
231 | LOCK_EX|LOCK_NB); | |
232 | ||
233 | if (fd < 0) | |
234 | return EXIT_FAILURE; | |
235 | ||
236 | /** | |
237 | * The device is now locked until the return below. | |
238 | * Now you can safely manipulate the block device. | |
239 | **/ | |
240 | ||
241 | return EXIT_SUCCESS; | |
242 | } | |
243 | ``` |