]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
8c1be37e | 2 | |
10c1b188 LP |
3 | #if HAVE_VALGRIND_MEMCHECK_H |
4 | #include <valgrind/memcheck.h> | |
5 | #endif | |
6 | ||
dccca82b | 7 | #include <errno.h> |
8c1be37e | 8 | #include <fcntl.h> |
f1443709 LP |
9 | #include <linux/blkpg.h> |
10 | #include <linux/fs.h> | |
8c1be37e | 11 | #include <linux/loop.h> |
441ec804 | 12 | #include <sys/file.h> |
8c1be37e | 13 | #include <sys/ioctl.h> |
f2d9213f | 14 | #include <unistd.h> |
8c1be37e | 15 | |
021bf175 LP |
16 | #include "sd-device.h" |
17 | ||
8c1be37e | 18 | #include "alloc-util.h" |
86c1c1f3 | 19 | #include "blockdev-util.h" |
021bf175 | 20 | #include "device-util.h" |
b0a94268 | 21 | #include "errno-util.h" |
8c1be37e | 22 | #include "fd-util.h" |
f1443709 | 23 | #include "fileio.h" |
8c1be37e | 24 | #include "loop-util.h" |
86c1c1f3 | 25 | #include "missing_loop.h" |
f1443709 | 26 | #include "parse-util.h" |
b202ec20 | 27 | #include "random-util.h" |
3cc44114 | 28 | #include "stat-util.h" |
f1443709 | 29 | #include "stdio-util.h" |
f2d9213f | 30 | #include "string-util.h" |
021bf175 | 31 | #include "tmpfile-util.h" |
8c1be37e | 32 | |
e8af3bfd | 33 | static void cleanup_clear_loop_close(int *fd) { |
86c1c1f3 LP |
34 | if (*fd < 0) |
35 | return; | |
36 | ||
37 | (void) ioctl(*fd, LOOP_CLR_FD); | |
38 | (void) safe_close(*fd); | |
39 | } | |
40 | ||
021bf175 LP |
41 | static int loop_is_bound(int fd) { |
42 | struct loop_info64 info; | |
43 | ||
44 | assert(fd >= 0); | |
45 | ||
46 | if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) { | |
47 | if (errno == ENXIO) | |
48 | return false; /* not bound! */ | |
49 | ||
50 | return -errno; | |
51 | } | |
52 | ||
53 | return true; /* bound! */ | |
54 | } | |
55 | ||
31c75fcc LP |
56 | static int get_current_uevent_seqnum(uint64_t *ret) { |
57 | _cleanup_free_ char *p = NULL; | |
58 | int r; | |
59 | ||
60 | r = read_full_virtual_file("/sys/kernel/uevent_seqnum", &p, NULL); | |
61 | if (r < 0) | |
62 | return log_debug_errno(r, "Failed to read current uevent sequence number: %m"); | |
63 | ||
64 | truncate_nl(p); | |
65 | ||
66 | r = safe_atou64(p, ret); | |
67 | if (r < 0) | |
68 | return log_debug_errno(r, "Failed to parse current uevent sequence number: %s", p); | |
69 | ||
70 | return 0; | |
71 | } | |
72 | ||
021bf175 LP |
73 | static int device_has_block_children(sd_device *d) { |
74 | _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; | |
75 | const char *main_sn, *main_ss; | |
76 | sd_device *q; | |
77 | int r; | |
78 | ||
79 | assert(d); | |
80 | ||
81 | /* Checks if the specified device currently has block device children (i.e. partition block | |
82 | * devices). */ | |
83 | ||
84 | r = sd_device_get_sysname(d, &main_sn); | |
85 | if (r < 0) | |
86 | return r; | |
87 | ||
88 | r = sd_device_get_subsystem(d, &main_ss); | |
89 | if (r < 0) | |
90 | return r; | |
91 | ||
92 | if (!streq(main_ss, "block")) | |
93 | return -EINVAL; | |
94 | ||
95 | r = sd_device_enumerator_new(&e); | |
96 | if (r < 0) | |
97 | return r; | |
98 | ||
99 | r = sd_device_enumerator_allow_uninitialized(e); | |
100 | if (r < 0) | |
101 | return r; | |
102 | ||
103 | r = sd_device_enumerator_add_match_parent(e, d); | |
104 | if (r < 0) | |
105 | return r; | |
106 | ||
107 | FOREACH_DEVICE(e, q) { | |
108 | const char *ss, *sn; | |
109 | ||
110 | r = sd_device_get_subsystem(q, &ss); | |
111 | if (r < 0) | |
112 | continue; | |
113 | ||
114 | if (!streq(ss, "block")) | |
115 | continue; | |
116 | ||
117 | r = sd_device_get_sysname(q, &sn); | |
118 | if (r < 0) | |
119 | continue; | |
120 | ||
121 | if (streq(sn, main_sn)) | |
122 | continue; | |
123 | ||
124 | return 1; /* we have block device children */ | |
125 | } | |
126 | ||
127 | return 0; | |
128 | } | |
129 | ||
95c50092 LP |
130 | static int loop_configure( |
131 | int fd, | |
021bf175 | 132 | int nr, |
95c50092 | 133 | const struct loop_config *c, |
31c75fcc | 134 | bool *try_loop_configure, |
8ede1e86 LP |
135 | uint64_t *ret_seqnum_not_before, |
136 | usec_t *ret_timestamp_not_before) { | |
95c50092 | 137 | |
021bf175 LP |
138 | _cleanup_(sd_device_unrefp) sd_device *d = NULL; |
139 | _cleanup_free_ char *sysname = NULL; | |
738f29cb | 140 | _cleanup_close_ int lock_fd = -1; |
31c75fcc | 141 | uint64_t seqnum; |
8ede1e86 | 142 | usec_t timestamp; |
86c1c1f3 LP |
143 | int r; |
144 | ||
145 | assert(fd >= 0); | |
021bf175 | 146 | assert(nr >= 0); |
86c1c1f3 | 147 | assert(c); |
95c50092 LP |
148 | assert(try_loop_configure); |
149 | ||
021bf175 LP |
150 | if (asprintf(&sysname, "loop%i", nr) < 0) |
151 | return -ENOMEM; | |
152 | ||
153 | r = sd_device_new_from_subsystem_sysname(&d, "block", sysname); | |
154 | if (r < 0) | |
155 | return r; | |
156 | ||
157 | /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened | |
158 | * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on | |
159 | * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure | |
160 | * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a | |
161 | * long time udev would possibly never run on it again, even though the fd is unlocked, simply | |
162 | * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to | |
163 | * automatically release the lock, after we are done. */ | |
164 | lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); | |
165 | if (lock_fd < 0) | |
166 | return lock_fd; | |
167 | if (flock(lock_fd, LOCK_EX) < 0) | |
168 | return -errno; | |
169 | ||
170 | /* Let's see if the device is really detached, i.e. currently has no associated partition block | |
171 | * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that | |
172 | * superficially is detached but still has partition block devices associated for it. They only go | |
173 | * away when the device is reattached. (Yes, LOOP_CLR_FD doesn't work then, because officially | |
174 | * nothing is attached and LOOP_CTL_REMOVE doesn't either, since it doesn't care about partition | |
175 | * block devices. */ | |
176 | r = device_has_block_children(d); | |
177 | if (r < 0) | |
178 | return r; | |
179 | if (r > 0) { | |
180 | r = loop_is_bound(fd); | |
181 | if (r < 0) | |
182 | return r; | |
183 | if (r > 0) | |
184 | return -EBUSY; | |
185 | ||
186 | return -EUCLEAN; /* Bound but children? Tell caller to reattach something so that the | |
187 | * partition block devices are gone too. */ | |
188 | } | |
189 | ||
95c50092 | 190 | if (*try_loop_configure) { |
31c75fcc LP |
191 | /* Acquire uevent seqnum immediately before attaching the loopback device. This allows |
192 | * callers to ignore all uevents with a seqnum before this one, if they need to associate | |
193 | * uevent with this attachment. Doing so isn't race-free though, as uevents that happen in | |
194 | * the window between this reading of the seqnum, and the LOOP_CONFIGURE call might still be | |
195 | * mistaken as originating from our attachment, even though might be caused by an earlier | |
196 | * use. But doing this at least shortens the race window a bit. */ | |
197 | r = get_current_uevent_seqnum(&seqnum); | |
198 | if (r < 0) | |
199 | return r; | |
8ede1e86 | 200 | timestamp = now(CLOCK_MONOTONIC); |
31c75fcc | 201 | |
95c50092 LP |
202 | if (ioctl(fd, LOOP_CONFIGURE, c) < 0) { |
203 | /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other | |
204 | * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL | |
205 | * rather than ENOTTY on loopback block devices. They should fix that in the kernel, | |
206 | * but in the meantime we accept both here. */ | |
207 | if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) | |
208 | return -errno; | |
86c1c1f3 | 209 | |
95c50092 LP |
210 | *try_loop_configure = false; |
211 | } else { | |
212 | bool good = true; | |
213 | ||
214 | if (c->info.lo_sizelimit != 0) { | |
215 | /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the | |
216 | * block device. If it's used, let's immediately check if it had the desired | |
217 | * effect hence. And if not use classic LOOP_SET_STATUS64. */ | |
218 | uint64_t z; | |
219 | ||
220 | if (ioctl(fd, BLKGETSIZE64, &z) < 0) { | |
221 | r = -errno; | |
222 | goto fail; | |
223 | } | |
224 | ||
225 | if (z != c->info.lo_sizelimit) { | |
226 | log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64."); | |
227 | good = false; | |
228 | } | |
bb2551bd | 229 | } |
86c1c1f3 | 230 | |
95c50092 LP |
231 | if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) { |
232 | /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag | |
233 | * into the block device. Let's hence verify if things work correctly here | |
234 | * before returning. */ | |
235 | ||
236 | r = blockdev_partscan_enabled(fd); | |
237 | if (r < 0) | |
238 | goto fail; | |
239 | if (r == 0) { | |
240 | log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64."); | |
241 | good = false; | |
242 | } | |
bb2551bd | 243 | } |
86c1c1f3 | 244 | |
95c50092 LP |
245 | if (!good) { |
246 | /* LOOP_CONFIGURE doesn't work. Remember that. */ | |
247 | *try_loop_configure = false; | |
248 | ||
249 | /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD, | |
250 | * because LOOP_CLR_FD is async: if the operation cannot be executed right | |
251 | * away it just sets the autoclear flag on the device. This means there's a | |
252 | * good chance we cannot actually reuse the loopback device right-away. Hence | |
253 | * let's assume it's busy, avoid the trouble and let the calling loop call us | |
254 | * again with a new, likely unused device. */ | |
255 | r = -EBUSY; | |
bb2551bd | 256 | goto fail; |
bb2551bd | 257 | } |
bb2551bd | 258 | |
31c75fcc LP |
259 | if (ret_seqnum_not_before) |
260 | *ret_seqnum_not_before = seqnum; | |
8ede1e86 LP |
261 | if (ret_timestamp_not_before) |
262 | *ret_timestamp_not_before = timestamp; | |
31c75fcc | 263 | |
bb2551bd | 264 | return 0; |
95c50092 | 265 | } |
86c1c1f3 LP |
266 | } |
267 | ||
31c75fcc LP |
268 | /* Let's read the seqnum again, to shorten the window. */ |
269 | r = get_current_uevent_seqnum(&seqnum); | |
270 | if (r < 0) | |
271 | return r; | |
8ede1e86 | 272 | timestamp = now(CLOCK_MONOTONIC); |
31c75fcc | 273 | |
738f29cb LP |
274 | /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64 |
275 | * ioctl can return EAGAIN in case we change the lo_offset field, if someone else is accessing the | |
276 | * block device while we try to reconfigure it. This is a pretty common case, since udev might | |
277 | * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways: | |
273d76f4 | 278 | * first, let's take the BSD lock to ensure that udev will not step in between the point in |
738f29cb LP |
279 | * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on |
280 | * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms | |
281 | * needlessly if we are just racing against udev. The latter is protection against all other cases, | |
021bf175 | 282 | * i.e. peers that do not take the BSD lock. */ |
738f29cb | 283 | |
86c1c1f3 LP |
284 | if (ioctl(fd, LOOP_SET_FD, c->fd) < 0) |
285 | return -errno; | |
286 | ||
738f29cb LP |
287 | for (unsigned n_attempts = 0;;) { |
288 | if (ioctl(fd, LOOP_SET_STATUS64, &c->info) >= 0) | |
289 | break; | |
290 | if (errno != EAGAIN || ++n_attempts >= 64) { | |
291 | r = log_debug_errno(errno, "Failed to configure loopback device: %m"); | |
292 | goto fail; | |
293 | } | |
294 | ||
b202ec20 LP |
295 | /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more |
296 | * failed attempts we see */ | |
297 | (void) usleep(UINT64_C(10) * USEC_PER_MSEC + | |
b0dbffd8 | 298 | random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); |
e8af3bfd | 299 | } |
86c1c1f3 | 300 | |
31c75fcc LP |
301 | if (ret_seqnum_not_before) |
302 | *ret_seqnum_not_before = seqnum; | |
8ede1e86 LP |
303 | if (ret_timestamp_not_before) |
304 | *ret_timestamp_not_before = timestamp; | |
31c75fcc | 305 | |
86c1c1f3 LP |
306 | return 0; |
307 | ||
308 | fail: | |
309 | (void) ioctl(fd, LOOP_CLR_FD); | |
310 | return r; | |
e8af3bfd ZJS |
311 | } |
312 | ||
021bf175 LP |
313 | static int attach_empty_file(int loop, int nr) { |
314 | _cleanup_close_ int fd = -1; | |
315 | ||
316 | /* So here's the thing: on various kernels (5.8 at least) loop block devices might enter a state | |
317 | * where they are detached but nonetheless have partitions, when used heavily. Accessing these | |
318 | * partitions results in immediatey IO errors. There's no pretty way to get rid of them | |
319 | * again. Neither LOOP_CLR_FD nor LOOP_CTL_REMOVE suffice (see above). What does work is to | |
320 | * reassociate them with a new fd however. This is what we do here hence: we associate the devices | |
377a9545 | 321 | * with an empty file (i.e. an image that definitely has no partitions). We then immediately clear it |
021bf175 LP |
322 | * again. This suffices to make the partitions go away. Ugly but appears to work. */ |
323 | ||
324 | log_debug("Found unattached loopback block device /dev/loop%i with partitions. Attaching empty file to remove them.", nr); | |
325 | ||
326 | fd = open_tmpfile_unlinkable(NULL, O_RDONLY); | |
327 | if (fd < 0) | |
328 | return fd; | |
329 | ||
330 | if (flock(loop, LOCK_EX) < 0) | |
331 | return -errno; | |
332 | ||
333 | if (ioctl(loop, LOOP_SET_FD, fd) < 0) | |
334 | return -errno; | |
335 | ||
336 | if (ioctl(loop, LOOP_SET_STATUS64, &(struct loop_info64) { | |
337 | .lo_flags = LO_FLAGS_READ_ONLY| | |
338 | LO_FLAGS_AUTOCLEAR| | |
339 | LO_FLAGS_PARTSCAN, /* enable partscan, so that the partitions really go away */ | |
340 | }) < 0) | |
341 | return -errno; | |
342 | ||
343 | if (ioctl(loop, LOOP_CLR_FD) < 0) | |
344 | return -errno; | |
345 | ||
346 | /* The caller is expected to immediately close the loopback device after this, so that the BSD lock | |
347 | * is released, and udev sees the changes. */ | |
348 | return 0; | |
349 | } | |
350 | ||
1b49e3e3 | 351 | int loop_device_make( |
ed9eeb7b LP |
352 | int fd, |
353 | int open_flags, | |
354 | uint64_t offset, | |
355 | uint64_t size, | |
356 | uint32_t loop_flags, | |
357 | LoopDevice **ret) { | |
8c1be37e | 358 | |
8c1be37e | 359 | _cleanup_free_ char *loopdev = NULL; |
95c50092 | 360 | bool try_loop_configure = true; |
86c1c1f3 | 361 | struct loop_config config; |
50d04699 | 362 | LoopDevice *d = NULL; |
31c75fcc | 363 | uint64_t seqnum = UINT64_MAX; |
8ede1e86 | 364 | usec_t timestamp = USEC_INFINITY; |
8c1be37e | 365 | struct stat st; |
b26c39ad | 366 | int nr = -1, r; |
8c1be37e LP |
367 | |
368 | assert(fd >= 0); | |
369 | assert(ret); | |
370 | assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); | |
371 | ||
372 | if (fstat(fd, &st) < 0) | |
373 | return -errno; | |
374 | ||
375 | if (S_ISBLK(st.st_mode)) { | |
86c1c1f3 | 376 | if (ioctl(fd, LOOP_GET_STATUS64, &config.info) >= 0) { |
b26c39ad | 377 | /* Oh! This is a loopback device? That's interesting! */ |
10c1b188 LP |
378 | |
379 | #if HAVE_VALGRIND_MEMCHECK_H | |
380 | /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ | |
86c1c1f3 | 381 | VALGRIND_MAKE_MEM_DEFINED(&config.info, sizeof(config.info)); |
10c1b188 | 382 | #endif |
86c1c1f3 | 383 | nr = config.info.lo_number; |
b26c39ad LP |
384 | |
385 | if (asprintf(&loopdev, "/dev/loop%i", nr) < 0) | |
386 | return -ENOMEM; | |
387 | } | |
388 | ||
ed9eeb7b | 389 | if (offset == 0 && IN_SET(size, 0, UINT64_MAX)) { |
ba5450f4 | 390 | _cleanup_close_ int copy = -1; |
8c1be37e | 391 | |
ed9eeb7b | 392 | /* If this is already a block device, store a copy of the fd as it is */ |
8c1be37e | 393 | |
ed9eeb7b LP |
394 | copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); |
395 | if (copy < 0) | |
396 | return -errno; | |
8c1be37e | 397 | |
ed9eeb7b LP |
398 | d = new(LoopDevice, 1); |
399 | if (!d) | |
400 | return -ENOMEM; | |
ed9eeb7b | 401 | *d = (LoopDevice) { |
ba5450f4 | 402 | .fd = TAKE_FD(copy), |
b26c39ad LP |
403 | .nr = nr, |
404 | .node = TAKE_PTR(loopdev), | |
ed9eeb7b | 405 | .relinquished = true, /* It's not allocated by us, don't destroy it when this object is freed */ |
f3859d5f | 406 | .devno = st.st_rdev, |
31c75fcc | 407 | .uevent_seqnum_not_before = UINT64_MAX, |
8ede1e86 | 408 | .timestamp_not_before = USEC_INFINITY, |
ed9eeb7b LP |
409 | }; |
410 | ||
411 | *ret = d; | |
412 | return d->fd; | |
413 | } | |
414 | } else { | |
415 | r = stat_verify_regular(&st); | |
416 | if (r < 0) | |
417 | return r; | |
8c1be37e LP |
418 | } |
419 | ||
e8af3bfd ZJS |
420 | _cleanup_close_ int control = -1; |
421 | _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -1; | |
422 | ||
8c1be37e LP |
423 | control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); |
424 | if (control < 0) | |
425 | return -errno; | |
426 | ||
86c1c1f3 LP |
427 | config = (struct loop_config) { |
428 | .fd = fd, | |
429 | .info = { | |
430 | /* Use the specified flags, but configure the read-only flag from the open flags, and force autoclear */ | |
0950526a | 431 | .lo_flags = (loop_flags & ~LO_FLAGS_READ_ONLY) | ((open_flags & O_ACCMODE) == O_RDONLY ? LO_FLAGS_READ_ONLY : 0) | LO_FLAGS_AUTOCLEAR, |
86c1c1f3 LP |
432 | .lo_offset = offset, |
433 | .lo_sizelimit = size == UINT64_MAX ? 0 : size, | |
434 | }, | |
435 | }; | |
436 | ||
0f6519d4 LP |
437 | /* Loop around LOOP_CTL_GET_FREE, since at the moment we attempt to open the returned device it might |
438 | * be gone already, taken by somebody else racing against us. */ | |
e8af3bfd ZJS |
439 | for (unsigned n_attempts = 0;;) { |
440 | _cleanup_close_ int loop = -1; | |
441 | ||
0f6519d4 LP |
442 | nr = ioctl(control, LOOP_CTL_GET_FREE); |
443 | if (nr < 0) | |
444 | return -errno; | |
8c1be37e | 445 | |
0f6519d4 LP |
446 | if (asprintf(&loopdev, "/dev/loop%i", nr) < 0) |
447 | return -ENOMEM; | |
8c1be37e | 448 | |
0f6519d4 | 449 | loop = open(loopdev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); |
01813148 ZJS |
450 | if (loop < 0) { |
451 | /* Somebody might've gotten the same number from the kernel, used the device, | |
452 | * and called LOOP_CTL_REMOVE on it. Let's retry with a new number. */ | |
77ad674b | 453 | if (!IN_SET(errno, ENOENT, ENXIO)) |
01813148 ZJS |
454 | return -errno; |
455 | } else { | |
8ede1e86 | 456 | r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, ×tamp); |
86c1c1f3 | 457 | if (r >= 0) { |
01813148 ZJS |
458 | loop_with_fd = TAKE_FD(loop); |
459 | break; | |
460 | } | |
021bf175 LP |
461 | if (r == -EUCLEAN) { |
462 | /* Make left-over partition disappear hack (see above) */ | |
463 | r = attach_empty_file(loop, nr); | |
464 | if (r < 0 && r != -EBUSY) | |
465 | return r; | |
466 | } else if (r != -EBUSY) | |
86c1c1f3 | 467 | return r; |
e8af3bfd | 468 | } |
01813148 | 469 | |
e8af3bfd ZJS |
470 | if (++n_attempts >= 64) /* Give up eventually */ |
471 | return -EBUSY; | |
0f6519d4 LP |
472 | |
473 | loopdev = mfree(loopdev); | |
b202ec20 LP |
474 | |
475 | /* Wait some random time, to make collision less likely. Let's pick a random time in the | |
476 | * range 0ms…250ms, linearly scaled by the number of failed attempts. */ | |
b0dbffd8 LP |
477 | (void) usleep(random_u64_range(UINT64_C(10) * USEC_PER_MSEC + |
478 | UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); | |
0f6519d4 | 479 | } |
8c1be37e | 480 | |
f3859d5f LP |
481 | if (fstat(loop_with_fd, &st) < 0) |
482 | return -errno; | |
483 | assert(S_ISBLK(st.st_mode)); | |
484 | ||
8c1be37e | 485 | d = new(LoopDevice, 1); |
e8af3bfd ZJS |
486 | if (!d) |
487 | return -ENOMEM; | |
8c1be37e | 488 | *d = (LoopDevice) { |
e8af3bfd | 489 | .fd = TAKE_FD(loop_with_fd), |
1cc6c93a | 490 | .node = TAKE_PTR(loopdev), |
8c1be37e | 491 | .nr = nr, |
f3859d5f | 492 | .devno = st.st_rdev, |
31c75fcc | 493 | .uevent_seqnum_not_before = seqnum, |
8ede1e86 | 494 | .timestamp_not_before = timestamp, |
8c1be37e LP |
495 | }; |
496 | ||
8c1be37e | 497 | *ret = d; |
38bd449f | 498 | return d->fd; |
8c1be37e LP |
499 | } |
500 | ||
79e8393a LP |
501 | int loop_device_make_by_path( |
502 | const char *path, | |
503 | int open_flags, | |
504 | uint32_t loop_flags, | |
505 | LoopDevice **ret) { | |
506 | ||
8c1be37e | 507 | _cleanup_close_ int fd = -1; |
b0a94268 | 508 | int r; |
8c1be37e LP |
509 | |
510 | assert(path); | |
511 | assert(ret); | |
b0a94268 | 512 | assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY)); |
8c1be37e | 513 | |
b0a94268 LP |
514 | /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying |
515 | * read-only if we cannot. */ | |
516 | ||
517 | fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|(open_flags >= 0 ? open_flags : O_RDWR)); | |
518 | if (fd < 0) { | |
519 | r = -errno; | |
520 | ||
521 | /* Retry read-only? */ | |
522 | if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS)) | |
523 | return r; | |
524 | ||
525 | fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY); | |
526 | if (fd < 0) | |
527 | return r; /* Propagate original error */ | |
528 | ||
529 | open_flags = O_RDONLY; | |
530 | } else if (open_flags < 0) | |
531 | open_flags = O_RDWR; | |
8c1be37e | 532 | |
1b49e3e3 | 533 | return loop_device_make(fd, open_flags, 0, 0, loop_flags, ret); |
8c1be37e LP |
534 | } |
535 | ||
536 | LoopDevice* loop_device_unref(LoopDevice *d) { | |
537 | if (!d) | |
538 | return NULL; | |
539 | ||
540 | if (d->fd >= 0) { | |
cae1e8fb LP |
541 | /* Implicitly sync the device, since otherwise in-flight blocks might not get written */ |
542 | if (fsync(d->fd) < 0) | |
543 | log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m"); | |
544 | ||
a2ea3b2f | 545 | if (d->nr >= 0 && !d->relinquished) { |
8c1be37e LP |
546 | if (ioctl(d->fd, LOOP_CLR_FD) < 0) |
547 | log_debug_errno(errno, "Failed to clear loop device: %m"); | |
548 | ||
549 | } | |
550 | ||
551 | safe_close(d->fd); | |
552 | } | |
553 | ||
a2ea3b2f | 554 | if (d->nr >= 0 && !d->relinquished) { |
8c1be37e LP |
555 | _cleanup_close_ int control = -1; |
556 | ||
557 | control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); | |
558 | if (control < 0) | |
f2d9213f ZJS |
559 | log_warning_errno(errno, |
560 | "Failed to open loop control device, cannot remove loop device %s: %m", | |
561 | strna(d->node)); | |
562 | else | |
563 | for (unsigned n_attempts = 0;;) { | |
564 | if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0) | |
565 | break; | |
566 | if (errno != EBUSY || ++n_attempts >= 64) { | |
567 | log_warning_errno(errno, "Failed to remove device %s: %m", strna(d->node)); | |
568 | break; | |
569 | } | |
cae1e8fb | 570 | (void) usleep(50 * USEC_PER_MSEC); |
f2d9213f | 571 | } |
8c1be37e LP |
572 | } |
573 | ||
574 | free(d->node); | |
5fecf46d | 575 | return mfree(d); |
8c1be37e | 576 | } |
a2ea3b2f LP |
577 | |
578 | void loop_device_relinquish(LoopDevice *d) { | |
579 | assert(d); | |
580 | ||
581 | /* Don't attempt to clean up the loop device anymore from this point on. Leave the clean-ing up to the kernel | |
582 | * itself, using the loop device "auto-clear" logic we already turned on when creating the device. */ | |
583 | ||
584 | d->relinquished = true; | |
585 | } | |
9dabc4fd LP |
586 | |
587 | int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) { | |
588 | _cleanup_close_ int loop_fd = -1; | |
589 | _cleanup_free_ char *p = NULL; | |
b26c39ad | 590 | struct loop_info64 info; |
9dabc4fd LP |
591 | struct stat st; |
592 | LoopDevice *d; | |
b26c39ad | 593 | int nr; |
9dabc4fd LP |
594 | |
595 | assert(loop_path); | |
596 | assert(ret); | |
597 | ||
598 | loop_fd = open(loop_path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); | |
599 | if (loop_fd < 0) | |
600 | return -errno; | |
601 | ||
602 | if (fstat(loop_fd, &st) < 0) | |
603 | return -errno; | |
9dabc4fd LP |
604 | if (!S_ISBLK(st.st_mode)) |
605 | return -ENOTBLK; | |
606 | ||
10c1b188 LP |
607 | if (ioctl(loop_fd, LOOP_GET_STATUS64, &info) >= 0) { |
608 | #if HAVE_VALGRIND_MEMCHECK_H | |
609 | /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ | |
610 | VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); | |
611 | #endif | |
b26c39ad | 612 | nr = info.lo_number; |
10c1b188 | 613 | } else |
b26c39ad LP |
614 | nr = -1; |
615 | ||
9dabc4fd LP |
616 | p = strdup(loop_path); |
617 | if (!p) | |
618 | return -ENOMEM; | |
619 | ||
620 | d = new(LoopDevice, 1); | |
621 | if (!d) | |
622 | return -ENOMEM; | |
623 | ||
624 | *d = (LoopDevice) { | |
625 | .fd = TAKE_FD(loop_fd), | |
b26c39ad | 626 | .nr = nr, |
9dabc4fd LP |
627 | .node = TAKE_PTR(p), |
628 | .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */ | |
79e8393a | 629 | .devno = st.st_dev, |
31c75fcc | 630 | .uevent_seqnum_not_before = UINT64_MAX, |
8ede1e86 | 631 | .timestamp_not_before = USEC_INFINITY, |
9dabc4fd LP |
632 | }; |
633 | ||
634 | *ret = d; | |
635 | return d->fd; | |
636 | } | |
637 | ||
f1443709 LP |
638 | static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) { |
639 | char sysfs[STRLEN("/sys/dev/block/:/partition") + 2*DECIMAL_STR_MAX(dev_t) + 1]; | |
640 | _cleanup_free_ char *whole = NULL, *buffer = NULL; | |
641 | uint64_t current_offset, current_size, partno; | |
642 | _cleanup_close_ int whole_fd = -1; | |
643 | struct stat st; | |
644 | dev_t devno; | |
645 | int r; | |
646 | ||
647 | assert(partition_fd >= 0); | |
648 | ||
649 | /* Resizes the partition the loopback device refer to (assuming it refers to one instead of an actual | |
650 | * loopback device), and changes the offset, if needed. This is a fancy wrapper around | |
651 | * BLKPG_RESIZE_PARTITION. */ | |
652 | ||
653 | if (fstat(partition_fd, &st) < 0) | |
654 | return -errno; | |
655 | ||
656 | assert(S_ISBLK(st.st_mode)); | |
657 | ||
658 | xsprintf(sysfs, "/sys/dev/block/%u:%u/partition", major(st.st_rdev), minor(st.st_rdev)); | |
659 | r = read_one_line_file(sysfs, &buffer); | |
660 | if (r == -ENOENT) /* not a partition, cannot resize */ | |
661 | return -ENOTTY; | |
662 | if (r < 0) | |
663 | return r; | |
664 | r = safe_atou64(buffer, &partno); | |
665 | if (r < 0) | |
666 | return r; | |
667 | ||
668 | xsprintf(sysfs, "/sys/dev/block/%u:%u/start", major(st.st_rdev), minor(st.st_rdev)); | |
669 | ||
670 | buffer = mfree(buffer); | |
671 | r = read_one_line_file(sysfs, &buffer); | |
672 | if (r < 0) | |
673 | return r; | |
674 | r = safe_atou64(buffer, ¤t_offset); | |
675 | if (r < 0) | |
676 | return r; | |
677 | if (current_offset > UINT64_MAX/512U) | |
678 | return -EINVAL; | |
679 | current_offset *= 512U; | |
680 | ||
681 | if (ioctl(partition_fd, BLKGETSIZE64, ¤t_size) < 0) | |
682 | return -EINVAL; | |
683 | ||
684 | if (size == UINT64_MAX && offset == UINT64_MAX) | |
685 | return 0; | |
686 | if (current_size == size && current_offset == offset) | |
687 | return 0; | |
688 | ||
689 | xsprintf(sysfs, "/sys/dev/block/%u:%u/../dev", major(st.st_rdev), minor(st.st_rdev)); | |
690 | ||
691 | buffer = mfree(buffer); | |
692 | r = read_one_line_file(sysfs, &buffer); | |
693 | if (r < 0) | |
694 | return r; | |
695 | r = parse_dev(buffer, &devno); | |
696 | if (r < 0) | |
697 | return r; | |
698 | ||
699 | r = device_path_make_major_minor(S_IFBLK, devno, &whole); | |
700 | if (r < 0) | |
701 | return r; | |
702 | ||
703 | whole_fd = open(whole, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); | |
704 | if (whole_fd < 0) | |
705 | return -errno; | |
706 | ||
707 | struct blkpg_partition bp = { | |
708 | .pno = partno, | |
709 | .start = offset == UINT64_MAX ? current_offset : offset, | |
710 | .length = size == UINT64_MAX ? current_size : size, | |
711 | }; | |
712 | ||
713 | struct blkpg_ioctl_arg ba = { | |
714 | .op = BLKPG_RESIZE_PARTITION, | |
715 | .data = &bp, | |
716 | .datalen = sizeof(bp), | |
717 | }; | |
718 | ||
719 | if (ioctl(whole_fd, BLKPG, &ba) < 0) | |
720 | return -errno; | |
721 | ||
722 | return 0; | |
723 | } | |
724 | ||
c37878fc LP |
725 | int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) { |
726 | struct loop_info64 info; | |
9dabc4fd LP |
727 | assert(d); |
728 | ||
f1443709 LP |
729 | /* Changes the offset/start of the loop device relative to the beginning of the underlying file or |
730 | * block device. If this loop device actually refers to a partition and not a loopback device, we'll | |
731 | * try to adjust the partition offsets instead. | |
732 | * | |
733 | * If either offset or size is UINT64_MAX we won't change that parameter. */ | |
734 | ||
9dabc4fd LP |
735 | if (d->fd < 0) |
736 | return -EBADF; | |
737 | ||
f1443709 LP |
738 | if (d->nr < 0) /* not a loopback device */ |
739 | return resize_partition(d->fd, offset, size); | |
740 | ||
c37878fc LP |
741 | if (ioctl(d->fd, LOOP_GET_STATUS64, &info) < 0) |
742 | return -errno; | |
743 | ||
10c1b188 LP |
744 | #if HAVE_VALGRIND_MEMCHECK_H |
745 | /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ | |
746 | VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); | |
747 | #endif | |
748 | ||
c37878fc LP |
749 | if (size == UINT64_MAX && offset == UINT64_MAX) |
750 | return 0; | |
751 | if (info.lo_sizelimit == size && info.lo_offset == offset) | |
752 | return 0; | |
753 | ||
754 | if (size != UINT64_MAX) | |
755 | info.lo_sizelimit = size; | |
756 | if (offset != UINT64_MAX) | |
757 | info.lo_offset = offset; | |
758 | ||
759 | if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0) | |
9dabc4fd LP |
760 | return -errno; |
761 | ||
762 | return 0; | |
763 | } | |
441ec804 LP |
764 | |
765 | int loop_device_flock(LoopDevice *d, int operation) { | |
766 | assert(d); | |
767 | ||
768 | if (d->fd < 0) | |
769 | return -EBADF; | |
770 | ||
771 | if (flock(d->fd, operation) < 0) | |
772 | return -errno; | |
773 | ||
774 | return 0; | |
775 | } | |
8dbc208c LP |
776 | |
777 | int loop_device_sync(LoopDevice *d) { | |
778 | assert(d); | |
779 | ||
780 | /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that | |
781 | * we can check the return value though. */ | |
782 | ||
783 | if (d->fd < 0) | |
784 | return -EBADF; | |
785 | ||
786 | if (fsync(d->fd) < 0) | |
787 | return -errno; | |
788 | ||
789 | return 0; | |
790 | } |