]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/loop-util.c
loop-util: rework how we lock loopback block devices
[thirdparty/systemd.git] / src / shared / loop-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
8c1be37e 2
10c1b188
LP
3#if HAVE_VALGRIND_MEMCHECK_H
4#include <valgrind/memcheck.h>
5#endif
6
dccca82b 7#include <errno.h>
8c1be37e 8#include <fcntl.h>
f1443709
LP
9#include <linux/blkpg.h>
10#include <linux/fs.h>
8c1be37e 11#include <linux/loop.h>
441ec804 12#include <sys/file.h>
8c1be37e 13#include <sys/ioctl.h>
f2d9213f 14#include <unistd.h>
8c1be37e 15
021bf175
LP
16#include "sd-device.h"
17
8c1be37e 18#include "alloc-util.h"
86c1c1f3 19#include "blockdev-util.h"
021bf175 20#include "device-util.h"
7176f06c 21#include "devnum-util.h"
e8c7c4d9 22#include "env-util.h"
b0a94268 23#include "errno-util.h"
8c1be37e 24#include "fd-util.h"
f1443709 25#include "fileio.h"
8c1be37e 26#include "loop-util.h"
86c1c1f3 27#include "missing_loop.h"
f1443709 28#include "parse-util.h"
b202ec20 29#include "random-util.h"
3cc44114 30#include "stat-util.h"
f1443709 31#include "stdio-util.h"
f2d9213f 32#include "string-util.h"
021bf175 33#include "tmpfile-util.h"
8c1be37e 34
e8af3bfd 35static void cleanup_clear_loop_close(int *fd) {
86c1c1f3
LP
36 if (*fd < 0)
37 return;
38
39 (void) ioctl(*fd, LOOP_CLR_FD);
40 (void) safe_close(*fd);
41}
42
021bf175
LP
43static int loop_is_bound(int fd) {
44 struct loop_info64 info;
45
46 assert(fd >= 0);
47
48 if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) {
49 if (errno == ENXIO)
50 return false; /* not bound! */
51
52 return -errno;
53 }
54
55 return true; /* bound! */
56}
57
31c75fcc
LP
58static int get_current_uevent_seqnum(uint64_t *ret) {
59 _cleanup_free_ char *p = NULL;
60 int r;
61
62 r = read_full_virtual_file("/sys/kernel/uevent_seqnum", &p, NULL);
63 if (r < 0)
64 return log_debug_errno(r, "Failed to read current uevent sequence number: %m");
65
a145f8c0 66 r = safe_atou64(strstrip(p), ret);
31c75fcc
LP
67 if (r < 0)
68 return log_debug_errno(r, "Failed to parse current uevent sequence number: %s", p);
69
70 return 0;
71}
72
021bf175
LP
73static int device_has_block_children(sd_device *d) {
74 _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
7ffc7f3f 75 const char *main_ss, *main_dt;
021bf175
LP
76 int r;
77
78 assert(d);
79
80 /* Checks if the specified device currently has block device children (i.e. partition block
81 * devices). */
82
7ffc7f3f 83 r = sd_device_get_subsystem(d, &main_ss);
021bf175
LP
84 if (r < 0)
85 return r;
86
7ffc7f3f
LP
87 if (!streq(main_ss, "block"))
88 return -EINVAL;
89
90 r = sd_device_get_devtype(d, &main_dt);
021bf175
LP
91 if (r < 0)
92 return r;
93
7ffc7f3f 94 if (!streq(main_dt, "disk")) /* Refuse invocation on partition block device, insist on "whole" device */
021bf175
LP
95 return -EINVAL;
96
97 r = sd_device_enumerator_new(&e);
98 if (r < 0)
99 return r;
100
101 r = sd_device_enumerator_allow_uninitialized(e);
102 if (r < 0)
103 return r;
104
105 r = sd_device_enumerator_add_match_parent(e, d);
106 if (r < 0)
107 return r;
108
5c467ef4
YW
109 r = sd_device_enumerator_add_match_subsystem(e, "block", /* match = */ true);
110 if (r < 0)
111 return r;
021bf175 112
5c467ef4
YW
113 r = sd_device_enumerator_add_match_property(e, "DEVTYPE", "partition");
114 if (r < 0)
115 return r;
021bf175 116
5c467ef4 117 return !!sd_device_enumerator_get_device_first(e);
021bf175
LP
118}
119
7f52206a
LP
120static int open_lock_fd(int primary_fd, int operation) {
121 int lock_fd;
122
123 assert(primary_fd >= 0);
124
125 lock_fd = fd_reopen(primary_fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
126 if (lock_fd < 0)
127 return lock_fd;
128 if (flock(lock_fd, operation) < 0)
129 return -errno;
130
131 return lock_fd;
132}
133
95c50092
LP
134static int loop_configure(
135 int fd,
021bf175 136 int nr,
95c50092 137 const struct loop_config *c,
31c75fcc 138 bool *try_loop_configure,
8ede1e86 139 uint64_t *ret_seqnum_not_before,
7f52206a
LP
140 usec_t *ret_timestamp_not_before,
141 int *ret_lock_fd) {
95c50092 142
021bf175
LP
143 _cleanup_(sd_device_unrefp) sd_device *d = NULL;
144 _cleanup_free_ char *sysname = NULL;
738f29cb 145 _cleanup_close_ int lock_fd = -1;
e8c7c4d9 146 struct loop_info64 info_copy;
31c75fcc 147 uint64_t seqnum;
8ede1e86 148 usec_t timestamp;
86c1c1f3
LP
149 int r;
150
151 assert(fd >= 0);
021bf175 152 assert(nr >= 0);
86c1c1f3 153 assert(c);
95c50092
LP
154 assert(try_loop_configure);
155
021bf175
LP
156 if (asprintf(&sysname, "loop%i", nr) < 0)
157 return -ENOMEM;
158
159 r = sd_device_new_from_subsystem_sysname(&d, "block", sysname);
160 if (r < 0)
161 return r;
162
163 /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened
164 * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on
165 * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure
166 * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a
167 * long time udev would possibly never run on it again, even though the fd is unlocked, simply
168 * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to
169 * automatically release the lock, after we are done. */
7f52206a 170 lock_fd = open_lock_fd(fd, LOCK_EX);
021bf175
LP
171 if (lock_fd < 0)
172 return lock_fd;
021bf175
LP
173
174 /* Let's see if the device is really detached, i.e. currently has no associated partition block
175 * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that
176 * superficially is detached but still has partition block devices associated for it. They only go
177 * away when the device is reattached. (Yes, LOOP_CLR_FD doesn't work then, because officially
178 * nothing is attached and LOOP_CTL_REMOVE doesn't either, since it doesn't care about partition
179 * block devices. */
180 r = device_has_block_children(d);
181 if (r < 0)
182 return r;
183 if (r > 0) {
184 r = loop_is_bound(fd);
185 if (r < 0)
186 return r;
187 if (r > 0)
188 return -EBUSY;
189
190 return -EUCLEAN; /* Bound but children? Tell caller to reattach something so that the
191 * partition block devices are gone too. */
192 }
193
95c50092 194 if (*try_loop_configure) {
31c75fcc
LP
195 /* Acquire uevent seqnum immediately before attaching the loopback device. This allows
196 * callers to ignore all uevents with a seqnum before this one, if they need to associate
197 * uevent with this attachment. Doing so isn't race-free though, as uevents that happen in
198 * the window between this reading of the seqnum, and the LOOP_CONFIGURE call might still be
199 * mistaken as originating from our attachment, even though might be caused by an earlier
200 * use. But doing this at least shortens the race window a bit. */
201 r = get_current_uevent_seqnum(&seqnum);
202 if (r < 0)
203 return r;
8ede1e86 204 timestamp = now(CLOCK_MONOTONIC);
31c75fcc 205
95c50092
LP
206 if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
207 /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other
208 * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL
209 * rather than ENOTTY on loopback block devices. They should fix that in the kernel,
210 * but in the meantime we accept both here. */
211 if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
212 return -errno;
86c1c1f3 213
95c50092
LP
214 *try_loop_configure = false;
215 } else {
216 bool good = true;
217
218 if (c->info.lo_sizelimit != 0) {
219 /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the
220 * block device. If it's used, let's immediately check if it had the desired
221 * effect hence. And if not use classic LOOP_SET_STATUS64. */
222 uint64_t z;
223
224 if (ioctl(fd, BLKGETSIZE64, &z) < 0) {
225 r = -errno;
226 goto fail;
227 }
228
229 if (z != c->info.lo_sizelimit) {
230 log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
231 good = false;
232 }
bb2551bd 233 }
86c1c1f3 234
95c50092
LP
235 if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
236 /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag
237 * into the block device. Let's hence verify if things work correctly here
238 * before returning. */
239
240 r = blockdev_partscan_enabled(fd);
241 if (r < 0)
242 goto fail;
243 if (r == 0) {
244 log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
245 good = false;
246 }
bb2551bd 247 }
86c1c1f3 248
95c50092
LP
249 if (!good) {
250 /* LOOP_CONFIGURE doesn't work. Remember that. */
251 *try_loop_configure = false;
252
253 /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD,
254 * because LOOP_CLR_FD is async: if the operation cannot be executed right
255 * away it just sets the autoclear flag on the device. This means there's a
256 * good chance we cannot actually reuse the loopback device right-away. Hence
257 * let's assume it's busy, avoid the trouble and let the calling loop call us
258 * again with a new, likely unused device. */
259 r = -EBUSY;
bb2551bd 260 goto fail;
bb2551bd 261 }
bb2551bd 262
7f52206a 263 goto success;
95c50092 264 }
86c1c1f3
LP
265 }
266
31c75fcc
LP
267 /* Let's read the seqnum again, to shorten the window. */
268 r = get_current_uevent_seqnum(&seqnum);
269 if (r < 0)
270 return r;
8ede1e86 271 timestamp = now(CLOCK_MONOTONIC);
31c75fcc 272
738f29cb
LP
273 /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64
274 * ioctl can return EAGAIN in case we change the lo_offset field, if someone else is accessing the
275 * block device while we try to reconfigure it. This is a pretty common case, since udev might
276 * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways:
273d76f4 277 * first, let's take the BSD lock to ensure that udev will not step in between the point in
738f29cb
LP
278 * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on
279 * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms
280 * needlessly if we are just racing against udev. The latter is protection against all other cases,
021bf175 281 * i.e. peers that do not take the BSD lock. */
738f29cb 282
86c1c1f3
LP
283 if (ioctl(fd, LOOP_SET_FD, c->fd) < 0)
284 return -errno;
285
e8c7c4d9
LP
286 /* Only some of the flags LOOP_CONFIGURE can set are also settable via LOOP_SET_STATUS64, hence mask
287 * them out. */
288 info_copy = c->info;
289 info_copy.lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
290
738f29cb 291 for (unsigned n_attempts = 0;;) {
e8c7c4d9 292 if (ioctl(fd, LOOP_SET_STATUS64, &info_copy) >= 0)
738f29cb
LP
293 break;
294 if (errno != EAGAIN || ++n_attempts >= 64) {
9d72a3cf 295 r = log_debug_errno(errno, "Failed to configure loopback block device: %m");
738f29cb
LP
296 goto fail;
297 }
298
b202ec20
LP
299 /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more
300 * failed attempts we see */
301 (void) usleep(UINT64_C(10) * USEC_PER_MSEC +
b0dbffd8 302 random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
e8af3bfd 303 }
86c1c1f3 304
b9a9748a
LP
305 /* Work around a kernel bug, where changing offset/size of the loopback device doesn't correctly
306 * invalidate the buffer cache. For details see:
307 *
308 * https://android.googlesource.com/platform/system/apex/+/bef74542fbbb4cd629793f4efee8e0053b360570
309 *
310 * This was fixed in kernel 5.0, see:
311 *
312 * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5db470e229e22b7eda6e23b5566e532c96fb5bc3
313 *
314 * We'll run the work-around here in the legacy LOOP_SET_STATUS64 codepath. In the LOOP_CONFIGURE
315 * codepath above it should not be necessary. */
316 if (c->info.lo_offset != 0 || c->info.lo_sizelimit != 0)
317 if (ioctl(fd, BLKFLSBUF, 0) < 0)
318 log_debug_errno(errno, "Failed to issue BLKFLSBUF ioctl, ignoring: %m");
319
e8c7c4d9
LP
320 /* LO_FLAGS_DIRECT_IO is a flags we need to configure via explicit ioctls. */
321 if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) {
322 unsigned long b = 1;
323
324 if (ioctl(fd, LOOP_SET_DIRECT_IO, b) < 0)
325 log_debug_errno(errno, "Failed to enable direct IO mode on loopback device /dev/loop%i, ignoring: %m", nr);
326 }
327
7f52206a 328success:
31c75fcc
LP
329 if (ret_seqnum_not_before)
330 *ret_seqnum_not_before = seqnum;
8ede1e86
LP
331 if (ret_timestamp_not_before)
332 *ret_timestamp_not_before = timestamp;
7f52206a
LP
333 if (ret_lock_fd)
334 *ret_lock_fd = TAKE_FD(lock_fd);
31c75fcc 335
86c1c1f3
LP
336 return 0;
337
338fail:
339 (void) ioctl(fd, LOOP_CLR_FD);
340 return r;
e8af3bfd
ZJS
341}
342
021bf175
LP
343static int attach_empty_file(int loop, int nr) {
344 _cleanup_close_ int fd = -1;
345
346 /* So here's the thing: on various kernels (5.8 at least) loop block devices might enter a state
347 * where they are detached but nonetheless have partitions, when used heavily. Accessing these
348 * partitions results in immediatey IO errors. There's no pretty way to get rid of them
349 * again. Neither LOOP_CLR_FD nor LOOP_CTL_REMOVE suffice (see above). What does work is to
350 * reassociate them with a new fd however. This is what we do here hence: we associate the devices
377a9545 351 * with an empty file (i.e. an image that definitely has no partitions). We then immediately clear it
021bf175
LP
352 * again. This suffices to make the partitions go away. Ugly but appears to work. */
353
354 log_debug("Found unattached loopback block device /dev/loop%i with partitions. Attaching empty file to remove them.", nr);
355
356 fd = open_tmpfile_unlinkable(NULL, O_RDONLY);
357 if (fd < 0)
358 return fd;
359
360 if (flock(loop, LOCK_EX) < 0)
361 return -errno;
362
363 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
364 return -errno;
365
366 if (ioctl(loop, LOOP_SET_STATUS64, &(struct loop_info64) {
367 .lo_flags = LO_FLAGS_READ_ONLY|
368 LO_FLAGS_AUTOCLEAR|
369 LO_FLAGS_PARTSCAN, /* enable partscan, so that the partitions really go away */
370 }) < 0)
371 return -errno;
372
373 if (ioctl(loop, LOOP_CLR_FD) < 0)
374 return -errno;
375
376 /* The caller is expected to immediately close the loopback device after this, so that the BSD lock
377 * is released, and udev sees the changes. */
378 return 0;
379}
380
e8c7c4d9 381static int loop_device_make_internal(
ed9eeb7b
LP
382 int fd,
383 int open_flags,
384 uint64_t offset,
385 uint64_t size,
386 uint32_t loop_flags,
7f52206a 387 int lock_op,
ed9eeb7b 388 LoopDevice **ret) {
8c1be37e 389
7f52206a 390 _cleanup_close_ int direct_io_fd = -1, lock_fd = -1;
8c1be37e 391 _cleanup_free_ char *loopdev = NULL;
95c50092 392 bool try_loop_configure = true;
86c1c1f3 393 struct loop_config config;
50d04699 394 LoopDevice *d = NULL;
31c75fcc 395 uint64_t seqnum = UINT64_MAX;
8ede1e86 396 usec_t timestamp = USEC_INFINITY;
e8c7c4d9 397 int nr = -1, r, f_flags;
8c1be37e 398 struct stat st;
8c1be37e
LP
399
400 assert(fd >= 0);
401 assert(ret);
402 assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
403
404 if (fstat(fd, &st) < 0)
405 return -errno;
406
407 if (S_ISBLK(st.st_mode)) {
7f52206a
LP
408 if (lock_op != LOCK_UN) {
409 lock_fd = open_lock_fd(fd, lock_op);
410 if (lock_fd < 0)
411 return lock_fd;
412 }
413
86c1c1f3 414 if (ioctl(fd, LOOP_GET_STATUS64, &config.info) >= 0) {
b26c39ad 415 /* Oh! This is a loopback device? That's interesting! */
10c1b188
LP
416
417#if HAVE_VALGRIND_MEMCHECK_H
418 /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
86c1c1f3 419 VALGRIND_MAKE_MEM_DEFINED(&config.info, sizeof(config.info));
10c1b188 420#endif
86c1c1f3 421 nr = config.info.lo_number;
b26c39ad
LP
422
423 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
424 return -ENOMEM;
425 }
426
ed9eeb7b 427 if (offset == 0 && IN_SET(size, 0, UINT64_MAX)) {
ba5450f4 428 _cleanup_close_ int copy = -1;
bcef1743 429 uint64_t diskseq = 0;
8c1be37e 430
d7654742
LP
431 /* If this is already a block device and we are supposed to cover the whole of it
432 * then store an fd to the original open device node — and do not actually create an
433 * unnecessary loopback device for it. Note that we reopen the inode here, instead of
434 * keeping just a dup() clone of it around, since we want to ensure that the O_DIRECT
435 * flag of the handle we keep is off, we have our own file index, and have the right
436 * read/write mode in effect. */
437
438 copy = fd_reopen(fd, open_flags|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
ed9eeb7b 439 if (copy < 0)
d7654742 440 return copy;
8c1be37e 441
7e93a658 442 r = fd_get_diskseq(copy, &diskseq);
bcef1743
LB
443 if (r < 0 && r != -EOPNOTSUPP)
444 return r;
445
ed9eeb7b
LP
446 d = new(LoopDevice, 1);
447 if (!d)
448 return -ENOMEM;
ed9eeb7b 449 *d = (LoopDevice) {
ba5450f4 450 .fd = TAKE_FD(copy),
7f52206a 451 .lock_fd = TAKE_FD(lock_fd),
b26c39ad
LP
452 .nr = nr,
453 .node = TAKE_PTR(loopdev),
ed9eeb7b 454 .relinquished = true, /* It's not allocated by us, don't destroy it when this object is freed */
f3859d5f 455 .devno = st.st_rdev,
bcef1743 456 .diskseq = diskseq,
31c75fcc 457 .uevent_seqnum_not_before = UINT64_MAX,
8ede1e86 458 .timestamp_not_before = USEC_INFINITY,
ed9eeb7b
LP
459 };
460
461 *ret = d;
462 return d->fd;
463 }
464 } else {
465 r = stat_verify_regular(&st);
466 if (r < 0)
467 return r;
8c1be37e
LP
468 }
469
e8c7c4d9
LP
470 f_flags = fcntl(fd, F_GETFL);
471 if (f_flags < 0)
472 return -errno;
473
474 if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) != FLAGS_SET(f_flags, O_DIRECT)) {
475 /* If LO_FLAGS_DIRECT_IO is requested, then make sure we have the fd open with O_DIRECT, as
476 * that's required. Conversely, if it's off require that O_DIRECT is off too (that's because
477 * new kernels will implicitly enable LO_FLAGS_DIRECT_IO if O_DIRECT is set).
478 *
479 * Our intention here is that LO_FLAGS_DIRECT_IO is the primary knob, and O_DIRECT derived
480 * from that automatically. */
481
482 direct_io_fd = fd_reopen(fd, (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0)|O_CLOEXEC|O_NONBLOCK|open_flags);
483 if (direct_io_fd < 0) {
484 if (!FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO))
485 return log_debug_errno(errno, "Failed to reopen file descriptor without O_DIRECT: %m");
486
487 /* Some file systems might not support O_DIRECT, let's gracefully continue without it then. */
488 log_debug_errno(errno, "Failed to enable O_DIRECT for backing file descriptor for loopback device. Continuing without.");
489 loop_flags &= ~LO_FLAGS_DIRECT_IO;
490 } else
491 fd = direct_io_fd; /* From now on, operate on our new O_DIRECT fd */
492 }
493
e8af3bfd
ZJS
494 _cleanup_close_ int control = -1;
495 _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -1;
496
8c1be37e
LP
497 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
498 if (control < 0)
499 return -errno;
500
86c1c1f3
LP
501 config = (struct loop_config) {
502 .fd = fd,
503 .info = {
504 /* Use the specified flags, but configure the read-only flag from the open flags, and force autoclear */
0950526a 505 .lo_flags = (loop_flags & ~LO_FLAGS_READ_ONLY) | ((open_flags & O_ACCMODE) == O_RDONLY ? LO_FLAGS_READ_ONLY : 0) | LO_FLAGS_AUTOCLEAR,
86c1c1f3
LP
506 .lo_offset = offset,
507 .lo_sizelimit = size == UINT64_MAX ? 0 : size,
508 },
509 };
510
0f6519d4
LP
511 /* Loop around LOOP_CTL_GET_FREE, since at the moment we attempt to open the returned device it might
512 * be gone already, taken by somebody else racing against us. */
e8af3bfd
ZJS
513 for (unsigned n_attempts = 0;;) {
514 _cleanup_close_ int loop = -1;
515
cc530466
LP
516 /* Let's take a lock on the control device first. On a busy system, where many programs
517 * attempt to allocate a loopback device at the same time, we might otherwise keep looping
518 * around relatively heavy operations: asking for a free loopback device, then opening it,
519 * validating it, attaching something to it. Let's serialize this whole operation, to make
520 * unnecessary busywork less likely. Note that this is just something we do to optimize our
521 * own code (and whoever else decides to use LOCK_EX locks for this), taking this lock is not
522 * necessary, it just means it's less likely we have to iterate through this loop again and
523 * again if our own code races against our own code. */
524 if (flock(control, LOCK_EX) < 0)
525 return -errno;
526
0f6519d4
LP
527 nr = ioctl(control, LOOP_CTL_GET_FREE);
528 if (nr < 0)
529 return -errno;
8c1be37e 530
0f6519d4
LP
531 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
532 return -ENOMEM;
8c1be37e 533
0f6519d4 534 loop = open(loopdev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags);
01813148
ZJS
535 if (loop < 0) {
536 /* Somebody might've gotten the same number from the kernel, used the device,
537 * and called LOOP_CTL_REMOVE on it. Let's retry with a new number. */
49043f81 538 if (!ERRNO_IS_DEVICE_ABSENT(errno))
01813148
ZJS
539 return -errno;
540 } else {
7f52206a 541 r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, &timestamp, &lock_fd);
86c1c1f3 542 if (r >= 0) {
01813148
ZJS
543 loop_with_fd = TAKE_FD(loop);
544 break;
545 }
021bf175
LP
546 if (r == -EUCLEAN) {
547 /* Make left-over partition disappear hack (see above) */
548 r = attach_empty_file(loop, nr);
549 if (r < 0 && r != -EBUSY)
550 return r;
551 } else if (r != -EBUSY)
86c1c1f3 552 return r;
e8af3bfd 553 }
01813148 554
cc530466
LP
555 /* OK, this didn't work, let's try again a bit later, but first release the lock on the
556 * control device */
557 if (flock(control, LOCK_UN) < 0)
558 return -errno;
559
e8af3bfd
ZJS
560 if (++n_attempts >= 64) /* Give up eventually */
561 return -EBUSY;
0f6519d4 562
3e921057
LP
563 /* Now close the loop device explicitly. This will release any lock acquired by
564 * attach_empty_file() or similar, while we sleep below. */
565 loop = safe_close(loop);
0f6519d4 566 loopdev = mfree(loopdev);
b202ec20
LP
567
568 /* Wait some random time, to make collision less likely. Let's pick a random time in the
569 * range 0ms…250ms, linearly scaled by the number of failed attempts. */
b0dbffd8
LP
570 (void) usleep(random_u64_range(UINT64_C(10) * USEC_PER_MSEC +
571 UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
0f6519d4 572 }
8c1be37e 573
e8c7c4d9
LP
574 if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) {
575 struct loop_info64 info;
576
577 if (ioctl(loop_with_fd, LOOP_GET_STATUS64, &info) < 0)
578 return -errno;
579
580#if HAVE_VALGRIND_MEMCHECK_H
581 VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
582#endif
583
584 /* On older kernels (<= 5.3) it was necessary to set the block size of the loopback block
585 * device to the logical block size of the underlying file system. Since there was no nice
586 * way to query the value, we are not bothering to do this however. On newer kernels the
587 * block size is propagated automatically and does not require intervention from us. We'll
588 * check here if enabling direct IO worked, to make this easily debuggable however.
589 *
590 * (Should anyone really care and actually wants direct IO on old kernels: it might be worth
591 * enabling direct IO with iteratively larger block sizes until it eventually works.) */
592 if (!FLAGS_SET(info.lo_flags, LO_FLAGS_DIRECT_IO))
593 log_debug("Could not enable direct IO mode, proceeding in buffered IO mode.");
594 }
595
f3859d5f
LP
596 if (fstat(loop_with_fd, &st) < 0)
597 return -errno;
598 assert(S_ISBLK(st.st_mode));
599
bcef1743 600 uint64_t diskseq = 0;
7e93a658 601 r = fd_get_diskseq(loop_with_fd, &diskseq);
bcef1743
LB
602 if (r < 0 && r != -EOPNOTSUPP)
603 return r;
604
7f52206a
LP
605 switch (lock_op & ~LOCK_NB) {
606 case LOCK_EX: /* Already in effect */
607 break;
608 case LOCK_SH: /* Downgrade */
609 if (flock(lock_fd, lock_op) < 0)
610 return -errno;
611 break;
612 case LOCK_UN: /* Release */
613 lock_fd = safe_close(lock_fd);
614 break;
615 default:
616 assert_not_reached();
617 }
618
8c1be37e 619 d = new(LoopDevice, 1);
e8af3bfd
ZJS
620 if (!d)
621 return -ENOMEM;
8c1be37e 622 *d = (LoopDevice) {
e8af3bfd 623 .fd = TAKE_FD(loop_with_fd),
7f52206a 624 .lock_fd = TAKE_FD(lock_fd),
1cc6c93a 625 .node = TAKE_PTR(loopdev),
8c1be37e 626 .nr = nr,
f3859d5f 627 .devno = st.st_rdev,
bcef1743 628 .diskseq = diskseq,
31c75fcc 629 .uevent_seqnum_not_before = seqnum,
8ede1e86 630 .timestamp_not_before = timestamp,
8c1be37e
LP
631 };
632
3b195f63
LP
633 log_debug("Successfully acquired %s, devno=%u:%u, nr=%i, diskseq=%" PRIu64,
634 d->node,
635 major(d->devno), minor(d->devno),
636 d->nr,
637 d->diskseq);
638
8c1be37e 639 *ret = d;
38bd449f 640 return d->fd;
8c1be37e
LP
641}
642
e8c7c4d9
LP
643static uint32_t loop_flags_mangle(uint32_t loop_flags) {
644 int r;
645
646 r = getenv_bool("SYSTEMD_LOOP_DIRECT_IO");
647 if (r < 0 && r != -ENXIO)
648 log_debug_errno(r, "Failed to parse $SYSTEMD_LOOP_DIRECT_IO, ignoring: %m");
649
bfd08445 650 return UPDATE_FLAG(loop_flags, LO_FLAGS_DIRECT_IO, r != 0); /* Turn on LO_FLAGS_DIRECT_IO by default, unless explicitly configured to off. */
e8c7c4d9
LP
651}
652
653int loop_device_make(
654 int fd,
655 int open_flags,
656 uint64_t offset,
657 uint64_t size,
658 uint32_t loop_flags,
7f52206a 659 int lock_op,
e8c7c4d9
LP
660 LoopDevice **ret) {
661
662 assert(fd >= 0);
663 assert(ret);
e8c7c4d9
LP
664
665 return loop_device_make_internal(
666 fd,
667 open_flags,
668 offset,
669 size,
bfd08445 670 loop_flags_mangle(loop_flags),
7f52206a 671 lock_op,
e8c7c4d9
LP
672 ret);
673}
674
79e8393a
LP
675int loop_device_make_by_path(
676 const char *path,
677 int open_flags,
678 uint32_t loop_flags,
7f52206a 679 int lock_op,
79e8393a
LP
680 LoopDevice **ret) {
681
e8c7c4d9 682 int r, basic_flags, direct_flags, rdwr_flags;
8c1be37e 683 _cleanup_close_ int fd = -1;
aa4d3aa3 684 bool direct = false;
8c1be37e
LP
685
686 assert(path);
687 assert(ret);
b0a94268 688 assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY));
8c1be37e 689
b0a94268
LP
690 /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying
691 * read-only if we cannot. */
692
e8c7c4d9
LP
693 loop_flags = loop_flags_mangle(loop_flags);
694
695 /* Let's open with O_DIRECT if we can. But not all file systems support that, hence fall back to
696 * non-O_DIRECT mode automatically, if it fails. */
697
698 basic_flags = O_CLOEXEC|O_NONBLOCK|O_NOCTTY;
699 direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0;
700 rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR;
701
702 fd = open(path, basic_flags|direct_flags|rdwr_flags);
703 if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */
704 fd = open(path, basic_flags|rdwr_flags);
aa4d3aa3
LP
705 else
706 direct = direct_flags != 0;
b0a94268
LP
707 if (fd < 0) {
708 r = -errno;
709
710 /* Retry read-only? */
711 if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS))
712 return r;
713
e8c7c4d9
LP
714 fd = open(path, basic_flags|direct_flags|O_RDONLY);
715 if (fd < 0 && direct_flags != 0) /* as above */
716 fd = open(path, basic_flags|O_RDONLY);
aa4d3aa3
LP
717 else
718 direct = direct_flags != 0;
b0a94268
LP
719 if (fd < 0)
720 return r; /* Propagate original error */
721
722 open_flags = O_RDONLY;
723 } else if (open_flags < 0)
724 open_flags = O_RDWR;
8c1be37e 725
aa4d3aa3
LP
726 log_debug("Opened '%s' in %s access mode%s, with O_DIRECT %s%s.",
727 path,
728 open_flags == O_RDWR ? "O_RDWR" : "O_RDONLY",
729 open_flags != rdwr_flags ? " (O_RDWR was requested but not allowed)" : "",
730 direct ? "enabled" : "disabled",
731 direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : "");
732
7f52206a 733 return loop_device_make_internal(fd, open_flags, 0, 0, loop_flags, lock_op, ret);
8c1be37e
LP
734}
735
736LoopDevice* loop_device_unref(LoopDevice *d) {
3a6ed1e1
LP
737 int r;
738
8c1be37e
LP
739 if (!d)
740 return NULL;
741
7f52206a
LP
742 d->lock_fd = safe_close(d->lock_fd);
743
8c1be37e 744 if (d->fd >= 0) {
cae1e8fb
LP
745 /* Implicitly sync the device, since otherwise in-flight blocks might not get written */
746 if (fsync(d->fd) < 0)
747 log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m");
748
a2ea3b2f 749 if (d->nr >= 0 && !d->relinquished) {
3a6ed1e1
LP
750 /* We are supposed to clear the loopback device. Let's do this synchronously: lock
751 * the device, manually remove all partitions and then clear it. This should ensure
752 * udev doesn't concurrently access the devices, and we can be reasonably sure that
753 * once we are done here the device is cleared and all its partition children
754 * removed. Note that we lock our primary device fd here (and not a separate locking
755 * fd, as we do during allocation, since we want to keep the lock all the way through
756 * the LOOP_CLR_FD, but that call would fail if we had more than one fd open.) */
8c1be37e 757
3a6ed1e1
LP
758 if (flock(d->fd, LOCK_EX) < 0)
759 log_debug_errno(errno, "Failed to lock loop block device, ignoring: %m");
760
761 r = block_device_remove_all_partitions(d->fd);
762 if (r < 0)
763 log_debug_errno(r, "Failed to remove partitions of loopback block device, ignoring: %m");
764
765 if (ioctl(d->fd, LOOP_CLR_FD) < 0)
766 log_debug_errno(errno, "Failed to clear loop device, ignoring: %m");
8c1be37e
LP
767 }
768
769 safe_close(d->fd);
770 }
771
a2ea3b2f 772 if (d->nr >= 0 && !d->relinquished) {
8c1be37e
LP
773 _cleanup_close_ int control = -1;
774
775 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
776 if (control < 0)
f2d9213f
ZJS
777 log_warning_errno(errno,
778 "Failed to open loop control device, cannot remove loop device %s: %m",
779 strna(d->node));
780 else
781 for (unsigned n_attempts = 0;;) {
782 if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0)
783 break;
784 if (errno != EBUSY || ++n_attempts >= 64) {
785 log_warning_errno(errno, "Failed to remove device %s: %m", strna(d->node));
786 break;
787 }
cae1e8fb 788 (void) usleep(50 * USEC_PER_MSEC);
f2d9213f 789 }
8c1be37e
LP
790 }
791
792 free(d->node);
5fecf46d 793 return mfree(d);
8c1be37e 794}
a2ea3b2f
LP
795
796void loop_device_relinquish(LoopDevice *d) {
797 assert(d);
798
799 /* Don't attempt to clean up the loop device anymore from this point on. Leave the clean-ing up to the kernel
800 * itself, using the loop device "auto-clear" logic we already turned on when creating the device. */
801
802 d->relinquished = true;
803}
9dabc4fd 804
24d59aee
DDM
805void loop_device_unrelinquish(LoopDevice *d) {
806 assert(d);
807 d->relinquished = false;
808}
809
7f52206a
LP
810int loop_device_open(
811 const char *loop_path,
812 int open_flags,
813 int lock_op,
814 LoopDevice **ret) {
815
816 _cleanup_close_ int loop_fd = -1, lock_fd = -1;
9dabc4fd 817 _cleanup_free_ char *p = NULL;
b26c39ad 818 struct loop_info64 info;
9dabc4fd
LP
819 struct stat st;
820 LoopDevice *d;
b26c39ad 821 int nr;
9dabc4fd
LP
822
823 assert(loop_path);
e8c7c4d9 824 assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
9dabc4fd
LP
825 assert(ret);
826
827 loop_fd = open(loop_path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags);
828 if (loop_fd < 0)
829 return -errno;
830
831 if (fstat(loop_fd, &st) < 0)
832 return -errno;
9dabc4fd
LP
833 if (!S_ISBLK(st.st_mode))
834 return -ENOTBLK;
835
10c1b188
LP
836 if (ioctl(loop_fd, LOOP_GET_STATUS64, &info) >= 0) {
837#if HAVE_VALGRIND_MEMCHECK_H
838 /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
839 VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
840#endif
b26c39ad 841 nr = info.lo_number;
10c1b188 842 } else
b26c39ad
LP
843 nr = -1;
844
7f52206a
LP
845 if ((lock_op & ~LOCK_NB) != LOCK_UN) {
846 lock_fd = open_lock_fd(loop_fd, lock_op);
847 if (lock_fd < 0)
848 return lock_fd;
849 }
850
9dabc4fd
LP
851 p = strdup(loop_path);
852 if (!p)
853 return -ENOMEM;
854
855 d = new(LoopDevice, 1);
856 if (!d)
857 return -ENOMEM;
858
859 *d = (LoopDevice) {
860 .fd = TAKE_FD(loop_fd),
7f52206a 861 .lock_fd = TAKE_FD(lock_fd),
b26c39ad 862 .nr = nr,
9dabc4fd
LP
863 .node = TAKE_PTR(p),
864 .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */
79e8393a 865 .devno = st.st_dev,
31c75fcc 866 .uevent_seqnum_not_before = UINT64_MAX,
8ede1e86 867 .timestamp_not_before = USEC_INFINITY,
9dabc4fd
LP
868 };
869
870 *ret = d;
871 return d->fd;
872}
873
f1443709
LP
874static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) {
875 char sysfs[STRLEN("/sys/dev/block/:/partition") + 2*DECIMAL_STR_MAX(dev_t) + 1];
ca822829 876 _cleanup_free_ char *buffer = NULL;
f1443709
LP
877 uint64_t current_offset, current_size, partno;
878 _cleanup_close_ int whole_fd = -1;
879 struct stat st;
880 dev_t devno;
881 int r;
882
883 assert(partition_fd >= 0);
884
885 /* Resizes the partition the loopback device refer to (assuming it refers to one instead of an actual
886 * loopback device), and changes the offset, if needed. This is a fancy wrapper around
887 * BLKPG_RESIZE_PARTITION. */
888
889 if (fstat(partition_fd, &st) < 0)
890 return -errno;
891
892 assert(S_ISBLK(st.st_mode));
893
ed13feff 894 xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev));
f1443709
LP
895 r = read_one_line_file(sysfs, &buffer);
896 if (r == -ENOENT) /* not a partition, cannot resize */
897 return -ENOTTY;
898 if (r < 0)
899 return r;
900 r = safe_atou64(buffer, &partno);
901 if (r < 0)
902 return r;
903
ed13feff 904 xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/start", DEVNUM_FORMAT_VAL(st.st_rdev));
f1443709
LP
905
906 buffer = mfree(buffer);
907 r = read_one_line_file(sysfs, &buffer);
908 if (r < 0)
909 return r;
910 r = safe_atou64(buffer, &current_offset);
911 if (r < 0)
912 return r;
913 if (current_offset > UINT64_MAX/512U)
914 return -EINVAL;
915 current_offset *= 512U;
916
917 if (ioctl(partition_fd, BLKGETSIZE64, &current_size) < 0)
918 return -EINVAL;
919
920 if (size == UINT64_MAX && offset == UINT64_MAX)
921 return 0;
922 if (current_size == size && current_offset == offset)
923 return 0;
924
ed13feff 925 xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/../dev", DEVNUM_FORMAT_VAL(st.st_rdev));
f1443709
LP
926
927 buffer = mfree(buffer);
928 r = read_one_line_file(sysfs, &buffer);
929 if (r < 0)
930 return r;
7176f06c 931 r = parse_devnum(buffer, &devno);
f1443709
LP
932 if (r < 0)
933 return r;
934
ca822829 935 whole_fd = r = device_open_from_devnum(S_IFBLK, devno, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL);
f1443709
LP
936 if (r < 0)
937 return r;
938
91e1ce1a
LP
939 return block_device_resize_partition(
940 whole_fd,
941 partno,
942 offset == UINT64_MAX ? current_offset : offset,
943 size == UINT64_MAX ? current_size : size);
f1443709
LP
944}
945
c37878fc
LP
946int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) {
947 struct loop_info64 info;
ff27ef4b 948
9dabc4fd 949 assert(d);
ff27ef4b 950 assert(d->fd >= 0);
9dabc4fd 951
f1443709
LP
952 /* Changes the offset/start of the loop device relative to the beginning of the underlying file or
953 * block device. If this loop device actually refers to a partition and not a loopback device, we'll
954 * try to adjust the partition offsets instead.
955 *
956 * If either offset or size is UINT64_MAX we won't change that parameter. */
957
f1443709
LP
958 if (d->nr < 0) /* not a loopback device */
959 return resize_partition(d->fd, offset, size);
960
c37878fc
LP
961 if (ioctl(d->fd, LOOP_GET_STATUS64, &info) < 0)
962 return -errno;
963
10c1b188
LP
964#if HAVE_VALGRIND_MEMCHECK_H
965 /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
966 VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
967#endif
968
c37878fc
LP
969 if (size == UINT64_MAX && offset == UINT64_MAX)
970 return 0;
971 if (info.lo_sizelimit == size && info.lo_offset == offset)
972 return 0;
973
974 if (size != UINT64_MAX)
975 info.lo_sizelimit = size;
976 if (offset != UINT64_MAX)
977 info.lo_offset = offset;
978
7c248223 979 return RET_NERRNO(ioctl(d->fd, LOOP_SET_STATUS64, &info));
9dabc4fd 980}
441ec804
LP
981
982int loop_device_flock(LoopDevice *d, int operation) {
7f52206a 983 assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX));
441ec804
LP
984 assert(d);
985
7f52206a
LP
986 /* When unlocking just close the lock fd */
987 if ((operation & ~LOCK_NB) == LOCK_UN) {
988 d->lock_fd = safe_close(d->lock_fd);
989 return 0;
990 }
991
992 /* If we had no lock fd so far, create one and lock it right-away */
993 if (d->lock_fd < 0) {
994 assert(d->fd >= 0);
995
996 d->lock_fd = open_lock_fd(d->fd, operation);
997 if (d->lock_fd < 0)
998 return d->lock_fd;
999
1000 return 0;
1001 }
441ec804 1002
7f52206a
LP
1003 /* Otherwise change the current lock mode on the existing fd */
1004 return RET_NERRNO(flock(d->lock_fd, operation));
441ec804 1005}
8dbc208c
LP
1006
1007int loop_device_sync(LoopDevice *d) {
1008 assert(d);
ff27ef4b 1009 assert(d->fd >= 0);
8dbc208c
LP
1010
1011 /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that
1012 * we can check the return value though. */
1013
7c248223 1014 return RET_NERRNO(fsync(d->fd));
8dbc208c 1015}