]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/loop-util.c
loop-util: close lock fd before trying LOOP_CLR_FD in failure path
[thirdparty/systemd.git] / src / shared / loop-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
8c1be37e 2
10c1b188
LP
3#if HAVE_VALGRIND_MEMCHECK_H
4#include <valgrind/memcheck.h>
5#endif
6
dccca82b 7#include <errno.h>
8c1be37e 8#include <fcntl.h>
f1443709
LP
9#include <linux/blkpg.h>
10#include <linux/fs.h>
8c1be37e 11#include <linux/loop.h>
441ec804 12#include <sys/file.h>
8c1be37e 13#include <sys/ioctl.h>
f2d9213f 14#include <unistd.h>
8c1be37e 15
021bf175
LP
16#include "sd-device.h"
17
8c1be37e 18#include "alloc-util.h"
86c1c1f3 19#include "blockdev-util.h"
021bf175 20#include "device-util.h"
7176f06c 21#include "devnum-util.h"
e8c7c4d9 22#include "env-util.h"
b0a94268 23#include "errno-util.h"
8c1be37e 24#include "fd-util.h"
f1443709 25#include "fileio.h"
8c1be37e 26#include "loop-util.h"
86c1c1f3 27#include "missing_loop.h"
f1443709 28#include "parse-util.h"
b202ec20 29#include "random-util.h"
3cc44114 30#include "stat-util.h"
f1443709 31#include "stdio-util.h"
f2d9213f 32#include "string-util.h"
021bf175 33#include "tmpfile-util.h"
8c1be37e 34
e8af3bfd 35static void cleanup_clear_loop_close(int *fd) {
86c1c1f3
LP
36 if (*fd < 0)
37 return;
38
39 (void) ioctl(*fd, LOOP_CLR_FD);
40 (void) safe_close(*fd);
41}
42
021bf175
LP
43static int loop_is_bound(int fd) {
44 struct loop_info64 info;
45
46 assert(fd >= 0);
47
48 if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) {
49 if (errno == ENXIO)
50 return false; /* not bound! */
51
52 return -errno;
53 }
54
55 return true; /* bound! */
56}
57
31c75fcc
LP
58static int get_current_uevent_seqnum(uint64_t *ret) {
59 _cleanup_free_ char *p = NULL;
60 int r;
61
62 r = read_full_virtual_file("/sys/kernel/uevent_seqnum", &p, NULL);
63 if (r < 0)
64 return log_debug_errno(r, "Failed to read current uevent sequence number: %m");
65
a145f8c0 66 r = safe_atou64(strstrip(p), ret);
31c75fcc
LP
67 if (r < 0)
68 return log_debug_errno(r, "Failed to parse current uevent sequence number: %s", p);
69
70 return 0;
71}
72
021bf175
LP
73static int device_has_block_children(sd_device *d) {
74 _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
7ffc7f3f 75 const char *main_ss, *main_dt;
021bf175
LP
76 int r;
77
78 assert(d);
79
80 /* Checks if the specified device currently has block device children (i.e. partition block
81 * devices). */
82
7ffc7f3f 83 r = sd_device_get_subsystem(d, &main_ss);
021bf175
LP
84 if (r < 0)
85 return r;
86
7ffc7f3f
LP
87 if (!streq(main_ss, "block"))
88 return -EINVAL;
89
90 r = sd_device_get_devtype(d, &main_dt);
021bf175
LP
91 if (r < 0)
92 return r;
93
7ffc7f3f 94 if (!streq(main_dt, "disk")) /* Refuse invocation on partition block device, insist on "whole" device */
021bf175
LP
95 return -EINVAL;
96
97 r = sd_device_enumerator_new(&e);
98 if (r < 0)
99 return r;
100
101 r = sd_device_enumerator_allow_uninitialized(e);
102 if (r < 0)
103 return r;
104
105 r = sd_device_enumerator_add_match_parent(e, d);
106 if (r < 0)
107 return r;
108
5c467ef4
YW
109 r = sd_device_enumerator_add_match_subsystem(e, "block", /* match = */ true);
110 if (r < 0)
111 return r;
021bf175 112
5c467ef4
YW
113 r = sd_device_enumerator_add_match_property(e, "DEVTYPE", "partition");
114 if (r < 0)
115 return r;
021bf175 116
5c467ef4 117 return !!sd_device_enumerator_get_device_first(e);
021bf175
LP
118}
119
7f52206a
LP
120static int open_lock_fd(int primary_fd, int operation) {
121 int lock_fd;
122
123 assert(primary_fd >= 0);
124
125 lock_fd = fd_reopen(primary_fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
126 if (lock_fd < 0)
127 return lock_fd;
128 if (flock(lock_fd, operation) < 0)
129 return -errno;
130
131 return lock_fd;
132}
133
95c50092
LP
134static int loop_configure(
135 int fd,
021bf175 136 int nr,
95c50092 137 const struct loop_config *c,
31c75fcc 138 bool *try_loop_configure,
8ede1e86 139 uint64_t *ret_seqnum_not_before,
7f52206a
LP
140 usec_t *ret_timestamp_not_before,
141 int *ret_lock_fd) {
95c50092 142
021bf175
LP
143 _cleanup_(sd_device_unrefp) sd_device *d = NULL;
144 _cleanup_free_ char *sysname = NULL;
738f29cb 145 _cleanup_close_ int lock_fd = -1;
e8c7c4d9 146 struct loop_info64 info_copy;
31c75fcc 147 uint64_t seqnum;
8ede1e86 148 usec_t timestamp;
86c1c1f3
LP
149 int r;
150
151 assert(fd >= 0);
021bf175 152 assert(nr >= 0);
86c1c1f3 153 assert(c);
95c50092
LP
154 assert(try_loop_configure);
155
021bf175
LP
156 if (asprintf(&sysname, "loop%i", nr) < 0)
157 return -ENOMEM;
158
159 r = sd_device_new_from_subsystem_sysname(&d, "block", sysname);
160 if (r < 0)
161 return r;
162
163 /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened
164 * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on
165 * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure
166 * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a
167 * long time udev would possibly never run on it again, even though the fd is unlocked, simply
168 * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to
169 * automatically release the lock, after we are done. */
7f52206a 170 lock_fd = open_lock_fd(fd, LOCK_EX);
021bf175
LP
171 if (lock_fd < 0)
172 return lock_fd;
021bf175
LP
173
174 /* Let's see if the device is really detached, i.e. currently has no associated partition block
175 * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that
247738b4
LP
176 * superficially is detached but still has partition block devices associated for it. Let's then
177 * manually remove the partitions via BLKPG, and tell the caller we did that via EUCLEAN, so they try
178 * again. */
021bf175
LP
179 r = device_has_block_children(d);
180 if (r < 0)
181 return r;
182 if (r > 0) {
183 r = loop_is_bound(fd);
184 if (r < 0)
185 return r;
186 if (r > 0)
187 return -EBUSY;
188
247738b4
LP
189 /* Unbound but has children? Remove all partitions, and report this to the caller, to try
190 * again, and count this as an attempt. */
191
192 r = block_device_remove_all_partitions(fd);
193 if (r < 0)
194 return r;
195
196 return -EUCLEAN;
021bf175
LP
197 }
198
95c50092 199 if (*try_loop_configure) {
31c75fcc
LP
200 /* Acquire uevent seqnum immediately before attaching the loopback device. This allows
201 * callers to ignore all uevents with a seqnum before this one, if they need to associate
202 * uevent with this attachment. Doing so isn't race-free though, as uevents that happen in
203 * the window between this reading of the seqnum, and the LOOP_CONFIGURE call might still be
204 * mistaken as originating from our attachment, even though might be caused by an earlier
205 * use. But doing this at least shortens the race window a bit. */
206 r = get_current_uevent_seqnum(&seqnum);
207 if (r < 0)
208 return r;
8ede1e86 209 timestamp = now(CLOCK_MONOTONIC);
31c75fcc 210
95c50092
LP
211 if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
212 /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other
213 * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL
214 * rather than ENOTTY on loopback block devices. They should fix that in the kernel,
215 * but in the meantime we accept both here. */
216 if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
217 return -errno;
86c1c1f3 218
95c50092
LP
219 *try_loop_configure = false;
220 } else {
221 bool good = true;
222
223 if (c->info.lo_sizelimit != 0) {
224 /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the
225 * block device. If it's used, let's immediately check if it had the desired
226 * effect hence. And if not use classic LOOP_SET_STATUS64. */
227 uint64_t z;
228
229 if (ioctl(fd, BLKGETSIZE64, &z) < 0) {
230 r = -errno;
231 goto fail;
232 }
233
234 if (z != c->info.lo_sizelimit) {
235 log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
236 good = false;
237 }
bb2551bd 238 }
86c1c1f3 239
95c50092
LP
240 if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
241 /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag
242 * into the block device. Let's hence verify if things work correctly here
243 * before returning. */
244
245 r = blockdev_partscan_enabled(fd);
246 if (r < 0)
247 goto fail;
248 if (r == 0) {
249 log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
250 good = false;
251 }
bb2551bd 252 }
86c1c1f3 253
95c50092
LP
254 if (!good) {
255 /* LOOP_CONFIGURE doesn't work. Remember that. */
256 *try_loop_configure = false;
257
258 /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD,
259 * because LOOP_CLR_FD is async: if the operation cannot be executed right
260 * away it just sets the autoclear flag on the device. This means there's a
261 * good chance we cannot actually reuse the loopback device right-away. Hence
262 * let's assume it's busy, avoid the trouble and let the calling loop call us
263 * again with a new, likely unused device. */
264 r = -EBUSY;
bb2551bd 265 goto fail;
bb2551bd 266 }
bb2551bd 267
7f52206a 268 goto success;
95c50092 269 }
86c1c1f3
LP
270 }
271
31c75fcc
LP
272 /* Let's read the seqnum again, to shorten the window. */
273 r = get_current_uevent_seqnum(&seqnum);
274 if (r < 0)
275 return r;
8ede1e86 276 timestamp = now(CLOCK_MONOTONIC);
31c75fcc 277
738f29cb
LP
278 /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64
279 * ioctl can return EAGAIN in case we change the lo_offset field, if someone else is accessing the
280 * block device while we try to reconfigure it. This is a pretty common case, since udev might
281 * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways:
273d76f4 282 * first, let's take the BSD lock to ensure that udev will not step in between the point in
738f29cb
LP
283 * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on
284 * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms
285 * needlessly if we are just racing against udev. The latter is protection against all other cases,
021bf175 286 * i.e. peers that do not take the BSD lock. */
738f29cb 287
86c1c1f3
LP
288 if (ioctl(fd, LOOP_SET_FD, c->fd) < 0)
289 return -errno;
290
e8c7c4d9
LP
291 /* Only some of the flags LOOP_CONFIGURE can set are also settable via LOOP_SET_STATUS64, hence mask
292 * them out. */
293 info_copy = c->info;
294 info_copy.lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
295
738f29cb 296 for (unsigned n_attempts = 0;;) {
e8c7c4d9 297 if (ioctl(fd, LOOP_SET_STATUS64, &info_copy) >= 0)
738f29cb
LP
298 break;
299 if (errno != EAGAIN || ++n_attempts >= 64) {
9d72a3cf 300 r = log_debug_errno(errno, "Failed to configure loopback block device: %m");
738f29cb
LP
301 goto fail;
302 }
303
b202ec20
LP
304 /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more
305 * failed attempts we see */
306 (void) usleep(UINT64_C(10) * USEC_PER_MSEC +
b0dbffd8 307 random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
e8af3bfd 308 }
86c1c1f3 309
b9a9748a
LP
310 /* Work around a kernel bug, where changing offset/size of the loopback device doesn't correctly
311 * invalidate the buffer cache. For details see:
312 *
313 * https://android.googlesource.com/platform/system/apex/+/bef74542fbbb4cd629793f4efee8e0053b360570
314 *
315 * This was fixed in kernel 5.0, see:
316 *
317 * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5db470e229e22b7eda6e23b5566e532c96fb5bc3
318 *
319 * We'll run the work-around here in the legacy LOOP_SET_STATUS64 codepath. In the LOOP_CONFIGURE
320 * codepath above it should not be necessary. */
321 if (c->info.lo_offset != 0 || c->info.lo_sizelimit != 0)
322 if (ioctl(fd, BLKFLSBUF, 0) < 0)
323 log_debug_errno(errno, "Failed to issue BLKFLSBUF ioctl, ignoring: %m");
324
e8c7c4d9
LP
325 /* LO_FLAGS_DIRECT_IO is a flags we need to configure via explicit ioctls. */
326 if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) {
327 unsigned long b = 1;
328
329 if (ioctl(fd, LOOP_SET_DIRECT_IO, b) < 0)
330 log_debug_errno(errno, "Failed to enable direct IO mode on loopback device /dev/loop%i, ignoring: %m", nr);
331 }
332
7f52206a 333success:
31c75fcc
LP
334 if (ret_seqnum_not_before)
335 *ret_seqnum_not_before = seqnum;
8ede1e86
LP
336 if (ret_timestamp_not_before)
337 *ret_timestamp_not_before = timestamp;
7f52206a
LP
338 if (ret_lock_fd)
339 *ret_lock_fd = TAKE_FD(lock_fd);
31c75fcc 340
86c1c1f3
LP
341 return 0;
342
343fail:
87862cc2
LP
344 /* Close the lock fd explicitly before clearing the loopback block device, since an additional open
345 * fd would block the clearing to succeed */
346 lock_fd = safe_close(lock_fd);
86c1c1f3
LP
347 (void) ioctl(fd, LOOP_CLR_FD);
348 return r;
e8af3bfd
ZJS
349}
350
e8c7c4d9 351static int loop_device_make_internal(
ed9eeb7b
LP
352 int fd,
353 int open_flags,
354 uint64_t offset,
355 uint64_t size,
356 uint32_t loop_flags,
7f52206a 357 int lock_op,
ed9eeb7b 358 LoopDevice **ret) {
8c1be37e 359
7f52206a 360 _cleanup_close_ int direct_io_fd = -1, lock_fd = -1;
8c1be37e 361 _cleanup_free_ char *loopdev = NULL;
95c50092 362 bool try_loop_configure = true;
86c1c1f3 363 struct loop_config config;
50d04699 364 LoopDevice *d = NULL;
31c75fcc 365 uint64_t seqnum = UINT64_MAX;
8ede1e86 366 usec_t timestamp = USEC_INFINITY;
e8c7c4d9 367 int nr = -1, r, f_flags;
8c1be37e 368 struct stat st;
8c1be37e
LP
369
370 assert(fd >= 0);
371 assert(ret);
372 assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
373
374 if (fstat(fd, &st) < 0)
375 return -errno;
376
377 if (S_ISBLK(st.st_mode)) {
7f52206a
LP
378 if (lock_op != LOCK_UN) {
379 lock_fd = open_lock_fd(fd, lock_op);
380 if (lock_fd < 0)
381 return lock_fd;
382 }
383
86c1c1f3 384 if (ioctl(fd, LOOP_GET_STATUS64, &config.info) >= 0) {
b26c39ad 385 /* Oh! This is a loopback device? That's interesting! */
10c1b188
LP
386
387#if HAVE_VALGRIND_MEMCHECK_H
388 /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
86c1c1f3 389 VALGRIND_MAKE_MEM_DEFINED(&config.info, sizeof(config.info));
10c1b188 390#endif
86c1c1f3 391 nr = config.info.lo_number;
b26c39ad
LP
392
393 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
394 return -ENOMEM;
395 }
396
ed9eeb7b 397 if (offset == 0 && IN_SET(size, 0, UINT64_MAX)) {
ba5450f4 398 _cleanup_close_ int copy = -1;
bcef1743 399 uint64_t diskseq = 0;
8c1be37e 400
d7654742
LP
401 /* If this is already a block device and we are supposed to cover the whole of it
402 * then store an fd to the original open device node — and do not actually create an
403 * unnecessary loopback device for it. Note that we reopen the inode here, instead of
404 * keeping just a dup() clone of it around, since we want to ensure that the O_DIRECT
405 * flag of the handle we keep is off, we have our own file index, and have the right
406 * read/write mode in effect. */
407
408 copy = fd_reopen(fd, open_flags|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
ed9eeb7b 409 if (copy < 0)
d7654742 410 return copy;
8c1be37e 411
7e93a658 412 r = fd_get_diskseq(copy, &diskseq);
bcef1743
LB
413 if (r < 0 && r != -EOPNOTSUPP)
414 return r;
415
ed9eeb7b
LP
416 d = new(LoopDevice, 1);
417 if (!d)
418 return -ENOMEM;
ed9eeb7b 419 *d = (LoopDevice) {
ba5450f4 420 .fd = TAKE_FD(copy),
7f52206a 421 .lock_fd = TAKE_FD(lock_fd),
b26c39ad
LP
422 .nr = nr,
423 .node = TAKE_PTR(loopdev),
ed9eeb7b 424 .relinquished = true, /* It's not allocated by us, don't destroy it when this object is freed */
f3859d5f 425 .devno = st.st_rdev,
bcef1743 426 .diskseq = diskseq,
31c75fcc 427 .uevent_seqnum_not_before = UINT64_MAX,
8ede1e86 428 .timestamp_not_before = USEC_INFINITY,
ed9eeb7b
LP
429 };
430
431 *ret = d;
432 return d->fd;
433 }
434 } else {
435 r = stat_verify_regular(&st);
436 if (r < 0)
437 return r;
8c1be37e
LP
438 }
439
e8c7c4d9
LP
440 f_flags = fcntl(fd, F_GETFL);
441 if (f_flags < 0)
442 return -errno;
443
444 if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) != FLAGS_SET(f_flags, O_DIRECT)) {
445 /* If LO_FLAGS_DIRECT_IO is requested, then make sure we have the fd open with O_DIRECT, as
446 * that's required. Conversely, if it's off require that O_DIRECT is off too (that's because
447 * new kernels will implicitly enable LO_FLAGS_DIRECT_IO if O_DIRECT is set).
448 *
449 * Our intention here is that LO_FLAGS_DIRECT_IO is the primary knob, and O_DIRECT derived
450 * from that automatically. */
451
452 direct_io_fd = fd_reopen(fd, (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0)|O_CLOEXEC|O_NONBLOCK|open_flags);
453 if (direct_io_fd < 0) {
454 if (!FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO))
455 return log_debug_errno(errno, "Failed to reopen file descriptor without O_DIRECT: %m");
456
457 /* Some file systems might not support O_DIRECT, let's gracefully continue without it then. */
458 log_debug_errno(errno, "Failed to enable O_DIRECT for backing file descriptor for loopback device. Continuing without.");
459 loop_flags &= ~LO_FLAGS_DIRECT_IO;
460 } else
461 fd = direct_io_fd; /* From now on, operate on our new O_DIRECT fd */
462 }
463
e8af3bfd
ZJS
464 _cleanup_close_ int control = -1;
465 _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -1;
466
8c1be37e
LP
467 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
468 if (control < 0)
469 return -errno;
470
86c1c1f3
LP
471 config = (struct loop_config) {
472 .fd = fd,
473 .info = {
474 /* Use the specified flags, but configure the read-only flag from the open flags, and force autoclear */
0950526a 475 .lo_flags = (loop_flags & ~LO_FLAGS_READ_ONLY) | ((open_flags & O_ACCMODE) == O_RDONLY ? LO_FLAGS_READ_ONLY : 0) | LO_FLAGS_AUTOCLEAR,
86c1c1f3
LP
476 .lo_offset = offset,
477 .lo_sizelimit = size == UINT64_MAX ? 0 : size,
478 },
479 };
480
0f6519d4
LP
481 /* Loop around LOOP_CTL_GET_FREE, since at the moment we attempt to open the returned device it might
482 * be gone already, taken by somebody else racing against us. */
e8af3bfd
ZJS
483 for (unsigned n_attempts = 0;;) {
484 _cleanup_close_ int loop = -1;
485
cc530466
LP
486 /* Let's take a lock on the control device first. On a busy system, where many programs
487 * attempt to allocate a loopback device at the same time, we might otherwise keep looping
488 * around relatively heavy operations: asking for a free loopback device, then opening it,
489 * validating it, attaching something to it. Let's serialize this whole operation, to make
490 * unnecessary busywork less likely. Note that this is just something we do to optimize our
491 * own code (and whoever else decides to use LOCK_EX locks for this), taking this lock is not
492 * necessary, it just means it's less likely we have to iterate through this loop again and
493 * again if our own code races against our own code. */
494 if (flock(control, LOCK_EX) < 0)
495 return -errno;
496
0f6519d4
LP
497 nr = ioctl(control, LOOP_CTL_GET_FREE);
498 if (nr < 0)
499 return -errno;
8c1be37e 500
0f6519d4
LP
501 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
502 return -ENOMEM;
8c1be37e 503
0f6519d4 504 loop = open(loopdev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags);
01813148
ZJS
505 if (loop < 0) {
506 /* Somebody might've gotten the same number from the kernel, used the device,
507 * and called LOOP_CTL_REMOVE on it. Let's retry with a new number. */
49043f81 508 if (!ERRNO_IS_DEVICE_ABSENT(errno))
01813148
ZJS
509 return -errno;
510 } else {
7f52206a 511 r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, &timestamp, &lock_fd);
86c1c1f3 512 if (r >= 0) {
01813148
ZJS
513 loop_with_fd = TAKE_FD(loop);
514 break;
515 }
247738b4
LP
516 if (!IN_SET(r, -EBUSY, -EUCLEAN)) /* Busy, or some left-over partition devices that
517 * were cleaned up. */
86c1c1f3 518 return r;
e8af3bfd 519 }
01813148 520
cc530466
LP
521 /* OK, this didn't work, let's try again a bit later, but first release the lock on the
522 * control device */
523 if (flock(control, LOCK_UN) < 0)
524 return -errno;
525
e8af3bfd
ZJS
526 if (++n_attempts >= 64) /* Give up eventually */
527 return -EBUSY;
0f6519d4 528
3e921057
LP
529 /* Now close the loop device explicitly. This will release any lock acquired by
530 * attach_empty_file() or similar, while we sleep below. */
531 loop = safe_close(loop);
0f6519d4 532 loopdev = mfree(loopdev);
b202ec20
LP
533
534 /* Wait some random time, to make collision less likely. Let's pick a random time in the
535 * range 0ms…250ms, linearly scaled by the number of failed attempts. */
b0dbffd8
LP
536 (void) usleep(random_u64_range(UINT64_C(10) * USEC_PER_MSEC +
537 UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
0f6519d4 538 }
8c1be37e 539
e8c7c4d9
LP
540 if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) {
541 struct loop_info64 info;
542
543 if (ioctl(loop_with_fd, LOOP_GET_STATUS64, &info) < 0)
544 return -errno;
545
546#if HAVE_VALGRIND_MEMCHECK_H
547 VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
548#endif
549
550 /* On older kernels (<= 5.3) it was necessary to set the block size of the loopback block
551 * device to the logical block size of the underlying file system. Since there was no nice
552 * way to query the value, we are not bothering to do this however. On newer kernels the
553 * block size is propagated automatically and does not require intervention from us. We'll
554 * check here if enabling direct IO worked, to make this easily debuggable however.
555 *
556 * (Should anyone really care and actually wants direct IO on old kernels: it might be worth
557 * enabling direct IO with iteratively larger block sizes until it eventually works.) */
558 if (!FLAGS_SET(info.lo_flags, LO_FLAGS_DIRECT_IO))
559 log_debug("Could not enable direct IO mode, proceeding in buffered IO mode.");
560 }
561
f3859d5f
LP
562 if (fstat(loop_with_fd, &st) < 0)
563 return -errno;
564 assert(S_ISBLK(st.st_mode));
565
bcef1743 566 uint64_t diskseq = 0;
7e93a658 567 r = fd_get_diskseq(loop_with_fd, &diskseq);
bcef1743
LB
568 if (r < 0 && r != -EOPNOTSUPP)
569 return r;
570
7f52206a
LP
571 switch (lock_op & ~LOCK_NB) {
572 case LOCK_EX: /* Already in effect */
573 break;
574 case LOCK_SH: /* Downgrade */
575 if (flock(lock_fd, lock_op) < 0)
576 return -errno;
577 break;
578 case LOCK_UN: /* Release */
579 lock_fd = safe_close(lock_fd);
580 break;
581 default:
582 assert_not_reached();
583 }
584
8c1be37e 585 d = new(LoopDevice, 1);
e8af3bfd
ZJS
586 if (!d)
587 return -ENOMEM;
8c1be37e 588 *d = (LoopDevice) {
e8af3bfd 589 .fd = TAKE_FD(loop_with_fd),
7f52206a 590 .lock_fd = TAKE_FD(lock_fd),
1cc6c93a 591 .node = TAKE_PTR(loopdev),
8c1be37e 592 .nr = nr,
f3859d5f 593 .devno = st.st_rdev,
bcef1743 594 .diskseq = diskseq,
31c75fcc 595 .uevent_seqnum_not_before = seqnum,
8ede1e86 596 .timestamp_not_before = timestamp,
8c1be37e
LP
597 };
598
3b195f63
LP
599 log_debug("Successfully acquired %s, devno=%u:%u, nr=%i, diskseq=%" PRIu64,
600 d->node,
601 major(d->devno), minor(d->devno),
602 d->nr,
603 d->diskseq);
604
8c1be37e 605 *ret = d;
38bd449f 606 return d->fd;
8c1be37e
LP
607}
608
e8c7c4d9
LP
609static uint32_t loop_flags_mangle(uint32_t loop_flags) {
610 int r;
611
612 r = getenv_bool("SYSTEMD_LOOP_DIRECT_IO");
613 if (r < 0 && r != -ENXIO)
614 log_debug_errno(r, "Failed to parse $SYSTEMD_LOOP_DIRECT_IO, ignoring: %m");
615
bfd08445 616 return UPDATE_FLAG(loop_flags, LO_FLAGS_DIRECT_IO, r != 0); /* Turn on LO_FLAGS_DIRECT_IO by default, unless explicitly configured to off. */
e8c7c4d9
LP
617}
618
619int loop_device_make(
620 int fd,
621 int open_flags,
622 uint64_t offset,
623 uint64_t size,
624 uint32_t loop_flags,
7f52206a 625 int lock_op,
e8c7c4d9
LP
626 LoopDevice **ret) {
627
628 assert(fd >= 0);
629 assert(ret);
e8c7c4d9
LP
630
631 return loop_device_make_internal(
632 fd,
633 open_flags,
634 offset,
635 size,
bfd08445 636 loop_flags_mangle(loop_flags),
7f52206a 637 lock_op,
e8c7c4d9
LP
638 ret);
639}
640
79e8393a
LP
641int loop_device_make_by_path(
642 const char *path,
643 int open_flags,
644 uint32_t loop_flags,
7f52206a 645 int lock_op,
79e8393a
LP
646 LoopDevice **ret) {
647
e8c7c4d9 648 int r, basic_flags, direct_flags, rdwr_flags;
8c1be37e 649 _cleanup_close_ int fd = -1;
aa4d3aa3 650 bool direct = false;
8c1be37e
LP
651
652 assert(path);
653 assert(ret);
b0a94268 654 assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY));
8c1be37e 655
b0a94268
LP
656 /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying
657 * read-only if we cannot. */
658
e8c7c4d9
LP
659 loop_flags = loop_flags_mangle(loop_flags);
660
661 /* Let's open with O_DIRECT if we can. But not all file systems support that, hence fall back to
662 * non-O_DIRECT mode automatically, if it fails. */
663
664 basic_flags = O_CLOEXEC|O_NONBLOCK|O_NOCTTY;
665 direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0;
666 rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR;
667
668 fd = open(path, basic_flags|direct_flags|rdwr_flags);
669 if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */
670 fd = open(path, basic_flags|rdwr_flags);
aa4d3aa3
LP
671 else
672 direct = direct_flags != 0;
b0a94268
LP
673 if (fd < 0) {
674 r = -errno;
675
676 /* Retry read-only? */
677 if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS))
678 return r;
679
e8c7c4d9
LP
680 fd = open(path, basic_flags|direct_flags|O_RDONLY);
681 if (fd < 0 && direct_flags != 0) /* as above */
682 fd = open(path, basic_flags|O_RDONLY);
aa4d3aa3
LP
683 else
684 direct = direct_flags != 0;
b0a94268
LP
685 if (fd < 0)
686 return r; /* Propagate original error */
687
688 open_flags = O_RDONLY;
689 } else if (open_flags < 0)
690 open_flags = O_RDWR;
8c1be37e 691
aa4d3aa3
LP
692 log_debug("Opened '%s' in %s access mode%s, with O_DIRECT %s%s.",
693 path,
694 open_flags == O_RDWR ? "O_RDWR" : "O_RDONLY",
695 open_flags != rdwr_flags ? " (O_RDWR was requested but not allowed)" : "",
696 direct ? "enabled" : "disabled",
697 direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : "");
698
7f52206a 699 return loop_device_make_internal(fd, open_flags, 0, 0, loop_flags, lock_op, ret);
8c1be37e
LP
700}
701
702LoopDevice* loop_device_unref(LoopDevice *d) {
3a6ed1e1
LP
703 int r;
704
8c1be37e
LP
705 if (!d)
706 return NULL;
707
7f52206a
LP
708 d->lock_fd = safe_close(d->lock_fd);
709
8c1be37e 710 if (d->fd >= 0) {
cae1e8fb
LP
711 /* Implicitly sync the device, since otherwise in-flight blocks might not get written */
712 if (fsync(d->fd) < 0)
713 log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m");
714
a2ea3b2f 715 if (d->nr >= 0 && !d->relinquished) {
3a6ed1e1
LP
716 /* We are supposed to clear the loopback device. Let's do this synchronously: lock
717 * the device, manually remove all partitions and then clear it. This should ensure
718 * udev doesn't concurrently access the devices, and we can be reasonably sure that
719 * once we are done here the device is cleared and all its partition children
720 * removed. Note that we lock our primary device fd here (and not a separate locking
721 * fd, as we do during allocation, since we want to keep the lock all the way through
722 * the LOOP_CLR_FD, but that call would fail if we had more than one fd open.) */
8c1be37e 723
3a6ed1e1
LP
724 if (flock(d->fd, LOCK_EX) < 0)
725 log_debug_errno(errno, "Failed to lock loop block device, ignoring: %m");
726
727 r = block_device_remove_all_partitions(d->fd);
728 if (r < 0)
729 log_debug_errno(r, "Failed to remove partitions of loopback block device, ignoring: %m");
730
731 if (ioctl(d->fd, LOOP_CLR_FD) < 0)
732 log_debug_errno(errno, "Failed to clear loop device, ignoring: %m");
8c1be37e
LP
733 }
734
735 safe_close(d->fd);
736 }
737
a2ea3b2f 738 if (d->nr >= 0 && !d->relinquished) {
8c1be37e
LP
739 _cleanup_close_ int control = -1;
740
741 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
742 if (control < 0)
f2d9213f
ZJS
743 log_warning_errno(errno,
744 "Failed to open loop control device, cannot remove loop device %s: %m",
745 strna(d->node));
746 else
747 for (unsigned n_attempts = 0;;) {
748 if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0)
749 break;
750 if (errno != EBUSY || ++n_attempts >= 64) {
751 log_warning_errno(errno, "Failed to remove device %s: %m", strna(d->node));
752 break;
753 }
cae1e8fb 754 (void) usleep(50 * USEC_PER_MSEC);
f2d9213f 755 }
8c1be37e
LP
756 }
757
758 free(d->node);
5fecf46d 759 return mfree(d);
8c1be37e 760}
a2ea3b2f
LP
761
762void loop_device_relinquish(LoopDevice *d) {
763 assert(d);
764
765 /* Don't attempt to clean up the loop device anymore from this point on. Leave the clean-ing up to the kernel
766 * itself, using the loop device "auto-clear" logic we already turned on when creating the device. */
767
768 d->relinquished = true;
769}
9dabc4fd 770
24d59aee
DDM
771void loop_device_unrelinquish(LoopDevice *d) {
772 assert(d);
773 d->relinquished = false;
774}
775
7f52206a
LP
776int loop_device_open(
777 const char *loop_path,
778 int open_flags,
779 int lock_op,
780 LoopDevice **ret) {
781
782 _cleanup_close_ int loop_fd = -1, lock_fd = -1;
9dabc4fd 783 _cleanup_free_ char *p = NULL;
b26c39ad 784 struct loop_info64 info;
9dabc4fd
LP
785 struct stat st;
786 LoopDevice *d;
b26c39ad 787 int nr;
9dabc4fd
LP
788
789 assert(loop_path);
e8c7c4d9 790 assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
9dabc4fd
LP
791 assert(ret);
792
793 loop_fd = open(loop_path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags);
794 if (loop_fd < 0)
795 return -errno;
796
797 if (fstat(loop_fd, &st) < 0)
798 return -errno;
9dabc4fd
LP
799 if (!S_ISBLK(st.st_mode))
800 return -ENOTBLK;
801
10c1b188
LP
802 if (ioctl(loop_fd, LOOP_GET_STATUS64, &info) >= 0) {
803#if HAVE_VALGRIND_MEMCHECK_H
804 /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
805 VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
806#endif
b26c39ad 807 nr = info.lo_number;
10c1b188 808 } else
b26c39ad
LP
809 nr = -1;
810
7f52206a
LP
811 if ((lock_op & ~LOCK_NB) != LOCK_UN) {
812 lock_fd = open_lock_fd(loop_fd, lock_op);
813 if (lock_fd < 0)
814 return lock_fd;
815 }
816
9dabc4fd
LP
817 p = strdup(loop_path);
818 if (!p)
819 return -ENOMEM;
820
821 d = new(LoopDevice, 1);
822 if (!d)
823 return -ENOMEM;
824
825 *d = (LoopDevice) {
826 .fd = TAKE_FD(loop_fd),
7f52206a 827 .lock_fd = TAKE_FD(lock_fd),
b26c39ad 828 .nr = nr,
9dabc4fd
LP
829 .node = TAKE_PTR(p),
830 .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */
79e8393a 831 .devno = st.st_dev,
31c75fcc 832 .uevent_seqnum_not_before = UINT64_MAX,
8ede1e86 833 .timestamp_not_before = USEC_INFINITY,
9dabc4fd
LP
834 };
835
836 *ret = d;
837 return d->fd;
838}
839
f1443709
LP
840static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) {
841 char sysfs[STRLEN("/sys/dev/block/:/partition") + 2*DECIMAL_STR_MAX(dev_t) + 1];
ca822829 842 _cleanup_free_ char *buffer = NULL;
f1443709
LP
843 uint64_t current_offset, current_size, partno;
844 _cleanup_close_ int whole_fd = -1;
845 struct stat st;
846 dev_t devno;
847 int r;
848
849 assert(partition_fd >= 0);
850
851 /* Resizes the partition the loopback device refer to (assuming it refers to one instead of an actual
852 * loopback device), and changes the offset, if needed. This is a fancy wrapper around
853 * BLKPG_RESIZE_PARTITION. */
854
855 if (fstat(partition_fd, &st) < 0)
856 return -errno;
857
858 assert(S_ISBLK(st.st_mode));
859
ed13feff 860 xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev));
f1443709
LP
861 r = read_one_line_file(sysfs, &buffer);
862 if (r == -ENOENT) /* not a partition, cannot resize */
863 return -ENOTTY;
864 if (r < 0)
865 return r;
866 r = safe_atou64(buffer, &partno);
867 if (r < 0)
868 return r;
869
ed13feff 870 xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/start", DEVNUM_FORMAT_VAL(st.st_rdev));
f1443709
LP
871
872 buffer = mfree(buffer);
873 r = read_one_line_file(sysfs, &buffer);
874 if (r < 0)
875 return r;
876 r = safe_atou64(buffer, &current_offset);
877 if (r < 0)
878 return r;
879 if (current_offset > UINT64_MAX/512U)
880 return -EINVAL;
881 current_offset *= 512U;
882
883 if (ioctl(partition_fd, BLKGETSIZE64, &current_size) < 0)
884 return -EINVAL;
885
886 if (size == UINT64_MAX && offset == UINT64_MAX)
887 return 0;
888 if (current_size == size && current_offset == offset)
889 return 0;
890
ed13feff 891 xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/../dev", DEVNUM_FORMAT_VAL(st.st_rdev));
f1443709
LP
892
893 buffer = mfree(buffer);
894 r = read_one_line_file(sysfs, &buffer);
895 if (r < 0)
896 return r;
7176f06c 897 r = parse_devnum(buffer, &devno);
f1443709
LP
898 if (r < 0)
899 return r;
900
ca822829 901 whole_fd = r = device_open_from_devnum(S_IFBLK, devno, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL);
f1443709
LP
902 if (r < 0)
903 return r;
904
91e1ce1a
LP
905 return block_device_resize_partition(
906 whole_fd,
907 partno,
908 offset == UINT64_MAX ? current_offset : offset,
909 size == UINT64_MAX ? current_size : size);
f1443709
LP
910}
911
c37878fc
LP
912int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) {
913 struct loop_info64 info;
ff27ef4b 914
9dabc4fd 915 assert(d);
ff27ef4b 916 assert(d->fd >= 0);
9dabc4fd 917
f1443709
LP
918 /* Changes the offset/start of the loop device relative to the beginning of the underlying file or
919 * block device. If this loop device actually refers to a partition and not a loopback device, we'll
920 * try to adjust the partition offsets instead.
921 *
922 * If either offset or size is UINT64_MAX we won't change that parameter. */
923
f1443709
LP
924 if (d->nr < 0) /* not a loopback device */
925 return resize_partition(d->fd, offset, size);
926
c37878fc
LP
927 if (ioctl(d->fd, LOOP_GET_STATUS64, &info) < 0)
928 return -errno;
929
10c1b188
LP
930#if HAVE_VALGRIND_MEMCHECK_H
931 /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
932 VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
933#endif
934
c37878fc
LP
935 if (size == UINT64_MAX && offset == UINT64_MAX)
936 return 0;
937 if (info.lo_sizelimit == size && info.lo_offset == offset)
938 return 0;
939
940 if (size != UINT64_MAX)
941 info.lo_sizelimit = size;
942 if (offset != UINT64_MAX)
943 info.lo_offset = offset;
944
7c248223 945 return RET_NERRNO(ioctl(d->fd, LOOP_SET_STATUS64, &info));
9dabc4fd 946}
441ec804
LP
947
948int loop_device_flock(LoopDevice *d, int operation) {
7f52206a 949 assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX));
441ec804
LP
950 assert(d);
951
7f52206a
LP
952 /* When unlocking just close the lock fd */
953 if ((operation & ~LOCK_NB) == LOCK_UN) {
954 d->lock_fd = safe_close(d->lock_fd);
955 return 0;
956 }
957
958 /* If we had no lock fd so far, create one and lock it right-away */
959 if (d->lock_fd < 0) {
960 assert(d->fd >= 0);
961
962 d->lock_fd = open_lock_fd(d->fd, operation);
963 if (d->lock_fd < 0)
964 return d->lock_fd;
965
966 return 0;
967 }
441ec804 968
7f52206a
LP
969 /* Otherwise change the current lock mode on the existing fd */
970 return RET_NERRNO(flock(d->lock_fd, operation));
441ec804 971}
8dbc208c
LP
972
973int loop_device_sync(LoopDevice *d) {
974 assert(d);
ff27ef4b 975 assert(d->fd >= 0);
8dbc208c
LP
976
977 /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that
978 * we can check the return value though. */
979
7c248223 980 return RET_NERRNO(fsync(d->fd));
8dbc208c 981}