]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
8c1be37e | 2 | |
10c1b188 LP |
3 | #if HAVE_VALGRIND_MEMCHECK_H |
4 | #include <valgrind/memcheck.h> | |
5 | #endif | |
6 | ||
dccca82b | 7 | #include <errno.h> |
8c1be37e | 8 | #include <fcntl.h> |
f1443709 LP |
9 | #include <linux/blkpg.h> |
10 | #include <linux/fs.h> | |
8c1be37e | 11 | #include <linux/loop.h> |
441ec804 | 12 | #include <sys/file.h> |
8c1be37e | 13 | #include <sys/ioctl.h> |
f2d9213f | 14 | #include <unistd.h> |
8c1be37e | 15 | |
021bf175 LP |
16 | #include "sd-device.h" |
17 | ||
8c1be37e | 18 | #include "alloc-util.h" |
86c1c1f3 | 19 | #include "blockdev-util.h" |
fcd8a19d | 20 | #include "data-fd-util.h" |
021bf175 | 21 | #include "device-util.h" |
7176f06c | 22 | #include "devnum-util.h" |
22ee78a8 | 23 | #include "dissect-image.h" |
e8c7c4d9 | 24 | #include "env-util.h" |
b0a94268 | 25 | #include "errno-util.h" |
8c1be37e | 26 | #include "fd-util.h" |
972c8db5 | 27 | #include "fs-util.h" |
f1443709 | 28 | #include "fileio.h" |
8c1be37e | 29 | #include "loop-util.h" |
86c1c1f3 | 30 | #include "missing_loop.h" |
f1443709 | 31 | #include "parse-util.h" |
e77cab82 | 32 | #include "path-util.h" |
b202ec20 | 33 | #include "random-util.h" |
3cc44114 | 34 | #include "stat-util.h" |
f1443709 | 35 | #include "stdio-util.h" |
f2d9213f | 36 | #include "string-util.h" |
021bf175 | 37 | #include "tmpfile-util.h" |
8c1be37e | 38 | |
e8af3bfd | 39 | static void cleanup_clear_loop_close(int *fd) { |
86c1c1f3 LP |
40 | if (*fd < 0) |
41 | return; | |
42 | ||
43 | (void) ioctl(*fd, LOOP_CLR_FD); | |
44 | (void) safe_close(*fd); | |
45 | } | |
46 | ||
021bf175 LP |
47 | static int loop_is_bound(int fd) { |
48 | struct loop_info64 info; | |
49 | ||
8e398254 | 50 | if (ioctl(ASSERT_FD(fd), LOOP_GET_STATUS64, &info) < 0) { |
021bf175 LP |
51 | if (errno == ENXIO) |
52 | return false; /* not bound! */ | |
53 | ||
54 | return -errno; | |
55 | } | |
56 | ||
57 | return true; /* bound! */ | |
58 | } | |
59 | ||
31c75fcc LP |
60 | static int get_current_uevent_seqnum(uint64_t *ret) { |
61 | _cleanup_free_ char *p = NULL; | |
62 | int r; | |
63 | ||
64 | r = read_full_virtual_file("/sys/kernel/uevent_seqnum", &p, NULL); | |
65 | if (r < 0) | |
66 | return log_debug_errno(r, "Failed to read current uevent sequence number: %m"); | |
67 | ||
a145f8c0 | 68 | r = safe_atou64(strstrip(p), ret); |
31c75fcc LP |
69 | if (r < 0) |
70 | return log_debug_errno(r, "Failed to parse current uevent sequence number: %s", p); | |
71 | ||
72 | return 0; | |
73 | } | |
74 | ||
7f52206a | 75 | static int open_lock_fd(int primary_fd, int operation) { |
254d1313 | 76 | _cleanup_close_ int lock_fd = -EBADF; |
7f52206a | 77 | |
10719a6f | 78 | assert(IN_SET(operation & ~LOCK_NB, LOCK_SH, LOCK_EX)); |
7f52206a | 79 | |
8e398254 | 80 | lock_fd = fd_reopen(ASSERT_FD(primary_fd), O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); |
7f52206a LP |
81 | if (lock_fd < 0) |
82 | return lock_fd; | |
10719a6f | 83 | |
7f52206a LP |
84 | if (flock(lock_fd, operation) < 0) |
85 | return -errno; | |
86 | ||
10719a6f | 87 | return TAKE_FD(lock_fd); |
7f52206a LP |
88 | } |
89 | ||
54ba7daf | 90 | static int loop_configure_verify_direct_io(int fd, const struct loop_config *c) { |
ac110243 | 91 | assert(fd >= 0); |
54ba7daf YW |
92 | assert(c); |
93 | ||
94 | if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) { | |
95 | struct loop_info64 info; | |
96 | ||
97 | if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) | |
98 | return log_debug_errno(errno, "Failed to issue LOOP_GET_STATUS64: %m"); | |
99 | ||
100 | #if HAVE_VALGRIND_MEMCHECK_H | |
101 | VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); | |
102 | #endif | |
103 | ||
104 | /* On older kernels (<= 5.3) it was necessary to set the block size of the loopback block | |
105 | * device to the logical block size of the underlying file system. Since there was no nice | |
106 | * way to query the value, we are not bothering to do this however. On newer kernels the | |
107 | * block size is propagated automatically and does not require intervention from us. We'll | |
108 | * check here if enabling direct IO worked, to make this easily debuggable however. | |
109 | * | |
110 | * (Should anyone really care and actually wants direct IO on old kernels: it might be worth | |
f5bb0a31 LB |
111 | * enabling direct IO with iteratively larger block sizes until it eventually works.) |
112 | * | |
113 | * On older kernels (e.g.: 5.10) when this is attempted on a file stored on a dm-crypt | |
114 | * backed partition the kernel will start returning I/O errors when accessing the mounted | |
115 | * loop device, so return a recognizable error that causes the operation to be started | |
116 | * from scratch without the LO_FLAGS_DIRECT_IO flag. */ | |
54ba7daf | 117 | if (!FLAGS_SET(info.lo_flags, LO_FLAGS_DIRECT_IO)) |
f5bb0a31 LB |
118 | return log_debug_errno( |
119 | SYNTHETIC_ERRNO(ENOANO), | |
120 | "Could not enable direct IO mode, retrying in buffered IO mode."); | |
54ba7daf YW |
121 | } |
122 | ||
123 | return 0; | |
124 | } | |
125 | ||
126 | static int loop_configure_verify(int fd, const struct loop_config *c) { | |
127 | bool broken = false; | |
128 | int r; | |
129 | ||
130 | assert(fd >= 0); | |
131 | assert(c); | |
132 | ||
fd83c98e | 133 | if (c->block_size != 0) { |
65046b92 | 134 | uint32_t ssz; |
fd83c98e | 135 | |
65046b92 LP |
136 | r = blockdev_get_sector_size(fd, &ssz); |
137 | if (r < 0) | |
138 | return r; | |
fd83c98e | 139 | |
1163ddb3 | 140 | if (ssz != c->block_size) { |
65046b92 | 141 | log_debug("LOOP_CONFIGURE didn't honour requested block size %" PRIu32 ", got %" PRIu32 " instead. Ignoring.", c->block_size, ssz); |
1163ddb3 LP |
142 | broken = true; |
143 | } | |
fd83c98e AD |
144 | } |
145 | ||
54ba7daf YW |
146 | if (c->info.lo_sizelimit != 0) { |
147 | /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the | |
148 | * block device. If it's used, let's immediately check if it had the desired | |
149 | * effect hence. And if not use classic LOOP_SET_STATUS64. */ | |
150 | uint64_t z; | |
151 | ||
c961a8c6 LP |
152 | r = blockdev_get_device_size(fd, &z); |
153 | if (r < 0) | |
154 | return r; | |
54ba7daf YW |
155 | |
156 | if (z != c->info.lo_sizelimit) { | |
fd83c98e | 157 | log_debug("LOOP_CONFIGURE is broken, doesn't honour .info.lo_sizelimit. Falling back to LOOP_SET_STATUS64."); |
54ba7daf YW |
158 | broken = true; |
159 | } | |
160 | } | |
161 | ||
162 | if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) { | |
163 | /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag | |
164 | * into the block device. Let's hence verify if things work correctly here | |
165 | * before returning. */ | |
166 | ||
167 | r = blockdev_partscan_enabled(fd); | |
168 | if (r < 0) | |
169 | return r; | |
170 | if (r == 0) { | |
171 | log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64."); | |
172 | broken = true; | |
173 | } | |
174 | } | |
175 | ||
176 | r = loop_configure_verify_direct_io(fd, c); | |
177 | if (r < 0) | |
178 | return r; | |
179 | ||
180 | return !broken; | |
181 | } | |
182 | ||
183 | static int loop_configure_fallback(int fd, const struct loop_config *c) { | |
184 | struct loop_info64 info_copy; | |
1163ddb3 | 185 | int r; |
54ba7daf YW |
186 | |
187 | assert(fd >= 0); | |
188 | assert(c); | |
189 | ||
190 | /* Only some of the flags LOOP_CONFIGURE can set are also settable via LOOP_SET_STATUS64, hence mask | |
191 | * them out. */ | |
192 | info_copy = c->info; | |
193 | info_copy.lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; | |
194 | ||
195 | /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64 | |
fd83c98e | 196 | * ioctl can return EAGAIN in case we change the info.lo_offset field, if someone else is accessing the |
54ba7daf YW |
197 | * block device while we try to reconfigure it. This is a pretty common case, since udev might |
198 | * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways: | |
199 | * first, let's take the BSD lock to ensure that udev will not step in between the point in | |
200 | * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on | |
201 | * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms | |
202 | * needlessly if we are just racing against udev. The latter is protection against all other cases, | |
203 | * i.e. peers that do not take the BSD lock. */ | |
204 | ||
205 | for (unsigned n_attempts = 0;;) { | |
206 | if (ioctl(fd, LOOP_SET_STATUS64, &info_copy) >= 0) | |
207 | break; | |
208 | ||
209 | if (errno != EAGAIN || ++n_attempts >= 64) | |
210 | return log_debug_errno(errno, "Failed to configure loopback block device: %m"); | |
211 | ||
212 | /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more | |
213 | * failed attempts we see */ | |
4251512e | 214 | (void) usleep_safe(UINT64_C(10) * USEC_PER_MSEC + |
54ba7daf YW |
215 | random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); |
216 | } | |
217 | ||
218 | /* Work around a kernel bug, where changing offset/size of the loopback device doesn't correctly | |
219 | * invalidate the buffer cache. For details see: | |
220 | * | |
221 | * https://android.googlesource.com/platform/system/apex/+/bef74542fbbb4cd629793f4efee8e0053b360570 | |
222 | * | |
223 | * This was fixed in kernel 5.0, see: | |
224 | * | |
225 | * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5db470e229e22b7eda6e23b5566e532c96fb5bc3 | |
226 | * | |
227 | * We'll run the work-around here in the legacy LOOP_SET_STATUS64 codepath. In the LOOP_CONFIGURE | |
228 | * codepath above it should not be necessary. */ | |
229 | if (c->info.lo_offset != 0 || c->info.lo_sizelimit != 0) | |
230 | if (ioctl(fd, BLKFLSBUF, 0) < 0) | |
231 | log_debug_errno(errno, "Failed to issue BLKFLSBUF ioctl, ignoring: %m"); | |
232 | ||
1163ddb3 LP |
233 | /* If a block size is requested then try to configure it. If that doesn't work, ignore errors, but |
234 | * afterwards, let's validate what is in effect, and if it doesn't match what we want, fail */ | |
235 | if (c->block_size != 0) { | |
236 | uint32_t ssz; | |
237 | ||
238 | if (ioctl(fd, LOOP_SET_BLOCK_SIZE, (unsigned long) c->block_size) < 0) | |
239 | log_debug_errno(errno, "Failed to set sector size, ignoring: %m"); | |
240 | ||
241 | r = blockdev_get_sector_size(fd, &ssz); | |
242 | if (r < 0) | |
243 | return log_debug_errno(r, "Failed to read sector size: %m"); | |
244 | if (ssz != c->block_size) | |
245 | return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Sector size of loopback device doesn't match what we requested, refusing."); | |
246 | } | |
247 | ||
54ba7daf YW |
248 | /* LO_FLAGS_DIRECT_IO is a flags we need to configure via explicit ioctls. */ |
249 | if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) | |
250 | if (ioctl(fd, LOOP_SET_DIRECT_IO, 1UL) < 0) | |
251 | log_debug_errno(errno, "Failed to enable direct IO mode, ignoring: %m"); | |
252 | ||
253 | return loop_configure_verify_direct_io(fd, c); | |
254 | } | |
255 | ||
95c50092 | 256 | static int loop_configure( |
021bf175 | 257 | int nr, |
da4fd288 YW |
258 | int open_flags, |
259 | int lock_op, | |
95c50092 | 260 | const struct loop_config *c, |
da4fd288 | 261 | LoopDevice **ret) { |
95c50092 | 262 | |
bb273a51 YW |
263 | static bool loop_configure_broken = false; |
264 | ||
da4fd288 | 265 | _cleanup_(sd_device_unrefp) sd_device *dev = NULL; |
254d1313 ZJS |
266 | _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -EBADF; /* This must be declared before lock_fd. */ |
267 | _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF; | |
da4fd288 YW |
268 | _cleanup_free_ char *node = NULL; |
269 | uint64_t diskseq = 0, seqnum = UINT64_MAX; | |
270 | usec_t timestamp = USEC_INFINITY; | |
271 | dev_t devno; | |
86c1c1f3 LP |
272 | int r; |
273 | ||
021bf175 | 274 | assert(nr >= 0); |
86c1c1f3 | 275 | assert(c); |
da4fd288 YW |
276 | assert(ret); |
277 | ||
278 | if (asprintf(&node, "/dev/loop%i", nr) < 0) | |
432f1fa8 | 279 | return log_oom_debug(); |
da4fd288 YW |
280 | |
281 | r = sd_device_new_from_devname(&dev, node); | |
282 | if (r < 0) | |
432f1fa8 | 283 | return log_debug_errno(r, "Failed to create sd_device object for \"%s\": %m", node); |
da4fd288 YW |
284 | |
285 | r = sd_device_get_devnum(dev, &devno); | |
286 | if (r < 0) | |
432f1fa8 | 287 | return log_device_debug_errno(dev, r, "Failed to get devnum: %m"); |
da4fd288 YW |
288 | |
289 | fd = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); | |
290 | if (fd < 0) | |
432f1fa8 | 291 | return log_device_debug_errno(dev, fd, "Failed to open device: %m"); |
95c50092 | 292 | |
021bf175 LP |
293 | /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened |
294 | * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on | |
295 | * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure | |
296 | * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a | |
297 | * long time udev would possibly never run on it again, even though the fd is unlocked, simply | |
298 | * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to | |
299 | * automatically release the lock, after we are done. */ | |
7f52206a | 300 | lock_fd = open_lock_fd(fd, LOCK_EX); |
021bf175 | 301 | if (lock_fd < 0) |
432f1fa8 YW |
302 | return log_device_debug_errno(dev, lock_fd, "Failed to acquire lock: %m"); |
303 | ||
304 | log_device_debug(dev, "Acquired exclusive lock."); | |
021bf175 | 305 | |
53274fdb YW |
306 | /* Let's see if backing file is really unattached. Someone may already attach a backing file without |
307 | * taking BSD lock. */ | |
308 | r = loop_is_bound(fd); | |
309 | if (r < 0) | |
432f1fa8 | 310 | return log_device_debug_errno(dev, r, "Failed to check if the loopback block device is bound: %m"); |
53274fdb | 311 | if (r > 0) |
432f1fa8 YW |
312 | return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EBUSY), |
313 | "The loopback block device is already bound, ignoring."); | |
53274fdb | 314 | |
021bf175 LP |
315 | /* Let's see if the device is really detached, i.e. currently has no associated partition block |
316 | * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that | |
247738b4 LP |
317 | * superficially is detached but still has partition block devices associated for it. Let's then |
318 | * manually remove the partitions via BLKPG, and tell the caller we did that via EUCLEAN, so they try | |
319 | * again. */ | |
833106b8 | 320 | r = block_device_remove_all_partitions(dev, fd); |
021bf175 | 321 | if (r < 0) |
432f1fa8 | 322 | return log_device_debug_errno(dev, r, "Failed to remove partitions on the loopback block device: %m"); |
833106b8 YW |
323 | if (r > 0) |
324 | /* Removed all partitions. Let's report this to the caller, to try again, and count this as | |
53274fdb | 325 | * an attempt. */ |
432f1fa8 YW |
326 | return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EUCLEAN), |
327 | "Removed partitions on the loopback block device."); | |
021bf175 | 328 | |
bb273a51 | 329 | if (!loop_configure_broken) { |
31c75fcc LP |
330 | /* Acquire uevent seqnum immediately before attaching the loopback device. This allows |
331 | * callers to ignore all uevents with a seqnum before this one, if they need to associate | |
332 | * uevent with this attachment. Doing so isn't race-free though, as uevents that happen in | |
333 | * the window between this reading of the seqnum, and the LOOP_CONFIGURE call might still be | |
334 | * mistaken as originating from our attachment, even though might be caused by an earlier | |
335 | * use. But doing this at least shortens the race window a bit. */ | |
336 | r = get_current_uevent_seqnum(&seqnum); | |
337 | if (r < 0) | |
432f1fa8 | 338 | return log_device_debug_errno(dev, r, "Failed to get the current uevent seqnum: %m"); |
54ba7daf | 339 | |
8ede1e86 | 340 | timestamp = now(CLOCK_MONOTONIC); |
31c75fcc | 341 | |
95c50092 LP |
342 | if (ioctl(fd, LOOP_CONFIGURE, c) < 0) { |
343 | /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other | |
344 | * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL | |
345 | * rather than ENOTTY on loopback block devices. They should fix that in the kernel, | |
346 | * but in the meantime we accept both here. */ | |
347 | if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) | |
432f1fa8 | 348 | return log_device_debug_errno(dev, errno, "ioctl(LOOP_CONFIGURE) failed: %m"); |
86c1c1f3 | 349 | |
bb273a51 | 350 | loop_configure_broken = true; |
95c50092 | 351 | } else { |
da4fd288 YW |
352 | loop_with_fd = TAKE_FD(fd); |
353 | ||
354 | r = loop_configure_verify(loop_with_fd, c); | |
54ba7daf | 355 | if (r < 0) |
432f1fa8 | 356 | return log_device_debug_errno(dev, r, "Failed to verify if loopback block device is correctly configured: %m"); |
54ba7daf | 357 | if (r == 0) { |
95c50092 | 358 | /* LOOP_CONFIGURE doesn't work. Remember that. */ |
bb273a51 | 359 | loop_configure_broken = true; |
95c50092 LP |
360 | |
361 | /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD, | |
362 | * because LOOP_CLR_FD is async: if the operation cannot be executed right | |
363 | * away it just sets the autoclear flag on the device. This means there's a | |
364 | * good chance we cannot actually reuse the loopback device right-away. Hence | |
365 | * let's assume it's busy, avoid the trouble and let the calling loop call us | |
366 | * again with a new, likely unused device. */ | |
da4fd288 | 367 | return -EBUSY; |
bb2551bd | 368 | } |
95c50092 | 369 | } |
86c1c1f3 LP |
370 | } |
371 | ||
bb273a51 YW |
372 | if (loop_configure_broken) { |
373 | /* Let's read the seqnum again, to shorten the window. */ | |
374 | r = get_current_uevent_seqnum(&seqnum); | |
375 | if (r < 0) | |
432f1fa8 | 376 | return log_device_debug_errno(dev, r, "Failed to get the current uevent seqnum: %m"); |
31c75fcc | 377 | |
bb273a51 | 378 | timestamp = now(CLOCK_MONOTONIC); |
738f29cb | 379 | |
bb273a51 | 380 | if (ioctl(fd, LOOP_SET_FD, c->fd) < 0) |
432f1fa8 | 381 | return log_device_debug_errno(dev, errno, "ioctl(LOOP_SET_FD) failed: %m"); |
86c1c1f3 | 382 | |
da4fd288 YW |
383 | loop_with_fd = TAKE_FD(fd); |
384 | ||
385 | r = loop_configure_fallback(loop_with_fd, c); | |
bb273a51 | 386 | if (r < 0) |
da4fd288 | 387 | return r; |
bb273a51 | 388 | } |
e8c7c4d9 | 389 | |
da4fd288 YW |
390 | r = fd_get_diskseq(loop_with_fd, &diskseq); |
391 | if (r < 0 && r != -EOPNOTSUPP) | |
432f1fa8 | 392 | return log_device_debug_errno(dev, r, "Failed to get diskseq: %m"); |
31c75fcc | 393 | |
da4fd288 YW |
394 | switch (lock_op & ~LOCK_NB) { |
395 | case LOCK_EX: /* Already in effect */ | |
396 | break; | |
397 | case LOCK_SH: /* Downgrade */ | |
398 | if (flock(lock_fd, lock_op) < 0) | |
432f1fa8 | 399 | return log_device_debug_errno(dev, errno, "Failed to downgrade lock level: %m"); |
da4fd288 YW |
400 | break; |
401 | case LOCK_UN: /* Release */ | |
402 | lock_fd = safe_close(lock_fd); | |
403 | break; | |
404 | default: | |
405 | assert_not_reached(); | |
406 | } | |
407 | ||
c961a8c6 LP |
408 | uint64_t device_size; |
409 | r = blockdev_get_device_size(loop_with_fd, &device_size); | |
410 | if (r < 0) | |
411 | return log_device_debug_errno(dev, r, "Failed to get loopback device size: %m"); | |
412 | ||
da4fd288 YW |
413 | LoopDevice *d = new(LoopDevice, 1); |
414 | if (!d) | |
432f1fa8 | 415 | return log_oom_debug(); |
da4fd288 YW |
416 | |
417 | *d = (LoopDevice) { | |
36d5eb0b | 418 | .n_ref = 1, |
da4fd288 YW |
419 | .fd = TAKE_FD(loop_with_fd), |
420 | .lock_fd = TAKE_FD(lock_fd), | |
421 | .node = TAKE_PTR(node), | |
422 | .nr = nr, | |
423 | .devno = devno, | |
424 | .dev = TAKE_PTR(dev), | |
425 | .diskseq = diskseq, | |
426 | .uevent_seqnum_not_before = seqnum, | |
427 | .timestamp_not_before = timestamp, | |
22ee78a8 | 428 | .sector_size = c->block_size, |
c961a8c6 | 429 | .device_size = device_size, |
6bc20134 | 430 | .created = true, |
da4fd288 | 431 | }; |
86c1c1f3 | 432 | |
da4fd288 YW |
433 | *ret = TAKE_PTR(d); |
434 | return 0; | |
e8af3bfd ZJS |
435 | } |
436 | ||
e8c7c4d9 | 437 | static int loop_device_make_internal( |
e77cab82 | 438 | const char *path, |
ed9eeb7b LP |
439 | int fd, |
440 | int open_flags, | |
441 | uint64_t offset, | |
442 | uint64_t size, | |
22ee78a8 | 443 | uint32_t sector_size, |
ed9eeb7b | 444 | uint32_t loop_flags, |
7f52206a | 445 | int lock_op, |
ed9eeb7b | 446 | LoopDevice **ret) { |
8c1be37e | 447 | |
da4fd288 | 448 | _cleanup_(loop_device_unrefp) LoopDevice *d = NULL; |
f5bb0a31 | 449 | _cleanup_close_ int reopened_fd = -EBADF, control = -EBADF; |
da4fd288 | 450 | _cleanup_free_ char *backing_file = NULL; |
86c1c1f3 | 451 | struct loop_config config; |
da4fd288 | 452 | int r, f_flags; |
8c1be37e | 453 | struct stat st; |
8c1be37e | 454 | |
8c1be37e LP |
455 | assert(ret); |
456 | assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); | |
457 | ||
8e398254 | 458 | if (fstat(ASSERT_FD(fd), &st) < 0) |
8c1be37e LP |
459 | return -errno; |
460 | ||
461 | if (S_ISBLK(st.st_mode)) { | |
1996ad28 | 462 | if (offset == 0 && IN_SET(size, 0, UINT64_MAX)) |
d7654742 LP |
463 | /* If this is already a block device and we are supposed to cover the whole of it |
464 | * then store an fd to the original open device node — and do not actually create an | |
1996ad28 | 465 | * unnecessary loopback device for it. */ |
de3b7f16 | 466 | return loop_device_open_from_fd(fd, open_flags, lock_op, ret); |
ed9eeb7b LP |
467 | } else { |
468 | r = stat_verify_regular(&st); | |
469 | if (r < 0) | |
470 | return r; | |
8c1be37e LP |
471 | } |
472 | ||
e77cab82 YW |
473 | if (path) { |
474 | r = path_make_absolute_cwd(path, &backing_file); | |
475 | if (r < 0) | |
476 | return r; | |
477 | ||
478 | path_simplify(backing_file); | |
479 | } else { | |
480 | r = fd_get_path(fd, &backing_file); | |
481 | if (r < 0) | |
482 | return r; | |
483 | } | |
484 | ||
e8c7c4d9 LP |
485 | f_flags = fcntl(fd, F_GETFL); |
486 | if (f_flags < 0) | |
487 | return -errno; | |
488 | ||
489 | if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) != FLAGS_SET(f_flags, O_DIRECT)) { | |
490 | /* If LO_FLAGS_DIRECT_IO is requested, then make sure we have the fd open with O_DIRECT, as | |
491 | * that's required. Conversely, if it's off require that O_DIRECT is off too (that's because | |
492 | * new kernels will implicitly enable LO_FLAGS_DIRECT_IO if O_DIRECT is set). | |
493 | * | |
494 | * Our intention here is that LO_FLAGS_DIRECT_IO is the primary knob, and O_DIRECT derived | |
495 | * from that automatically. */ | |
496 | ||
f5bb0a31 LB |
497 | reopened_fd = fd_reopen(fd, (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0)|O_CLOEXEC|O_NONBLOCK|open_flags); |
498 | if (reopened_fd < 0) { | |
e8c7c4d9 | 499 | if (!FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) |
d579c42e | 500 | return log_debug_errno(reopened_fd, "Failed to reopen file descriptor without O_DIRECT: %m"); |
e8c7c4d9 LP |
501 | |
502 | /* Some file systems might not support O_DIRECT, let's gracefully continue without it then. */ | |
d579c42e | 503 | log_debug_errno(reopened_fd, "Failed to enable O_DIRECT for backing file descriptor for loopback device. Continuing without."); |
e8c7c4d9 LP |
504 | loop_flags &= ~LO_FLAGS_DIRECT_IO; |
505 | } else | |
f5bb0a31 | 506 | fd = reopened_fd; /* From now on, operate on our new O_DIRECT fd */ |
e8c7c4d9 LP |
507 | } |
508 | ||
8c1be37e LP |
509 | control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); |
510 | if (control < 0) | |
511 | return -errno; | |
512 | ||
22ee78a8 LP |
513 | if (sector_size == 0) |
514 | /* If no sector size is specified, default to the classic default */ | |
515 | sector_size = 512; | |
516 | else if (sector_size == UINT32_MAX) { | |
517 | ||
518 | if (S_ISBLK(st.st_mode)) | |
519 | /* If the sector size is specified as UINT32_MAX we'll propagate the sector size of | |
520 | * the underlying block device. */ | |
521 | r = blockdev_get_sector_size(fd, §or_size); | |
522 | else { | |
92651a7a | 523 | _cleanup_close_ int non_direct_io_fd = -EBADF; |
22ee78a8 LP |
524 | int probe_fd; |
525 | ||
526 | assert(S_ISREG(st.st_mode)); | |
527 | ||
528 | /* If sector size is specified as UINT32_MAX, we'll try to probe the right sector | |
529 | * size of the image in question by looking for the GPT partition header at various | |
530 | * offsets. This of course only works if the image already has a disk label. | |
531 | * | |
532 | * So here we actually want to read the file contents ourselves. This is quite likely | |
533 | * not going to work if we managed to enable O_DIRECT, because in such a case there | |
534 | * are some pretty strict alignment requirements to offset, size and target, but | |
535 | * there's no way to query what alignment specifically is actually required. Hence, | |
536 | * let's avoid the mess, and temporarily open an fd without O_DIRECT for the probing | |
537 | * logic. */ | |
538 | ||
539 | if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) { | |
540 | non_direct_io_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_NONBLOCK); | |
541 | if (non_direct_io_fd < 0) | |
542 | return non_direct_io_fd; | |
543 | ||
544 | probe_fd = non_direct_io_fd; | |
545 | } else | |
546 | probe_fd = fd; | |
547 | ||
548 | r = probe_sector_size(probe_fd, §or_size); | |
549 | } | |
550 | if (r < 0) | |
551 | return r; | |
552 | } | |
553 | ||
86c1c1f3 LP |
554 | config = (struct loop_config) { |
555 | .fd = fd, | |
22ee78a8 | 556 | .block_size = sector_size, |
86c1c1f3 LP |
557 | .info = { |
558 | /* Use the specified flags, but configure the read-only flag from the open flags, and force autoclear */ | |
0950526a | 559 | .lo_flags = (loop_flags & ~LO_FLAGS_READ_ONLY) | ((open_flags & O_ACCMODE) == O_RDONLY ? LO_FLAGS_READ_ONLY : 0) | LO_FLAGS_AUTOCLEAR, |
86c1c1f3 LP |
560 | .lo_offset = offset, |
561 | .lo_sizelimit = size == UINT64_MAX ? 0 : size, | |
562 | }, | |
563 | }; | |
564 | ||
0f6519d4 LP |
565 | /* Loop around LOOP_CTL_GET_FREE, since at the moment we attempt to open the returned device it might |
566 | * be gone already, taken by somebody else racing against us. */ | |
e8af3bfd | 567 | for (unsigned n_attempts = 0;;) { |
432f1fa8 | 568 | usec_t usec; |
da4fd288 | 569 | int nr; |
e8af3bfd | 570 | |
cc530466 LP |
571 | /* Let's take a lock on the control device first. On a busy system, where many programs |
572 | * attempt to allocate a loopback device at the same time, we might otherwise keep looping | |
573 | * around relatively heavy operations: asking for a free loopback device, then opening it, | |
574 | * validating it, attaching something to it. Let's serialize this whole operation, to make | |
575 | * unnecessary busywork less likely. Note that this is just something we do to optimize our | |
576 | * own code (and whoever else decides to use LOCK_EX locks for this), taking this lock is not | |
577 | * necessary, it just means it's less likely we have to iterate through this loop again and | |
4c1d50e6 LP |
578 | * again if our own code races against our own code. |
579 | * | |
580 | * Note: our lock protocol is to take the /dev/loop-control lock first, and the block device | |
581 | * lock second, if both are taken, and always in this order, to avoid ABBA locking issues. */ | |
cc530466 LP |
582 | if (flock(control, LOCK_EX) < 0) |
583 | return -errno; | |
584 | ||
0f6519d4 LP |
585 | nr = ioctl(control, LOOP_CTL_GET_FREE); |
586 | if (nr < 0) | |
587 | return -errno; | |
8c1be37e | 588 | |
da4fd288 YW |
589 | r = loop_configure(nr, open_flags, lock_op, &config, &d); |
590 | if (r >= 0) | |
591 | break; | |
cc5bae6c | 592 | |
da4fd288 YW |
593 | /* -ENODEV or friends: Somebody might've gotten the same number from the kernel, used the |
594 | * device, and called LOOP_CTL_REMOVE on it. Let's retry with a new number. | |
595 | * -EBUSY: a file descriptor is already bound to the loopback block device. | |
f5bb0a31 LB |
596 | * -EUCLEAN: some left-over partition devices that were cleaned up. |
597 | * -ENOANO: we tried to use LO_FLAGS_DIRECT_IO but the kernel rejected it. */ | |
598 | if (!ERRNO_IS_DEVICE_ABSENT(r) && !IN_SET(r, -EBUSY, -EUCLEAN, -ENOANO)) | |
2421dd72 | 599 | return r; |
01813148 | 600 | |
cc530466 LP |
601 | /* OK, this didn't work, let's try again a bit later, but first release the lock on the |
602 | * control device */ | |
603 | if (flock(control, LOCK_UN) < 0) | |
604 | return -errno; | |
605 | ||
e8af3bfd ZJS |
606 | if (++n_attempts >= 64) /* Give up eventually */ |
607 | return -EBUSY; | |
0f6519d4 | 608 | |
f5bb0a31 LB |
609 | /* If we failed to enable direct IO mode, let's retry without it. We restart the process as |
610 | * on some combination of kernel version and storage filesystem, the kernel is very unhappy | |
611 | * about a failed DIRECT_IO enablement and throws I/O errors. */ | |
612 | if (r == -ENOANO && FLAGS_SET(config.info.lo_flags, LO_FLAGS_DIRECT_IO)) { | |
613 | config.info.lo_flags &= ~LO_FLAGS_DIRECT_IO; | |
614 | open_flags &= ~O_DIRECT; | |
615 | ||
616 | int non_direct_io_fd = fd_reopen(config.fd, O_CLOEXEC|O_NONBLOCK|open_flags); | |
617 | if (non_direct_io_fd < 0) | |
618 | return log_debug_errno( | |
619 | non_direct_io_fd, | |
620 | "Failed to reopen file descriptor without O_DIRECT: %m"); | |
621 | ||
622 | safe_close(reopened_fd); | |
623 | fd = config.fd = /* For cleanups */ reopened_fd = non_direct_io_fd; | |
624 | } | |
625 | ||
b202ec20 LP |
626 | /* Wait some random time, to make collision less likely. Let's pick a random time in the |
627 | * range 0ms…250ms, linearly scaled by the number of failed attempts. */ | |
432f1fa8 YW |
628 | usec = random_u64_range(UINT64_C(10) * USEC_PER_MSEC + |
629 | UINT64_C(240) * USEC_PER_MSEC * n_attempts/64); | |
630 | log_debug("Trying again after %s.", FORMAT_TIMESPAN(usec, USEC_PER_MSEC)); | |
4251512e | 631 | (void) usleep_safe(usec); |
0f6519d4 | 632 | } |
8c1be37e | 633 | |
da4fd288 | 634 | d->backing_file = TAKE_PTR(backing_file); |
4d2a9e3e LP |
635 | d->backing_inode = st.st_ino; |
636 | d->backing_devno = st.st_dev; | |
8c1be37e | 637 | |
3b195f63 LP |
638 | log_debug("Successfully acquired %s, devno=%u:%u, nr=%i, diskseq=%" PRIu64, |
639 | d->node, | |
640 | major(d->devno), minor(d->devno), | |
641 | d->nr, | |
642 | d->diskseq); | |
643 | ||
da4fd288 YW |
644 | *ret = TAKE_PTR(d); |
645 | return 0; | |
8c1be37e LP |
646 | } |
647 | ||
e8c7c4d9 LP |
648 | static uint32_t loop_flags_mangle(uint32_t loop_flags) { |
649 | int r; | |
650 | ||
651 | r = getenv_bool("SYSTEMD_LOOP_DIRECT_IO"); | |
652 | if (r < 0 && r != -ENXIO) | |
653 | log_debug_errno(r, "Failed to parse $SYSTEMD_LOOP_DIRECT_IO, ignoring: %m"); | |
654 | ||
bfd08445 | 655 | return UPDATE_FLAG(loop_flags, LO_FLAGS_DIRECT_IO, r != 0); /* Turn on LO_FLAGS_DIRECT_IO by default, unless explicitly configured to off. */ |
e8c7c4d9 LP |
656 | } |
657 | ||
658 | int loop_device_make( | |
659 | int fd, | |
660 | int open_flags, | |
661 | uint64_t offset, | |
662 | uint64_t size, | |
22ee78a8 | 663 | uint32_t sector_size, |
e8c7c4d9 | 664 | uint32_t loop_flags, |
7f52206a | 665 | int lock_op, |
e8c7c4d9 LP |
666 | LoopDevice **ret) { |
667 | ||
668 | assert(fd >= 0); | |
669 | assert(ret); | |
e8c7c4d9 LP |
670 | |
671 | return loop_device_make_internal( | |
e77cab82 | 672 | NULL, |
e8c7c4d9 LP |
673 | fd, |
674 | open_flags, | |
675 | offset, | |
676 | size, | |
22ee78a8 | 677 | sector_size, |
bfd08445 | 678 | loop_flags_mangle(loop_flags), |
7f52206a | 679 | lock_op, |
e8c7c4d9 LP |
680 | ret); |
681 | } | |
682 | ||
972c8db5 DDM |
683 | int loop_device_make_by_path_at( |
684 | int dir_fd, | |
79e8393a LP |
685 | const char *path, |
686 | int open_flags, | |
22ee78a8 | 687 | uint32_t sector_size, |
79e8393a | 688 | uint32_t loop_flags, |
7f52206a | 689 | int lock_op, |
79e8393a LP |
690 | LoopDevice **ret) { |
691 | ||
e8c7c4d9 | 692 | int r, basic_flags, direct_flags, rdwr_flags; |
254d1313 | 693 | _cleanup_close_ int fd = -EBADF; |
aa4d3aa3 | 694 | bool direct = false; |
8c1be37e | 695 | |
972c8db5 | 696 | assert(dir_fd >= 0 || dir_fd == AT_FDCWD); |
8c1be37e LP |
697 | assert(path); |
698 | assert(ret); | |
b0a94268 | 699 | assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY)); |
8c1be37e | 700 | |
b0a94268 LP |
701 | /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying |
702 | * read-only if we cannot. */ | |
703 | ||
e8c7c4d9 LP |
704 | loop_flags = loop_flags_mangle(loop_flags); |
705 | ||
706 | /* Let's open with O_DIRECT if we can. But not all file systems support that, hence fall back to | |
707 | * non-O_DIRECT mode automatically, if it fails. */ | |
708 | ||
709 | basic_flags = O_CLOEXEC|O_NONBLOCK|O_NOCTTY; | |
710 | direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0; | |
711 | rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR; | |
712 | ||
420d2e31 | 713 | fd = xopenat(dir_fd, path, basic_flags|direct_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0); |
e8c7c4d9 | 714 | if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */ |
420d2e31 | 715 | fd = xopenat(dir_fd, path, basic_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0); |
aa4d3aa3 LP |
716 | else |
717 | direct = direct_flags != 0; | |
b0a94268 LP |
718 | if (fd < 0) { |
719 | r = -errno; | |
720 | ||
721 | /* Retry read-only? */ | |
722 | if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS)) | |
723 | return r; | |
724 | ||
420d2e31 | 725 | fd = xopenat(dir_fd, path, basic_flags|direct_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0); |
e8c7c4d9 | 726 | if (fd < 0 && direct_flags != 0) /* as above */ |
420d2e31 | 727 | fd = xopenat(dir_fd, path, basic_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0); |
aa4d3aa3 LP |
728 | else |
729 | direct = direct_flags != 0; | |
b0a94268 LP |
730 | if (fd < 0) |
731 | return r; /* Propagate original error */ | |
732 | ||
733 | open_flags = O_RDONLY; | |
734 | } else if (open_flags < 0) | |
735 | open_flags = O_RDWR; | |
8c1be37e | 736 | |
aa4d3aa3 LP |
737 | log_debug("Opened '%s' in %s access mode%s, with O_DIRECT %s%s.", |
738 | path, | |
739 | open_flags == O_RDWR ? "O_RDWR" : "O_RDONLY", | |
740 | open_flags != rdwr_flags ? " (O_RDWR was requested but not allowed)" : "", | |
741 | direct ? "enabled" : "disabled", | |
742 | direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : ""); | |
743 | ||
972c8db5 DDM |
744 | return loop_device_make_internal( |
745 | dir_fd == AT_FDCWD ? path : NULL, | |
746 | fd, | |
747 | open_flags, | |
748 | /* offset = */ 0, | |
749 | /* size = */ 0, | |
750 | sector_size, | |
751 | loop_flags, | |
752 | lock_op, | |
753 | ret); | |
8c1be37e LP |
754 | } |
755 | ||
fcd8a19d LP |
756 | int loop_device_make_by_path_memory( |
757 | const char *path, | |
758 | int open_flags, | |
22ee78a8 | 759 | uint32_t sector_size, |
fcd8a19d LP |
760 | uint32_t loop_flags, |
761 | int lock_op, | |
762 | LoopDevice **ret) { | |
763 | ||
764 | _cleanup_close_ int fd = -EBADF, mfd = -EBADF; | |
765 | _cleanup_free_ char *fn = NULL; | |
766 | struct stat st; | |
767 | int r; | |
768 | ||
769 | assert(path); | |
770 | assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); | |
771 | assert(ret); | |
772 | ||
773 | loop_flags &= ~LO_FLAGS_DIRECT_IO; /* memfds don't support O_DIRECT, hence LO_FLAGS_DIRECT_IO can't be used either */ | |
774 | ||
775 | fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY); | |
776 | if (fd < 0) | |
777 | return -errno; | |
778 | ||
779 | if (fstat(fd, &st) < 0) | |
780 | return -errno; | |
781 | ||
782 | if (!S_ISREG(st.st_mode) && !S_ISBLK(st.st_mode)) | |
783 | return -EBADF; | |
784 | ||
785 | r = path_extract_filename(path, &fn); | |
786 | if (r < 0) | |
787 | return r; | |
788 | ||
789 | mfd = memfd_clone_fd(fd, fn, open_flags|O_CLOEXEC); | |
790 | if (mfd < 0) | |
791 | return mfd; | |
792 | ||
793 | fd = safe_close(fd); /* Let's close the original early */ | |
794 | ||
22ee78a8 | 795 | return loop_device_make_internal(NULL, mfd, open_flags, 0, 0, sector_size, loop_flags, lock_op, ret); |
fcd8a19d LP |
796 | } |
797 | ||
36d5eb0b | 798 | static LoopDevice* loop_device_free(LoopDevice *d) { |
5bb1d7fb | 799 | _cleanup_close_ int control = -EBADF; |
3a6ed1e1 LP |
800 | int r; |
801 | ||
8c1be37e LP |
802 | if (!d) |
803 | return NULL; | |
804 | ||
4c1d50e6 LP |
805 | /* Release any lock we might have on the device first. We want to open+lock the /dev/loop-control |
806 | * device below, but our lock protocol says that if both control and block device locks are taken, | |
807 | * the control lock needs to be taken first, the block device lock second — in order to avoid ABBA | |
808 | * locking issues. Moreover, we want to issue LOOP_CLR_FD on the block device further down, and that | |
809 | * would fail if we had another fd open to the device. */ | |
7f52206a LP |
810 | d->lock_fd = safe_close(d->lock_fd); |
811 | ||
4c1d50e6 LP |
812 | /* Let's open the control device early, and lock it, so that we can release our block device and |
813 | * delete it in a synchronized fashion, and allocators won't needlessly see the block device as free | |
814 | * while we are about to delete it. */ | |
7cb349f0 | 815 | if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) { |
4c1d50e6 LP |
816 | control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK); |
817 | if (control < 0) | |
818 | log_debug_errno(errno, "Failed to open loop control device, cannot remove loop device '%s', ignoring: %m", strna(d->node)); | |
819 | else if (flock(control, LOCK_EX) < 0) | |
820 | log_debug_errno(errno, "Failed to lock loop control device, ignoring: %m"); | |
821 | } | |
822 | ||
823 | /* Then let's release the loopback block device */ | |
8c1be37e | 824 | if (d->fd >= 0) { |
cae1e8fb LP |
825 | /* Implicitly sync the device, since otherwise in-flight blocks might not get written */ |
826 | if (fsync(d->fd) < 0) | |
827 | log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m"); | |
828 | ||
7cb349f0 | 829 | if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) { |
3a6ed1e1 LP |
830 | /* We are supposed to clear the loopback device. Let's do this synchronously: lock |
831 | * the device, manually remove all partitions and then clear it. This should ensure | |
832 | * udev doesn't concurrently access the devices, and we can be reasonably sure that | |
833 | * once we are done here the device is cleared and all its partition children | |
834 | * removed. Note that we lock our primary device fd here (and not a separate locking | |
835 | * fd, as we do during allocation, since we want to keep the lock all the way through | |
836 | * the LOOP_CLR_FD, but that call would fail if we had more than one fd open.) */ | |
8c1be37e | 837 | |
3a6ed1e1 LP |
838 | if (flock(d->fd, LOCK_EX) < 0) |
839 | log_debug_errno(errno, "Failed to lock loop block device, ignoring: %m"); | |
840 | ||
46c3a288 | 841 | r = block_device_remove_all_partitions(d->dev, d->fd); |
3a6ed1e1 LP |
842 | if (r < 0) |
843 | log_debug_errno(r, "Failed to remove partitions of loopback block device, ignoring: %m"); | |
844 | ||
845 | if (ioctl(d->fd, LOOP_CLR_FD) < 0) | |
846 | log_debug_errno(errno, "Failed to clear loop device, ignoring: %m"); | |
8c1be37e LP |
847 | } |
848 | ||
849 | safe_close(d->fd); | |
850 | } | |
851 | ||
4c1d50e6 | 852 | /* Now that the block device is released, let's also try to remove it */ |
afbe20b7 | 853 | if (control >= 0) { |
6483bcef ZJS |
854 | useconds_t delay = 5 * USEC_PER_MSEC; /* A total delay of 5090 ms between 39 attempts, |
855 | * (4*5 + 5*10 + 5*20 + … + 3*640) = 5090. */ | |
afbe20b7 ZJS |
856 | |
857 | for (unsigned attempt = 1;; attempt++) { | |
4c1d50e6 LP |
858 | if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0) |
859 | break; | |
afbe20b7 | 860 | if (errno != EBUSY || attempt > 38) { |
4c1d50e6 LP |
861 | log_debug_errno(errno, "Failed to remove device %s: %m", strna(d->node)); |
862 | break; | |
f2d9213f | 863 | } |
afbe20b7 ZJS |
864 | if (attempt % 5 == 0) { |
865 | log_debug("Device is still busy after %u attempts…", attempt); | |
866 | delay *= 2; | |
867 | } | |
868 | ||
4251512e | 869 | (void) usleep_safe(delay); |
4c1d50e6 | 870 | } |
afbe20b7 | 871 | } |
8c1be37e LP |
872 | |
873 | free(d->node); | |
cc5bae6c | 874 | sd_device_unref(d->dev); |
e77cab82 | 875 | free(d->backing_file); |
5fecf46d | 876 | return mfree(d); |
8c1be37e | 877 | } |
a2ea3b2f | 878 | |
36d5eb0b YW |
879 | DEFINE_TRIVIAL_REF_UNREF_FUNC(LoopDevice, loop_device, loop_device_free); |
880 | ||
a2ea3b2f LP |
881 | void loop_device_relinquish(LoopDevice *d) { |
882 | assert(d); | |
883 | ||
884 | /* Don't attempt to clean up the loop device anymore from this point on. Leave the clean-ing up to the kernel | |
885 | * itself, using the loop device "auto-clear" logic we already turned on when creating the device. */ | |
886 | ||
887 | d->relinquished = true; | |
888 | } | |
9dabc4fd | 889 | |
24d59aee DDM |
890 | void loop_device_unrelinquish(LoopDevice *d) { |
891 | assert(d); | |
892 | d->relinquished = false; | |
893 | } | |
894 | ||
4f0ad43e YW |
895 | int loop_device_open( |
896 | sd_device *dev, | |
7f52206a LP |
897 | int open_flags, |
898 | int lock_op, | |
899 | LoopDevice **ret) { | |
900 | ||
254d1313 | 901 | _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF; |
4f0ad43e | 902 | _cleanup_free_ char *node = NULL, *backing_file = NULL; |
4d2a9e3e | 903 | dev_t devnum, backing_devno = 0; |
b26c39ad | 904 | struct loop_info64 info; |
4d2a9e3e | 905 | ino_t backing_inode = 0; |
ffcb3324 | 906 | uint64_t diskseq = 0; |
9dabc4fd | 907 | LoopDevice *d; |
4f0ad43e | 908 | const char *s; |
a8d8a619 | 909 | int r, nr = -1; |
9dabc4fd | 910 | |
4f0ad43e | 911 | assert(dev); |
e8c7c4d9 | 912 | assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); |
9dabc4fd LP |
913 | assert(ret); |
914 | ||
4f0ad43e YW |
915 | /* Even if fd is provided through the argument in loop_device_open_from_fd(), we reopen the inode |
916 | * here, instead of keeping just a dup() clone of it around, since we want to ensure that the | |
917 | * O_DIRECT flag of the handle we keep is off, we have our own file index, and have the right | |
918 | * read/write mode in effect. */ | |
919 | fd = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); | |
920 | if (fd < 0) | |
921 | return fd; | |
cc5bae6c | 922 | |
4f0ad43e YW |
923 | if ((lock_op & ~LOCK_NB) != LOCK_UN) { |
924 | lock_fd = open_lock_fd(fd, lock_op); | |
925 | if (lock_fd < 0) | |
926 | return lock_fd; | |
a8d8a619 YW |
927 | } |
928 | ||
4f0ad43e | 929 | if (ioctl(fd, LOOP_GET_STATUS64, &info) >= 0) { |
10c1b188 LP |
930 | #if HAVE_VALGRIND_MEMCHECK_H |
931 | /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ | |
932 | VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); | |
933 | #endif | |
b26c39ad | 934 | nr = info.lo_number; |
e77cab82 YW |
935 | |
936 | if (sd_device_get_sysattr_value(dev, "loop/backing_file", &s) >= 0) { | |
937 | backing_file = strdup(s); | |
938 | if (!backing_file) | |
939 | return -ENOMEM; | |
940 | } | |
4d2a9e3e LP |
941 | |
942 | backing_devno = info.lo_device; | |
943 | backing_inode = info.lo_inode; | |
a8d8a619 | 944 | } |
b26c39ad | 945 | |
4f0ad43e | 946 | r = fd_get_diskseq(fd, &diskseq); |
ffcb3324 YW |
947 | if (r < 0 && r != -EOPNOTSUPP) |
948 | return r; | |
949 | ||
22ee78a8 LP |
950 | uint32_t sector_size; |
951 | r = blockdev_get_sector_size(fd, §or_size); | |
952 | if (r < 0) | |
953 | return r; | |
954 | ||
c961a8c6 LP |
955 | uint64_t device_size; |
956 | r = blockdev_get_device_size(fd, &device_size); | |
957 | if (r < 0) | |
958 | return r; | |
959 | ||
4f0ad43e YW |
960 | r = sd_device_get_devnum(dev, &devnum); |
961 | if (r < 0) | |
962 | return r; | |
7f52206a | 963 | |
4f0ad43e | 964 | r = sd_device_get_devname(dev, &s); |
cc5bae6c YW |
965 | if (r < 0) |
966 | return r; | |
967 | ||
4f0ad43e YW |
968 | node = strdup(s); |
969 | if (!node) | |
cc5bae6c | 970 | return -ENOMEM; |
9dabc4fd LP |
971 | |
972 | d = new(LoopDevice, 1); | |
973 | if (!d) | |
974 | return -ENOMEM; | |
975 | ||
976 | *d = (LoopDevice) { | |
36d5eb0b | 977 | .n_ref = 1, |
a8d8a619 | 978 | .fd = TAKE_FD(fd), |
7f52206a | 979 | .lock_fd = TAKE_FD(lock_fd), |
b26c39ad | 980 | .nr = nr, |
4f0ad43e YW |
981 | .node = TAKE_PTR(node), |
982 | .dev = sd_device_ref(dev), | |
e77cab82 | 983 | .backing_file = TAKE_PTR(backing_file), |
4d2a9e3e LP |
984 | .backing_inode = backing_inode, |
985 | .backing_devno = backing_devno, | |
9dabc4fd | 986 | .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */ |
4f0ad43e | 987 | .devno = devnum, |
ffcb3324 | 988 | .diskseq = diskseq, |
31c75fcc | 989 | .uevent_seqnum_not_before = UINT64_MAX, |
8ede1e86 | 990 | .timestamp_not_before = USEC_INFINITY, |
22ee78a8 | 991 | .sector_size = sector_size, |
c961a8c6 | 992 | .device_size = device_size, |
6bc20134 | 993 | .created = false, |
9dabc4fd LP |
994 | }; |
995 | ||
996 | *ret = d; | |
4f0ad43e YW |
997 | return 0; |
998 | } | |
999 | ||
1000 | int loop_device_open_from_fd( | |
1001 | int fd, | |
1002 | int open_flags, | |
1003 | int lock_op, | |
1004 | LoopDevice **ret) { | |
1005 | ||
1006 | _cleanup_(sd_device_unrefp) sd_device *dev = NULL; | |
1007 | int r; | |
1008 | ||
8e398254 | 1009 | r = block_device_new_from_fd(ASSERT_FD(fd), 0, &dev); |
4f0ad43e YW |
1010 | if (r < 0) |
1011 | return r; | |
1012 | ||
1013 | return loop_device_open(dev, open_flags, lock_op, ret); | |
1014 | } | |
1015 | ||
1016 | int loop_device_open_from_path( | |
1017 | const char *path, | |
1018 | int open_flags, | |
1019 | int lock_op, | |
1020 | LoopDevice **ret) { | |
1021 | ||
1022 | _cleanup_(sd_device_unrefp) sd_device *dev = NULL; | |
1023 | int r; | |
1024 | ||
1025 | assert(path); | |
1026 | ||
1027 | r = block_device_new_from_path(path, 0, &dev); | |
1028 | if (r < 0) | |
1029 | return r; | |
1030 | ||
1031 | return loop_device_open(dev, open_flags, lock_op, ret); | |
9dabc4fd LP |
1032 | } |
1033 | ||
f1443709 LP |
1034 | static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) { |
1035 | char sysfs[STRLEN("/sys/dev/block/:/partition") + 2*DECIMAL_STR_MAX(dev_t) + 1]; | |
ca822829 | 1036 | _cleanup_free_ char *buffer = NULL; |
f1443709 | 1037 | uint64_t current_offset, current_size, partno; |
254d1313 | 1038 | _cleanup_close_ int whole_fd = -EBADF; |
f1443709 LP |
1039 | struct stat st; |
1040 | dev_t devno; | |
1041 | int r; | |
1042 | ||
f1443709 LP |
1043 | /* Resizes the partition the loopback device refer to (assuming it refers to one instead of an actual |
1044 | * loopback device), and changes the offset, if needed. This is a fancy wrapper around | |
1045 | * BLKPG_RESIZE_PARTITION. */ | |
1046 | ||
8e398254 | 1047 | if (fstat(ASSERT_FD(partition_fd), &st) < 0) |
f1443709 LP |
1048 | return -errno; |
1049 | ||
1050 | assert(S_ISBLK(st.st_mode)); | |
1051 | ||
ed13feff | 1052 | xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev)); |
f1443709 LP |
1053 | r = read_one_line_file(sysfs, &buffer); |
1054 | if (r == -ENOENT) /* not a partition, cannot resize */ | |
1055 | return -ENOTTY; | |
1056 | if (r < 0) | |
1057 | return r; | |
1058 | r = safe_atou64(buffer, &partno); | |
1059 | if (r < 0) | |
1060 | return r; | |
1061 | ||
ed13feff | 1062 | xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/start", DEVNUM_FORMAT_VAL(st.st_rdev)); |
f1443709 LP |
1063 | |
1064 | buffer = mfree(buffer); | |
1065 | r = read_one_line_file(sysfs, &buffer); | |
1066 | if (r < 0) | |
1067 | return r; | |
1068 | r = safe_atou64(buffer, ¤t_offset); | |
1069 | if (r < 0) | |
1070 | return r; | |
1071 | if (current_offset > UINT64_MAX/512U) | |
1072 | return -EINVAL; | |
1073 | current_offset *= 512U; | |
1074 | ||
c961a8c6 LP |
1075 | r = blockdev_get_device_size(partition_fd, ¤t_size); |
1076 | if (r < 0) | |
1077 | return r; | |
f1443709 LP |
1078 | |
1079 | if (size == UINT64_MAX && offset == UINT64_MAX) | |
1080 | return 0; | |
1081 | if (current_size == size && current_offset == offset) | |
1082 | return 0; | |
1083 | ||
ed13feff | 1084 | xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/../dev", DEVNUM_FORMAT_VAL(st.st_rdev)); |
f1443709 LP |
1085 | |
1086 | buffer = mfree(buffer); | |
1087 | r = read_one_line_file(sysfs, &buffer); | |
1088 | if (r < 0) | |
1089 | return r; | |
7176f06c | 1090 | r = parse_devnum(buffer, &devno); |
f1443709 LP |
1091 | if (r < 0) |
1092 | return r; | |
1093 | ||
ca822829 | 1094 | whole_fd = r = device_open_from_devnum(S_IFBLK, devno, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL); |
f1443709 LP |
1095 | if (r < 0) |
1096 | return r; | |
1097 | ||
91e1ce1a LP |
1098 | return block_device_resize_partition( |
1099 | whole_fd, | |
1100 | partno, | |
1101 | offset == UINT64_MAX ? current_offset : offset, | |
1102 | size == UINT64_MAX ? current_size : size); | |
f1443709 LP |
1103 | } |
1104 | ||
c37878fc LP |
1105 | int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) { |
1106 | struct loop_info64 info; | |
ff27ef4b | 1107 | |
9dabc4fd | 1108 | assert(d); |
ff27ef4b | 1109 | assert(d->fd >= 0); |
9dabc4fd | 1110 | |
f1443709 LP |
1111 | /* Changes the offset/start of the loop device relative to the beginning of the underlying file or |
1112 | * block device. If this loop device actually refers to a partition and not a loopback device, we'll | |
1113 | * try to adjust the partition offsets instead. | |
1114 | * | |
1115 | * If either offset or size is UINT64_MAX we won't change that parameter. */ | |
1116 | ||
f1443709 LP |
1117 | if (d->nr < 0) /* not a loopback device */ |
1118 | return resize_partition(d->fd, offset, size); | |
1119 | ||
c37878fc LP |
1120 | if (ioctl(d->fd, LOOP_GET_STATUS64, &info) < 0) |
1121 | return -errno; | |
1122 | ||
10c1b188 LP |
1123 | #if HAVE_VALGRIND_MEMCHECK_H |
1124 | /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */ | |
1125 | VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); | |
1126 | #endif | |
1127 | ||
c37878fc LP |
1128 | if (size == UINT64_MAX && offset == UINT64_MAX) |
1129 | return 0; | |
1130 | if (info.lo_sizelimit == size && info.lo_offset == offset) | |
1131 | return 0; | |
1132 | ||
1133 | if (size != UINT64_MAX) | |
1134 | info.lo_sizelimit = size; | |
1135 | if (offset != UINT64_MAX) | |
1136 | info.lo_offset = offset; | |
1137 | ||
7c248223 | 1138 | return RET_NERRNO(ioctl(d->fd, LOOP_SET_STATUS64, &info)); |
9dabc4fd | 1139 | } |
441ec804 LP |
1140 | |
1141 | int loop_device_flock(LoopDevice *d, int operation) { | |
7f52206a | 1142 | assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX)); |
441ec804 LP |
1143 | assert(d); |
1144 | ||
7f52206a LP |
1145 | /* When unlocking just close the lock fd */ |
1146 | if ((operation & ~LOCK_NB) == LOCK_UN) { | |
1147 | d->lock_fd = safe_close(d->lock_fd); | |
1148 | return 0; | |
1149 | } | |
1150 | ||
1151 | /* If we had no lock fd so far, create one and lock it right-away */ | |
1152 | if (d->lock_fd < 0) { | |
8e398254 | 1153 | d->lock_fd = open_lock_fd(ASSERT_FD(d->fd), operation); |
7f52206a LP |
1154 | if (d->lock_fd < 0) |
1155 | return d->lock_fd; | |
1156 | ||
1157 | return 0; | |
1158 | } | |
441ec804 | 1159 | |
7f52206a LP |
1160 | /* Otherwise change the current lock mode on the existing fd */ |
1161 | return RET_NERRNO(flock(d->lock_fd, operation)); | |
441ec804 | 1162 | } |
8dbc208c LP |
1163 | |
1164 | int loop_device_sync(LoopDevice *d) { | |
1165 | assert(d); | |
1166 | ||
1167 | /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that | |
1168 | * we can check the return value though. */ | |
1169 | ||
8e398254 | 1170 | return RET_NERRNO(fsync(ASSERT_FD(d->fd))); |
8dbc208c | 1171 | } |
d2430d50 LP |
1172 | |
1173 | int loop_device_set_autoclear(LoopDevice *d, bool autoclear) { | |
1174 | struct loop_info64 info; | |
1175 | ||
1176 | assert(d); | |
1177 | ||
8e398254 | 1178 | if (ioctl(ASSERT_FD(d->fd), LOOP_GET_STATUS64, &info) < 0) |
d2430d50 LP |
1179 | return -errno; |
1180 | ||
1181 | if (autoclear == FLAGS_SET(info.lo_flags, LO_FLAGS_AUTOCLEAR)) | |
1182 | return 0; | |
1183 | ||
1184 | SET_FLAG(info.lo_flags, LO_FLAGS_AUTOCLEAR, autoclear); | |
1185 | ||
1186 | if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0) | |
1187 | return -errno; | |
1188 | ||
1189 | return 1; | |
1190 | } | |
999ac3e2 LP |
1191 | |
1192 | int loop_device_set_filename(LoopDevice *d, const char *name) { | |
1193 | struct loop_info64 info; | |
1194 | ||
1195 | assert(d); | |
1196 | ||
1197 | /* Sets the .lo_file_name of the loopback device. This is supposed to contain the path to the file | |
1198 | * backing the block device, but is actually just a free-form string you can pass to the kernel. Most | |
1199 | * tools that actually care for the backing file path use the sysfs attribute file loop/backing_file | |
7a05926f YW |
1200 | * which is a kernel generated string, subject to file system namespaces and such. |
1201 | * | |
1202 | * .lo_file_name is useful since userspace can select it freely when creating a loopback block | |
1203 | * device, and we can use it for /dev/disk/by-loop-ref/ symlinks, and similar, so that apps can | |
1204 | * recognize their own loopback files. */ | |
999ac3e2 LP |
1205 | |
1206 | if (name && strlen(name) >= sizeof(info.lo_file_name)) | |
1207 | return -ENOBUFS; | |
1208 | ||
8e398254 | 1209 | if (ioctl(ASSERT_FD(d->fd), LOOP_GET_STATUS64, &info) < 0) |
999ac3e2 LP |
1210 | return -errno; |
1211 | ||
1212 | if (strneq((char*) info.lo_file_name, strempty(name), sizeof(info.lo_file_name))) | |
1213 | return 0; | |
1214 | ||
1215 | if (name) { | |
1216 | strncpy((char*) info.lo_file_name, name, sizeof(info.lo_file_name)-1); | |
1217 | info.lo_file_name[sizeof(info.lo_file_name)-1] = 0; | |
1218 | } else | |
1219 | memzero(info.lo_file_name, sizeof(info.lo_file_name)); | |
1220 | ||
1221 | if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0) | |
1222 | return -errno; | |
1223 | ||
1224 | return 1; | |
1225 | } |