]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/discover-image.c
Merge pull request #30284 from YHNdnzj/fstab-wantedby-defaultdeps
[thirdparty/systemd.git] / src / shared / discover-image.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/loop.h>
7 #include <linux/magic.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <sys/file.h>
11 #include <sys/ioctl.h>
12 #include <sys/stat.h>
13 #include <unistd.h>
14
15 #include "alloc-util.h"
16 #include "blockdev-util.h"
17 #include "btrfs-util.h"
18 #include "chase.h"
19 #include "chattr-util.h"
20 #include "copy.h"
21 #include "dirent-util.h"
22 #include "discover-image.h"
23 #include "dissect-image.h"
24 #include "env-file.h"
25 #include "env-util.h"
26 #include "extension-util.h"
27 #include "fd-util.h"
28 #include "fs-util.h"
29 #include "hashmap.h"
30 #include "hostname-setup.h"
31 #include "id128-util.h"
32 #include "initrd-util.h"
33 #include "lock-util.h"
34 #include "log.h"
35 #include "loop-util.h"
36 #include "macro.h"
37 #include "mkdir.h"
38 #include "nulstr-util.h"
39 #include "os-util.h"
40 #include "path-util.h"
41 #include "rm-rf.h"
42 #include "stat-util.h"
43 #include "string-table.h"
44 #include "string-util.h"
45 #include "strv.h"
46 #include "time-util.h"
47 #include "utf8.h"
48 #include "xattr-util.h"
49
50 static const char* const image_search_path[_IMAGE_CLASS_MAX] = {
51 [IMAGE_MACHINE] = "/etc/machines\0" /* only place symlinks here */
52 "/run/machines\0" /* and here too */
53 "/var/lib/machines\0" /* the main place for images */
54 "/var/lib/container\0" /* legacy */
55 "/usr/local/lib/machines\0"
56 "/usr/lib/machines\0",
57
58 [IMAGE_PORTABLE] = "/etc/portables\0" /* only place symlinks here */
59 "/run/portables\0" /* and here too */
60 "/var/lib/portables\0" /* the main place for images */
61 "/usr/local/lib/portables\0"
62 "/usr/lib/portables\0",
63
64 /* Note that we don't allow storing extensions under /usr/, unlike with other image types. That's
65 * because extension images are supposed to extend /usr/, so you get into recursive races, especially
66 * with directory-based extensions, as the kernel's OverlayFS explicitly checks for this and errors
67 * out with -ELOOP if it finds that a lowerdir= is a child of another lowerdir=. */
68 [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
69 "/run/extensions\0" /* and here too */
70 "/var/lib/extensions\0", /* the main place for images */
71
72 [IMAGE_CONFEXT] = "/run/confexts\0" /* only place symlinks here */
73 "/var/lib/confexts\0" /* the main place for images */
74 "/usr/local/lib/confexts\0"
75 "/usr/lib/confexts\0",
76 };
77
78 /* Inside the initrd, use a slightly different set of search path (i.e. include .extra/sysext in extension
79 * search dir) */
80 static const char* const image_search_path_initrd[_IMAGE_CLASS_MAX] = {
81 /* (entries that aren't listed here will get the same search path as for the non initrd-case) */
82
83 [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
84 "/run/extensions\0" /* and here too */
85 "/var/lib/extensions\0" /* the main place for images */
86 "/.extra/sysext\0" /* put sysext picked up by systemd-stub last, since not trusted */
87 };
88
89 static const char* image_class_suffix_table[_IMAGE_CLASS_MAX] = {
90 [IMAGE_SYSEXT] = ".sysext",
91 [IMAGE_CONFEXT] = ".confext",
92 };
93
94 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(image_class_suffix, ImageClass);
95
96 static Image *image_free(Image *i) {
97 assert(i);
98
99 free(i->name);
100 free(i->path);
101
102 free(i->hostname);
103 strv_free(i->machine_info);
104 strv_free(i->os_release);
105 strv_free(i->sysext_release);
106 strv_free(i->confext_release);
107
108 return mfree(i);
109 }
110
111 DEFINE_TRIVIAL_REF_UNREF_FUNC(Image, image, image_free);
112 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(image_hash_ops, char, string_hash_func, string_compare_func,
113 Image, image_unref);
114
115 static char **image_settings_path(Image *image) {
116 _cleanup_strv_free_ char **l = NULL;
117 _cleanup_free_ char *fn = NULL;
118 size_t i = 0;
119 int r;
120
121 assert(image);
122
123 l = new0(char*, 4);
124 if (!l)
125 return NULL;
126
127 fn = strjoin(image->name, ".nspawn");
128 if (!fn)
129 return NULL;
130
131 FOREACH_STRING(s, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
132 l[i] = path_join(s, fn);
133 if (!l[i])
134 return NULL;
135
136 i++;
137 }
138
139 r = file_in_same_dir(image->path, fn, l + i);
140 if (r == -ENOMEM)
141 return NULL;
142 if (r < 0)
143 log_debug_errno(r, "Failed to generate .nspawn settings path from image path, ignoring: %m");
144
145 strv_uniq(l);
146
147 return TAKE_PTR(l);
148 }
149
150 static int image_roothash_path(Image *image, char **ret) {
151 _cleanup_free_ char *fn = NULL;
152
153 assert(image);
154
155 fn = strjoin(image->name, ".roothash");
156 if (!fn)
157 return -ENOMEM;
158
159 return file_in_same_dir(image->path, fn, ret);
160 }
161
162 static int image_new(
163 ImageType t,
164 ImageClass c,
165 const char *pretty,
166 const char *path,
167 const char *filename,
168 bool read_only,
169 usec_t crtime,
170 usec_t mtime,
171 Image **ret) {
172
173 _cleanup_(image_unrefp) Image *i = NULL;
174
175 assert(t >= 0);
176 assert(t < _IMAGE_TYPE_MAX);
177 assert(pretty);
178 assert(filename);
179 assert(ret);
180
181 i = new(Image, 1);
182 if (!i)
183 return -ENOMEM;
184
185 *i = (Image) {
186 .n_ref = 1,
187 .type = t,
188 .class = c,
189 .read_only = read_only,
190 .crtime = crtime,
191 .mtime = mtime,
192 .usage = UINT64_MAX,
193 .usage_exclusive = UINT64_MAX,
194 .limit = UINT64_MAX,
195 .limit_exclusive = UINT64_MAX,
196 };
197
198 i->name = strdup(pretty);
199 if (!i->name)
200 return -ENOMEM;
201
202 i->path = path_join(path, filename);
203 if (!i->path)
204 return -ENOMEM;
205
206 path_simplify(i->path);
207
208 *ret = TAKE_PTR(i);
209
210 return 0;
211 }
212
213 static int extract_pretty(
214 const char *path,
215 const char *class_suffix,
216 const char *format_suffix,
217 char **ret) {
218
219 _cleanup_free_ char *name = NULL;
220 int r;
221
222 assert(path);
223 assert(ret);
224
225 r = path_extract_filename(path, &name);
226 if (r < 0)
227 return r;
228
229 if (format_suffix) {
230 char *e = endswith(name, format_suffix);
231 if (!e) /* Format suffix is required */
232 return -EINVAL;
233
234 *e = 0;
235 }
236
237 if (class_suffix) {
238 char *e = endswith(name, class_suffix);
239 if (e) /* Class suffix is optional */
240 *e = 0;
241 }
242
243 if (!image_name_is_valid(name))
244 return -EINVAL;
245
246 *ret = TAKE_PTR(name);
247 return 0;
248 }
249
250 static int image_make(
251 ImageClass c,
252 const char *pretty,
253 int dfd,
254 const char *path,
255 const char *filename,
256 const struct stat *st,
257 Image **ret) {
258
259 _cleanup_free_ char *pretty_buffer = NULL, *parent = NULL;
260 struct stat stbuf;
261 bool read_only;
262 int r;
263
264 assert(dfd >= 0 || dfd == AT_FDCWD);
265 assert(path || dfd == AT_FDCWD);
266 assert(filename);
267
268 /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block
269 * devices into /var/lib/machines/, and treat them normally.
270 *
271 * This function returns -ENOENT if we can't find the image after all, and -EMEDIUMTYPE if it's not a file we
272 * recognize. */
273
274 if (!st) {
275 if (fstatat(dfd, filename, &stbuf, 0) < 0)
276 return -errno;
277
278 st = &stbuf;
279 }
280
281 if (!path) {
282 if (dfd == AT_FDCWD)
283 (void) safe_getcwd(&parent);
284 else
285 (void) fd_get_path(dfd, &parent);
286 }
287
288 read_only =
289 (path && path_startswith(path, "/usr")) ||
290 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
291
292 if (S_ISDIR(st->st_mode)) {
293 _cleanup_close_ int fd = -EBADF;
294 unsigned file_attr = 0;
295 usec_t crtime = 0;
296
297 if (!ret)
298 return 0;
299
300 if (!pretty) {
301 r = extract_pretty(filename, image_class_suffix_to_string(c), NULL, &pretty_buffer);
302 if (r < 0)
303 return r;
304
305 pretty = pretty_buffer;
306 }
307
308 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
309 if (fd < 0)
310 return -errno;
311
312 if (btrfs_might_be_subvol(st)) {
313
314 r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
315 if (r < 0)
316 return r;
317 if (r > 0) {
318 BtrfsSubvolInfo info;
319
320 /* It's a btrfs subvolume */
321
322 r = btrfs_subvol_get_info_fd(fd, 0, &info);
323 if (r < 0)
324 return r;
325
326 r = image_new(IMAGE_SUBVOLUME,
327 c,
328 pretty,
329 path,
330 filename,
331 info.read_only || read_only,
332 info.otime,
333 0,
334 ret);
335 if (r < 0)
336 return r;
337
338 if (btrfs_quota_scan_ongoing(fd) == 0) {
339 BtrfsQuotaInfo quota;
340
341 r = btrfs_subvol_get_subtree_quota_fd(fd, 0, &quota);
342 if (r >= 0) {
343 (*ret)->usage = quota.referenced;
344 (*ret)->usage_exclusive = quota.exclusive;
345
346 (*ret)->limit = quota.referenced_max;
347 (*ret)->limit_exclusive = quota.exclusive_max;
348 }
349 }
350
351 return 0;
352 }
353 }
354
355 /* Get directory creation time (not available everywhere, but that's OK */
356 (void) fd_getcrtime(fd, &crtime);
357
358 /* If the IMMUTABLE bit is set, we consider the directory read-only. Since the ioctl is not
359 * supported everywhere we ignore failures. */
360 (void) read_attr_fd(fd, &file_attr);
361
362 /* It's just a normal directory. */
363 r = image_new(IMAGE_DIRECTORY,
364 c,
365 pretty,
366 path,
367 filename,
368 read_only || (file_attr & FS_IMMUTABLE_FL),
369 crtime,
370 0, /* we don't use mtime of stat() here, since it's not the time of last change of the tree, but only of the top-level dir */
371 ret);
372 if (r < 0)
373 return r;
374
375 return 0;
376
377 } else if (S_ISREG(st->st_mode) && endswith(filename, ".raw")) {
378 usec_t crtime = 0;
379
380 /* It's a RAW disk image */
381
382 if (!ret)
383 return 0;
384
385 (void) fd_getcrtime_at(dfd, filename, AT_SYMLINK_FOLLOW, &crtime);
386
387 if (!pretty) {
388 r = extract_pretty(filename, image_class_suffix_to_string(c), ".raw", &pretty_buffer);
389 if (r < 0)
390 return r;
391
392 pretty = pretty_buffer;
393 }
394
395 r = image_new(IMAGE_RAW,
396 c,
397 pretty,
398 path,
399 filename,
400 !(st->st_mode & 0222) || read_only,
401 crtime,
402 timespec_load(&st->st_mtim),
403 ret);
404 if (r < 0)
405 return r;
406
407 (*ret)->usage = (*ret)->usage_exclusive = st->st_blocks * 512;
408 (*ret)->limit = (*ret)->limit_exclusive = st->st_size;
409
410 return 0;
411
412 } else if (S_ISBLK(st->st_mode)) {
413 _cleanup_close_ int block_fd = -EBADF;
414 uint64_t size = UINT64_MAX;
415
416 /* A block device */
417
418 if (!ret)
419 return 0;
420
421 if (!pretty) {
422 r = extract_pretty(filename, NULL, NULL, &pretty_buffer);
423 if (r < 0)
424 return r;
425
426 pretty = pretty_buffer;
427 }
428
429 block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
430 if (block_fd < 0)
431 log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
432 else {
433 /* Refresh stat data after opening the node */
434 if (fstat(block_fd, &stbuf) < 0)
435 return -errno;
436 st = &stbuf;
437
438 if (!S_ISBLK(st->st_mode)) /* Verify that what we opened is actually what we think it is */
439 return -ENOTTY;
440
441 if (!read_only) {
442 int state = 0;
443
444 if (ioctl(block_fd, BLKROGET, &state) < 0)
445 log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
446 else if (state)
447 read_only = true;
448 }
449
450 r = blockdev_get_device_size(block_fd, &size);
451 if (r < 0)
452 log_debug_errno(r, "Failed to issue BLKGETSIZE64 on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
453
454 block_fd = safe_close(block_fd);
455 }
456
457 r = image_new(IMAGE_BLOCK,
458 c,
459 pretty,
460 path,
461 filename,
462 !(st->st_mode & 0222) || read_only,
463 0,
464 0,
465 ret);
466 if (r < 0)
467 return r;
468
469 if (!IN_SET(size, 0, UINT64_MAX))
470 (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size;
471
472 return 0;
473 }
474
475 return -EMEDIUMTYPE;
476 }
477
478 static const char *pick_image_search_path(ImageClass class) {
479 if (class < 0 || class >= _IMAGE_CLASS_MAX)
480 return NULL;
481
482 /* Use the initrd search path if there is one, otherwise use the common one */
483 return in_initrd() && image_search_path_initrd[class] ? image_search_path_initrd[class] : image_search_path[class];
484 }
485
486 int image_find(ImageClass class,
487 const char *name,
488 const char *root,
489 Image **ret) {
490
491 int r;
492
493 assert(class >= 0);
494 assert(class < _IMAGE_CLASS_MAX);
495 assert(name);
496
497 /* There are no images with invalid names */
498 if (!image_name_is_valid(name))
499 return -ENOENT;
500
501 NULSTR_FOREACH(path, pick_image_search_path(class)) {
502 _cleanup_free_ char *resolved = NULL;
503 _cleanup_closedir_ DIR *d = NULL;
504 struct stat st;
505 int flags;
506
507 r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
508 if (r == -ENOENT)
509 continue;
510 if (r < 0)
511 return r;
512
513 /* As mentioned above, we follow symlinks on this fstatat(), because we want to permit people
514 * to symlink block devices into the search path. (For now, we disable that when operating
515 * relative to some root directory.) */
516 flags = root ? AT_SYMLINK_NOFOLLOW : 0;
517 if (fstatat(dirfd(d), name, &st, flags) < 0) {
518 _cleanup_free_ char *raw = NULL;
519
520 if (errno != ENOENT)
521 return -errno;
522
523 raw = strjoin(name, ".raw");
524 if (!raw)
525 return -ENOMEM;
526
527 if (fstatat(dirfd(d), raw, &st, flags) < 0) {
528 if (errno == ENOENT)
529 continue;
530
531 return -errno;
532 }
533
534 if (!S_ISREG(st.st_mode))
535 continue;
536
537 r = image_make(class, name, dirfd(d), resolved, raw, &st, ret);
538
539 } else {
540 if (!S_ISDIR(st.st_mode) && !S_ISBLK(st.st_mode))
541 continue;
542
543 r = image_make(class, name, dirfd(d), resolved, name, &st, ret);
544 }
545 if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
546 continue;
547 if (r < 0)
548 return r;
549
550 if (ret)
551 (*ret)->discoverable = true;
552
553 return 1;
554 }
555
556 if (class == IMAGE_MACHINE && streq(name, ".host")) {
557 r = image_make(class, ".host", AT_FDCWD, NULL, empty_to_root(root), NULL, ret);
558 if (r < 0)
559 return r;
560
561 if (ret)
562 (*ret)->discoverable = true;
563
564 return r;
565 }
566
567 return -ENOENT;
568 };
569
570 int image_from_path(const char *path, Image **ret) {
571
572 /* Note that we don't set the 'discoverable' field of the returned object, because we don't check here whether
573 * the image is in the image search path. And if it is we don't know if the path we used is actually not
574 * overridden by another, different image earlier in the search path */
575
576 if (path_equal(path, "/"))
577 return image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, "/", NULL, ret);
578
579 return image_make(_IMAGE_CLASS_INVALID, NULL, AT_FDCWD, NULL, path, NULL, ret);
580 }
581
582 int image_find_harder(ImageClass class, const char *name_or_path, const char *root, Image **ret) {
583 if (image_name_is_valid(name_or_path))
584 return image_find(class, name_or_path, root, ret);
585
586 return image_from_path(name_or_path, ret);
587 }
588
589 int image_discover(
590 ImageClass class,
591 const char *root,
592 Hashmap *h) {
593
594 int r;
595
596 assert(class >= 0);
597 assert(class < _IMAGE_CLASS_MAX);
598 assert(h);
599
600 NULSTR_FOREACH(path, pick_image_search_path(class)) {
601 _cleanup_free_ char *resolved = NULL;
602 _cleanup_closedir_ DIR *d = NULL;
603
604 r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
605 if (r == -ENOENT)
606 continue;
607 if (r < 0)
608 return r;
609
610 FOREACH_DIRENT_ALL(de, d, return -errno) {
611 _cleanup_(image_unrefp) Image *image = NULL;
612 _cleanup_free_ char *pretty = NULL;
613 struct stat st;
614 int flags;
615
616 if (dot_or_dot_dot(de->d_name))
617 continue;
618
619 /* As mentioned above, we follow symlinks on this fstatat(), because we want to
620 * permit people to symlink block devices into the search path. */
621 flags = root ? AT_SYMLINK_NOFOLLOW : 0;
622 if (fstatat(dirfd(d), de->d_name, &st, flags) < 0) {
623 if (errno == ENOENT)
624 continue;
625
626 return -errno;
627 }
628
629 if (S_ISREG(st.st_mode))
630 r = extract_pretty(de->d_name, image_class_suffix_to_string(class), ".raw", &pretty);
631 else if (S_ISDIR(st.st_mode))
632 r = extract_pretty(de->d_name, image_class_suffix_to_string(class), NULL, &pretty);
633 else if (S_ISBLK(st.st_mode))
634 r = extract_pretty(de->d_name, NULL, NULL, &pretty);
635 else {
636 log_debug("Skipping directory entry '%s', which is neither regular file, directory nor block device.", de->d_name);
637 continue;
638 }
639 if (r < 0) {
640 log_debug_errno(r, "Skipping directory entry '%s', which doesn't look like an image.", de->d_name);
641 continue;
642 }
643
644 if (hashmap_contains(h, pretty))
645 continue;
646
647 r = image_make(class, pretty, dirfd(d), resolved, de->d_name, &st, &image);
648 if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
649 continue;
650 if (r < 0)
651 return r;
652
653 image->discoverable = true;
654
655 r = hashmap_put(h, image->name, image);
656 if (r < 0)
657 return r;
658
659 TAKE_PTR(image);
660 }
661 }
662
663 if (class == IMAGE_MACHINE && !hashmap_contains(h, ".host")) {
664 _cleanup_(image_unrefp) Image *image = NULL;
665
666 r = image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, empty_to_root("/"), NULL, &image);
667 if (r < 0)
668 return r;
669
670 image->discoverable = true;
671
672 r = hashmap_put(h, image->name, image);
673 if (r < 0)
674 return r;
675
676 image = NULL;
677 }
678
679 return 0;
680 }
681
682 int image_remove(Image *i) {
683 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
684 _cleanup_strv_free_ char **settings = NULL;
685 _cleanup_free_ char *roothash = NULL;
686 int r;
687
688 assert(i);
689
690 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
691 return -EROFS;
692
693 settings = image_settings_path(i);
694 if (!settings)
695 return -ENOMEM;
696
697 r = image_roothash_path(i, &roothash);
698 if (r < 0)
699 return r;
700
701 /* Make sure we don't interfere with a running nspawn */
702 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
703 if (r < 0)
704 return r;
705
706 switch (i->type) {
707
708 case IMAGE_SUBVOLUME:
709
710 /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the
711 * big guns */
712 if (unlink(i->path) < 0) {
713 r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
714 if (r < 0)
715 return r;
716 }
717
718 break;
719
720 case IMAGE_DIRECTORY:
721 /* Allow deletion of read-only directories */
722 (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
723 r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
724 if (r < 0)
725 return r;
726
727 break;
728
729 case IMAGE_BLOCK:
730
731 /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node
732 * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink
733 * the thing (it's most likely a symlink after all). */
734
735 if (path_startswith(i->path, "/dev"))
736 break;
737
738 _fallthrough_;
739 case IMAGE_RAW:
740 if (unlink(i->path) < 0)
741 return -errno;
742 break;
743
744 default:
745 return -EOPNOTSUPP;
746 }
747
748 STRV_FOREACH(j, settings)
749 if (unlink(*j) < 0 && errno != ENOENT)
750 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j);
751
752 if (unlink(roothash) < 0 && errno != ENOENT)
753 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", roothash);
754
755 return 0;
756 }
757
758 static int rename_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
759 _cleanup_free_ char *fn = NULL, *rs = NULL;
760 int r;
761
762 fn = strjoin(new_name, suffix);
763 if (!fn)
764 return -ENOMEM;
765
766 r = file_in_same_dir(path, fn, &rs);
767 if (r < 0)
768 return r;
769
770 return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs);
771 }
772
773 int image_rename(Image *i, const char *new_name) {
774 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
775 _cleanup_free_ char *new_path = NULL, *nn = NULL, *roothash = NULL;
776 _cleanup_strv_free_ char **settings = NULL;
777 unsigned file_attr = 0;
778 int r;
779
780 assert(i);
781
782 if (!image_name_is_valid(new_name))
783 return -EINVAL;
784
785 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
786 return -EROFS;
787
788 settings = image_settings_path(i);
789 if (!settings)
790 return -ENOMEM;
791
792 r = image_roothash_path(i, &roothash);
793 if (r < 0)
794 return r;
795
796 /* Make sure we don't interfere with a running nspawn */
797 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
798 if (r < 0)
799 return r;
800
801 /* Make sure nobody takes the new name, between the time we
802 * checked it is currently unused in all search paths, and the
803 * time we take possession of it */
804 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
805 if (r < 0)
806 return r;
807
808 r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
809 if (r >= 0)
810 return -EEXIST;
811 if (r != -ENOENT)
812 return r;
813
814 switch (i->type) {
815
816 case IMAGE_DIRECTORY:
817 /* Turn of the immutable bit while we rename the image, so that we can rename it */
818 (void) read_attr_path(i->path, &file_attr);
819
820 if (file_attr & FS_IMMUTABLE_FL)
821 (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
822
823 _fallthrough_;
824 case IMAGE_SUBVOLUME:
825 r = file_in_same_dir(i->path, new_name, &new_path);
826 break;
827
828 case IMAGE_BLOCK:
829
830 /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */
831 if (path_startswith(i->path, "/dev"))
832 return -EROFS;
833
834 r = file_in_same_dir(i->path, new_name, &new_path);
835 break;
836
837 case IMAGE_RAW: {
838 const char *fn;
839
840 fn = strjoina(new_name, ".raw");
841
842 r = file_in_same_dir(i->path, fn, &new_path);
843 break;
844 }
845
846 default:
847 return -EOPNOTSUPP;
848 }
849 if (r < 0)
850 return r;
851
852 nn = strdup(new_name);
853 if (!nn)
854 return -ENOMEM;
855
856 r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
857 if (r < 0)
858 return r;
859
860 /* Restore the immutable bit, if it was set before */
861 if (file_attr & FS_IMMUTABLE_FL)
862 (void) chattr_path(new_path, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL);
863
864 free_and_replace(i->path, new_path);
865 free_and_replace(i->name, nn);
866
867 STRV_FOREACH(j, settings) {
868 r = rename_auxiliary_file(*j, new_name, ".nspawn");
869 if (r < 0 && r != -ENOENT)
870 log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j);
871 }
872
873 r = rename_auxiliary_file(roothash, new_name, ".roothash");
874 if (r < 0 && r != -ENOENT)
875 log_debug_errno(r, "Failed to rename roothash file %s, ignoring: %m", roothash);
876
877 return 0;
878 }
879
880 static int clone_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
881 _cleanup_free_ char *fn = NULL, *rs = NULL;
882 int r;
883
884 fn = strjoin(new_name, suffix);
885 if (!fn)
886 return -ENOMEM;
887
888 r = file_in_same_dir(path, fn, &rs);
889 if (r < 0)
890 return r;
891
892 return copy_file_atomic(path, rs, 0664, COPY_REFLINK);
893 }
894
895 int image_clone(Image *i, const char *new_name, bool read_only) {
896 _cleanup_(release_lock_file) LockFile name_lock = LOCK_FILE_INIT;
897 _cleanup_strv_free_ char **settings = NULL;
898 _cleanup_free_ char *roothash = NULL;
899 const char *new_path;
900 int r;
901
902 assert(i);
903
904 if (!image_name_is_valid(new_name))
905 return -EINVAL;
906
907 settings = image_settings_path(i);
908 if (!settings)
909 return -ENOMEM;
910
911 r = image_roothash_path(i, &roothash);
912 if (r < 0)
913 return r;
914
915 /* Make sure nobody takes the new name, between the time we
916 * checked it is currently unused in all search paths, and the
917 * time we take possession of it */
918 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
919 if (r < 0)
920 return r;
921
922 r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
923 if (r >= 0)
924 return -EEXIST;
925 if (r != -ENOENT)
926 return r;
927
928 switch (i->type) {
929
930 case IMAGE_SUBVOLUME:
931 case IMAGE_DIRECTORY:
932 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
933 * directory. */
934
935 new_path = strjoina("/var/lib/machines/", new_name);
936
937 r = btrfs_subvol_snapshot_at(AT_FDCWD, i->path, AT_FDCWD, new_path,
938 (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
939 BTRFS_SNAPSHOT_FALLBACK_COPY |
940 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
941 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
942 BTRFS_SNAPSHOT_RECURSIVE |
943 BTRFS_SNAPSHOT_QUOTA);
944 if (r >= 0)
945 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
946 (void) btrfs_subvol_auto_qgroup(new_path, 0, true);
947
948 break;
949
950 case IMAGE_RAW:
951 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
952
953 r = copy_file_atomic_full(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, FS_NOCOW_FL,
954 COPY_REFLINK|COPY_CRTIME, NULL, NULL);
955 break;
956
957 case IMAGE_BLOCK:
958 default:
959 return -EOPNOTSUPP;
960 }
961
962 if (r < 0)
963 return r;
964
965 STRV_FOREACH(j, settings) {
966 r = clone_auxiliary_file(*j, new_name, ".nspawn");
967 if (r < 0 && r != -ENOENT)
968 log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j);
969 }
970
971 r = clone_auxiliary_file(roothash, new_name, ".roothash");
972 if (r < 0 && r != -ENOENT)
973 log_debug_errno(r, "Failed to clone root hash file %s, ignoring: %m", roothash);
974
975 return 0;
976 }
977
978 int image_read_only(Image *i, bool b) {
979 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
980 int r;
981
982 assert(i);
983
984 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
985 return -EROFS;
986
987 /* Make sure we don't interfere with a running nspawn */
988 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
989 if (r < 0)
990 return r;
991
992 switch (i->type) {
993
994 case IMAGE_SUBVOLUME:
995
996 /* Note that we set the flag only on the top-level
997 * subvolume of the image. */
998
999 r = btrfs_subvol_set_read_only(i->path, b);
1000 if (r < 0)
1001 return r;
1002
1003 break;
1004
1005 case IMAGE_DIRECTORY:
1006 /* For simple directory trees we cannot use the access
1007 mode of the top-level directory, since it has an
1008 effect on the container itself. However, we can
1009 use the "immutable" flag, to at least make the
1010 top-level directory read-only. It's not as good as
1011 a read-only subvolume, but at least something, and
1012 we can read the value back. */
1013
1014 r = chattr_path(i->path, b ? FS_IMMUTABLE_FL : 0, FS_IMMUTABLE_FL, NULL);
1015 if (r < 0)
1016 return r;
1017
1018 break;
1019
1020 case IMAGE_RAW: {
1021 struct stat st;
1022
1023 if (stat(i->path, &st) < 0)
1024 return -errno;
1025
1026 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
1027 return -errno;
1028
1029 /* If the images is now read-only, it's a good time to
1030 * defrag it, given that no write patterns will
1031 * fragment it again. */
1032 if (b)
1033 (void) btrfs_defrag(i->path);
1034 break;
1035 }
1036
1037 case IMAGE_BLOCK: {
1038 _cleanup_close_ int fd = -EBADF;
1039 struct stat st;
1040 int state = b;
1041
1042 fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY);
1043 if (fd < 0)
1044 return -errno;
1045
1046 if (fstat(fd, &st) < 0)
1047 return -errno;
1048 if (!S_ISBLK(st.st_mode))
1049 return -ENOTTY;
1050
1051 if (ioctl(fd, BLKROSET, &state) < 0)
1052 return -errno;
1053
1054 break;
1055 }
1056
1057 default:
1058 return -EOPNOTSUPP;
1059 }
1060
1061 return 0;
1062 }
1063
1064 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
1065 _cleanup_free_ char *p = NULL;
1066 LockFile t = LOCK_FILE_INIT;
1067 struct stat st;
1068 bool exclusive;
1069 int r;
1070
1071 assert(path);
1072 assert(global);
1073 assert(local);
1074
1075 /* Locks an image path. This actually creates two locks: one "local" one, next to the image path
1076 * itself, which might be shared via NFS. And another "global" one, in /run, that uses the
1077 * device/inode number. This has the benefit that we can even lock a tree that is a mount point,
1078 * correctly. */
1079
1080 if (!path_is_absolute(path))
1081 return -EINVAL;
1082
1083 switch (operation & (LOCK_SH|LOCK_EX)) {
1084 case LOCK_SH:
1085 exclusive = false;
1086 break;
1087 case LOCK_EX:
1088 exclusive = true;
1089 break;
1090 default:
1091 return -EINVAL;
1092 }
1093
1094 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
1095 *local = *global = (LockFile) LOCK_FILE_INIT;
1096 return 0;
1097 }
1098
1099 /* Prohibit taking exclusive locks on the host image. We can't allow this, since we ourselves are
1100 * running off it after all, and we don't want any images to manipulate the host image. We make an
1101 * exception for shared locks however: we allow those (and make them NOPs since there's no point in
1102 * taking them if there can't be exclusive locks). Strictly speaking these are questionable as well,
1103 * since it means changes made to the host might propagate to the container as they happen (and a
1104 * shared lock kinda suggests that no changes happen at all while it is in place), but it's too
1105 * useful not to allow read-only containers off the host root, hence let's support this, and trust
1106 * the user to do the right thing with this. */
1107 if (path_equal(path, "/")) {
1108 if (exclusive)
1109 return -EBUSY;
1110
1111 *local = *global = (LockFile) LOCK_FILE_INIT;
1112 return 0;
1113 }
1114
1115 if (stat(path, &st) >= 0) {
1116 if (S_ISBLK(st.st_mode))
1117 r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev));
1118 else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode))
1119 r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino);
1120 else
1121 return -ENOTTY;
1122 if (r < 0)
1123 return -ENOMEM;
1124 }
1125
1126 /* For block devices we don't need the "local" lock, as the major/minor lock above should be
1127 * sufficient, since block devices are host local anyway. */
1128 if (!path_startswith(path, "/dev/")) {
1129 r = make_lock_file_for(path, operation, &t);
1130 if (r < 0) {
1131 if (!exclusive && r == -EROFS)
1132 log_debug_errno(r, "Failed to create shared lock for '%s', ignoring: %m", path);
1133 else
1134 return r;
1135 }
1136 }
1137
1138 if (p) {
1139 (void) mkdir_p("/run/systemd/nspawn/locks", 0700);
1140
1141 r = make_lock_file(p, operation, global);
1142 if (r < 0) {
1143 release_lock_file(&t);
1144 return r;
1145 }
1146 } else
1147 *global = (LockFile) LOCK_FILE_INIT;
1148
1149 *local = t;
1150 return 0;
1151 }
1152
1153 int image_set_limit(Image *i, uint64_t referenced_max) {
1154 assert(i);
1155
1156 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
1157 return -EROFS;
1158
1159 if (i->type != IMAGE_SUBVOLUME)
1160 return -EOPNOTSUPP;
1161
1162 /* We set the quota both for the subvolume as well as for the
1163 * subtree. The latter is mostly for historical reasons, since
1164 * we didn't use to have a concept of subtree quota, and hence
1165 * only modified the subvolume quota. */
1166
1167 (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max);
1168 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
1169 return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max);
1170 }
1171
1172 int image_read_metadata(Image *i, const ImagePolicy *image_policy) {
1173 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
1174 int r;
1175
1176 assert(i);
1177
1178 r = image_path_lock(i->path, LOCK_SH|LOCK_NB, &global_lock, &local_lock);
1179 if (r < 0)
1180 return r;
1181
1182 switch (i->type) {
1183
1184 case IMAGE_SUBVOLUME:
1185 case IMAGE_DIRECTORY: {
1186 _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **sysext_release = NULL, **confext_release = NULL;
1187 _cleanup_free_ char *hostname = NULL, *path = NULL;
1188 sd_id128_t machine_id = SD_ID128_NULL;
1189
1190 if (i->class == IMAGE_SYSEXT) {
1191 r = extension_has_forbidden_content(i->path);
1192 if (r < 0)
1193 return r;
1194 if (r > 0)
1195 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
1196 "Conflicting content found in image %s, refusing.",
1197 i->name);
1198 }
1199
1200 r = chase("/etc/hostname", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
1201 if (r < 0 && r != -ENOENT)
1202 log_debug_errno(r, "Failed to chase /etc/hostname in image %s: %m", i->name);
1203 else if (r >= 0) {
1204 r = read_etc_hostname(path, &hostname);
1205 if (r < 0)
1206 log_debug_errno(errno, "Failed to read /etc/hostname of image %s: %m", i->name);
1207 }
1208
1209 path = mfree(path);
1210
1211 r = id128_get_machine(i->path, &machine_id);
1212 if (r < 0)
1213 log_debug_errno(r, "Failed to read machine ID in image %s, ignoring: %m", i->name);
1214
1215 r = chase("/etc/machine-info", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
1216 if (r < 0 && r != -ENOENT)
1217 log_debug_errno(r, "Failed to chase /etc/machine-info in image %s: %m", i->name);
1218 else if (r >= 0) {
1219 r = load_env_file_pairs(NULL, path, &machine_info);
1220 if (r < 0)
1221 log_debug_errno(r, "Failed to parse machine-info data of %s: %m", i->name);
1222 }
1223
1224 r = load_os_release_pairs(i->path, &os_release);
1225 if (r < 0)
1226 log_debug_errno(r, "Failed to read os-release in image, ignoring: %m");
1227
1228 r = load_extension_release_pairs(i->path, IMAGE_SYSEXT, i->name, /* relax_extension_release_check= */ false, &sysext_release);
1229 if (r < 0)
1230 log_debug_errno(r, "Failed to read sysext-release in image, ignoring: %m");
1231
1232 r = load_extension_release_pairs(i->path, IMAGE_CONFEXT, i->name, /* relax_extension_release_check= */ false, &confext_release);
1233 if (r < 0)
1234 log_debug_errno(r, "Failed to read confext-release in image, ignoring: %m");
1235
1236 free_and_replace(i->hostname, hostname);
1237 i->machine_id = machine_id;
1238 strv_free_and_replace(i->machine_info, machine_info);
1239 strv_free_and_replace(i->os_release, os_release);
1240 strv_free_and_replace(i->sysext_release, sysext_release);
1241 strv_free_and_replace(i->confext_release, confext_release);
1242 break;
1243 }
1244
1245 case IMAGE_RAW:
1246 case IMAGE_BLOCK: {
1247 _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
1248 _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
1249
1250 r = loop_device_make_by_path(i->path, O_RDONLY, /* sector_size= */ UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &d);
1251 if (r < 0)
1252 return r;
1253
1254 r = dissect_loop_device(
1255 d,
1256 /* verity= */ NULL,
1257 /* mount_options= */ NULL,
1258 image_policy,
1259 DISSECT_IMAGE_GENERIC_ROOT |
1260 DISSECT_IMAGE_REQUIRE_ROOT |
1261 DISSECT_IMAGE_RELAX_VAR_CHECK |
1262 DISSECT_IMAGE_READ_ONLY |
1263 DISSECT_IMAGE_USR_NO_ROOT |
1264 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
1265 DISSECT_IMAGE_PIN_PARTITION_DEVICES,
1266 &m);
1267 if (r < 0)
1268 return r;
1269
1270 r = dissected_image_acquire_metadata(m,
1271 DISSECT_IMAGE_VALIDATE_OS |
1272 DISSECT_IMAGE_VALIDATE_OS_EXT);
1273 if (r < 0)
1274 return r;
1275
1276 free_and_replace(i->hostname, m->hostname);
1277 i->machine_id = m->machine_id;
1278 strv_free_and_replace(i->machine_info, m->machine_info);
1279 strv_free_and_replace(i->os_release, m->os_release);
1280 strv_free_and_replace(i->sysext_release, m->sysext_release);
1281 strv_free_and_replace(i->confext_release, m->confext_release);
1282
1283 break;
1284 }
1285
1286 default:
1287 return -EOPNOTSUPP;
1288 }
1289
1290 i->metadata_valid = true;
1291
1292 return 0;
1293 }
1294
1295 int image_name_lock(const char *name, int operation, LockFile *ret) {
1296 const char *p;
1297
1298 assert(name);
1299 assert(ret);
1300
1301 /* Locks an image name, regardless of the precise path used. */
1302
1303 if (streq(name, ".host"))
1304 return -EBUSY;
1305
1306 if (!image_name_is_valid(name))
1307 return -EINVAL;
1308
1309 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
1310 *ret = (LockFile) LOCK_FILE_INIT;
1311 return 0;
1312 }
1313
1314 (void) mkdir_p("/run/systemd/nspawn/locks", 0700);
1315
1316 p = strjoina("/run/systemd/nspawn/locks/name-", name);
1317 return make_lock_file(p, operation, ret);
1318 }
1319
1320 bool image_in_search_path(
1321 ImageClass class,
1322 const char *root,
1323 const char *image) {
1324
1325 assert(image);
1326
1327 NULSTR_FOREACH(path, pick_image_search_path(class)) {
1328 const char *p, *q;
1329 size_t k;
1330
1331 if (!empty_or_root(root)) {
1332 q = path_startswith(path, root);
1333 if (!q)
1334 continue;
1335 } else
1336 q = path;
1337
1338 p = path_startswith(q, path);
1339 if (!p)
1340 continue;
1341
1342 /* Make sure there's a filename following */
1343 k = strcspn(p, "/");
1344 if (k == 0)
1345 continue;
1346
1347 p += k;
1348
1349 /* Accept trailing slashes */
1350 if (p[strspn(p, "/")] == 0)
1351 return true;
1352
1353 }
1354
1355 return false;
1356 }
1357
1358 int image_to_json(const struct Image *img, JsonVariant **ret) {
1359 assert(img);
1360
1361 return json_build(ret,
1362 JSON_BUILD_OBJECT(
1363 JSON_BUILD_PAIR_STRING("Type", image_type_to_string(img->type)),
1364 JSON_BUILD_PAIR_STRING("Class", image_class_to_string(img->class)),
1365 JSON_BUILD_PAIR_STRING("Name", img->name),
1366 JSON_BUILD_PAIR_CONDITION(img->path, "Path", JSON_BUILD_STRING(img->path)),
1367 JSON_BUILD_PAIR_BOOLEAN("ReadOnly", img->read_only),
1368 JSON_BUILD_PAIR_CONDITION(img->crtime != 0, "CreationTimestamp", JSON_BUILD_UNSIGNED(img->crtime)),
1369 JSON_BUILD_PAIR_CONDITION(img->mtime != 0, "ModificationTimestamp", JSON_BUILD_UNSIGNED(img->mtime)),
1370 JSON_BUILD_PAIR_CONDITION(img->usage != UINT64_MAX, "Usage", JSON_BUILD_UNSIGNED(img->usage)),
1371 JSON_BUILD_PAIR_CONDITION(img->usage_exclusive != UINT64_MAX, "UsageExclusive", JSON_BUILD_UNSIGNED(img->usage_exclusive)),
1372 JSON_BUILD_PAIR_CONDITION(img->limit != UINT64_MAX, "Limit", JSON_BUILD_UNSIGNED(img->limit)),
1373 JSON_BUILD_PAIR_CONDITION(img->limit_exclusive != UINT64_MAX, "LimitExclusive", JSON_BUILD_UNSIGNED(img->limit_exclusive))));
1374 }
1375
1376 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
1377 [IMAGE_DIRECTORY] = "directory",
1378 [IMAGE_SUBVOLUME] = "subvolume",
1379 [IMAGE_RAW] = "raw",
1380 [IMAGE_BLOCK] = "block",
1381 };
1382
1383 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);