]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/discover-image.c
man/systemd.mount: tmpfs automatically gains After=swap.target dep
[thirdparty/systemd.git] / src / shared / discover-image.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/loop.h>
7 #include <linux/magic.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <sys/file.h>
11 #include <sys/ioctl.h>
12 #include <sys/stat.h>
13 #include <unistd.h>
14
15 #include "alloc-util.h"
16 #include "btrfs-util.h"
17 #include "chase.h"
18 #include "chattr-util.h"
19 #include "copy.h"
20 #include "dirent-util.h"
21 #include "discover-image.h"
22 #include "dissect-image.h"
23 #include "env-file.h"
24 #include "env-util.h"
25 #include "extension-util.h"
26 #include "fd-util.h"
27 #include "fs-util.h"
28 #include "hashmap.h"
29 #include "hostname-setup.h"
30 #include "id128-util.h"
31 #include "initrd-util.h"
32 #include "lock-util.h"
33 #include "log.h"
34 #include "loop-util.h"
35 #include "macro.h"
36 #include "mkdir.h"
37 #include "nulstr-util.h"
38 #include "os-util.h"
39 #include "path-util.h"
40 #include "rm-rf.h"
41 #include "stat-util.h"
42 #include "string-table.h"
43 #include "string-util.h"
44 #include "strv.h"
45 #include "time-util.h"
46 #include "utf8.h"
47 #include "xattr-util.h"
48
49 static const char* const image_search_path[_IMAGE_CLASS_MAX] = {
50 [IMAGE_MACHINE] = "/etc/machines\0" /* only place symlinks here */
51 "/run/machines\0" /* and here too */
52 "/var/lib/machines\0" /* the main place for images */
53 "/var/lib/container\0" /* legacy */
54 "/usr/local/lib/machines\0"
55 "/usr/lib/machines\0",
56
57 [IMAGE_PORTABLE] = "/etc/portables\0" /* only place symlinks here */
58 "/run/portables\0" /* and here too */
59 "/var/lib/portables\0" /* the main place for images */
60 "/usr/local/lib/portables\0"
61 "/usr/lib/portables\0",
62
63 /* Note that we don't allow storing extensions under /usr/, unlike with other image types. That's
64 * because extension images are supposed to extend /usr/, so you get into recursive races, especially
65 * with directory-based extensions, as the kernel's OverlayFS explicitly checks for this and errors
66 * out with -ELOOP if it finds that a lowerdir= is a child of another lowerdir=. */
67 [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
68 "/run/extensions\0" /* and here too */
69 "/var/lib/extensions\0", /* the main place for images */
70
71 [IMAGE_CONFEXT] = "/run/confexts\0" /* only place symlinks here */
72 "/var/lib/confexts\0" /* the main place for images */
73 "/usr/local/lib/confexts\0"
74 "/usr/lib/confexts\0",
75 };
76
77 /* Inside the initrd, use a slightly different set of search path (i.e. include .extra/sysext in extension
78 * search dir) */
79 static const char* const image_search_path_initrd[_IMAGE_CLASS_MAX] = {
80 /* (entries that aren't listed here will get the same search path as for the non initrd-case) */
81
82 [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
83 "/run/extensions\0" /* and here too */
84 "/var/lib/extensions\0" /* the main place for images */
85 "/.extra/sysext\0" /* put sysext picked up by systemd-stub last, since not trusted */
86 };
87
88 static const char* image_class_suffix_table[_IMAGE_CLASS_MAX] = {
89 [IMAGE_SYSEXT] = ".sysext",
90 [IMAGE_CONFEXT] = ".confext",
91 };
92
93 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(image_class_suffix, ImageClass);
94
95 static Image *image_free(Image *i) {
96 assert(i);
97
98 free(i->name);
99 free(i->path);
100
101 free(i->hostname);
102 strv_free(i->machine_info);
103 strv_free(i->os_release);
104 strv_free(i->sysext_release);
105 strv_free(i->confext_release);
106
107 return mfree(i);
108 }
109
110 DEFINE_TRIVIAL_REF_UNREF_FUNC(Image, image, image_free);
111 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(image_hash_ops, char, string_hash_func, string_compare_func,
112 Image, image_unref);
113
114 static char **image_settings_path(Image *image) {
115 _cleanup_strv_free_ char **l = NULL;
116 _cleanup_free_ char *fn = NULL;
117 size_t i = 0;
118 int r;
119
120 assert(image);
121
122 l = new0(char*, 4);
123 if (!l)
124 return NULL;
125
126 fn = strjoin(image->name, ".nspawn");
127 if (!fn)
128 return NULL;
129
130 FOREACH_STRING(s, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
131 l[i] = path_join(s, fn);
132 if (!l[i])
133 return NULL;
134
135 i++;
136 }
137
138 r = file_in_same_dir(image->path, fn, l + i);
139 if (r == -ENOMEM)
140 return NULL;
141 if (r < 0)
142 log_debug_errno(r, "Failed to generate .nspawn settings path from image path, ignoring: %m");
143
144 strv_uniq(l);
145
146 return TAKE_PTR(l);
147 }
148
149 static int image_roothash_path(Image *image, char **ret) {
150 _cleanup_free_ char *fn = NULL;
151
152 assert(image);
153
154 fn = strjoin(image->name, ".roothash");
155 if (!fn)
156 return -ENOMEM;
157
158 return file_in_same_dir(image->path, fn, ret);
159 }
160
161 static int image_new(
162 ImageType t,
163 ImageClass c,
164 const char *pretty,
165 const char *path,
166 const char *filename,
167 bool read_only,
168 usec_t crtime,
169 usec_t mtime,
170 Image **ret) {
171
172 _cleanup_(image_unrefp) Image *i = NULL;
173
174 assert(t >= 0);
175 assert(t < _IMAGE_TYPE_MAX);
176 assert(pretty);
177 assert(filename);
178 assert(ret);
179
180 i = new(Image, 1);
181 if (!i)
182 return -ENOMEM;
183
184 *i = (Image) {
185 .n_ref = 1,
186 .type = t,
187 .class = c,
188 .read_only = read_only,
189 .crtime = crtime,
190 .mtime = mtime,
191 .usage = UINT64_MAX,
192 .usage_exclusive = UINT64_MAX,
193 .limit = UINT64_MAX,
194 .limit_exclusive = UINT64_MAX,
195 };
196
197 i->name = strdup(pretty);
198 if (!i->name)
199 return -ENOMEM;
200
201 i->path = path_join(path, filename);
202 if (!i->path)
203 return -ENOMEM;
204
205 path_simplify(i->path);
206
207 *ret = TAKE_PTR(i);
208
209 return 0;
210 }
211
212 static int extract_pretty(
213 const char *path,
214 const char *class_suffix,
215 const char *format_suffix,
216 char **ret) {
217
218 _cleanup_free_ char *name = NULL;
219 int r;
220
221 assert(path);
222 assert(ret);
223
224 r = path_extract_filename(path, &name);
225 if (r < 0)
226 return r;
227
228 if (format_suffix) {
229 char *e = endswith(name, format_suffix);
230 if (!e) /* Format suffix is required */
231 return -EINVAL;
232
233 *e = 0;
234 }
235
236 if (class_suffix) {
237 char *e = endswith(name, class_suffix);
238 if (e) /* Class suffix is optional */
239 *e = 0;
240 }
241
242 if (!image_name_is_valid(name))
243 return -EINVAL;
244
245 *ret = TAKE_PTR(name);
246 return 0;
247 }
248
249 static int image_make(
250 ImageClass c,
251 const char *pretty,
252 int dfd,
253 const char *path,
254 const char *filename,
255 const struct stat *st,
256 Image **ret) {
257
258 _cleanup_free_ char *pretty_buffer = NULL, *parent = NULL;
259 struct stat stbuf;
260 bool read_only;
261 int r;
262
263 assert(dfd >= 0 || dfd == AT_FDCWD);
264 assert(path || dfd == AT_FDCWD);
265 assert(filename);
266
267 /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block
268 * devices into /var/lib/machines/, and treat them normally.
269 *
270 * This function returns -ENOENT if we can't find the image after all, and -EMEDIUMTYPE if it's not a file we
271 * recognize. */
272
273 if (!st) {
274 if (fstatat(dfd, filename, &stbuf, 0) < 0)
275 return -errno;
276
277 st = &stbuf;
278 }
279
280 if (!path) {
281 if (dfd == AT_FDCWD)
282 (void) safe_getcwd(&parent);
283 else
284 (void) fd_get_path(dfd, &parent);
285 }
286
287 read_only =
288 (path && path_startswith(path, "/usr")) ||
289 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
290
291 if (S_ISDIR(st->st_mode)) {
292 _cleanup_close_ int fd = -EBADF;
293 unsigned file_attr = 0;
294 usec_t crtime = 0;
295
296 if (!ret)
297 return 0;
298
299 if (!pretty) {
300 r = extract_pretty(filename, image_class_suffix_to_string(c), NULL, &pretty_buffer);
301 if (r < 0)
302 return r;
303
304 pretty = pretty_buffer;
305 }
306
307 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
308 if (fd < 0)
309 return -errno;
310
311 if (btrfs_might_be_subvol(st)) {
312
313 r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
314 if (r < 0)
315 return r;
316 if (r > 0) {
317 BtrfsSubvolInfo info;
318
319 /* It's a btrfs subvolume */
320
321 r = btrfs_subvol_get_info_fd(fd, 0, &info);
322 if (r < 0)
323 return r;
324
325 r = image_new(IMAGE_SUBVOLUME,
326 c,
327 pretty,
328 path,
329 filename,
330 info.read_only || read_only,
331 info.otime,
332 0,
333 ret);
334 if (r < 0)
335 return r;
336
337 if (btrfs_quota_scan_ongoing(fd) == 0) {
338 BtrfsQuotaInfo quota;
339
340 r = btrfs_subvol_get_subtree_quota_fd(fd, 0, &quota);
341 if (r >= 0) {
342 (*ret)->usage = quota.referenced;
343 (*ret)->usage_exclusive = quota.exclusive;
344
345 (*ret)->limit = quota.referenced_max;
346 (*ret)->limit_exclusive = quota.exclusive_max;
347 }
348 }
349
350 return 0;
351 }
352 }
353
354 /* Get directory creation time (not available everywhere, but that's OK */
355 (void) fd_getcrtime(fd, &crtime);
356
357 /* If the IMMUTABLE bit is set, we consider the directory read-only. Since the ioctl is not
358 * supported everywhere we ignore failures. */
359 (void) read_attr_fd(fd, &file_attr);
360
361 /* It's just a normal directory. */
362 r = image_new(IMAGE_DIRECTORY,
363 c,
364 pretty,
365 path,
366 filename,
367 read_only || (file_attr & FS_IMMUTABLE_FL),
368 crtime,
369 0, /* we don't use mtime of stat() here, since it's not the time of last change of the tree, but only of the top-level dir */
370 ret);
371 if (r < 0)
372 return r;
373
374 return 0;
375
376 } else if (S_ISREG(st->st_mode) && endswith(filename, ".raw")) {
377 usec_t crtime = 0;
378
379 /* It's a RAW disk image */
380
381 if (!ret)
382 return 0;
383
384 (void) fd_getcrtime_at(dfd, filename, AT_SYMLINK_FOLLOW, &crtime);
385
386 if (!pretty) {
387 r = extract_pretty(filename, image_class_suffix_to_string(c), ".raw", &pretty_buffer);
388 if (r < 0)
389 return r;
390
391 pretty = pretty_buffer;
392 }
393
394 r = image_new(IMAGE_RAW,
395 c,
396 pretty,
397 path,
398 filename,
399 !(st->st_mode & 0222) || read_only,
400 crtime,
401 timespec_load(&st->st_mtim),
402 ret);
403 if (r < 0)
404 return r;
405
406 (*ret)->usage = (*ret)->usage_exclusive = st->st_blocks * 512;
407 (*ret)->limit = (*ret)->limit_exclusive = st->st_size;
408
409 return 0;
410
411 } else if (S_ISBLK(st->st_mode)) {
412 _cleanup_close_ int block_fd = -EBADF;
413 uint64_t size = UINT64_MAX;
414
415 /* A block device */
416
417 if (!ret)
418 return 0;
419
420 if (!pretty) {
421 r = extract_pretty(filename, NULL, NULL, &pretty_buffer);
422 if (r < 0)
423 return r;
424
425 pretty = pretty_buffer;
426 }
427
428 block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
429 if (block_fd < 0)
430 log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
431 else {
432 /* Refresh stat data after opening the node */
433 if (fstat(block_fd, &stbuf) < 0)
434 return -errno;
435 st = &stbuf;
436
437 if (!S_ISBLK(st->st_mode)) /* Verify that what we opened is actually what we think it is */
438 return -ENOTTY;
439
440 if (!read_only) {
441 int state = 0;
442
443 if (ioctl(block_fd, BLKROGET, &state) < 0)
444 log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
445 else if (state)
446 read_only = true;
447 }
448
449 if (ioctl(block_fd, BLKGETSIZE64, &size) < 0)
450 log_debug_errno(errno, "Failed to issue BLKGETSIZE64 on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
451
452 block_fd = safe_close(block_fd);
453 }
454
455 r = image_new(IMAGE_BLOCK,
456 c,
457 pretty,
458 path,
459 filename,
460 !(st->st_mode & 0222) || read_only,
461 0,
462 0,
463 ret);
464 if (r < 0)
465 return r;
466
467 if (!IN_SET(size, 0, UINT64_MAX))
468 (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size;
469
470 return 0;
471 }
472
473 return -EMEDIUMTYPE;
474 }
475
476 static const char *pick_image_search_path(ImageClass class) {
477 if (class < 0 || class >= _IMAGE_CLASS_MAX)
478 return NULL;
479
480 /* Use the initrd search path if there is one, otherwise use the common one */
481 return in_initrd() && image_search_path_initrd[class] ? image_search_path_initrd[class] : image_search_path[class];
482 }
483
484 int image_find(ImageClass class,
485 const char *name,
486 const char *root,
487 Image **ret) {
488
489 int r;
490
491 assert(class >= 0);
492 assert(class < _IMAGE_CLASS_MAX);
493 assert(name);
494
495 /* There are no images with invalid names */
496 if (!image_name_is_valid(name))
497 return -ENOENT;
498
499 NULSTR_FOREACH(path, pick_image_search_path(class)) {
500 _cleanup_free_ char *resolved = NULL;
501 _cleanup_closedir_ DIR *d = NULL;
502 struct stat st;
503 int flags;
504
505 r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
506 if (r == -ENOENT)
507 continue;
508 if (r < 0)
509 return r;
510
511 /* As mentioned above, we follow symlinks on this fstatat(), because we want to permit people
512 * to symlink block devices into the search path. (For now, we disable that when operating
513 * relative to some root directory.) */
514 flags = root ? AT_SYMLINK_NOFOLLOW : 0;
515 if (fstatat(dirfd(d), name, &st, flags) < 0) {
516 _cleanup_free_ char *raw = NULL;
517
518 if (errno != ENOENT)
519 return -errno;
520
521 raw = strjoin(name, ".raw");
522 if (!raw)
523 return -ENOMEM;
524
525 if (fstatat(dirfd(d), raw, &st, flags) < 0) {
526 if (errno == ENOENT)
527 continue;
528
529 return -errno;
530 }
531
532 if (!S_ISREG(st.st_mode))
533 continue;
534
535 r = image_make(class, name, dirfd(d), resolved, raw, &st, ret);
536
537 } else {
538 if (!S_ISDIR(st.st_mode) && !S_ISBLK(st.st_mode))
539 continue;
540
541 r = image_make(class, name, dirfd(d), resolved, name, &st, ret);
542 }
543 if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
544 continue;
545 if (r < 0)
546 return r;
547
548 if (ret)
549 (*ret)->discoverable = true;
550
551 return 1;
552 }
553
554 if (class == IMAGE_MACHINE && streq(name, ".host")) {
555 r = image_make(class, ".host", AT_FDCWD, NULL, empty_to_root(root), NULL, ret);
556 if (r < 0)
557 return r;
558
559 if (ret)
560 (*ret)->discoverable = true;
561
562 return r;
563 }
564
565 return -ENOENT;
566 };
567
568 int image_from_path(const char *path, Image **ret) {
569
570 /* Note that we don't set the 'discoverable' field of the returned object, because we don't check here whether
571 * the image is in the image search path. And if it is we don't know if the path we used is actually not
572 * overridden by another, different image earlier in the search path */
573
574 if (path_equal(path, "/"))
575 return image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, "/", NULL, ret);
576
577 return image_make(_IMAGE_CLASS_INVALID, NULL, AT_FDCWD, NULL, path, NULL, ret);
578 }
579
580 int image_find_harder(ImageClass class, const char *name_or_path, const char *root, Image **ret) {
581 if (image_name_is_valid(name_or_path))
582 return image_find(class, name_or_path, root, ret);
583
584 return image_from_path(name_or_path, ret);
585 }
586
587 int image_discover(
588 ImageClass class,
589 const char *root,
590 Hashmap *h) {
591
592 int r;
593
594 assert(class >= 0);
595 assert(class < _IMAGE_CLASS_MAX);
596 assert(h);
597
598 NULSTR_FOREACH(path, pick_image_search_path(class)) {
599 _cleanup_free_ char *resolved = NULL;
600 _cleanup_closedir_ DIR *d = NULL;
601
602 r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
603 if (r == -ENOENT)
604 continue;
605 if (r < 0)
606 return r;
607
608 FOREACH_DIRENT_ALL(de, d, return -errno) {
609 _cleanup_(image_unrefp) Image *image = NULL;
610 _cleanup_free_ char *pretty = NULL;
611 struct stat st;
612 int flags;
613
614 if (dot_or_dot_dot(de->d_name))
615 continue;
616
617 /* As mentioned above, we follow symlinks on this fstatat(), because we want to
618 * permit people to symlink block devices into the search path. */
619 flags = root ? AT_SYMLINK_NOFOLLOW : 0;
620 if (fstatat(dirfd(d), de->d_name, &st, flags) < 0) {
621 if (errno == ENOENT)
622 continue;
623
624 return -errno;
625 }
626
627 if (S_ISREG(st.st_mode))
628 r = extract_pretty(de->d_name, image_class_suffix_to_string(class), ".raw", &pretty);
629 else if (S_ISDIR(st.st_mode))
630 r = extract_pretty(de->d_name, image_class_suffix_to_string(class), NULL, &pretty);
631 else if (S_ISBLK(st.st_mode))
632 r = extract_pretty(de->d_name, NULL, NULL, &pretty);
633 else {
634 log_debug("Skipping directory entry '%s', which is neither regular file, directory nor block device.", de->d_name);
635 continue;
636 }
637 if (r < 0) {
638 log_debug_errno(r, "Skipping directory entry '%s', which doesn't look like an image.", de->d_name);
639 continue;
640 }
641
642 if (hashmap_contains(h, pretty))
643 continue;
644
645 r = image_make(class, pretty, dirfd(d), resolved, de->d_name, &st, &image);
646 if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
647 continue;
648 if (r < 0)
649 return r;
650
651 image->discoverable = true;
652
653 r = hashmap_put(h, image->name, image);
654 if (r < 0)
655 return r;
656
657 TAKE_PTR(image);
658 }
659 }
660
661 if (class == IMAGE_MACHINE && !hashmap_contains(h, ".host")) {
662 _cleanup_(image_unrefp) Image *image = NULL;
663
664 r = image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, empty_to_root("/"), NULL, &image);
665 if (r < 0)
666 return r;
667
668 image->discoverable = true;
669
670 r = hashmap_put(h, image->name, image);
671 if (r < 0)
672 return r;
673
674 image = NULL;
675 }
676
677 return 0;
678 }
679
680 int image_remove(Image *i) {
681 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
682 _cleanup_strv_free_ char **settings = NULL;
683 _cleanup_free_ char *roothash = NULL;
684 int r;
685
686 assert(i);
687
688 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
689 return -EROFS;
690
691 settings = image_settings_path(i);
692 if (!settings)
693 return -ENOMEM;
694
695 r = image_roothash_path(i, &roothash);
696 if (r < 0)
697 return r;
698
699 /* Make sure we don't interfere with a running nspawn */
700 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
701 if (r < 0)
702 return r;
703
704 switch (i->type) {
705
706 case IMAGE_SUBVOLUME:
707
708 /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the
709 * big guns */
710 if (unlink(i->path) < 0) {
711 r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
712 if (r < 0)
713 return r;
714 }
715
716 break;
717
718 case IMAGE_DIRECTORY:
719 /* Allow deletion of read-only directories */
720 (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
721 r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
722 if (r < 0)
723 return r;
724
725 break;
726
727 case IMAGE_BLOCK:
728
729 /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node
730 * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink
731 * the thing (it's most likely a symlink after all). */
732
733 if (path_startswith(i->path, "/dev"))
734 break;
735
736 _fallthrough_;
737 case IMAGE_RAW:
738 if (unlink(i->path) < 0)
739 return -errno;
740 break;
741
742 default:
743 return -EOPNOTSUPP;
744 }
745
746 STRV_FOREACH(j, settings)
747 if (unlink(*j) < 0 && errno != ENOENT)
748 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j);
749
750 if (unlink(roothash) < 0 && errno != ENOENT)
751 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", roothash);
752
753 return 0;
754 }
755
756 static int rename_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
757 _cleanup_free_ char *fn = NULL, *rs = NULL;
758 int r;
759
760 fn = strjoin(new_name, suffix);
761 if (!fn)
762 return -ENOMEM;
763
764 r = file_in_same_dir(path, fn, &rs);
765 if (r < 0)
766 return r;
767
768 return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs);
769 }
770
771 int image_rename(Image *i, const char *new_name) {
772 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
773 _cleanup_free_ char *new_path = NULL, *nn = NULL, *roothash = NULL;
774 _cleanup_strv_free_ char **settings = NULL;
775 unsigned file_attr = 0;
776 int r;
777
778 assert(i);
779
780 if (!image_name_is_valid(new_name))
781 return -EINVAL;
782
783 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
784 return -EROFS;
785
786 settings = image_settings_path(i);
787 if (!settings)
788 return -ENOMEM;
789
790 r = image_roothash_path(i, &roothash);
791 if (r < 0)
792 return r;
793
794 /* Make sure we don't interfere with a running nspawn */
795 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
796 if (r < 0)
797 return r;
798
799 /* Make sure nobody takes the new name, between the time we
800 * checked it is currently unused in all search paths, and the
801 * time we take possession of it */
802 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
803 if (r < 0)
804 return r;
805
806 r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
807 if (r >= 0)
808 return -EEXIST;
809 if (r != -ENOENT)
810 return r;
811
812 switch (i->type) {
813
814 case IMAGE_DIRECTORY:
815 /* Turn of the immutable bit while we rename the image, so that we can rename it */
816 (void) read_attr_path(i->path, &file_attr);
817
818 if (file_attr & FS_IMMUTABLE_FL)
819 (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
820
821 _fallthrough_;
822 case IMAGE_SUBVOLUME:
823 r = file_in_same_dir(i->path, new_name, &new_path);
824 break;
825
826 case IMAGE_BLOCK:
827
828 /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */
829 if (path_startswith(i->path, "/dev"))
830 return -EROFS;
831
832 r = file_in_same_dir(i->path, new_name, &new_path);
833 break;
834
835 case IMAGE_RAW: {
836 const char *fn;
837
838 fn = strjoina(new_name, ".raw");
839
840 r = file_in_same_dir(i->path, fn, &new_path);
841 break;
842 }
843
844 default:
845 return -EOPNOTSUPP;
846 }
847 if (r < 0)
848 return r;
849
850 nn = strdup(new_name);
851 if (!nn)
852 return -ENOMEM;
853
854 r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
855 if (r < 0)
856 return r;
857
858 /* Restore the immutable bit, if it was set before */
859 if (file_attr & FS_IMMUTABLE_FL)
860 (void) chattr_path(new_path, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL);
861
862 free_and_replace(i->path, new_path);
863 free_and_replace(i->name, nn);
864
865 STRV_FOREACH(j, settings) {
866 r = rename_auxiliary_file(*j, new_name, ".nspawn");
867 if (r < 0 && r != -ENOENT)
868 log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j);
869 }
870
871 r = rename_auxiliary_file(roothash, new_name, ".roothash");
872 if (r < 0 && r != -ENOENT)
873 log_debug_errno(r, "Failed to rename roothash file %s, ignoring: %m", roothash);
874
875 return 0;
876 }
877
878 static int clone_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
879 _cleanup_free_ char *fn = NULL, *rs = NULL;
880 int r;
881
882 fn = strjoin(new_name, suffix);
883 if (!fn)
884 return -ENOMEM;
885
886 r = file_in_same_dir(path, fn, &rs);
887 if (r < 0)
888 return r;
889
890 return copy_file_atomic(path, rs, 0664, COPY_REFLINK);
891 }
892
893 int image_clone(Image *i, const char *new_name, bool read_only) {
894 _cleanup_(release_lock_file) LockFile name_lock = LOCK_FILE_INIT;
895 _cleanup_strv_free_ char **settings = NULL;
896 _cleanup_free_ char *roothash = NULL;
897 const char *new_path;
898 int r;
899
900 assert(i);
901
902 if (!image_name_is_valid(new_name))
903 return -EINVAL;
904
905 settings = image_settings_path(i);
906 if (!settings)
907 return -ENOMEM;
908
909 r = image_roothash_path(i, &roothash);
910 if (r < 0)
911 return r;
912
913 /* Make sure nobody takes the new name, between the time we
914 * checked it is currently unused in all search paths, and the
915 * time we take possession of it */
916 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
917 if (r < 0)
918 return r;
919
920 r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
921 if (r >= 0)
922 return -EEXIST;
923 if (r != -ENOENT)
924 return r;
925
926 switch (i->type) {
927
928 case IMAGE_SUBVOLUME:
929 case IMAGE_DIRECTORY:
930 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
931 * directory. */
932
933 new_path = strjoina("/var/lib/machines/", new_name);
934
935 r = btrfs_subvol_snapshot_at(AT_FDCWD, i->path, AT_FDCWD, new_path,
936 (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
937 BTRFS_SNAPSHOT_FALLBACK_COPY |
938 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
939 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
940 BTRFS_SNAPSHOT_RECURSIVE |
941 BTRFS_SNAPSHOT_QUOTA);
942 if (r >= 0)
943 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
944 (void) btrfs_subvol_auto_qgroup(new_path, 0, true);
945
946 break;
947
948 case IMAGE_RAW:
949 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
950
951 r = copy_file_atomic_full(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, FS_NOCOW_FL,
952 COPY_REFLINK|COPY_CRTIME, NULL, NULL);
953 break;
954
955 case IMAGE_BLOCK:
956 default:
957 return -EOPNOTSUPP;
958 }
959
960 if (r < 0)
961 return r;
962
963 STRV_FOREACH(j, settings) {
964 r = clone_auxiliary_file(*j, new_name, ".nspawn");
965 if (r < 0 && r != -ENOENT)
966 log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j);
967 }
968
969 r = clone_auxiliary_file(roothash, new_name, ".roothash");
970 if (r < 0 && r != -ENOENT)
971 log_debug_errno(r, "Failed to clone root hash file %s, ignoring: %m", roothash);
972
973 return 0;
974 }
975
976 int image_read_only(Image *i, bool b) {
977 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
978 int r;
979
980 assert(i);
981
982 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
983 return -EROFS;
984
985 /* Make sure we don't interfere with a running nspawn */
986 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
987 if (r < 0)
988 return r;
989
990 switch (i->type) {
991
992 case IMAGE_SUBVOLUME:
993
994 /* Note that we set the flag only on the top-level
995 * subvolume of the image. */
996
997 r = btrfs_subvol_set_read_only(i->path, b);
998 if (r < 0)
999 return r;
1000
1001 break;
1002
1003 case IMAGE_DIRECTORY:
1004 /* For simple directory trees we cannot use the access
1005 mode of the top-level directory, since it has an
1006 effect on the container itself. However, we can
1007 use the "immutable" flag, to at least make the
1008 top-level directory read-only. It's not as good as
1009 a read-only subvolume, but at least something, and
1010 we can read the value back. */
1011
1012 r = chattr_path(i->path, b ? FS_IMMUTABLE_FL : 0, FS_IMMUTABLE_FL, NULL);
1013 if (r < 0)
1014 return r;
1015
1016 break;
1017
1018 case IMAGE_RAW: {
1019 struct stat st;
1020
1021 if (stat(i->path, &st) < 0)
1022 return -errno;
1023
1024 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
1025 return -errno;
1026
1027 /* If the images is now read-only, it's a good time to
1028 * defrag it, given that no write patterns will
1029 * fragment it again. */
1030 if (b)
1031 (void) btrfs_defrag(i->path);
1032 break;
1033 }
1034
1035 case IMAGE_BLOCK: {
1036 _cleanup_close_ int fd = -EBADF;
1037 struct stat st;
1038 int state = b;
1039
1040 fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY);
1041 if (fd < 0)
1042 return -errno;
1043
1044 if (fstat(fd, &st) < 0)
1045 return -errno;
1046 if (!S_ISBLK(st.st_mode))
1047 return -ENOTTY;
1048
1049 if (ioctl(fd, BLKROSET, &state) < 0)
1050 return -errno;
1051
1052 break;
1053 }
1054
1055 default:
1056 return -EOPNOTSUPP;
1057 }
1058
1059 return 0;
1060 }
1061
1062 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
1063 _cleanup_free_ char *p = NULL;
1064 LockFile t = LOCK_FILE_INIT;
1065 struct stat st;
1066 bool exclusive;
1067 int r;
1068
1069 assert(path);
1070 assert(global);
1071 assert(local);
1072
1073 /* Locks an image path. This actually creates two locks: one "local" one, next to the image path
1074 * itself, which might be shared via NFS. And another "global" one, in /run, that uses the
1075 * device/inode number. This has the benefit that we can even lock a tree that is a mount point,
1076 * correctly. */
1077
1078 if (!path_is_absolute(path))
1079 return -EINVAL;
1080
1081 switch (operation & (LOCK_SH|LOCK_EX)) {
1082 case LOCK_SH:
1083 exclusive = false;
1084 break;
1085 case LOCK_EX:
1086 exclusive = true;
1087 break;
1088 default:
1089 return -EINVAL;
1090 }
1091
1092 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
1093 *local = *global = (LockFile) LOCK_FILE_INIT;
1094 return 0;
1095 }
1096
1097 /* Prohibit taking exclusive locks on the host image. We can't allow this, since we ourselves are
1098 * running off it after all, and we don't want any images to manipulate the host image. We make an
1099 * exception for shared locks however: we allow those (and make them NOPs since there's no point in
1100 * taking them if there can't be exclusive locks). Strictly speaking these are questionable as well,
1101 * since it means changes made to the host might propagate to the container as they happen (and a
1102 * shared lock kinda suggests that no changes happen at all while it is in place), but it's too
1103 * useful not to allow read-only containers off the host root, hence let's support this, and trust
1104 * the user to do the right thing with this. */
1105 if (path_equal(path, "/")) {
1106 if (exclusive)
1107 return -EBUSY;
1108
1109 *local = *global = (LockFile) LOCK_FILE_INIT;
1110 return 0;
1111 }
1112
1113 if (stat(path, &st) >= 0) {
1114 if (S_ISBLK(st.st_mode))
1115 r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev));
1116 else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode))
1117 r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino);
1118 else
1119 return -ENOTTY;
1120 if (r < 0)
1121 return -ENOMEM;
1122 }
1123
1124 /* For block devices we don't need the "local" lock, as the major/minor lock above should be
1125 * sufficient, since block devices are host local anyway. */
1126 if (!path_startswith(path, "/dev/")) {
1127 r = make_lock_file_for(path, operation, &t);
1128 if (r < 0) {
1129 if (!exclusive && r == -EROFS)
1130 log_debug_errno(r, "Failed to create shared lock for '%s', ignoring: %m", path);
1131 else
1132 return r;
1133 }
1134 }
1135
1136 if (p) {
1137 (void) mkdir_p("/run/systemd/nspawn/locks", 0700);
1138
1139 r = make_lock_file(p, operation, global);
1140 if (r < 0) {
1141 release_lock_file(&t);
1142 return r;
1143 }
1144 } else
1145 *global = (LockFile) LOCK_FILE_INIT;
1146
1147 *local = t;
1148 return 0;
1149 }
1150
1151 int image_set_limit(Image *i, uint64_t referenced_max) {
1152 assert(i);
1153
1154 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
1155 return -EROFS;
1156
1157 if (i->type != IMAGE_SUBVOLUME)
1158 return -EOPNOTSUPP;
1159
1160 /* We set the quota both for the subvolume as well as for the
1161 * subtree. The latter is mostly for historical reasons, since
1162 * we didn't use to have a concept of subtree quota, and hence
1163 * only modified the subvolume quota. */
1164
1165 (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max);
1166 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
1167 return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max);
1168 }
1169
1170 int image_read_metadata(Image *i, const ImagePolicy *image_policy) {
1171 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
1172 int r;
1173
1174 assert(i);
1175
1176 r = image_path_lock(i->path, LOCK_SH|LOCK_NB, &global_lock, &local_lock);
1177 if (r < 0)
1178 return r;
1179
1180 switch (i->type) {
1181
1182 case IMAGE_SUBVOLUME:
1183 case IMAGE_DIRECTORY: {
1184 _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **sysext_release = NULL, **confext_release = NULL;
1185 _cleanup_free_ char *hostname = NULL, *path = NULL;
1186 sd_id128_t machine_id = SD_ID128_NULL;
1187
1188 if (i->class == IMAGE_SYSEXT) {
1189 r = extension_has_forbidden_content(i->path);
1190 if (r < 0)
1191 return r;
1192 if (r > 0)
1193 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
1194 "Conflicting content found in image %s, refusing.",
1195 i->name);
1196 }
1197
1198 r = chase("/etc/hostname", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
1199 if (r < 0 && r != -ENOENT)
1200 log_debug_errno(r, "Failed to chase /etc/hostname in image %s: %m", i->name);
1201 else if (r >= 0) {
1202 r = read_etc_hostname(path, &hostname);
1203 if (r < 0)
1204 log_debug_errno(errno, "Failed to read /etc/hostname of image %s: %m", i->name);
1205 }
1206
1207 path = mfree(path);
1208
1209 r = id128_get_machine(i->path, &machine_id);
1210 if (r < 0)
1211 log_debug_errno(r, "Failed to read machine ID in image %s, ignoring: %m", i->name);
1212
1213 r = chase("/etc/machine-info", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
1214 if (r < 0 && r != -ENOENT)
1215 log_debug_errno(r, "Failed to chase /etc/machine-info in image %s: %m", i->name);
1216 else if (r >= 0) {
1217 r = load_env_file_pairs(NULL, path, &machine_info);
1218 if (r < 0)
1219 log_debug_errno(r, "Failed to parse machine-info data of %s: %m", i->name);
1220 }
1221
1222 r = load_os_release_pairs(i->path, &os_release);
1223 if (r < 0)
1224 log_debug_errno(r, "Failed to read os-release in image, ignoring: %m");
1225
1226 r = load_extension_release_pairs(i->path, IMAGE_SYSEXT, i->name, /* relax_extension_release_check= */ false, &sysext_release);
1227 if (r < 0)
1228 log_debug_errno(r, "Failed to read sysext-release in image, ignoring: %m");
1229
1230 r = load_extension_release_pairs(i->path, IMAGE_CONFEXT, i->name, /* relax_extension_release_check= */ false, &confext_release);
1231 if (r < 0)
1232 log_debug_errno(r, "Failed to read confext-release in image, ignoring: %m");
1233
1234 free_and_replace(i->hostname, hostname);
1235 i->machine_id = machine_id;
1236 strv_free_and_replace(i->machine_info, machine_info);
1237 strv_free_and_replace(i->os_release, os_release);
1238 strv_free_and_replace(i->sysext_release, sysext_release);
1239 strv_free_and_replace(i->confext_release, confext_release);
1240 break;
1241 }
1242
1243 case IMAGE_RAW:
1244 case IMAGE_BLOCK: {
1245 _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
1246 _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
1247
1248 r = loop_device_make_by_path(i->path, O_RDONLY, /* sector_size= */ UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &d);
1249 if (r < 0)
1250 return r;
1251
1252 r = dissect_loop_device(
1253 d,
1254 /* verity= */ NULL,
1255 /* mount_options= */ NULL,
1256 image_policy,
1257 DISSECT_IMAGE_GENERIC_ROOT |
1258 DISSECT_IMAGE_REQUIRE_ROOT |
1259 DISSECT_IMAGE_RELAX_VAR_CHECK |
1260 DISSECT_IMAGE_READ_ONLY |
1261 DISSECT_IMAGE_USR_NO_ROOT |
1262 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
1263 DISSECT_IMAGE_PIN_PARTITION_DEVICES,
1264 &m);
1265 if (r < 0)
1266 return r;
1267
1268 r = dissected_image_acquire_metadata(m,
1269 DISSECT_IMAGE_VALIDATE_OS |
1270 DISSECT_IMAGE_VALIDATE_OS_EXT);
1271 if (r < 0)
1272 return r;
1273
1274 free_and_replace(i->hostname, m->hostname);
1275 i->machine_id = m->machine_id;
1276 strv_free_and_replace(i->machine_info, m->machine_info);
1277 strv_free_and_replace(i->os_release, m->os_release);
1278 strv_free_and_replace(i->sysext_release, m->sysext_release);
1279 strv_free_and_replace(i->confext_release, m->confext_release);
1280
1281 break;
1282 }
1283
1284 default:
1285 return -EOPNOTSUPP;
1286 }
1287
1288 i->metadata_valid = true;
1289
1290 return 0;
1291 }
1292
1293 int image_name_lock(const char *name, int operation, LockFile *ret) {
1294 const char *p;
1295
1296 assert(name);
1297 assert(ret);
1298
1299 /* Locks an image name, regardless of the precise path used. */
1300
1301 if (streq(name, ".host"))
1302 return -EBUSY;
1303
1304 if (!image_name_is_valid(name))
1305 return -EINVAL;
1306
1307 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
1308 *ret = (LockFile) LOCK_FILE_INIT;
1309 return 0;
1310 }
1311
1312 (void) mkdir_p("/run/systemd/nspawn/locks", 0700);
1313
1314 p = strjoina("/run/systemd/nspawn/locks/name-", name);
1315 return make_lock_file(p, operation, ret);
1316 }
1317
1318 bool image_in_search_path(
1319 ImageClass class,
1320 const char *root,
1321 const char *image) {
1322
1323 assert(image);
1324
1325 NULSTR_FOREACH(path, pick_image_search_path(class)) {
1326 const char *p, *q;
1327 size_t k;
1328
1329 if (!empty_or_root(root)) {
1330 q = path_startswith(path, root);
1331 if (!q)
1332 continue;
1333 } else
1334 q = path;
1335
1336 p = path_startswith(q, path);
1337 if (!p)
1338 continue;
1339
1340 /* Make sure there's a filename following */
1341 k = strcspn(p, "/");
1342 if (k == 0)
1343 continue;
1344
1345 p += k;
1346
1347 /* Accept trailing slashes */
1348 if (p[strspn(p, "/")] == 0)
1349 return true;
1350
1351 }
1352
1353 return false;
1354 }
1355
1356 int image_to_json(const struct Image *img, JsonVariant **ret) {
1357 assert(img);
1358
1359 return json_build(ret,
1360 JSON_BUILD_OBJECT(
1361 JSON_BUILD_PAIR_STRING("Type", image_type_to_string(img->type)),
1362 JSON_BUILD_PAIR_STRING("Class", image_class_to_string(img->class)),
1363 JSON_BUILD_PAIR_STRING("Name", img->name),
1364 JSON_BUILD_PAIR_CONDITION(img->path, "Path", JSON_BUILD_STRING(img->path)),
1365 JSON_BUILD_PAIR_BOOLEAN("ReadOnly", img->read_only),
1366 JSON_BUILD_PAIR_CONDITION(img->crtime != 0, "CreationTimestamp", JSON_BUILD_UNSIGNED(img->crtime)),
1367 JSON_BUILD_PAIR_CONDITION(img->mtime != 0, "ModificationTimestamp", JSON_BUILD_UNSIGNED(img->mtime)),
1368 JSON_BUILD_PAIR_CONDITION(img->usage != UINT64_MAX, "Usage", JSON_BUILD_UNSIGNED(img->usage)),
1369 JSON_BUILD_PAIR_CONDITION(img->usage_exclusive != UINT64_MAX, "UsageExclusive", JSON_BUILD_UNSIGNED(img->usage_exclusive)),
1370 JSON_BUILD_PAIR_CONDITION(img->limit != UINT64_MAX, "Limit", JSON_BUILD_UNSIGNED(img->limit)),
1371 JSON_BUILD_PAIR_CONDITION(img->limit_exclusive != UINT64_MAX, "LimitExclusive", JSON_BUILD_UNSIGNED(img->limit_exclusive))));
1372 }
1373
1374 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
1375 [IMAGE_DIRECTORY] = "directory",
1376 [IMAGE_SUBVOLUME] = "subvolume",
1377 [IMAGE_RAW] = "raw",
1378 [IMAGE_BLOCK] = "block",
1379 };
1380
1381 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);