]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/discover-image.c
Fix compilation error
[thirdparty/systemd.git] / src / shared / discover-image.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/loop.h>
7 #include <linux/magic.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <sys/file.h>
11 #include <sys/ioctl.h>
12 #include <sys/stat.h>
13 #include <unistd.h>
14
15 #include "alloc-util.h"
16 #include "btrfs-util.h"
17 #include "chase.h"
18 #include "chattr-util.h"
19 #include "copy.h"
20 #include "dirent-util.h"
21 #include "discover-image.h"
22 #include "dissect-image.h"
23 #include "env-file.h"
24 #include "env-util.h"
25 #include "extension-util.h"
26 #include "fd-util.h"
27 #include "fs-util.h"
28 #include "hashmap.h"
29 #include "hostname-setup.h"
30 #include "id128-util.h"
31 #include "initrd-util.h"
32 #include "lock-util.h"
33 #include "log.h"
34 #include "loop-util.h"
35 #include "macro.h"
36 #include "mkdir.h"
37 #include "nulstr-util.h"
38 #include "os-util.h"
39 #include "path-util.h"
40 #include "rm-rf.h"
41 #include "stat-util.h"
42 #include "string-table.h"
43 #include "string-util.h"
44 #include "strv.h"
45 #include "time-util.h"
46 #include "utf8.h"
47 #include "xattr-util.h"
48
49 static const char* const image_search_path[_IMAGE_CLASS_MAX] = {
50 [IMAGE_MACHINE] = "/etc/machines\0" /* only place symlinks here */
51 "/run/machines\0" /* and here too */
52 "/var/lib/machines\0" /* the main place for images */
53 "/var/lib/container\0" /* legacy */
54 "/usr/local/lib/machines\0"
55 "/usr/lib/machines\0",
56
57 [IMAGE_PORTABLE] = "/etc/portables\0" /* only place symlinks here */
58 "/run/portables\0" /* and here too */
59 "/var/lib/portables\0" /* the main place for images */
60 "/usr/local/lib/portables\0"
61 "/usr/lib/portables\0",
62
63 /* Note that we don't allow storing extensions under /usr/, unlike with other image types. That's
64 * because extension images are supposed to extend /usr/, so you get into recursive races, especially
65 * with directory-based extensions, as the kernel's OverlayFS explicitly checks for this and errors
66 * out with -ELOOP if it finds that a lowerdir= is a child of another lowerdir=. */
67 [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
68 "/run/extensions\0" /* and here too */
69 "/var/lib/extensions\0", /* the main place for images */
70
71 [IMAGE_CONFEXT] = "/run/confexts\0" /* only place symlinks here */
72 "/var/lib/confexts\0" /* the main place for images */
73 "/usr/local/lib/confexts\0"
74 "/usr/lib/confexts\0",
75 };
76
77 /* Inside the initrd, use a slightly different set of search path (i.e. include .extra/sysext in extension
78 * search dir) */
79 static const char* const image_search_path_initrd[_IMAGE_CLASS_MAX] = {
80 /* (entries that aren't listed here will get the same search path as for the non initrd-case) */
81
82 [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
83 "/run/extensions\0" /* and here too */
84 "/var/lib/extensions\0" /* the main place for images */
85 "/usr/local/lib/extensions\0"
86 "/usr/lib/extensions\0"
87 "/.extra/sysext\0" /* put sysext picked up by systemd-stub last, since not trusted */
88 };
89
90 static Image *image_free(Image *i) {
91 assert(i);
92
93 free(i->name);
94 free(i->path);
95
96 free(i->hostname);
97 strv_free(i->machine_info);
98 strv_free(i->os_release);
99 strv_free(i->extension_release);
100
101 return mfree(i);
102 }
103
104 DEFINE_TRIVIAL_REF_UNREF_FUNC(Image, image, image_free);
105 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(image_hash_ops, char, string_hash_func, string_compare_func,
106 Image, image_unref);
107
108 static char **image_settings_path(Image *image) {
109 _cleanup_strv_free_ char **l = NULL;
110 _cleanup_free_ char *fn = NULL;
111 size_t i = 0;
112 int r;
113
114 assert(image);
115
116 l = new0(char*, 4);
117 if (!l)
118 return NULL;
119
120 fn = strjoin(image->name, ".nspawn");
121 if (!fn)
122 return NULL;
123
124 FOREACH_STRING(s, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
125 l[i] = path_join(s, fn);
126 if (!l[i])
127 return NULL;
128
129 i++;
130 }
131
132 r = file_in_same_dir(image->path, fn, l + i);
133 if (r == -ENOMEM)
134 return NULL;
135 if (r < 0)
136 log_debug_errno(r, "Failed to generate .nspawn settings path from image path, ignoring: %m");
137
138 strv_uniq(l);
139
140 return TAKE_PTR(l);
141 }
142
143 static int image_roothash_path(Image *image, char **ret) {
144 _cleanup_free_ char *fn = NULL;
145
146 assert(image);
147
148 fn = strjoin(image->name, ".roothash");
149 if (!fn)
150 return -ENOMEM;
151
152 return file_in_same_dir(image->path, fn, ret);
153 }
154
155 static int image_new(
156 ImageType t,
157 ImageClass c,
158 const char *pretty,
159 const char *path,
160 const char *filename,
161 bool read_only,
162 usec_t crtime,
163 usec_t mtime,
164 Image **ret) {
165
166 _cleanup_(image_unrefp) Image *i = NULL;
167
168 assert(t >= 0);
169 assert(t < _IMAGE_TYPE_MAX);
170 assert(pretty);
171 assert(filename);
172 assert(ret);
173
174 i = new(Image, 1);
175 if (!i)
176 return -ENOMEM;
177
178 *i = (Image) {
179 .n_ref = 1,
180 .type = t,
181 .class = c,
182 .read_only = read_only,
183 .crtime = crtime,
184 .mtime = mtime,
185 .usage = UINT64_MAX,
186 .usage_exclusive = UINT64_MAX,
187 .limit = UINT64_MAX,
188 .limit_exclusive = UINT64_MAX,
189 };
190
191 i->name = strdup(pretty);
192 if (!i->name)
193 return -ENOMEM;
194
195 i->path = path_join(path, filename);
196 if (!i->path)
197 return -ENOMEM;
198
199 path_simplify(i->path);
200
201 *ret = TAKE_PTR(i);
202
203 return 0;
204 }
205
206 static int extract_pretty(const char *path, const char *suffix, char **ret) {
207 _cleanup_free_ char *name = NULL;
208 const char *p;
209
210 assert(path);
211 assert(ret);
212
213 p = last_path_component(path);
214
215 name = strdupcspn(p, "/");
216 if (!name)
217 return -ENOMEM;
218
219 if (suffix) {
220 char *e;
221
222 e = endswith(name, suffix);
223 if (!e)
224 return -EINVAL;
225
226 *e = 0;
227 }
228
229 if (!image_name_is_valid(name))
230 return -EINVAL;
231
232 *ret = TAKE_PTR(name);
233 return 0;
234 }
235
236 static int image_make(
237 ImageClass c,
238 const char *pretty,
239 int dfd,
240 const char *path,
241 const char *filename,
242 const struct stat *st,
243 Image **ret) {
244
245 _cleanup_free_ char *pretty_buffer = NULL, *parent = NULL;
246 struct stat stbuf;
247 bool read_only;
248 int r;
249
250 assert(dfd >= 0 || dfd == AT_FDCWD);
251 assert(path || dfd == AT_FDCWD);
252 assert(filename);
253
254 /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block
255 * devices into /var/lib/machines/, and treat them normally.
256 *
257 * This function returns -ENOENT if we can't find the image after all, and -EMEDIUMTYPE if it's not a file we
258 * recognize. */
259
260 if (!st) {
261 if (fstatat(dfd, filename, &stbuf, 0) < 0)
262 return -errno;
263
264 st = &stbuf;
265 }
266
267 if (!path) {
268 if (dfd == AT_FDCWD)
269 (void) safe_getcwd(&parent);
270 else
271 (void) fd_get_path(dfd, &parent);
272 }
273
274 read_only =
275 (path && path_startswith(path, "/usr")) ||
276 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
277
278 if (S_ISDIR(st->st_mode)) {
279 _cleanup_close_ int fd = -EBADF;
280 unsigned file_attr = 0;
281 usec_t crtime = 0;
282
283 if (!ret)
284 return 0;
285
286 if (!pretty) {
287 r = extract_pretty(filename, NULL, &pretty_buffer);
288 if (r < 0)
289 return r;
290
291 pretty = pretty_buffer;
292 }
293
294 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
295 if (fd < 0)
296 return -errno;
297
298 if (btrfs_might_be_subvol(st)) {
299
300 r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
301 if (r < 0)
302 return r;
303 if (r) {
304 BtrfsSubvolInfo info;
305
306 /* It's a btrfs subvolume */
307
308 r = btrfs_subvol_get_info_fd(fd, 0, &info);
309 if (r < 0)
310 return r;
311
312 r = image_new(IMAGE_SUBVOLUME,
313 c,
314 pretty,
315 path,
316 filename,
317 info.read_only || read_only,
318 info.otime,
319 0,
320 ret);
321 if (r < 0)
322 return r;
323
324 if (btrfs_quota_scan_ongoing(fd) == 0) {
325 BtrfsQuotaInfo quota;
326
327 r = btrfs_subvol_get_subtree_quota_fd(fd, 0, &quota);
328 if (r >= 0) {
329 (*ret)->usage = quota.referenced;
330 (*ret)->usage_exclusive = quota.exclusive;
331
332 (*ret)->limit = quota.referenced_max;
333 (*ret)->limit_exclusive = quota.exclusive_max;
334 }
335 }
336
337 return 0;
338 }
339 }
340
341 /* Get directory creation time (not available everywhere, but that's OK */
342 (void) fd_getcrtime(fd, &crtime);
343
344 /* If the IMMUTABLE bit is set, we consider the directory read-only. Since the ioctl is not
345 * supported everywhere we ignore failures. */
346 (void) read_attr_fd(fd, &file_attr);
347
348 /* It's just a normal directory. */
349 r = image_new(IMAGE_DIRECTORY,
350 c,
351 pretty,
352 path,
353 filename,
354 read_only || (file_attr & FS_IMMUTABLE_FL),
355 crtime,
356 0, /* we don't use mtime of stat() here, since it's not the time of last change of the tree, but only of the top-level dir */
357 ret);
358 if (r < 0)
359 return r;
360
361 return 0;
362
363 } else if (S_ISREG(st->st_mode) && endswith(filename, ".raw")) {
364 usec_t crtime = 0;
365
366 /* It's a RAW disk image */
367
368 if (!ret)
369 return 0;
370
371 (void) fd_getcrtime_at(dfd, filename, AT_SYMLINK_FOLLOW, &crtime);
372
373 if (!pretty) {
374 r = extract_pretty(filename, ".raw", &pretty_buffer);
375 if (r < 0)
376 return r;
377
378 pretty = pretty_buffer;
379 }
380
381 r = image_new(IMAGE_RAW,
382 c,
383 pretty,
384 path,
385 filename,
386 !(st->st_mode & 0222) || read_only,
387 crtime,
388 timespec_load(&st->st_mtim),
389 ret);
390 if (r < 0)
391 return r;
392
393 (*ret)->usage = (*ret)->usage_exclusive = st->st_blocks * 512;
394 (*ret)->limit = (*ret)->limit_exclusive = st->st_size;
395
396 return 0;
397
398 } else if (S_ISBLK(st->st_mode)) {
399 _cleanup_close_ int block_fd = -EBADF;
400 uint64_t size = UINT64_MAX;
401
402 /* A block device */
403
404 if (!ret)
405 return 0;
406
407 if (!pretty) {
408 r = extract_pretty(filename, NULL, &pretty_buffer);
409 if (r < 0)
410 return r;
411
412 pretty = pretty_buffer;
413 }
414
415 block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
416 if (block_fd < 0)
417 log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
418 else {
419 /* Refresh stat data after opening the node */
420 if (fstat(block_fd, &stbuf) < 0)
421 return -errno;
422 st = &stbuf;
423
424 if (!S_ISBLK(st->st_mode)) /* Verify that what we opened is actually what we think it is */
425 return -ENOTTY;
426
427 if (!read_only) {
428 int state = 0;
429
430 if (ioctl(block_fd, BLKROGET, &state) < 0)
431 log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
432 else if (state)
433 read_only = true;
434 }
435
436 if (ioctl(block_fd, BLKGETSIZE64, &size) < 0)
437 log_debug_errno(errno, "Failed to issue BLKGETSIZE64 on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
438
439 block_fd = safe_close(block_fd);
440 }
441
442 r = image_new(IMAGE_BLOCK,
443 c,
444 pretty,
445 path,
446 filename,
447 !(st->st_mode & 0222) || read_only,
448 0,
449 0,
450 ret);
451 if (r < 0)
452 return r;
453
454 if (!IN_SET(size, 0, UINT64_MAX))
455 (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size;
456
457 return 0;
458 }
459
460 return -EMEDIUMTYPE;
461 }
462
463 static const char *pick_image_search_path(ImageClass class) {
464 if (class < 0 || class >= _IMAGE_CLASS_MAX)
465 return NULL;
466
467 /* Use the initrd search path if there is one, otherwise use the common one */
468 return in_initrd() && image_search_path_initrd[class] ? image_search_path_initrd[class] : image_search_path[class];
469 }
470
471 int image_find(ImageClass class,
472 const char *name,
473 const char *root,
474 Image **ret) {
475
476 int r;
477
478 assert(class >= 0);
479 assert(class < _IMAGE_CLASS_MAX);
480 assert(name);
481
482 /* There are no images with invalid names */
483 if (!image_name_is_valid(name))
484 return -ENOENT;
485
486 NULSTR_FOREACH(path, pick_image_search_path(class)) {
487 _cleanup_free_ char *resolved = NULL;
488 _cleanup_closedir_ DIR *d = NULL;
489 struct stat st;
490 int flags;
491
492 r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
493 if (r == -ENOENT)
494 continue;
495 if (r < 0)
496 return r;
497
498 /* As mentioned above, we follow symlinks on this fstatat(), because we want to permit people
499 * to symlink block devices into the search path. (For now, we disable that when operating
500 * relative to some root directory.) */
501 flags = root ? AT_SYMLINK_NOFOLLOW : 0;
502 if (fstatat(dirfd(d), name, &st, flags) < 0) {
503 _cleanup_free_ char *raw = NULL;
504
505 if (errno != ENOENT)
506 return -errno;
507
508 raw = strjoin(name, ".raw");
509 if (!raw)
510 return -ENOMEM;
511
512 if (fstatat(dirfd(d), raw, &st, flags) < 0) {
513 if (errno == ENOENT)
514 continue;
515
516 return -errno;
517 }
518
519 if (!S_ISREG(st.st_mode))
520 continue;
521
522 r = image_make(class, name, dirfd(d), resolved, raw, &st, ret);
523
524 } else {
525 if (!S_ISDIR(st.st_mode) && !S_ISBLK(st.st_mode))
526 continue;
527
528 r = image_make(class, name, dirfd(d), resolved, name, &st, ret);
529 }
530 if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
531 continue;
532 if (r < 0)
533 return r;
534
535 if (ret)
536 (*ret)->discoverable = true;
537
538 return 1;
539 }
540
541 if (class == IMAGE_MACHINE && streq(name, ".host")) {
542 r = image_make(class, ".host", AT_FDCWD, NULL, empty_to_root(root), NULL, ret);
543 if (r < 0)
544 return r;
545
546 if (ret)
547 (*ret)->discoverable = true;
548
549 return r;
550 }
551
552 return -ENOENT;
553 };
554
555 int image_from_path(const char *path, Image **ret) {
556
557 /* Note that we don't set the 'discoverable' field of the returned object, because we don't check here whether
558 * the image is in the image search path. And if it is we don't know if the path we used is actually not
559 * overridden by another, different image earlier in the search path */
560
561 if (path_equal(path, "/"))
562 return image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, "/", NULL, ret);
563
564 return image_make(_IMAGE_CLASS_INVALID, NULL, AT_FDCWD, NULL, path, NULL, ret);
565 }
566
567 int image_find_harder(ImageClass class, const char *name_or_path, const char *root, Image **ret) {
568 if (image_name_is_valid(name_or_path))
569 return image_find(class, name_or_path, root, ret);
570
571 return image_from_path(name_or_path, ret);
572 }
573
574 int image_discover(
575 ImageClass class,
576 const char *root,
577 Hashmap *h) {
578
579 int r;
580
581 assert(class >= 0);
582 assert(class < _IMAGE_CLASS_MAX);
583 assert(h);
584
585 NULSTR_FOREACH(path, pick_image_search_path(class)) {
586 _cleanup_free_ char *resolved = NULL;
587 _cleanup_closedir_ DIR *d = NULL;
588
589 r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
590 if (r == -ENOENT)
591 continue;
592 if (r < 0)
593 return r;
594
595 FOREACH_DIRENT_ALL(de, d, return -errno) {
596 _cleanup_(image_unrefp) Image *image = NULL;
597 _cleanup_free_ char *truncated = NULL;
598 const char *pretty;
599 struct stat st;
600 int flags;
601
602 if (dot_or_dot_dot(de->d_name))
603 continue;
604
605 /* As mentioned above, we follow symlinks on this fstatat(), because we want to
606 * permit people to symlink block devices into the search path. */
607 flags = root ? AT_SYMLINK_NOFOLLOW : 0;
608 if (fstatat(dirfd(d), de->d_name, &st, flags) < 0) {
609 if (errno == ENOENT)
610 continue;
611
612 return -errno;
613 }
614
615 if (S_ISREG(st.st_mode)) {
616 const char *e;
617
618 e = endswith(de->d_name, ".raw");
619 if (!e)
620 continue;
621
622 truncated = strndup(de->d_name, e - de->d_name);
623 if (!truncated)
624 return -ENOMEM;
625
626 pretty = truncated;
627 } else if (S_ISDIR(st.st_mode) || S_ISBLK(st.st_mode))
628 pretty = de->d_name;
629 else
630 continue;
631
632 if (!image_name_is_valid(pretty))
633 continue;
634
635 if (hashmap_contains(h, pretty))
636 continue;
637
638 r = image_make(class, pretty, dirfd(d), resolved, de->d_name, &st, &image);
639 if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
640 continue;
641 if (r < 0)
642 return r;
643
644 image->discoverable = true;
645
646 r = hashmap_put(h, image->name, image);
647 if (r < 0)
648 return r;
649
650 image = NULL;
651 }
652 }
653
654 if (class == IMAGE_MACHINE && !hashmap_contains(h, ".host")) {
655 _cleanup_(image_unrefp) Image *image = NULL;
656
657 r = image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, empty_to_root("/"), NULL, &image);
658 if (r < 0)
659 return r;
660
661 image->discoverable = true;
662
663 r = hashmap_put(h, image->name, image);
664 if (r < 0)
665 return r;
666
667 image = NULL;
668 }
669
670 return 0;
671 }
672
673 int image_remove(Image *i) {
674 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
675 _cleanup_strv_free_ char **settings = NULL;
676 _cleanup_free_ char *roothash = NULL;
677 int r;
678
679 assert(i);
680
681 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
682 return -EROFS;
683
684 settings = image_settings_path(i);
685 if (!settings)
686 return -ENOMEM;
687
688 r = image_roothash_path(i, &roothash);
689 if (r < 0)
690 return r;
691
692 /* Make sure we don't interfere with a running nspawn */
693 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
694 if (r < 0)
695 return r;
696
697 switch (i->type) {
698
699 case IMAGE_SUBVOLUME:
700
701 /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the
702 * big guns */
703 if (unlink(i->path) < 0) {
704 r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
705 if (r < 0)
706 return r;
707 }
708
709 break;
710
711 case IMAGE_DIRECTORY:
712 /* Allow deletion of read-only directories */
713 (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
714 r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
715 if (r < 0)
716 return r;
717
718 break;
719
720 case IMAGE_BLOCK:
721
722 /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node
723 * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink
724 * the thing (it's most likely a symlink after all). */
725
726 if (path_startswith(i->path, "/dev"))
727 break;
728
729 _fallthrough_;
730 case IMAGE_RAW:
731 if (unlink(i->path) < 0)
732 return -errno;
733 break;
734
735 default:
736 return -EOPNOTSUPP;
737 }
738
739 STRV_FOREACH(j, settings)
740 if (unlink(*j) < 0 && errno != ENOENT)
741 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j);
742
743 if (unlink(roothash) < 0 && errno != ENOENT)
744 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", roothash);
745
746 return 0;
747 }
748
749 static int rename_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
750 _cleanup_free_ char *fn = NULL, *rs = NULL;
751 int r;
752
753 fn = strjoin(new_name, suffix);
754 if (!fn)
755 return -ENOMEM;
756
757 r = file_in_same_dir(path, fn, &rs);
758 if (r < 0)
759 return r;
760
761 return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs);
762 }
763
764 int image_rename(Image *i, const char *new_name) {
765 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
766 _cleanup_free_ char *new_path = NULL, *nn = NULL, *roothash = NULL;
767 _cleanup_strv_free_ char **settings = NULL;
768 unsigned file_attr = 0;
769 int r;
770
771 assert(i);
772
773 if (!image_name_is_valid(new_name))
774 return -EINVAL;
775
776 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
777 return -EROFS;
778
779 settings = image_settings_path(i);
780 if (!settings)
781 return -ENOMEM;
782
783 r = image_roothash_path(i, &roothash);
784 if (r < 0)
785 return r;
786
787 /* Make sure we don't interfere with a running nspawn */
788 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
789 if (r < 0)
790 return r;
791
792 /* Make sure nobody takes the new name, between the time we
793 * checked it is currently unused in all search paths, and the
794 * time we take possession of it */
795 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
796 if (r < 0)
797 return r;
798
799 r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
800 if (r >= 0)
801 return -EEXIST;
802 if (r != -ENOENT)
803 return r;
804
805 switch (i->type) {
806
807 case IMAGE_DIRECTORY:
808 /* Turn of the immutable bit while we rename the image, so that we can rename it */
809 (void) read_attr_path(i->path, &file_attr);
810
811 if (file_attr & FS_IMMUTABLE_FL)
812 (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
813
814 _fallthrough_;
815 case IMAGE_SUBVOLUME:
816 r = file_in_same_dir(i->path, new_name, &new_path);
817 break;
818
819 case IMAGE_BLOCK:
820
821 /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */
822 if (path_startswith(i->path, "/dev"))
823 return -EROFS;
824
825 r = file_in_same_dir(i->path, new_name, &new_path);
826 break;
827
828 case IMAGE_RAW: {
829 const char *fn;
830
831 fn = strjoina(new_name, ".raw");
832
833 r = file_in_same_dir(i->path, fn, &new_path);
834 break;
835 }
836
837 default:
838 return -EOPNOTSUPP;
839 }
840 if (r < 0)
841 return r;
842
843 nn = strdup(new_name);
844 if (!nn)
845 return -ENOMEM;
846
847 r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
848 if (r < 0)
849 return r;
850
851 /* Restore the immutable bit, if it was set before */
852 if (file_attr & FS_IMMUTABLE_FL)
853 (void) chattr_path(new_path, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL);
854
855 free_and_replace(i->path, new_path);
856 free_and_replace(i->name, nn);
857
858 STRV_FOREACH(j, settings) {
859 r = rename_auxiliary_file(*j, new_name, ".nspawn");
860 if (r < 0 && r != -ENOENT)
861 log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j);
862 }
863
864 r = rename_auxiliary_file(roothash, new_name, ".roothash");
865 if (r < 0 && r != -ENOENT)
866 log_debug_errno(r, "Failed to rename roothash file %s, ignoring: %m", roothash);
867
868 return 0;
869 }
870
871 static int clone_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
872 _cleanup_free_ char *fn = NULL, *rs = NULL;
873 int r;
874
875 fn = strjoin(new_name, suffix);
876 if (!fn)
877 return -ENOMEM;
878
879 r = file_in_same_dir(path, fn, &rs);
880 if (r < 0)
881 return r;
882
883 return copy_file_atomic(path, rs, 0664, COPY_REFLINK);
884 }
885
886 int image_clone(Image *i, const char *new_name, bool read_only) {
887 _cleanup_(release_lock_file) LockFile name_lock = LOCK_FILE_INIT;
888 _cleanup_strv_free_ char **settings = NULL;
889 _cleanup_free_ char *roothash = NULL;
890 const char *new_path;
891 int r;
892
893 assert(i);
894
895 if (!image_name_is_valid(new_name))
896 return -EINVAL;
897
898 settings = image_settings_path(i);
899 if (!settings)
900 return -ENOMEM;
901
902 r = image_roothash_path(i, &roothash);
903 if (r < 0)
904 return r;
905
906 /* Make sure nobody takes the new name, between the time we
907 * checked it is currently unused in all search paths, and the
908 * time we take possession of it */
909 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
910 if (r < 0)
911 return r;
912
913 r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
914 if (r >= 0)
915 return -EEXIST;
916 if (r != -ENOENT)
917 return r;
918
919 switch (i->type) {
920
921 case IMAGE_SUBVOLUME:
922 case IMAGE_DIRECTORY:
923 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
924 * directory. */
925
926 new_path = strjoina("/var/lib/machines/", new_name);
927
928 r = btrfs_subvol_snapshot(i->path, new_path,
929 (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
930 BTRFS_SNAPSHOT_FALLBACK_COPY |
931 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
932 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
933 BTRFS_SNAPSHOT_RECURSIVE |
934 BTRFS_SNAPSHOT_QUOTA);
935 if (r >= 0)
936 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
937 (void) btrfs_subvol_auto_qgroup(new_path, 0, true);
938
939 break;
940
941 case IMAGE_RAW:
942 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
943
944 r = copy_file_atomic_full(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, FS_NOCOW_FL,
945 COPY_REFLINK|COPY_CRTIME, NULL, NULL);
946 break;
947
948 case IMAGE_BLOCK:
949 default:
950 return -EOPNOTSUPP;
951 }
952
953 if (r < 0)
954 return r;
955
956 STRV_FOREACH(j, settings) {
957 r = clone_auxiliary_file(*j, new_name, ".nspawn");
958 if (r < 0 && r != -ENOENT)
959 log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j);
960 }
961
962 r = clone_auxiliary_file(roothash, new_name, ".roothash");
963 if (r < 0 && r != -ENOENT)
964 log_debug_errno(r, "Failed to clone root hash file %s, ignoring: %m", roothash);
965
966 return 0;
967 }
968
969 int image_read_only(Image *i, bool b) {
970 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
971 int r;
972
973 assert(i);
974
975 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
976 return -EROFS;
977
978 /* Make sure we don't interfere with a running nspawn */
979 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
980 if (r < 0)
981 return r;
982
983 switch (i->type) {
984
985 case IMAGE_SUBVOLUME:
986
987 /* Note that we set the flag only on the top-level
988 * subvolume of the image. */
989
990 r = btrfs_subvol_set_read_only(i->path, b);
991 if (r < 0)
992 return r;
993
994 break;
995
996 case IMAGE_DIRECTORY:
997 /* For simple directory trees we cannot use the access
998 mode of the top-level directory, since it has an
999 effect on the container itself. However, we can
1000 use the "immutable" flag, to at least make the
1001 top-level directory read-only. It's not as good as
1002 a read-only subvolume, but at least something, and
1003 we can read the value back. */
1004
1005 r = chattr_path(i->path, b ? FS_IMMUTABLE_FL : 0, FS_IMMUTABLE_FL, NULL);
1006 if (r < 0)
1007 return r;
1008
1009 break;
1010
1011 case IMAGE_RAW: {
1012 struct stat st;
1013
1014 if (stat(i->path, &st) < 0)
1015 return -errno;
1016
1017 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
1018 return -errno;
1019
1020 /* If the images is now read-only, it's a good time to
1021 * defrag it, given that no write patterns will
1022 * fragment it again. */
1023 if (b)
1024 (void) btrfs_defrag(i->path);
1025 break;
1026 }
1027
1028 case IMAGE_BLOCK: {
1029 _cleanup_close_ int fd = -EBADF;
1030 struct stat st;
1031 int state = b;
1032
1033 fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY);
1034 if (fd < 0)
1035 return -errno;
1036
1037 if (fstat(fd, &st) < 0)
1038 return -errno;
1039 if (!S_ISBLK(st.st_mode))
1040 return -ENOTTY;
1041
1042 if (ioctl(fd, BLKROSET, &state) < 0)
1043 return -errno;
1044
1045 break;
1046 }
1047
1048 default:
1049 return -EOPNOTSUPP;
1050 }
1051
1052 return 0;
1053 }
1054
1055 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
1056 _cleanup_free_ char *p = NULL;
1057 LockFile t = LOCK_FILE_INIT;
1058 struct stat st;
1059 bool exclusive;
1060 int r;
1061
1062 assert(path);
1063 assert(global);
1064 assert(local);
1065
1066 /* Locks an image path. This actually creates two locks: one "local" one, next to the image path
1067 * itself, which might be shared via NFS. And another "global" one, in /run, that uses the
1068 * device/inode number. This has the benefit that we can even lock a tree that is a mount point,
1069 * correctly. */
1070
1071 if (!path_is_absolute(path))
1072 return -EINVAL;
1073
1074 switch (operation & (LOCK_SH|LOCK_EX)) {
1075 case LOCK_SH:
1076 exclusive = false;
1077 break;
1078 case LOCK_EX:
1079 exclusive = true;
1080 break;
1081 default:
1082 return -EINVAL;
1083 }
1084
1085 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
1086 *local = *global = (LockFile) LOCK_FILE_INIT;
1087 return 0;
1088 }
1089
1090 /* Prohibit taking exclusive locks on the host image. We can't allow this, since we ourselves are
1091 * running off it after all, and we don't want any images to manipulate the host image. We make an
1092 * exception for shared locks however: we allow those (and make them NOPs since there's no point in
1093 * taking them if there can't be exclusive locks). Strictly speaking these are questionable as well,
1094 * since it means changes made to the host might propagate to the container as they happen (and a
1095 * shared lock kinda suggests that no changes happen at all while it is in place), but it's too
1096 * useful not to allow read-only containers off the host root, hence let's support this, and trust
1097 * the user to do the right thing with this. */
1098 if (path_equal(path, "/")) {
1099 if (exclusive)
1100 return -EBUSY;
1101
1102 *local = *global = (LockFile) LOCK_FILE_INIT;
1103 return 0;
1104 }
1105
1106 if (stat(path, &st) >= 0) {
1107 if (S_ISBLK(st.st_mode))
1108 r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev));
1109 else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode))
1110 r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino);
1111 else
1112 return -ENOTTY;
1113 if (r < 0)
1114 return -ENOMEM;
1115 }
1116
1117 /* For block devices we don't need the "local" lock, as the major/minor lock above should be
1118 * sufficient, since block devices are host local anyway. */
1119 if (!path_startswith(path, "/dev/")) {
1120 r = make_lock_file_for(path, operation, &t);
1121 if (r < 0) {
1122 if (!exclusive && r == -EROFS)
1123 log_debug_errno(r, "Failed to create shared lock for '%s', ignoring: %m", path);
1124 else
1125 return r;
1126 }
1127 }
1128
1129 if (p) {
1130 (void) mkdir_p("/run/systemd/nspawn/locks", 0700);
1131
1132 r = make_lock_file(p, operation, global);
1133 if (r < 0) {
1134 release_lock_file(&t);
1135 return r;
1136 }
1137 } else
1138 *global = (LockFile) LOCK_FILE_INIT;
1139
1140 *local = t;
1141 return 0;
1142 }
1143
1144 int image_set_limit(Image *i, uint64_t referenced_max) {
1145 assert(i);
1146
1147 if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
1148 return -EROFS;
1149
1150 if (i->type != IMAGE_SUBVOLUME)
1151 return -EOPNOTSUPP;
1152
1153 /* We set the quota both for the subvolume as well as for the
1154 * subtree. The latter is mostly for historical reasons, since
1155 * we didn't use to have a concept of subtree quota, and hence
1156 * only modified the subvolume quota. */
1157
1158 (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max);
1159 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
1160 return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max);
1161 }
1162
1163 int image_read_metadata(Image *i, const ImagePolicy *image_policy) {
1164 _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
1165 int r;
1166
1167 assert(i);
1168
1169 r = image_path_lock(i->path, LOCK_SH|LOCK_NB, &global_lock, &local_lock);
1170 if (r < 0)
1171 return r;
1172
1173 switch (i->type) {
1174
1175 case IMAGE_SUBVOLUME:
1176 case IMAGE_DIRECTORY: {
1177 _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **extension_release = NULL;
1178 sd_id128_t machine_id = SD_ID128_NULL;
1179 _cleanup_free_ char *hostname = NULL;
1180 _cleanup_free_ char *path = NULL;
1181
1182 if (i->class == IMAGE_SYSEXT) {
1183 r = extension_has_forbidden_content(i->path);
1184 if (r < 0)
1185 return r;
1186 if (r > 0)
1187 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
1188 "Conflicting content found in image %s, refusing.",
1189 i->name);
1190 }
1191
1192 r = chase("/etc/hostname", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
1193 if (r < 0 && r != -ENOENT)
1194 log_debug_errno(r, "Failed to chase /etc/hostname in image %s: %m", i->name);
1195 else if (r >= 0) {
1196 r = read_etc_hostname(path, &hostname);
1197 if (r < 0)
1198 log_debug_errno(errno, "Failed to read /etc/hostname of image %s: %m", i->name);
1199 }
1200
1201 path = mfree(path);
1202
1203 r = id128_get_machine(i->path, &machine_id);
1204 if (r < 0)
1205 log_debug_errno(r, "Failed to read machine ID in image %s, ignoring: %m", i->name);
1206
1207 r = chase("/etc/machine-info", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
1208 if (r < 0 && r != -ENOENT)
1209 log_debug_errno(r, "Failed to chase /etc/machine-info in image %s: %m", i->name);
1210 else if (r >= 0) {
1211 r = load_env_file_pairs(NULL, path, &machine_info);
1212 if (r < 0)
1213 log_debug_errno(r, "Failed to parse machine-info data of %s: %m", i->name);
1214 }
1215
1216 r = load_os_release_pairs(i->path, &os_release);
1217 if (r < 0)
1218 log_debug_errno(r, "Failed to read os-release in image, ignoring: %m");
1219
1220 r = load_extension_release_pairs(i->path, i->class, i->name, /* relax_extension_release_check= */ false, &extension_release);
1221 if (r < 0)
1222 log_debug_errno(r, "Failed to read extension-release in image, ignoring: %m");
1223
1224 free_and_replace(i->hostname, hostname);
1225 i->machine_id = machine_id;
1226 strv_free_and_replace(i->machine_info, machine_info);
1227 strv_free_and_replace(i->os_release, os_release);
1228 strv_free_and_replace(i->extension_release, extension_release);
1229
1230 break;
1231 }
1232
1233 case IMAGE_RAW:
1234 case IMAGE_BLOCK: {
1235 _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
1236 _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
1237
1238 r = loop_device_make_by_path(i->path, O_RDONLY, /* sector_size= */ UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &d);
1239 if (r < 0)
1240 return r;
1241
1242 r = dissect_loop_device(
1243 d,
1244 /* verity= */ NULL,
1245 /* mount_options= */ NULL,
1246 image_policy,
1247 DISSECT_IMAGE_GENERIC_ROOT |
1248 DISSECT_IMAGE_REQUIRE_ROOT |
1249 DISSECT_IMAGE_RELAX_VAR_CHECK |
1250 DISSECT_IMAGE_READ_ONLY |
1251 DISSECT_IMAGE_USR_NO_ROOT |
1252 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
1253 DISSECT_IMAGE_PIN_PARTITION_DEVICES,
1254 &m);
1255 if (r < 0)
1256 return r;
1257
1258 r = dissected_image_acquire_metadata(m,
1259 DISSECT_IMAGE_VALIDATE_OS |
1260 DISSECT_IMAGE_VALIDATE_OS_EXT);
1261 if (r < 0)
1262 return r;
1263
1264 free_and_replace(i->hostname, m->hostname);
1265 i->machine_id = m->machine_id;
1266 strv_free_and_replace(i->machine_info, m->machine_info);
1267 strv_free_and_replace(i->os_release, m->os_release);
1268 strv_free_and_replace(i->extension_release, m->extension_release);
1269
1270 break;
1271 }
1272
1273 default:
1274 return -EOPNOTSUPP;
1275 }
1276
1277 i->metadata_valid = true;
1278
1279 return 0;
1280 }
1281
1282 int image_name_lock(const char *name, int operation, LockFile *ret) {
1283 const char *p;
1284
1285 assert(name);
1286 assert(ret);
1287
1288 /* Locks an image name, regardless of the precise path used. */
1289
1290 if (streq(name, ".host"))
1291 return -EBUSY;
1292
1293 if (!image_name_is_valid(name))
1294 return -EINVAL;
1295
1296 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
1297 *ret = (LockFile) LOCK_FILE_INIT;
1298 return 0;
1299 }
1300
1301 (void) mkdir_p("/run/systemd/nspawn/locks", 0700);
1302
1303 p = strjoina("/run/systemd/nspawn/locks/name-", name);
1304 return make_lock_file(p, operation, ret);
1305 }
1306
1307 bool image_in_search_path(
1308 ImageClass class,
1309 const char *root,
1310 const char *image) {
1311
1312 assert(image);
1313
1314 NULSTR_FOREACH(path, pick_image_search_path(class)) {
1315 const char *p, *q;
1316 size_t k;
1317
1318 if (!empty_or_root(root)) {
1319 q = path_startswith(path, root);
1320 if (!q)
1321 continue;
1322 } else
1323 q = path;
1324
1325 p = path_startswith(q, path);
1326 if (!p)
1327 continue;
1328
1329 /* Make sure there's a filename following */
1330 k = strcspn(p, "/");
1331 if (k == 0)
1332 continue;
1333
1334 p += k;
1335
1336 /* Accept trailing slashes */
1337 if (p[strspn(p, "/")] == 0)
1338 return true;
1339
1340 }
1341
1342 return false;
1343 }
1344
1345 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
1346 [IMAGE_DIRECTORY] = "directory",
1347 [IMAGE_SUBVOLUME] = "subvolume",
1348 [IMAGE_RAW] = "raw",
1349 [IMAGE_BLOCK] = "block",
1350 };
1351
1352 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);