2 This file is part of systemd.
4 Copyright 2013 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
35 #include "dirent-util.h"
40 #include "lockfile-util.h"
42 #include "machine-image.h"
45 #include "path-util.h"
47 #include "string-table.h"
48 #include "string-util.h"
50 #include "time-util.h"
53 #include "xattr-util.h"
55 static const char image_search_path
[] =
57 "/var/lib/container\0" /* legacy */
58 "/usr/local/lib/machines\0"
59 "/usr/lib/machines\0";
61 Image
*image_unref(Image
*i
) {
70 static char **image_settings_path(Image
*image
) {
71 _cleanup_strv_free_
char **l
= NULL
;
82 fn
= strjoina(image
->name
, ".nspawn");
84 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
85 l
[i
] = strappend(s
, fn
);
92 l
[i
] = file_in_same_dir(image
->path
, fn
);
102 static char *image_roothash_path(Image
*image
) {
107 fn
= strjoina(image
->name
, ".roothash");
109 return file_in_same_dir(image
->path
, fn
);
112 static int image_new(
116 const char *filename
,
122 _cleanup_(image_unrefp
) Image
*i
= NULL
;
125 assert(t
< _IMAGE_TYPE_MAX
);
135 i
->read_only
= read_only
;
138 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
139 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
141 i
->name
= strdup(pretty
);
146 i
->path
= strjoin(path
, "/", filename
);
148 i
->path
= strdup(filename
);
153 path_kill_slashes(i
->path
);
161 static int image_make(
165 const char *filename
,
174 /* We explicitly *do* follow symlinks here, since we want to
175 * allow symlinking trees into /var/lib/machines/, and treat
178 if (fstatat(dfd
, filename
, &st
, 0) < 0)
182 (path
&& path_startswith(path
, "/usr")) ||
183 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
185 if (S_ISDIR(st
.st_mode
)) {
186 _cleanup_close_
int fd
= -1;
187 unsigned file_attr
= 0;
195 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
199 /* btrfs subvolumes have inode 256 */
200 if (st
.st_ino
== 256) {
202 r
= btrfs_is_filesystem(fd
);
206 BtrfsSubvolInfo info
;
208 /* It's a btrfs subvolume */
210 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
214 r
= image_new(IMAGE_SUBVOLUME
,
218 info
.read_only
|| read_only
,
225 if (btrfs_quota_scan_ongoing(fd
) == 0) {
226 BtrfsQuotaInfo quota
;
228 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
230 (*ret
)->usage
= quota
.referenced
;
231 (*ret
)->usage_exclusive
= quota
.exclusive
;
233 (*ret
)->limit
= quota
.referenced_max
;
234 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
242 /* If the IMMUTABLE bit is set, we consider the
243 * directory read-only. Since the ioctl is not
244 * supported everywhere we ignore failures. */
245 (void) read_attr_fd(fd
, &file_attr
);
247 /* It's just a normal directory. */
248 r
= image_new(IMAGE_DIRECTORY
,
252 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
261 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
264 /* It's a RAW disk image */
269 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
272 pretty
= strndupa(filename
, strlen(filename
) - 4);
274 r
= image_new(IMAGE_RAW
,
278 !(st
.st_mode
& 0222) || read_only
,
280 timespec_load(&st
.st_mtim
),
285 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
286 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
294 int image_find(const char *name
, Image
**ret
) {
300 /* There are no images with invalid names */
301 if (!image_name_is_valid(name
))
304 NULSTR_FOREACH(path
, image_search_path
) {
305 _cleanup_closedir_
DIR *d
= NULL
;
315 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
316 if (IN_SET(r
, 0, -ENOENT
)) {
317 _cleanup_free_
char *raw
= NULL
;
319 raw
= strappend(name
, ".raw");
323 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
324 if (IN_SET(r
, 0, -ENOENT
))
333 if (streq(name
, ".host"))
334 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
339 int image_discover(Hashmap
*h
) {
345 NULSTR_FOREACH(path
, image_search_path
) {
346 _cleanup_closedir_
DIR *d
= NULL
;
357 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
358 _cleanup_(image_unrefp
) Image
*image
= NULL
;
360 if (!image_name_is_valid(de
->d_name
))
363 if (hashmap_contains(h
, de
->d_name
))
366 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
367 if (IN_SET(r
, 0, -ENOENT
))
372 r
= hashmap_put(h
, image
->name
, image
);
380 if (!hashmap_contains(h
, ".host")) {
381 _cleanup_(image_unrefp
) Image
*image
= NULL
;
383 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
387 r
= hashmap_put(h
, image
->name
, image
);
398 void image_hashmap_free(Hashmap
*map
) {
401 while ((i
= hashmap_steal_first(map
)))
407 int image_remove(Image
*i
) {
408 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
409 _cleanup_strv_free_
char **settings
= NULL
;
410 _cleanup_free_
char *roothash
= NULL
;
416 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
419 settings
= image_settings_path(i
);
423 roothash
= image_roothash_path(i
);
427 /* Make sure we don't interfere with a running nspawn */
428 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
434 case IMAGE_SUBVOLUME
:
435 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
440 case IMAGE_DIRECTORY
:
441 /* Allow deletion of read-only directories */
442 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
443 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
450 if (unlink(i
->path
) < 0)
458 STRV_FOREACH(j
, settings
) {
459 if (unlink(*j
) < 0 && errno
!= ENOENT
)
460 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
463 if (unlink(roothash
) < 0 && errno
!= ENOENT
)
464 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", roothash
);
469 static int rename_auxiliary_file(const char *path
, const char *new_name
, const char *suffix
) {
470 _cleanup_free_
char *rs
= NULL
;
473 fn
= strjoina(new_name
, suffix
);
475 rs
= file_in_same_dir(path
, fn
);
479 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
482 int image_rename(Image
*i
, const char *new_name
) {
483 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
484 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
, *roothash
= NULL
;
485 _cleanup_strv_free_
char **settings
= NULL
;
486 unsigned file_attr
= 0;
492 if (!image_name_is_valid(new_name
))
495 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
498 settings
= image_settings_path(i
);
502 roothash
= image_roothash_path(i
);
506 /* Make sure we don't interfere with a running nspawn */
507 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
511 /* Make sure nobody takes the new name, between the time we
512 * checked it is currently unused in all search paths, and the
513 * time we take possession of it */
514 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
518 r
= image_find(new_name
, NULL
);
526 case IMAGE_DIRECTORY
:
527 /* Turn of the immutable bit while we rename the image, so that we can rename it */
528 (void) read_attr_path(i
->path
, &file_attr
);
530 if (file_attr
& FS_IMMUTABLE_FL
)
531 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
535 case IMAGE_SUBVOLUME
:
536 new_path
= file_in_same_dir(i
->path
, new_name
);
542 fn
= strjoina(new_name
, ".raw");
543 new_path
= file_in_same_dir(i
->path
, fn
);
554 nn
= strdup(new_name
);
558 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
562 /* Restore the immutable bit, if it was set before */
563 if (file_attr
& FS_IMMUTABLE_FL
)
564 (void) chattr_path(new_path
, FS_IMMUTABLE_FL
, FS_IMMUTABLE_FL
);
574 STRV_FOREACH(j
, settings
) {
575 r
= rename_auxiliary_file(*j
, new_name
, ".nspawn");
576 if (r
< 0 && r
!= -ENOENT
)
577 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
580 r
= rename_auxiliary_file(roothash
, new_name
, ".roothash");
581 if (r
< 0 && r
!= -ENOENT
)
582 log_debug_errno(r
, "Failed to rename roothash file %s, ignoring: %m", roothash
);
587 static int clone_auxiliary_file(const char *path
, const char *new_name
, const char *suffix
) {
588 _cleanup_free_
char *rs
= NULL
;
591 fn
= strjoina(new_name
, suffix
);
593 rs
= file_in_same_dir(path
, fn
);
597 return copy_file_atomic(path
, rs
, 0664, 0, COPY_REFLINK
);
600 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
601 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
602 _cleanup_strv_free_
char **settings
= NULL
;
603 _cleanup_free_
char *roothash
= NULL
;
604 const char *new_path
;
610 if (!image_name_is_valid(new_name
))
613 settings
= image_settings_path(i
);
617 roothash
= image_roothash_path(i
);
621 /* Make sure nobody takes the new name, between the time we
622 * checked it is currently unused in all search paths, and the
623 * time we take possession of it */
624 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
628 r
= image_find(new_name
, NULL
);
636 case IMAGE_SUBVOLUME
:
637 case IMAGE_DIRECTORY
:
638 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
641 new_path
= strjoina("/var/lib/machines/", new_name
);
643 r
= btrfs_subvol_snapshot(i
->path
, new_path
,
644 (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
645 BTRFS_SNAPSHOT_FALLBACK_COPY
|
646 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
647 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
648 BTRFS_SNAPSHOT_RECURSIVE
|
649 BTRFS_SNAPSHOT_QUOTA
);
651 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
652 (void) btrfs_subvol_auto_qgroup(new_path
, 0, true);
657 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
659 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, FS_NOCOW_FL
, COPY_REFLINK
);
669 STRV_FOREACH(j
, settings
) {
670 r
= clone_auxiliary_file(*j
, new_name
, ".nspawn");
671 if (r
< 0 && r
!= -ENOENT
)
672 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
675 r
= clone_auxiliary_file(roothash
, new_name
, ".roothash");
676 if (r
< 0 && r
!= -ENOENT
)
677 log_debug_errno(r
, "Failed to clone root hash file %s, ignoring: %m", roothash
);
682 int image_read_only(Image
*i
, bool b
) {
683 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
687 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
690 /* Make sure we don't interfere with a running nspawn */
691 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
697 case IMAGE_SUBVOLUME
:
699 /* Note that we set the flag only on the top-level
700 * subvolume of the image. */
702 r
= btrfs_subvol_set_read_only(i
->path
, b
);
708 case IMAGE_DIRECTORY
:
709 /* For simple directory trees we cannot use the access
710 mode of the top-level directory, since it has an
711 effect on the container itself. However, we can
712 use the "immutable" flag, to at least make the
713 top-level directory read-only. It's not as good as
714 a read-only subvolume, but at least something, and
715 we can read the value back. */
717 r
= chattr_path(i
->path
, b
? FS_IMMUTABLE_FL
: 0, FS_IMMUTABLE_FL
);
726 if (stat(i
->path
, &st
) < 0)
729 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
732 /* If the images is now read-only, it's a good time to
733 * defrag it, given that no write patterns will
734 * fragment it again. */
736 (void) btrfs_defrag(i
->path
);
747 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
748 _cleanup_free_
char *p
= NULL
;
749 LockFile t
= LOCK_FILE_INIT
;
757 /* Locks an image path. This actually creates two locks: one
758 * "local" one, next to the image path itself, which might be
759 * shared via NFS. And another "global" one, in /run, that
760 * uses the device/inode number. This has the benefit that we
761 * can even lock a tree that is a mount point, correctly. */
763 if (!path_is_absolute(path
))
766 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
767 *local
= *global
= (LockFile
) LOCK_FILE_INIT
;
771 if (path_equal(path
, "/"))
774 if (stat(path
, &st
) >= 0) {
775 if (asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
) < 0)
779 r
= make_lock_file_for(path
, operation
, &t
);
784 mkdir_p("/run/systemd/nspawn/locks", 0700);
786 r
= make_lock_file(p
, operation
, global
);
788 release_lock_file(&t
);
792 *global
= (LockFile
) LOCK_FILE_INIT
;
798 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
801 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
804 if (i
->type
!= IMAGE_SUBVOLUME
)
807 /* We set the quota both for the subvolume as well as for the
808 * subtree. The latter is mostly for historical reasons, since
809 * we didn't use to have a concept of subtree quota, and hence
810 * only modified the subvolume quota. */
812 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
813 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
814 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
817 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
823 /* Locks an image name, regardless of the precise path used. */
825 if (!image_name_is_valid(name
))
828 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
829 *ret
= (LockFile
) LOCK_FILE_INIT
;
833 if (streq(name
, ".host"))
836 mkdir_p("/run/systemd/nspawn/locks", 0700);
837 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
839 return make_lock_file(p
, operation
, ret
);
842 bool image_name_is_valid(const char *s
) {
843 if (!filename_is_valid(s
))
846 if (string_has_cc(s
, NULL
))
849 if (!utf8_is_valid(s
))
852 /* Temporary files for atomically creating new files */
853 if (startswith(s
, ".#"))
859 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
860 [IMAGE_DIRECTORY
] = "directory",
861 [IMAGE_SUBVOLUME
] = "subvolume",
865 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);