2 This file is part of systemd.
4 Copyright 2013 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
35 #include "dirent-util.h"
40 #include "lockfile-util.h"
42 #include "machine-image.h"
45 #include "path-util.h"
47 #include "string-table.h"
48 #include "string-util.h"
50 #include "time-util.h"
53 #include "xattr-util.h"
55 static const char image_search_path
[] =
57 "/var/lib/container\0" /* legacy */
58 "/usr/local/lib/machines\0"
59 "/usr/lib/machines\0";
61 Image
*image_unref(Image
*i
) {
70 static char **image_settings_path(Image
*image
) {
71 _cleanup_strv_free_
char **l
= NULL
;
82 fn
= strjoina(image
->name
, ".nspawn");
84 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
85 l
[i
] = strappend(s
, fn
);
92 l
[i
] = file_in_same_dir(image
->path
, fn
);
102 static char *image_roothash_path(Image
*image
) {
107 fn
= strjoina(image
->name
, ".roothash");
109 return file_in_same_dir(image
->path
, fn
);
112 static int image_new(
116 const char *filename
,
122 _cleanup_(image_unrefp
) Image
*i
= NULL
;
125 assert(t
< _IMAGE_TYPE_MAX
);
135 i
->read_only
= read_only
;
138 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
139 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
141 i
->name
= strdup(pretty
);
146 i
->path
= strjoin(path
, "/", filename
);
148 i
->path
= strdup(filename
);
153 path_kill_slashes(i
->path
);
161 static int image_make(
165 const char *filename
,
174 /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block
175 * devices into /var/lib/machines/, and treat them normally. */
177 if (fstatat(dfd
, filename
, &st
, 0) < 0)
181 (path
&& path_startswith(path
, "/usr")) ||
182 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
184 if (S_ISDIR(st
.st_mode
)) {
185 _cleanup_close_
int fd
= -1;
186 unsigned file_attr
= 0;
194 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
198 /* btrfs subvolumes have inode 256 */
199 if (st
.st_ino
== 256) {
201 r
= btrfs_is_filesystem(fd
);
205 BtrfsSubvolInfo info
;
207 /* It's a btrfs subvolume */
209 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
213 r
= image_new(IMAGE_SUBVOLUME
,
217 info
.read_only
|| read_only
,
224 if (btrfs_quota_scan_ongoing(fd
) == 0) {
225 BtrfsQuotaInfo quota
;
227 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
229 (*ret
)->usage
= quota
.referenced
;
230 (*ret
)->usage_exclusive
= quota
.exclusive
;
232 (*ret
)->limit
= quota
.referenced_max
;
233 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
241 /* If the IMMUTABLE bit is set, we consider the
242 * directory read-only. Since the ioctl is not
243 * supported everywhere we ignore failures. */
244 (void) read_attr_fd(fd
, &file_attr
);
246 /* It's just a normal directory. */
247 r
= image_new(IMAGE_DIRECTORY
,
251 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
260 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
263 /* It's a RAW disk image */
268 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
271 pretty
= strndupa(filename
, strlen(filename
) - 4);
273 r
= image_new(IMAGE_RAW
,
277 !(st
.st_mode
& 0222) || read_only
,
279 timespec_load(&st
.st_mtim
),
284 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
285 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
289 } else if (S_ISBLK(st
.st_mode
)) {
290 _cleanup_close_
int block_fd
= -1;
291 uint64_t size
= UINT64_MAX
;
301 block_fd
= openat(dfd
, filename
, O_RDONLY
|O_NONBLOCK
|O_CLOEXEC
|O_NOCTTY
);
303 log_debug_errno(errno
, "Failed to open block device %s/%s, ignoring: %m", path
, filename
);
305 if (fstat(block_fd
, &st
) < 0)
307 if (!S_ISBLK(st
.st_mode
)) /* Verify that what we opened is actually what we think it is */
313 if (ioctl(block_fd
, BLKROGET
, &state
) < 0)
314 log_debug_errno(errno
, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path
, filename
);
319 if (ioctl(block_fd
, BLKGETSIZE64
, &size
) < 0)
320 log_debug_errno(errno
, "Failed to issue BLKFLSBUF on device %s/%s, ignoring: %m", path
, filename
);
322 block_fd
= safe_close(block_fd
);
325 r
= image_new(IMAGE_BLOCK
,
329 !(st
.st_mode
& 0222) || read_only
,
336 if (size
!= 0 && size
!= UINT64_MAX
)
337 (*ret
)->usage
= (*ret
)->usage_exclusive
= (*ret
)->limit
= (*ret
)->limit_exclusive
= size
;
345 int image_find(const char *name
, Image
**ret
) {
351 /* There are no images with invalid names */
352 if (!image_name_is_valid(name
))
355 NULSTR_FOREACH(path
, image_search_path
) {
356 _cleanup_closedir_
DIR *d
= NULL
;
366 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
367 if (IN_SET(r
, 0, -ENOENT
)) {
368 _cleanup_free_
char *raw
= NULL
;
370 raw
= strappend(name
, ".raw");
374 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
375 if (IN_SET(r
, 0, -ENOENT
))
384 if (streq(name
, ".host"))
385 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
390 int image_discover(Hashmap
*h
) {
396 NULSTR_FOREACH(path
, image_search_path
) {
397 _cleanup_closedir_
DIR *d
= NULL
;
408 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
409 _cleanup_(image_unrefp
) Image
*image
= NULL
;
411 if (!image_name_is_valid(de
->d_name
))
414 if (hashmap_contains(h
, de
->d_name
))
417 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
418 if (IN_SET(r
, 0, -ENOENT
))
423 r
= hashmap_put(h
, image
->name
, image
);
431 if (!hashmap_contains(h
, ".host")) {
432 _cleanup_(image_unrefp
) Image
*image
= NULL
;
434 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
438 r
= hashmap_put(h
, image
->name
, image
);
449 void image_hashmap_free(Hashmap
*map
) {
452 while ((i
= hashmap_steal_first(map
)))
458 int image_remove(Image
*i
) {
459 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
460 _cleanup_strv_free_
char **settings
= NULL
;
461 _cleanup_free_
char *roothash
= NULL
;
467 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
470 settings
= image_settings_path(i
);
474 roothash
= image_roothash_path(i
);
478 /* Make sure we don't interfere with a running nspawn */
479 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
485 case IMAGE_SUBVOLUME
:
487 /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the
489 if (unlink(i
->path
) < 0) {
490 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
497 case IMAGE_DIRECTORY
:
498 /* Allow deletion of read-only directories */
499 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
500 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
508 /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node
509 * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink
510 * the thing (it's most likely a symlink after all). */
512 if (path_startswith(i
->path
, "/dev"))
518 if (unlink(i
->path
) < 0)
526 STRV_FOREACH(j
, settings
) {
527 if (unlink(*j
) < 0 && errno
!= ENOENT
)
528 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
531 if (unlink(roothash
) < 0 && errno
!= ENOENT
)
532 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", roothash
);
537 static int rename_auxiliary_file(const char *path
, const char *new_name
, const char *suffix
) {
538 _cleanup_free_
char *rs
= NULL
;
541 fn
= strjoina(new_name
, suffix
);
543 rs
= file_in_same_dir(path
, fn
);
547 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
550 int image_rename(Image
*i
, const char *new_name
) {
551 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
552 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
, *roothash
= NULL
;
553 _cleanup_strv_free_
char **settings
= NULL
;
554 unsigned file_attr
= 0;
560 if (!image_name_is_valid(new_name
))
563 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
566 settings
= image_settings_path(i
);
570 roothash
= image_roothash_path(i
);
574 /* Make sure we don't interfere with a running nspawn */
575 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
579 /* Make sure nobody takes the new name, between the time we
580 * checked it is currently unused in all search paths, and the
581 * time we take possession of it */
582 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
586 r
= image_find(new_name
, NULL
);
594 case IMAGE_DIRECTORY
:
595 /* Turn of the immutable bit while we rename the image, so that we can rename it */
596 (void) read_attr_path(i
->path
, &file_attr
);
598 if (file_attr
& FS_IMMUTABLE_FL
)
599 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
603 case IMAGE_SUBVOLUME
:
604 new_path
= file_in_same_dir(i
->path
, new_name
);
609 /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */
610 if (path_startswith(i
->path
, "/dev"))
613 new_path
= file_in_same_dir(i
->path
, new_name
);
619 fn
= strjoina(new_name
, ".raw");
620 new_path
= file_in_same_dir(i
->path
, fn
);
631 nn
= strdup(new_name
);
635 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
639 /* Restore the immutable bit, if it was set before */
640 if (file_attr
& FS_IMMUTABLE_FL
)
641 (void) chattr_path(new_path
, FS_IMMUTABLE_FL
, FS_IMMUTABLE_FL
);
651 STRV_FOREACH(j
, settings
) {
652 r
= rename_auxiliary_file(*j
, new_name
, ".nspawn");
653 if (r
< 0 && r
!= -ENOENT
)
654 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
657 r
= rename_auxiliary_file(roothash
, new_name
, ".roothash");
658 if (r
< 0 && r
!= -ENOENT
)
659 log_debug_errno(r
, "Failed to rename roothash file %s, ignoring: %m", roothash
);
664 static int clone_auxiliary_file(const char *path
, const char *new_name
, const char *suffix
) {
665 _cleanup_free_
char *rs
= NULL
;
668 fn
= strjoina(new_name
, suffix
);
670 rs
= file_in_same_dir(path
, fn
);
674 return copy_file_atomic(path
, rs
, 0664, 0, COPY_REFLINK
);
677 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
678 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
679 _cleanup_strv_free_
char **settings
= NULL
;
680 _cleanup_free_
char *roothash
= NULL
;
681 const char *new_path
;
687 if (!image_name_is_valid(new_name
))
690 settings
= image_settings_path(i
);
694 roothash
= image_roothash_path(i
);
698 /* Make sure nobody takes the new name, between the time we
699 * checked it is currently unused in all search paths, and the
700 * time we take possession of it */
701 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
705 r
= image_find(new_name
, NULL
);
713 case IMAGE_SUBVOLUME
:
714 case IMAGE_DIRECTORY
:
715 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
718 new_path
= strjoina("/var/lib/machines/", new_name
);
720 r
= btrfs_subvol_snapshot(i
->path
, new_path
,
721 (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
722 BTRFS_SNAPSHOT_FALLBACK_COPY
|
723 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
724 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
725 BTRFS_SNAPSHOT_RECURSIVE
|
726 BTRFS_SNAPSHOT_QUOTA
);
728 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
729 (void) btrfs_subvol_auto_qgroup(new_path
, 0, true);
734 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
736 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, FS_NOCOW_FL
, COPY_REFLINK
);
747 STRV_FOREACH(j
, settings
) {
748 r
= clone_auxiliary_file(*j
, new_name
, ".nspawn");
749 if (r
< 0 && r
!= -ENOENT
)
750 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
753 r
= clone_auxiliary_file(roothash
, new_name
, ".roothash");
754 if (r
< 0 && r
!= -ENOENT
)
755 log_debug_errno(r
, "Failed to clone root hash file %s, ignoring: %m", roothash
);
760 int image_read_only(Image
*i
, bool b
) {
761 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
765 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
768 /* Make sure we don't interfere with a running nspawn */
769 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
775 case IMAGE_SUBVOLUME
:
777 /* Note that we set the flag only on the top-level
778 * subvolume of the image. */
780 r
= btrfs_subvol_set_read_only(i
->path
, b
);
786 case IMAGE_DIRECTORY
:
787 /* For simple directory trees we cannot use the access
788 mode of the top-level directory, since it has an
789 effect on the container itself. However, we can
790 use the "immutable" flag, to at least make the
791 top-level directory read-only. It's not as good as
792 a read-only subvolume, but at least something, and
793 we can read the value back. */
795 r
= chattr_path(i
->path
, b
? FS_IMMUTABLE_FL
: 0, FS_IMMUTABLE_FL
);
804 if (stat(i
->path
, &st
) < 0)
807 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
810 /* If the images is now read-only, it's a good time to
811 * defrag it, given that no write patterns will
812 * fragment it again. */
814 (void) btrfs_defrag(i
->path
);
819 _cleanup_close_
int fd
= -1;
823 fd
= open(i
->path
, O_CLOEXEC
|O_RDONLY
|O_NONBLOCK
|O_NOCTTY
);
827 if (fstat(fd
, &st
) < 0)
829 if (!S_ISBLK(st
.st_mode
))
832 if (ioctl(fd
, BLKROSET
, &state
) < 0)
845 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
846 _cleanup_free_
char *p
= NULL
;
847 LockFile t
= LOCK_FILE_INIT
;
855 /* Locks an image path. This actually creates two locks: one
856 * "local" one, next to the image path itself, which might be
857 * shared via NFS. And another "global" one, in /run, that
858 * uses the device/inode number. This has the benefit that we
859 * can even lock a tree that is a mount point, correctly. */
861 if (!path_is_absolute(path
))
864 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
865 *local
= *global
= (LockFile
) LOCK_FILE_INIT
;
869 if (path_equal(path
, "/"))
872 if (stat(path
, &st
) >= 0) {
873 if (S_ISBLK(st
.st_mode
))
874 r
= asprintf(&p
, "/run/systemd/nspawn/locks/block-%u:%u", major(st
.st_rdev
), minor(st
.st_rdev
));
875 else if (S_ISDIR(st
.st_mode
) || S_ISREG(st
.st_mode
))
876 r
= asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
);
884 /* For block devices we don't need the "local" lock, as the major/minor lock above should be sufficient, since
885 * block devices are device local anyway. */
886 if (!path_startswith(path
, "/dev")) {
887 r
= make_lock_file_for(path
, operation
, &t
);
893 mkdir_p("/run/systemd/nspawn/locks", 0700);
895 r
= make_lock_file(p
, operation
, global
);
897 release_lock_file(&t
);
901 *global
= (LockFile
) LOCK_FILE_INIT
;
907 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
910 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
913 if (i
->type
!= IMAGE_SUBVOLUME
)
916 /* We set the quota both for the subvolume as well as for the
917 * subtree. The latter is mostly for historical reasons, since
918 * we didn't use to have a concept of subtree quota, and hence
919 * only modified the subvolume quota. */
921 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
922 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
923 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
926 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
932 /* Locks an image name, regardless of the precise path used. */
934 if (!image_name_is_valid(name
))
937 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
938 *ret
= (LockFile
) LOCK_FILE_INIT
;
942 if (streq(name
, ".host"))
945 mkdir_p("/run/systemd/nspawn/locks", 0700);
946 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
948 return make_lock_file(p
, operation
, ret
);
951 bool image_name_is_valid(const char *s
) {
952 if (!filename_is_valid(s
))
955 if (string_has_cc(s
, NULL
))
958 if (!utf8_is_valid(s
))
961 /* Temporary files for atomically creating new files */
962 if (startswith(s
, ".#"))
968 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
969 [IMAGE_DIRECTORY
] = "directory",
970 [IMAGE_SUBVOLUME
] = "subvolume",
972 [IMAGE_BLOCK
] = "block",
975 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);