1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2013 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
32 #include "alloc-util.h"
33 #include "btrfs-util.h"
34 #include "chattr-util.h"
36 #include "dirent-util.h"
41 #include "lockfile-util.h"
43 #include "machine-image.h"
46 #include "path-util.h"
48 #include "string-table.h"
49 #include "string-util.h"
51 #include "time-util.h"
54 #include "xattr-util.h"
56 static const char image_search_path
[] =
58 "/var/lib/container\0" /* legacy */
59 "/usr/local/lib/machines\0"
60 "/usr/lib/machines\0";
62 Image
*image_unref(Image
*i
) {
71 static char **image_settings_path(Image
*image
) {
72 _cleanup_strv_free_
char **l
= NULL
;
83 fn
= strjoina(image
->name
, ".nspawn");
85 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
86 l
[i
] = strappend(s
, fn
);
93 l
[i
] = file_in_same_dir(image
->path
, fn
);
103 static char *image_roothash_path(Image
*image
) {
108 fn
= strjoina(image
->name
, ".roothash");
110 return file_in_same_dir(image
->path
, fn
);
113 static int image_new(
117 const char *filename
,
123 _cleanup_(image_unrefp
) Image
*i
= NULL
;
126 assert(t
< _IMAGE_TYPE_MAX
);
136 i
->read_only
= read_only
;
139 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
140 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
142 i
->name
= strdup(pretty
);
147 i
->path
= strjoin(path
, "/", filename
);
149 i
->path
= strdup(filename
);
154 path_kill_slashes(i
->path
);
162 static int image_make(
166 const char *filename
,
175 /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block
176 * devices into /var/lib/machines/, and treat them normally. */
178 if (fstatat(dfd
, filename
, &st
, 0) < 0)
182 (path
&& path_startswith(path
, "/usr")) ||
183 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
185 if (S_ISDIR(st
.st_mode
)) {
186 _cleanup_close_
int fd
= -1;
187 unsigned file_attr
= 0;
195 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
199 /* btrfs subvolumes have inode 256 */
200 if (st
.st_ino
== 256) {
202 r
= btrfs_is_filesystem(fd
);
206 BtrfsSubvolInfo info
;
208 /* It's a btrfs subvolume */
210 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
214 r
= image_new(IMAGE_SUBVOLUME
,
218 info
.read_only
|| read_only
,
225 if (btrfs_quota_scan_ongoing(fd
) == 0) {
226 BtrfsQuotaInfo quota
;
228 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
230 (*ret
)->usage
= quota
.referenced
;
231 (*ret
)->usage_exclusive
= quota
.exclusive
;
233 (*ret
)->limit
= quota
.referenced_max
;
234 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
242 /* If the IMMUTABLE bit is set, we consider the
243 * directory read-only. Since the ioctl is not
244 * supported everywhere we ignore failures. */
245 (void) read_attr_fd(fd
, &file_attr
);
247 /* It's just a normal directory. */
248 r
= image_new(IMAGE_DIRECTORY
,
252 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
261 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
264 /* It's a RAW disk image */
269 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
272 pretty
= strndupa(filename
, strlen(filename
) - 4);
274 r
= image_new(IMAGE_RAW
,
278 !(st
.st_mode
& 0222) || read_only
,
280 timespec_load(&st
.st_mtim
),
285 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
286 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
290 } else if (S_ISBLK(st
.st_mode
)) {
291 _cleanup_close_
int block_fd
= -1;
292 uint64_t size
= UINT64_MAX
;
302 block_fd
= openat(dfd
, filename
, O_RDONLY
|O_NONBLOCK
|O_CLOEXEC
|O_NOCTTY
);
304 log_debug_errno(errno
, "Failed to open block device %s/%s, ignoring: %m", path
, filename
);
306 if (fstat(block_fd
, &st
) < 0)
308 if (!S_ISBLK(st
.st_mode
)) /* Verify that what we opened is actually what we think it is */
314 if (ioctl(block_fd
, BLKROGET
, &state
) < 0)
315 log_debug_errno(errno
, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path
, filename
);
320 if (ioctl(block_fd
, BLKGETSIZE64
, &size
) < 0)
321 log_debug_errno(errno
, "Failed to issue BLKFLSBUF on device %s/%s, ignoring: %m", path
, filename
);
323 block_fd
= safe_close(block_fd
);
326 r
= image_new(IMAGE_BLOCK
,
330 !(st
.st_mode
& 0222) || read_only
,
337 if (size
!= 0 && size
!= UINT64_MAX
)
338 (*ret
)->usage
= (*ret
)->usage_exclusive
= (*ret
)->limit
= (*ret
)->limit_exclusive
= size
;
346 int image_find(const char *name
, Image
**ret
) {
352 /* There are no images with invalid names */
353 if (!image_name_is_valid(name
))
356 NULSTR_FOREACH(path
, image_search_path
) {
357 _cleanup_closedir_
DIR *d
= NULL
;
367 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
368 if (IN_SET(r
, 0, -ENOENT
)) {
369 _cleanup_free_
char *raw
= NULL
;
371 raw
= strappend(name
, ".raw");
375 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
376 if (IN_SET(r
, 0, -ENOENT
))
385 if (streq(name
, ".host"))
386 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
391 int image_discover(Hashmap
*h
) {
397 NULSTR_FOREACH(path
, image_search_path
) {
398 _cleanup_closedir_
DIR *d
= NULL
;
409 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
410 _cleanup_(image_unrefp
) Image
*image
= NULL
;
412 if (!image_name_is_valid(de
->d_name
))
415 if (hashmap_contains(h
, de
->d_name
))
418 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
419 if (IN_SET(r
, 0, -ENOENT
))
424 r
= hashmap_put(h
, image
->name
, image
);
432 if (!hashmap_contains(h
, ".host")) {
433 _cleanup_(image_unrefp
) Image
*image
= NULL
;
435 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
439 r
= hashmap_put(h
, image
->name
, image
);
450 void image_hashmap_free(Hashmap
*map
) {
453 while ((i
= hashmap_steal_first(map
)))
459 int image_remove(Image
*i
) {
460 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
461 _cleanup_strv_free_
char **settings
= NULL
;
462 _cleanup_free_
char *roothash
= NULL
;
468 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
471 settings
= image_settings_path(i
);
475 roothash
= image_roothash_path(i
);
479 /* Make sure we don't interfere with a running nspawn */
480 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
486 case IMAGE_SUBVOLUME
:
488 /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the
490 if (unlink(i
->path
) < 0) {
491 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
498 case IMAGE_DIRECTORY
:
499 /* Allow deletion of read-only directories */
500 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
501 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
509 /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node
510 * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink
511 * the thing (it's most likely a symlink after all). */
513 if (path_startswith(i
->path
, "/dev"))
518 if (unlink(i
->path
) < 0)
526 STRV_FOREACH(j
, settings
) {
527 if (unlink(*j
) < 0 && errno
!= ENOENT
)
528 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
531 if (unlink(roothash
) < 0 && errno
!= ENOENT
)
532 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", roothash
);
537 static int rename_auxiliary_file(const char *path
, const char *new_name
, const char *suffix
) {
538 _cleanup_free_
char *rs
= NULL
;
541 fn
= strjoina(new_name
, suffix
);
543 rs
= file_in_same_dir(path
, fn
);
547 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
550 int image_rename(Image
*i
, const char *new_name
) {
551 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
552 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
, *roothash
= NULL
;
553 _cleanup_strv_free_
char **settings
= NULL
;
554 unsigned file_attr
= 0;
560 if (!image_name_is_valid(new_name
))
563 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
566 settings
= image_settings_path(i
);
570 roothash
= image_roothash_path(i
);
574 /* Make sure we don't interfere with a running nspawn */
575 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
579 /* Make sure nobody takes the new name, between the time we
580 * checked it is currently unused in all search paths, and the
581 * time we take possession of it */
582 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
586 r
= image_find(new_name
, NULL
);
594 case IMAGE_DIRECTORY
:
595 /* Turn of the immutable bit while we rename the image, so that we can rename it */
596 (void) read_attr_path(i
->path
, &file_attr
);
598 if (file_attr
& FS_IMMUTABLE_FL
)
599 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
602 case IMAGE_SUBVOLUME
:
603 new_path
= file_in_same_dir(i
->path
, new_name
);
608 /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */
609 if (path_startswith(i
->path
, "/dev"))
612 new_path
= file_in_same_dir(i
->path
, new_name
);
618 fn
= strjoina(new_name
, ".raw");
619 new_path
= file_in_same_dir(i
->path
, fn
);
630 nn
= strdup(new_name
);
634 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
638 /* Restore the immutable bit, if it was set before */
639 if (file_attr
& FS_IMMUTABLE_FL
)
640 (void) chattr_path(new_path
, FS_IMMUTABLE_FL
, FS_IMMUTABLE_FL
);
650 STRV_FOREACH(j
, settings
) {
651 r
= rename_auxiliary_file(*j
, new_name
, ".nspawn");
652 if (r
< 0 && r
!= -ENOENT
)
653 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
656 r
= rename_auxiliary_file(roothash
, new_name
, ".roothash");
657 if (r
< 0 && r
!= -ENOENT
)
658 log_debug_errno(r
, "Failed to rename roothash file %s, ignoring: %m", roothash
);
663 static int clone_auxiliary_file(const char *path
, const char *new_name
, const char *suffix
) {
664 _cleanup_free_
char *rs
= NULL
;
667 fn
= strjoina(new_name
, suffix
);
669 rs
= file_in_same_dir(path
, fn
);
673 return copy_file_atomic(path
, rs
, 0664, 0, COPY_REFLINK
);
676 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
677 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
678 _cleanup_strv_free_
char **settings
= NULL
;
679 _cleanup_free_
char *roothash
= NULL
;
680 const char *new_path
;
686 if (!image_name_is_valid(new_name
))
689 settings
= image_settings_path(i
);
693 roothash
= image_roothash_path(i
);
697 /* Make sure nobody takes the new name, between the time we
698 * checked it is currently unused in all search paths, and the
699 * time we take possession of it */
700 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
704 r
= image_find(new_name
, NULL
);
712 case IMAGE_SUBVOLUME
:
713 case IMAGE_DIRECTORY
:
714 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
717 new_path
= strjoina("/var/lib/machines/", new_name
);
719 r
= btrfs_subvol_snapshot(i
->path
, new_path
,
720 (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
721 BTRFS_SNAPSHOT_FALLBACK_COPY
|
722 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
723 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
724 BTRFS_SNAPSHOT_RECURSIVE
|
725 BTRFS_SNAPSHOT_QUOTA
);
727 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
728 (void) btrfs_subvol_auto_qgroup(new_path
, 0, true);
733 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
735 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, FS_NOCOW_FL
, COPY_REFLINK
);
746 STRV_FOREACH(j
, settings
) {
747 r
= clone_auxiliary_file(*j
, new_name
, ".nspawn");
748 if (r
< 0 && r
!= -ENOENT
)
749 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
752 r
= clone_auxiliary_file(roothash
, new_name
, ".roothash");
753 if (r
< 0 && r
!= -ENOENT
)
754 log_debug_errno(r
, "Failed to clone root hash file %s, ignoring: %m", roothash
);
759 int image_read_only(Image
*i
, bool b
) {
760 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
764 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
767 /* Make sure we don't interfere with a running nspawn */
768 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
774 case IMAGE_SUBVOLUME
:
776 /* Note that we set the flag only on the top-level
777 * subvolume of the image. */
779 r
= btrfs_subvol_set_read_only(i
->path
, b
);
785 case IMAGE_DIRECTORY
:
786 /* For simple directory trees we cannot use the access
787 mode of the top-level directory, since it has an
788 effect on the container itself. However, we can
789 use the "immutable" flag, to at least make the
790 top-level directory read-only. It's not as good as
791 a read-only subvolume, but at least something, and
792 we can read the value back. */
794 r
= chattr_path(i
->path
, b
? FS_IMMUTABLE_FL
: 0, FS_IMMUTABLE_FL
);
803 if (stat(i
->path
, &st
) < 0)
806 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
809 /* If the images is now read-only, it's a good time to
810 * defrag it, given that no write patterns will
811 * fragment it again. */
813 (void) btrfs_defrag(i
->path
);
818 _cleanup_close_
int fd
= -1;
822 fd
= open(i
->path
, O_CLOEXEC
|O_RDONLY
|O_NONBLOCK
|O_NOCTTY
);
826 if (fstat(fd
, &st
) < 0)
828 if (!S_ISBLK(st
.st_mode
))
831 if (ioctl(fd
, BLKROSET
, &state
) < 0)
844 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
845 _cleanup_free_
char *p
= NULL
;
846 LockFile t
= LOCK_FILE_INIT
;
854 /* Locks an image path. This actually creates two locks: one
855 * "local" one, next to the image path itself, which might be
856 * shared via NFS. And another "global" one, in /run, that
857 * uses the device/inode number. This has the benefit that we
858 * can even lock a tree that is a mount point, correctly. */
860 if (!path_is_absolute(path
))
863 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
864 *local
= *global
= (LockFile
) LOCK_FILE_INIT
;
868 if (path_equal(path
, "/"))
871 if (stat(path
, &st
) >= 0) {
872 if (S_ISBLK(st
.st_mode
))
873 r
= asprintf(&p
, "/run/systemd/nspawn/locks/block-%u:%u", major(st
.st_rdev
), minor(st
.st_rdev
));
874 else if (S_ISDIR(st
.st_mode
) || S_ISREG(st
.st_mode
))
875 r
= asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
);
883 /* For block devices we don't need the "local" lock, as the major/minor lock above should be sufficient, since
884 * block devices are device local anyway. */
885 if (!path_startswith(path
, "/dev")) {
886 r
= make_lock_file_for(path
, operation
, &t
);
892 mkdir_p("/run/systemd/nspawn/locks", 0700);
894 r
= make_lock_file(p
, operation
, global
);
896 release_lock_file(&t
);
900 *global
= (LockFile
) LOCK_FILE_INIT
;
906 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
909 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
912 if (i
->type
!= IMAGE_SUBVOLUME
)
915 /* We set the quota both for the subvolume as well as for the
916 * subtree. The latter is mostly for historical reasons, since
917 * we didn't use to have a concept of subtree quota, and hence
918 * only modified the subvolume quota. */
920 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
921 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
922 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
925 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
931 /* Locks an image name, regardless of the precise path used. */
933 if (!image_name_is_valid(name
))
936 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
937 *ret
= (LockFile
) LOCK_FILE_INIT
;
941 if (streq(name
, ".host"))
944 mkdir_p("/run/systemd/nspawn/locks", 0700);
945 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
947 return make_lock_file(p
, operation
, ret
);
950 bool image_name_is_valid(const char *s
) {
951 if (!filename_is_valid(s
))
954 if (string_has_cc(s
, NULL
))
957 if (!utf8_is_valid(s
))
960 /* Temporary files for atomically creating new files */
961 if (startswith(s
, ".#"))
967 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
968 [IMAGE_DIRECTORY
] = "directory",
969 [IMAGE_SUBVOLUME
] = "subvolume",
971 [IMAGE_BLOCK
] = "block",
974 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);