2 This file is part of systemd.
4 Copyright 2013 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
30 #include "alloc-util.h"
31 #include "btrfs-util.h"
32 #include "chattr-util.h"
34 #include "dirent-util.h"
38 #include "lockfile-util.h"
41 #include "machine-image.h"
43 #include "path-util.h"
45 #include "string-table.h"
46 #include "string-util.h"
48 #include "time-util.h"
51 #include "xattr-util.h"
53 static const char image_search_path
[] =
55 "/var/lib/container\0" /* legacy */
56 "/usr/local/lib/machines\0"
57 "/usr/lib/machines\0";
59 Image
*image_unref(Image
*i
) {
68 static char **image_settings_path(Image
*image
) {
69 _cleanup_strv_free_
char **l
= NULL
;
80 fn
= strjoina(image
->name
, ".nspawn");
82 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
83 l
[i
] = strappend(s
, fn
);
90 l
[i
] = file_in_same_dir(image
->path
, fn
);
100 static int image_new(
104 const char *filename
,
110 _cleanup_(image_unrefp
) Image
*i
= NULL
;
113 assert(t
< _IMAGE_TYPE_MAX
);
123 i
->read_only
= read_only
;
126 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
127 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
129 i
->name
= strdup(pretty
);
134 i
->path
= strjoin(path
, "/", filename
, NULL
);
136 i
->path
= strdup(filename
);
141 path_kill_slashes(i
->path
);
149 static int image_make(
153 const char *filename
,
162 /* We explicitly *do* follow symlinks here, since we want to
163 * allow symlinking trees into /var/lib/machines/, and treat
166 if (fstatat(dfd
, filename
, &st
, 0) < 0)
170 (path
&& path_startswith(path
, "/usr")) ||
171 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
173 if (S_ISDIR(st
.st_mode
)) {
174 _cleanup_close_
int fd
= -1;
175 unsigned file_attr
= 0;
183 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
187 /* btrfs subvolumes have inode 256 */
188 if (st
.st_ino
== 256) {
190 r
= btrfs_is_filesystem(fd
);
194 BtrfsSubvolInfo info
;
196 /* It's a btrfs subvolume */
198 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
202 r
= image_new(IMAGE_SUBVOLUME
,
206 info
.read_only
|| read_only
,
213 if (btrfs_quota_scan_ongoing(fd
) == 0) {
214 BtrfsQuotaInfo quota
;
216 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
218 (*ret
)->usage
= quota
.referenced
;
219 (*ret
)->usage_exclusive
= quota
.exclusive
;
221 (*ret
)->limit
= quota
.referenced_max
;
222 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
230 /* If the IMMUTABLE bit is set, we consider the
231 * directory read-only. Since the ioctl is not
232 * supported everywhere we ignore failures. */
233 (void) read_attr_fd(fd
, &file_attr
);
235 /* It's just a normal directory. */
236 r
= image_new(IMAGE_DIRECTORY
,
240 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
249 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
252 /* It's a RAW disk image */
257 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
260 pretty
= strndupa(filename
, strlen(filename
) - 4);
262 r
= image_new(IMAGE_RAW
,
266 !(st
.st_mode
& 0222) || read_only
,
268 timespec_load(&st
.st_mtim
),
273 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
274 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
282 int image_find(const char *name
, Image
**ret
) {
288 /* There are no images with invalid names */
289 if (!image_name_is_valid(name
))
292 NULSTR_FOREACH(path
, image_search_path
) {
293 _cleanup_closedir_
DIR *d
= NULL
;
303 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
304 if (r
== 0 || r
== -ENOENT
) {
305 _cleanup_free_
char *raw
= NULL
;
307 raw
= strappend(name
, ".raw");
311 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
312 if (r
== 0 || r
== -ENOENT
)
321 if (streq(name
, ".host"))
322 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
327 int image_discover(Hashmap
*h
) {
333 NULSTR_FOREACH(path
, image_search_path
) {
334 _cleanup_closedir_
DIR *d
= NULL
;
345 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
346 _cleanup_(image_unrefp
) Image
*image
= NULL
;
348 if (!image_name_is_valid(de
->d_name
))
351 if (hashmap_contains(h
, de
->d_name
))
354 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
355 if (r
== 0 || r
== -ENOENT
)
360 r
= hashmap_put(h
, image
->name
, image
);
368 if (!hashmap_contains(h
, ".host")) {
369 _cleanup_(image_unrefp
) Image
*image
= NULL
;
371 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
375 r
= hashmap_put(h
, image
->name
, image
);
386 void image_hashmap_free(Hashmap
*map
) {
389 while ((i
= hashmap_steal_first(map
)))
395 int image_remove(Image
*i
) {
396 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
397 _cleanup_strv_free_
char **settings
= NULL
;
403 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
406 settings
= image_settings_path(i
);
410 /* Make sure we don't interfere with a running nspawn */
411 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
417 case IMAGE_SUBVOLUME
:
418 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
423 case IMAGE_DIRECTORY
:
424 /* Allow deletion of read-only directories */
425 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
426 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
433 if (unlink(i
->path
) < 0)
441 STRV_FOREACH(j
, settings
) {
442 if (unlink(*j
) < 0 && errno
!= ENOENT
)
443 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
449 static int rename_settings_file(const char *path
, const char *new_name
) {
450 _cleanup_free_
char *rs
= NULL
;
453 fn
= strjoina(new_name
, ".nspawn");
455 rs
= file_in_same_dir(path
, fn
);
459 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
462 int image_rename(Image
*i
, const char *new_name
) {
463 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
464 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
;
465 _cleanup_strv_free_
char **settings
= NULL
;
466 unsigned file_attr
= 0;
472 if (!image_name_is_valid(new_name
))
475 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
478 settings
= image_settings_path(i
);
482 /* Make sure we don't interfere with a running nspawn */
483 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
487 /* Make sure nobody takes the new name, between the time we
488 * checked it is currently unused in all search paths, and the
489 * time we take possession of it */
490 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
494 r
= image_find(new_name
, NULL
);
502 case IMAGE_DIRECTORY
:
503 /* Turn of the immutable bit while we rename the image, so that we can rename it */
504 (void) read_attr_path(i
->path
, &file_attr
);
506 if (file_attr
& FS_IMMUTABLE_FL
)
507 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
511 case IMAGE_SUBVOLUME
:
512 new_path
= file_in_same_dir(i
->path
, new_name
);
518 fn
= strjoina(new_name
, ".raw");
519 new_path
= file_in_same_dir(i
->path
, fn
);
530 nn
= strdup(new_name
);
534 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
538 /* Restore the immutable bit, if it was set before */
539 if (file_attr
& FS_IMMUTABLE_FL
)
540 (void) chattr_path(new_path
, FS_IMMUTABLE_FL
, FS_IMMUTABLE_FL
);
550 STRV_FOREACH(j
, settings
) {
551 r
= rename_settings_file(*j
, new_name
);
552 if (r
< 0 && r
!= -ENOENT
)
553 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
559 static int clone_settings_file(const char *path
, const char *new_name
) {
560 _cleanup_free_
char *rs
= NULL
;
563 fn
= strjoina(new_name
, ".nspawn");
565 rs
= file_in_same_dir(path
, fn
);
569 return copy_file_atomic(path
, rs
, 0664, false, 0);
572 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
573 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
574 _cleanup_strv_free_
char **settings
= NULL
;
575 const char *new_path
;
581 if (!image_name_is_valid(new_name
))
584 settings
= image_settings_path(i
);
588 /* Make sure nobody takes the new name, between the time we
589 * checked it is currently unused in all search paths, and the
590 * time we take possession of it */
591 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
595 r
= image_find(new_name
, NULL
);
603 case IMAGE_SUBVOLUME
:
604 case IMAGE_DIRECTORY
:
605 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
608 new_path
= strjoina("/var/lib/machines/", new_name
);
610 r
= btrfs_subvol_snapshot(i
->path
, new_path
, (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
611 if (r
== -EOPNOTSUPP
) {
612 /* No btrfs snapshots supported, create a normal directory then. */
614 r
= copy_directory(i
->path
, new_path
, false);
616 (void) chattr_path(new_path
, read_only
? FS_IMMUTABLE_FL
: 0, FS_IMMUTABLE_FL
);
618 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
619 (void) btrfs_subvol_auto_qgroup(new_path
, 0, true);
624 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
626 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, false, FS_NOCOW_FL
);
636 STRV_FOREACH(j
, settings
) {
637 r
= clone_settings_file(*j
, new_name
);
638 if (r
< 0 && r
!= -ENOENT
)
639 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
645 int image_read_only(Image
*i
, bool b
) {
646 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
650 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
653 /* Make sure we don't interfere with a running nspawn */
654 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
660 case IMAGE_SUBVOLUME
:
662 /* Note that we set the flag only on the top-level
663 * subvolume of the image. */
665 r
= btrfs_subvol_set_read_only(i
->path
, b
);
671 case IMAGE_DIRECTORY
:
672 /* For simple directory trees we cannot use the access
673 mode of the top-level directory, since it has an
674 effect on the container itself. However, we can
675 use the "immutable" flag, to at least make the
676 top-level directory read-only. It's not as good as
677 a read-only subvolume, but at least something, and
678 we can read the value back.*/
680 r
= chattr_path(i
->path
, b
? FS_IMMUTABLE_FL
: 0, FS_IMMUTABLE_FL
);
689 if (stat(i
->path
, &st
) < 0)
692 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
695 /* If the images is now read-only, it's a good time to
696 * defrag it, given that no write patterns will
697 * fragment it again. */
699 (void) btrfs_defrag(i
->path
);
710 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
711 _cleanup_free_
char *p
= NULL
;
712 LockFile t
= LOCK_FILE_INIT
;
720 /* Locks an image path. This actually creates two locks: one
721 * "local" one, next to the image path itself, which might be
722 * shared via NFS. And another "global" one, in /run, that
723 * uses the device/inode number. This has the benefit that we
724 * can even lock a tree that is a mount point, correctly. */
726 if (path_equal(path
, "/"))
729 if (!path_is_absolute(path
))
732 if (stat(path
, &st
) >= 0) {
733 if (asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
) < 0)
737 r
= make_lock_file_for(path
, operation
, &t
);
742 mkdir_p("/run/systemd/nspawn/locks", 0700);
744 r
= make_lock_file(p
, operation
, global
);
746 release_lock_file(&t
);
755 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
758 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
761 if (i
->type
!= IMAGE_SUBVOLUME
)
764 /* We set the quota both for the subvolume as well as for the
765 * subtree. The latter is mostly for historical reasons, since
766 * we didn't use to have a concept of subtree quota, and hence
767 * only modified the subvolume quota. */
769 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
770 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
771 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
774 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
780 /* Locks an image name, regardless of the precise path used. */
782 if (!image_name_is_valid(name
))
785 if (streq(name
, ".host"))
788 mkdir_p("/run/systemd/nspawn/locks", 0700);
789 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
791 return make_lock_file(p
, operation
, ret
);
794 bool image_name_is_valid(const char *s
) {
795 if (!filename_is_valid(s
))
798 if (string_has_cc(s
, NULL
))
801 if (!utf8_is_valid(s
))
804 /* Temporary files for atomically creating new files */
805 if (startswith(s
, ".#"))
811 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
812 [IMAGE_DIRECTORY
] = "directory",
813 [IMAGE_SUBVOLUME
] = "subvolume",
817 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);