2 This file is part of systemd.
4 Copyright 2013 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
35 #include "dirent-util.h"
40 #include "lockfile-util.h"
42 #include "machine-image.h"
45 #include "path-util.h"
47 #include "string-table.h"
48 #include "string-util.h"
50 #include "time-util.h"
53 #include "xattr-util.h"
55 static const char image_search_path
[] =
57 "/var/lib/container\0" /* legacy */
58 "/usr/local/lib/machines\0"
59 "/usr/lib/machines\0";
61 Image
*image_unref(Image
*i
) {
70 static char **image_settings_path(Image
*image
) {
71 _cleanup_strv_free_
char **l
= NULL
;
82 fn
= strjoina(image
->name
, ".nspawn");
84 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
85 l
[i
] = strappend(s
, fn
);
92 l
[i
] = file_in_same_dir(image
->path
, fn
);
102 static int image_new(
106 const char *filename
,
112 _cleanup_(image_unrefp
) Image
*i
= NULL
;
115 assert(t
< _IMAGE_TYPE_MAX
);
125 i
->read_only
= read_only
;
128 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
129 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
131 i
->name
= strdup(pretty
);
136 i
->path
= strjoin(path
, "/", filename
);
138 i
->path
= strdup(filename
);
143 path_kill_slashes(i
->path
);
151 static int image_make(
155 const char *filename
,
164 /* We explicitly *do* follow symlinks here, since we want to
165 * allow symlinking trees into /var/lib/machines/, and treat
168 if (fstatat(dfd
, filename
, &st
, 0) < 0)
172 (path
&& path_startswith(path
, "/usr")) ||
173 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
175 if (S_ISDIR(st
.st_mode
)) {
176 _cleanup_close_
int fd
= -1;
177 unsigned file_attr
= 0;
185 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
189 /* btrfs subvolumes have inode 256 */
190 if (st
.st_ino
== 256) {
192 r
= btrfs_is_filesystem(fd
);
196 BtrfsSubvolInfo info
;
198 /* It's a btrfs subvolume */
200 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
204 r
= image_new(IMAGE_SUBVOLUME
,
208 info
.read_only
|| read_only
,
215 if (btrfs_quota_scan_ongoing(fd
) == 0) {
216 BtrfsQuotaInfo quota
;
218 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
220 (*ret
)->usage
= quota
.referenced
;
221 (*ret
)->usage_exclusive
= quota
.exclusive
;
223 (*ret
)->limit
= quota
.referenced_max
;
224 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
232 /* If the IMMUTABLE bit is set, we consider the
233 * directory read-only. Since the ioctl is not
234 * supported everywhere we ignore failures. */
235 (void) read_attr_fd(fd
, &file_attr
);
237 /* It's just a normal directory. */
238 r
= image_new(IMAGE_DIRECTORY
,
242 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
251 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
254 /* It's a RAW disk image */
259 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
262 pretty
= strndupa(filename
, strlen(filename
) - 4);
264 r
= image_new(IMAGE_RAW
,
268 !(st
.st_mode
& 0222) || read_only
,
270 timespec_load(&st
.st_mtim
),
275 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
276 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
284 int image_find(const char *name
, Image
**ret
) {
290 /* There are no images with invalid names */
291 if (!image_name_is_valid(name
))
294 NULSTR_FOREACH(path
, image_search_path
) {
295 _cleanup_closedir_
DIR *d
= NULL
;
305 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
306 if (r
== 0 || r
== -ENOENT
) {
307 _cleanup_free_
char *raw
= NULL
;
309 raw
= strappend(name
, ".raw");
313 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
314 if (r
== 0 || r
== -ENOENT
)
323 if (streq(name
, ".host"))
324 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
329 int image_discover(Hashmap
*h
) {
335 NULSTR_FOREACH(path
, image_search_path
) {
336 _cleanup_closedir_
DIR *d
= NULL
;
347 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
348 _cleanup_(image_unrefp
) Image
*image
= NULL
;
350 if (!image_name_is_valid(de
->d_name
))
353 if (hashmap_contains(h
, de
->d_name
))
356 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
357 if (r
== 0 || r
== -ENOENT
)
362 r
= hashmap_put(h
, image
->name
, image
);
370 if (!hashmap_contains(h
, ".host")) {
371 _cleanup_(image_unrefp
) Image
*image
= NULL
;
373 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
377 r
= hashmap_put(h
, image
->name
, image
);
388 void image_hashmap_free(Hashmap
*map
) {
391 while ((i
= hashmap_steal_first(map
)))
397 int image_remove(Image
*i
) {
398 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
399 _cleanup_strv_free_
char **settings
= NULL
;
405 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
408 settings
= image_settings_path(i
);
412 /* Make sure we don't interfere with a running nspawn */
413 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
419 case IMAGE_SUBVOLUME
:
420 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
425 case IMAGE_DIRECTORY
:
426 /* Allow deletion of read-only directories */
427 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
428 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
435 if (unlink(i
->path
) < 0)
443 STRV_FOREACH(j
, settings
) {
444 if (unlink(*j
) < 0 && errno
!= ENOENT
)
445 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
451 static int rename_settings_file(const char *path
, const char *new_name
) {
452 _cleanup_free_
char *rs
= NULL
;
455 fn
= strjoina(new_name
, ".nspawn");
457 rs
= file_in_same_dir(path
, fn
);
461 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
464 int image_rename(Image
*i
, const char *new_name
) {
465 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
466 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
;
467 _cleanup_strv_free_
char **settings
= NULL
;
468 unsigned file_attr
= 0;
474 if (!image_name_is_valid(new_name
))
477 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
480 settings
= image_settings_path(i
);
484 /* Make sure we don't interfere with a running nspawn */
485 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
489 /* Make sure nobody takes the new name, between the time we
490 * checked it is currently unused in all search paths, and the
491 * time we take possession of it */
492 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
496 r
= image_find(new_name
, NULL
);
504 case IMAGE_DIRECTORY
:
505 /* Turn of the immutable bit while we rename the image, so that we can rename it */
506 (void) read_attr_path(i
->path
, &file_attr
);
508 if (file_attr
& FS_IMMUTABLE_FL
)
509 (void) chattr_path(i
->path
, 0, FS_IMMUTABLE_FL
);
513 case IMAGE_SUBVOLUME
:
514 new_path
= file_in_same_dir(i
->path
, new_name
);
520 fn
= strjoina(new_name
, ".raw");
521 new_path
= file_in_same_dir(i
->path
, fn
);
532 nn
= strdup(new_name
);
536 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
540 /* Restore the immutable bit, if it was set before */
541 if (file_attr
& FS_IMMUTABLE_FL
)
542 (void) chattr_path(new_path
, FS_IMMUTABLE_FL
, FS_IMMUTABLE_FL
);
552 STRV_FOREACH(j
, settings
) {
553 r
= rename_settings_file(*j
, new_name
);
554 if (r
< 0 && r
!= -ENOENT
)
555 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
561 static int clone_settings_file(const char *path
, const char *new_name
) {
562 _cleanup_free_
char *rs
= NULL
;
565 fn
= strjoina(new_name
, ".nspawn");
567 rs
= file_in_same_dir(path
, fn
);
571 return copy_file_atomic(path
, rs
, 0664, false, 0);
574 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
575 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
576 _cleanup_strv_free_
char **settings
= NULL
;
577 const char *new_path
;
583 if (!image_name_is_valid(new_name
))
586 settings
= image_settings_path(i
);
590 /* Make sure nobody takes the new name, between the time we
591 * checked it is currently unused in all search paths, and the
592 * time we take possession of it */
593 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
597 r
= image_find(new_name
, NULL
);
605 case IMAGE_SUBVOLUME
:
606 case IMAGE_DIRECTORY
:
607 /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
610 new_path
= strjoina("/var/lib/machines/", new_name
);
612 r
= btrfs_subvol_snapshot(i
->path
, new_path
,
613 (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
614 BTRFS_SNAPSHOT_FALLBACK_COPY
|
615 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
616 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
617 BTRFS_SNAPSHOT_RECURSIVE
|
618 BTRFS_SNAPSHOT_QUOTA
);
620 /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
621 (void) btrfs_subvol_auto_qgroup(new_path
, 0, true);
626 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
628 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, false, FS_NOCOW_FL
);
638 STRV_FOREACH(j
, settings
) {
639 r
= clone_settings_file(*j
, new_name
);
640 if (r
< 0 && r
!= -ENOENT
)
641 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
647 int image_read_only(Image
*i
, bool b
) {
648 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
652 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
655 /* Make sure we don't interfere with a running nspawn */
656 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
662 case IMAGE_SUBVOLUME
:
664 /* Note that we set the flag only on the top-level
665 * subvolume of the image. */
667 r
= btrfs_subvol_set_read_only(i
->path
, b
);
673 case IMAGE_DIRECTORY
:
674 /* For simple directory trees we cannot use the access
675 mode of the top-level directory, since it has an
676 effect on the container itself. However, we can
677 use the "immutable" flag, to at least make the
678 top-level directory read-only. It's not as good as
679 a read-only subvolume, but at least something, and
680 we can read the value back.*/
682 r
= chattr_path(i
->path
, b
? FS_IMMUTABLE_FL
: 0, FS_IMMUTABLE_FL
);
691 if (stat(i
->path
, &st
) < 0)
694 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
697 /* If the images is now read-only, it's a good time to
698 * defrag it, given that no write patterns will
699 * fragment it again. */
701 (void) btrfs_defrag(i
->path
);
712 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
713 _cleanup_free_
char *p
= NULL
;
714 LockFile t
= LOCK_FILE_INIT
;
722 /* Locks an image path. This actually creates two locks: one
723 * "local" one, next to the image path itself, which might be
724 * shared via NFS. And another "global" one, in /run, that
725 * uses the device/inode number. This has the benefit that we
726 * can even lock a tree that is a mount point, correctly. */
728 if (!path_is_absolute(path
))
731 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
732 *local
= *global
= (LockFile
) LOCK_FILE_INIT
;
736 if (path_equal(path
, "/"))
739 if (stat(path
, &st
) >= 0) {
740 if (asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
) < 0)
744 r
= make_lock_file_for(path
, operation
, &t
);
749 mkdir_p("/run/systemd/nspawn/locks", 0700);
751 r
= make_lock_file(p
, operation
, global
);
753 release_lock_file(&t
);
757 *global
= (LockFile
) LOCK_FILE_INIT
;
763 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
766 if (IMAGE_IS_VENDOR(i
) || IMAGE_IS_HOST(i
))
769 if (i
->type
!= IMAGE_SUBVOLUME
)
772 /* We set the quota both for the subvolume as well as for the
773 * subtree. The latter is mostly for historical reasons, since
774 * we didn't use to have a concept of subtree quota, and hence
775 * only modified the subvolume quota. */
777 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
778 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
779 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
782 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
788 /* Locks an image name, regardless of the precise path used. */
790 if (!image_name_is_valid(name
))
793 if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
794 *ret
= (LockFile
) LOCK_FILE_INIT
;
798 if (streq(name
, ".host"))
801 mkdir_p("/run/systemd/nspawn/locks", 0700);
802 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
804 return make_lock_file(p
, operation
, ret
);
807 bool image_name_is_valid(const char *s
) {
808 if (!filename_is_valid(s
))
811 if (string_has_cc(s
, NULL
))
814 if (!utf8_is_valid(s
))
817 /* Temporary files for atomically creating new files */
818 if (startswith(s
, ".#"))
824 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
825 [IMAGE_DIRECTORY
] = "directory",
826 [IMAGE_SUBVOLUME
] = "subvolume",
830 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);