1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/statfs.h>
26 #include "btrfs-util.h"
27 #include "chattr-util.h"
29 #include "dirent-util.h"
32 #include "machine-image.h"
34 #include "path-util.h"
36 #include "string-table.h"
37 #include "string-util.h"
40 #include "xattr-util.h"
42 static const char image_search_path
[] =
44 "/var/lib/container\0" /* legacy */
45 "/usr/local/lib/machines\0"
46 "/usr/lib/machines\0";
48 Image
*image_unref(Image
*i
) {
58 static char **image_settings_path(Image
*image
) {
59 _cleanup_strv_free_
char **l
= NULL
;
70 fn
= strjoina(image
->name
, ".nspawn");
72 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
73 l
[i
] = strappend(s
, fn
);
80 l
[i
] = file_in_same_dir(image
->path
, fn
);
100 _cleanup_(image_unrefp
) Image
*i
= NULL
;
103 assert(t
< _IMAGE_TYPE_MAX
);
113 i
->read_only
= read_only
;
116 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
117 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
119 i
->name
= strdup(pretty
);
124 i
->path
= strjoin(path
, "/", filename
, NULL
);
126 i
->path
= strdup(filename
);
131 path_kill_slashes(i
->path
);
139 static int image_make(
143 const char *filename
,
152 /* We explicitly *do* follow symlinks here, since we want to
153 * allow symlinking trees into /var/lib/machines/, and treat
156 if (fstatat(dfd
, filename
, &st
, 0) < 0)
160 (path
&& path_startswith(path
, "/usr")) ||
161 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
163 if (S_ISDIR(st
.st_mode
)) {
164 _cleanup_close_
int fd
= -1;
165 unsigned file_attr
= 0;
173 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
177 /* btrfs subvolumes have inode 256 */
178 if (st
.st_ino
== 256) {
180 r
= btrfs_is_filesystem(fd
);
184 BtrfsSubvolInfo info
;
186 /* It's a btrfs subvolume */
188 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
192 r
= image_new(IMAGE_SUBVOLUME
,
196 info
.read_only
|| read_only
,
203 if (btrfs_quota_scan_ongoing(fd
) == 0) {
204 BtrfsQuotaInfo quota
;
206 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
208 (*ret
)->usage
= quota
.referenced
;
209 (*ret
)->usage_exclusive
= quota
.exclusive
;
211 (*ret
)->limit
= quota
.referenced_max
;
212 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
220 /* If the IMMUTABLE bit is set, we consider the
221 * directory read-only. Since the ioctl is not
222 * supported everywhere we ignore failures. */
223 (void) read_attr_fd(fd
, &file_attr
);
225 /* It's just a normal directory. */
226 r
= image_new(IMAGE_DIRECTORY
,
230 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
239 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
242 /* It's a RAW disk image */
247 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
250 pretty
= strndupa(filename
, strlen(filename
) - 4);
252 r
= image_new(IMAGE_RAW
,
256 !(st
.st_mode
& 0222) || read_only
,
258 timespec_load(&st
.st_mtim
),
263 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
264 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
272 int image_find(const char *name
, Image
**ret
) {
278 /* There are no images with invalid names */
279 if (!image_name_is_valid(name
))
282 NULSTR_FOREACH(path
, image_search_path
) {
283 _cleanup_closedir_
DIR *d
= NULL
;
293 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
294 if (r
== 0 || r
== -ENOENT
) {
295 _cleanup_free_
char *raw
= NULL
;
297 raw
= strappend(name
, ".raw");
301 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
302 if (r
== 0 || r
== -ENOENT
)
311 if (streq(name
, ".host"))
312 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
317 int image_discover(Hashmap
*h
) {
323 NULSTR_FOREACH(path
, image_search_path
) {
324 _cleanup_closedir_
DIR *d
= NULL
;
335 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
336 _cleanup_(image_unrefp
) Image
*image
= NULL
;
338 if (!image_name_is_valid(de
->d_name
))
341 if (hashmap_contains(h
, de
->d_name
))
344 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
345 if (r
== 0 || r
== -ENOENT
)
350 r
= hashmap_put(h
, image
->name
, image
);
358 if (!hashmap_contains(h
, ".host")) {
359 _cleanup_(image_unrefp
) Image
*image
= NULL
;
361 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
365 r
= hashmap_put(h
, image
->name
, image
);
376 void image_hashmap_free(Hashmap
*map
) {
379 while ((i
= hashmap_steal_first(map
)))
385 int image_remove(Image
*i
) {
386 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
387 _cleanup_strv_free_
char **settings
= NULL
;
393 if (path_equal(i
->path
, "/") ||
394 path_startswith(i
->path
, "/usr"))
397 settings
= image_settings_path(i
);
401 /* Make sure we don't interfere with a running nspawn */
402 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
408 case IMAGE_SUBVOLUME
:
409 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
414 case IMAGE_DIRECTORY
:
415 /* Allow deletion of read-only directories */
416 (void) chattr_path(i
->path
, false, FS_IMMUTABLE_FL
);
417 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
424 if (unlink(i
->path
) < 0)
432 STRV_FOREACH(j
, settings
) {
433 if (unlink(*j
) < 0 && errno
!= ENOENT
)
434 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
440 static int rename_settings_file(const char *path
, const char *new_name
) {
441 _cleanup_free_
char *rs
= NULL
;
444 fn
= strjoina(new_name
, ".nspawn");
446 rs
= file_in_same_dir(path
, fn
);
450 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
453 int image_rename(Image
*i
, const char *new_name
) {
454 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
455 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
;
456 _cleanup_strv_free_
char **settings
= NULL
;
457 unsigned file_attr
= 0;
463 if (!image_name_is_valid(new_name
))
466 if (path_equal(i
->path
, "/") ||
467 path_startswith(i
->path
, "/usr"))
470 settings
= image_settings_path(i
);
474 /* Make sure we don't interfere with a running nspawn */
475 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
479 /* Make sure nobody takes the new name, between the time we
480 * checked it is currently unused in all search paths, and the
481 * time we take possesion of it */
482 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
486 r
= image_find(new_name
, NULL
);
494 case IMAGE_DIRECTORY
:
495 /* Turn of the immutable bit while we rename the image, so that we can rename it */
496 (void) read_attr_path(i
->path
, &file_attr
);
498 if (file_attr
& FS_IMMUTABLE_FL
)
499 (void) chattr_path(i
->path
, false, FS_IMMUTABLE_FL
);
503 case IMAGE_SUBVOLUME
:
504 new_path
= file_in_same_dir(i
->path
, new_name
);
510 fn
= strjoina(new_name
, ".raw");
511 new_path
= file_in_same_dir(i
->path
, fn
);
522 nn
= strdup(new_name
);
526 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
530 /* Restore the immutable bit, if it was set before */
531 if (file_attr
& FS_IMMUTABLE_FL
)
532 (void) chattr_path(new_path
, true, FS_IMMUTABLE_FL
);
542 STRV_FOREACH(j
, settings
) {
543 r
= rename_settings_file(*j
, new_name
);
544 if (r
< 0 && r
!= -ENOENT
)
545 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
551 static int clone_settings_file(const char *path
, const char *new_name
) {
552 _cleanup_free_
char *rs
= NULL
;
555 fn
= strjoina(new_name
, ".nspawn");
557 rs
= file_in_same_dir(path
, fn
);
561 return copy_file_atomic(path
, rs
, 0664, false, 0);
564 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
565 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
566 _cleanup_strv_free_
char **settings
= NULL
;
567 const char *new_path
;
573 if (!image_name_is_valid(new_name
))
576 settings
= image_settings_path(i
);
580 /* Make sure nobody takes the new name, between the time we
581 * checked it is currently unused in all search paths, and the
582 * time we take possesion of it */
583 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
587 r
= image_find(new_name
, NULL
);
595 case IMAGE_SUBVOLUME
:
596 case IMAGE_DIRECTORY
:
597 new_path
= strjoina("/var/lib/machines/", new_name
);
599 r
= btrfs_subvol_snapshot(i
->path
, new_path
, (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
601 /* Enable "subtree" quotas for the copy, if we didn't
602 * copy any quota from the source. */
603 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
608 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
610 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, false, FS_NOCOW_FL
);
620 STRV_FOREACH(j
, settings
) {
621 r
= clone_settings_file(*j
, new_name
);
622 if (r
< 0 && r
!= -ENOENT
)
623 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
629 int image_read_only(Image
*i
, bool b
) {
630 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
634 if (path_equal(i
->path
, "/") ||
635 path_startswith(i
->path
, "/usr"))
638 /* Make sure we don't interfere with a running nspawn */
639 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
645 case IMAGE_SUBVOLUME
:
647 /* Note that we set the flag only on the top-level
648 * subvolume of the image. */
650 r
= btrfs_subvol_set_read_only(i
->path
, b
);
656 case IMAGE_DIRECTORY
:
657 /* For simple directory trees we cannot use the access
658 mode of the top-level directory, since it has an
659 effect on the container itself. However, we can
660 use the "immutable" flag, to at least make the
661 top-level directory read-only. It's not as good as
662 a read-only subvolume, but at least something, and
663 we can read the value back.*/
665 r
= chattr_path(i
->path
, b
, FS_IMMUTABLE_FL
);
674 if (stat(i
->path
, &st
) < 0)
677 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
680 /* If the images is now read-only, it's a good time to
681 * defrag it, given that no write patterns will
682 * fragment it again. */
684 (void) btrfs_defrag(i
->path
);
695 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
696 _cleanup_free_
char *p
= NULL
;
697 LockFile t
= LOCK_FILE_INIT
;
705 /* Locks an image path. This actually creates two locks: one
706 * "local" one, next to the image path itself, which might be
707 * shared via NFS. And another "global" one, in /run, that
708 * uses the device/inode number. This has the benefit that we
709 * can even lock a tree that is a mount point, correctly. */
711 if (path_equal(path
, "/"))
714 if (!path_is_absolute(path
))
717 if (stat(path
, &st
) >= 0) {
718 if (asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
) < 0)
722 r
= make_lock_file_for(path
, operation
, &t
);
727 mkdir_p("/run/systemd/nspawn/locks", 0700);
729 r
= make_lock_file(p
, operation
, global
);
731 release_lock_file(&t
);
740 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
743 if (path_equal(i
->path
, "/") ||
744 path_startswith(i
->path
, "/usr"))
747 if (i
->type
!= IMAGE_SUBVOLUME
)
750 /* We set the quota both for the subvolume as well as for the
751 * subtree. The latter is mostly for historical reasons, since
752 * we didn't use to have a concept of subtree quota, and hence
753 * only modified the subvolume quota. */
755 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
756 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
757 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
760 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
766 /* Locks an image name, regardless of the precise path used. */
768 if (!image_name_is_valid(name
))
771 if (streq(name
, ".host"))
774 mkdir_p("/run/systemd/nspawn/locks", 0700);
775 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
777 return make_lock_file(p
, operation
, ret
);
780 bool image_name_is_valid(const char *s
) {
781 if (!filename_is_valid(s
))
784 if (string_has_cc(s
, NULL
))
787 if (!utf8_is_valid(s
))
790 /* Temporary files for atomically creating new files */
791 if (startswith(s
, ".#"))
797 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
798 [IMAGE_DIRECTORY
] = "directory",
799 [IMAGE_SUBVOLUME
] = "subvolume",
803 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);