1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/statfs.h>
26 #include "btrfs-util.h"
28 #include "dirent-util.h"
30 #include "machine-image.h"
32 #include "path-util.h"
34 #include "string-util.h"
38 static const char image_search_path
[] =
40 "/var/lib/container\0" /* legacy */
41 "/usr/local/lib/machines\0"
42 "/usr/lib/machines\0";
44 Image
*image_unref(Image
*i
) {
54 static char **image_settings_path(Image
*image
) {
55 _cleanup_strv_free_
char **l
= NULL
;
66 fn
= strjoina(image
->name
, ".nspawn");
68 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
69 l
[i
] = strappend(s
, fn
);
76 l
[i
] = file_in_same_dir(image
->path
, fn
);
96 _cleanup_(image_unrefp
) Image
*i
= NULL
;
99 assert(t
< _IMAGE_TYPE_MAX
);
109 i
->read_only
= read_only
;
112 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
113 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
115 i
->name
= strdup(pretty
);
120 i
->path
= strjoin(path
, "/", filename
, NULL
);
122 i
->path
= strdup(filename
);
127 path_kill_slashes(i
->path
);
135 static int image_make(
139 const char *filename
,
148 /* We explicitly *do* follow symlinks here, since we want to
149 * allow symlinking trees into /var/lib/machines/, and treat
152 if (fstatat(dfd
, filename
, &st
, 0) < 0)
156 (path
&& path_startswith(path
, "/usr")) ||
157 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
159 if (S_ISDIR(st
.st_mode
)) {
160 _cleanup_close_
int fd
= -1;
161 unsigned file_attr
= 0;
169 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
173 /* btrfs subvolumes have inode 256 */
174 if (st
.st_ino
== 256) {
176 r
= btrfs_is_filesystem(fd
);
180 BtrfsSubvolInfo info
;
182 /* It's a btrfs subvolume */
184 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
188 r
= image_new(IMAGE_SUBVOLUME
,
192 info
.read_only
|| read_only
,
199 if (btrfs_quota_scan_ongoing(fd
) == 0) {
200 BtrfsQuotaInfo quota
;
202 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
204 (*ret
)->usage
= quota
.referenced
;
205 (*ret
)->usage_exclusive
= quota
.exclusive
;
207 (*ret
)->limit
= quota
.referenced_max
;
208 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
216 /* If the IMMUTABLE bit is set, we consider the
217 * directory read-only. Since the ioctl is not
218 * supported everywhere we ignore failures. */
219 (void) read_attr_fd(fd
, &file_attr
);
221 /* It's just a normal directory. */
222 r
= image_new(IMAGE_DIRECTORY
,
226 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
235 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
238 /* It's a RAW disk image */
243 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
246 pretty
= strndupa(filename
, strlen(filename
) - 4);
248 r
= image_new(IMAGE_RAW
,
252 !(st
.st_mode
& 0222) || read_only
,
254 timespec_load(&st
.st_mtim
),
259 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
260 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
268 int image_find(const char *name
, Image
**ret
) {
274 /* There are no images with invalid names */
275 if (!image_name_is_valid(name
))
278 NULSTR_FOREACH(path
, image_search_path
) {
279 _cleanup_closedir_
DIR *d
= NULL
;
289 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
290 if (r
== 0 || r
== -ENOENT
) {
291 _cleanup_free_
char *raw
= NULL
;
293 raw
= strappend(name
, ".raw");
297 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
298 if (r
== 0 || r
== -ENOENT
)
307 if (streq(name
, ".host"))
308 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
313 int image_discover(Hashmap
*h
) {
319 NULSTR_FOREACH(path
, image_search_path
) {
320 _cleanup_closedir_
DIR *d
= NULL
;
331 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
332 _cleanup_(image_unrefp
) Image
*image
= NULL
;
334 if (!image_name_is_valid(de
->d_name
))
337 if (hashmap_contains(h
, de
->d_name
))
340 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
341 if (r
== 0 || r
== -ENOENT
)
346 r
= hashmap_put(h
, image
->name
, image
);
354 if (!hashmap_contains(h
, ".host")) {
355 _cleanup_(image_unrefp
) Image
*image
= NULL
;
357 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
361 r
= hashmap_put(h
, image
->name
, image
);
372 void image_hashmap_free(Hashmap
*map
) {
375 while ((i
= hashmap_steal_first(map
)))
381 int image_remove(Image
*i
) {
382 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
383 _cleanup_strv_free_
char **settings
= NULL
;
389 if (path_equal(i
->path
, "/") ||
390 path_startswith(i
->path
, "/usr"))
393 settings
= image_settings_path(i
);
397 /* Make sure we don't interfere with a running nspawn */
398 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
404 case IMAGE_SUBVOLUME
:
405 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
410 case IMAGE_DIRECTORY
:
411 /* Allow deletion of read-only directories */
412 (void) chattr_path(i
->path
, false, FS_IMMUTABLE_FL
);
413 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
420 if (unlink(i
->path
) < 0)
428 STRV_FOREACH(j
, settings
) {
429 if (unlink(*j
) < 0 && errno
!= ENOENT
)
430 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
436 static int rename_settings_file(const char *path
, const char *new_name
) {
437 _cleanup_free_
char *rs
= NULL
;
440 fn
= strjoina(new_name
, ".nspawn");
442 rs
= file_in_same_dir(path
, fn
);
446 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
449 int image_rename(Image
*i
, const char *new_name
) {
450 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
451 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
;
452 _cleanup_strv_free_
char **settings
= NULL
;
453 unsigned file_attr
= 0;
459 if (!image_name_is_valid(new_name
))
462 if (path_equal(i
->path
, "/") ||
463 path_startswith(i
->path
, "/usr"))
466 settings
= image_settings_path(i
);
470 /* Make sure we don't interfere with a running nspawn */
471 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
475 /* Make sure nobody takes the new name, between the time we
476 * checked it is currently unused in all search paths, and the
477 * time we take possesion of it */
478 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
482 r
= image_find(new_name
, NULL
);
490 case IMAGE_DIRECTORY
:
491 /* Turn of the immutable bit while we rename the image, so that we can rename it */
492 (void) read_attr_path(i
->path
, &file_attr
);
494 if (file_attr
& FS_IMMUTABLE_FL
)
495 (void) chattr_path(i
->path
, false, FS_IMMUTABLE_FL
);
499 case IMAGE_SUBVOLUME
:
500 new_path
= file_in_same_dir(i
->path
, new_name
);
506 fn
= strjoina(new_name
, ".raw");
507 new_path
= file_in_same_dir(i
->path
, fn
);
518 nn
= strdup(new_name
);
522 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
526 /* Restore the immutable bit, if it was set before */
527 if (file_attr
& FS_IMMUTABLE_FL
)
528 (void) chattr_path(new_path
, true, FS_IMMUTABLE_FL
);
538 STRV_FOREACH(j
, settings
) {
539 r
= rename_settings_file(*j
, new_name
);
540 if (r
< 0 && r
!= -ENOENT
)
541 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
547 static int clone_settings_file(const char *path
, const char *new_name
) {
548 _cleanup_free_
char *rs
= NULL
;
551 fn
= strjoina(new_name
, ".nspawn");
553 rs
= file_in_same_dir(path
, fn
);
557 return copy_file_atomic(path
, rs
, 0664, false, 0);
560 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
561 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
562 _cleanup_strv_free_
char **settings
= NULL
;
563 const char *new_path
;
569 if (!image_name_is_valid(new_name
))
572 settings
= image_settings_path(i
);
576 /* Make sure nobody takes the new name, between the time we
577 * checked it is currently unused in all search paths, and the
578 * time we take possesion of it */
579 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
583 r
= image_find(new_name
, NULL
);
591 case IMAGE_SUBVOLUME
:
592 case IMAGE_DIRECTORY
:
593 new_path
= strjoina("/var/lib/machines/", new_name
);
595 r
= btrfs_subvol_snapshot(i
->path
, new_path
, (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
597 /* Enable "subtree" quotas for the copy, if we didn't
598 * copy any quota from the source. */
599 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
604 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
606 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, false, FS_NOCOW_FL
);
616 STRV_FOREACH(j
, settings
) {
617 r
= clone_settings_file(*j
, new_name
);
618 if (r
< 0 && r
!= -ENOENT
)
619 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
625 int image_read_only(Image
*i
, bool b
) {
626 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
630 if (path_equal(i
->path
, "/") ||
631 path_startswith(i
->path
, "/usr"))
634 /* Make sure we don't interfere with a running nspawn */
635 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
641 case IMAGE_SUBVOLUME
:
643 /* Note that we set the flag only on the top-level
644 * subvolume of the image. */
646 r
= btrfs_subvol_set_read_only(i
->path
, b
);
652 case IMAGE_DIRECTORY
:
653 /* For simple directory trees we cannot use the access
654 mode of the top-level directory, since it has an
655 effect on the container itself. However, we can
656 use the "immutable" flag, to at least make the
657 top-level directory read-only. It's not as good as
658 a read-only subvolume, but at least something, and
659 we can read the value back.*/
661 r
= chattr_path(i
->path
, b
, FS_IMMUTABLE_FL
);
670 if (stat(i
->path
, &st
) < 0)
673 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
676 /* If the images is now read-only, it's a good time to
677 * defrag it, given that no write patterns will
678 * fragment it again. */
680 (void) btrfs_defrag(i
->path
);
691 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
692 _cleanup_free_
char *p
= NULL
;
693 LockFile t
= LOCK_FILE_INIT
;
701 /* Locks an image path. This actually creates two locks: one
702 * "local" one, next to the image path itself, which might be
703 * shared via NFS. And another "global" one, in /run, that
704 * uses the device/inode number. This has the benefit that we
705 * can even lock a tree that is a mount point, correctly. */
707 if (path_equal(path
, "/"))
710 if (!path_is_absolute(path
))
713 if (stat(path
, &st
) >= 0) {
714 if (asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
) < 0)
718 r
= make_lock_file_for(path
, operation
, &t
);
723 mkdir_p("/run/systemd/nspawn/locks", 0700);
725 r
= make_lock_file(p
, operation
, global
);
727 release_lock_file(&t
);
736 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
739 if (path_equal(i
->path
, "/") ||
740 path_startswith(i
->path
, "/usr"))
743 if (i
->type
!= IMAGE_SUBVOLUME
)
746 /* We set the quota both for the subvolume as well as for the
747 * subtree. The latter is mostly for historical reasons, since
748 * we didn't use to have a concept of subtree quota, and hence
749 * only modified the subvolume quota. */
751 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
752 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
753 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
756 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
762 /* Locks an image name, regardless of the precise path used. */
764 if (!image_name_is_valid(name
))
767 if (streq(name
, ".host"))
770 mkdir_p("/run/systemd/nspawn/locks", 0700);
771 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
773 return make_lock_file(p
, operation
, ret
);
776 bool image_name_is_valid(const char *s
) {
777 if (!filename_is_valid(s
))
780 if (string_has_cc(s
, NULL
))
783 if (!utf8_is_valid(s
))
786 /* Temporary files for atomically creating new files */
787 if (startswith(s
, ".#"))
793 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
794 [IMAGE_DIRECTORY
] = "directory",
795 [IMAGE_SUBVOLUME
] = "subvolume",
799 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);