1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
24 #include <sys/statfs.h>
26 #include "alloc-util.h"
27 #include "btrfs-util.h"
28 #include "chattr-util.h"
30 #include "dirent-util.h"
33 #include "machine-image.h"
35 #include "path-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
41 #include "xattr-util.h"
43 static const char image_search_path
[] =
45 "/var/lib/container\0" /* legacy */
46 "/usr/local/lib/machines\0"
47 "/usr/lib/machines\0";
49 Image
*image_unref(Image
*i
) {
59 static char **image_settings_path(Image
*image
) {
60 _cleanup_strv_free_
char **l
= NULL
;
71 fn
= strjoina(image
->name
, ".nspawn");
73 FOREACH_STRING(s
, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
74 l
[i
] = strappend(s
, fn
);
81 l
[i
] = file_in_same_dir(image
->path
, fn
);
101 _cleanup_(image_unrefp
) Image
*i
= NULL
;
104 assert(t
< _IMAGE_TYPE_MAX
);
114 i
->read_only
= read_only
;
117 i
->usage
= i
->usage_exclusive
= (uint64_t) -1;
118 i
->limit
= i
->limit_exclusive
= (uint64_t) -1;
120 i
->name
= strdup(pretty
);
125 i
->path
= strjoin(path
, "/", filename
, NULL
);
127 i
->path
= strdup(filename
);
132 path_kill_slashes(i
->path
);
140 static int image_make(
144 const char *filename
,
153 /* We explicitly *do* follow symlinks here, since we want to
154 * allow symlinking trees into /var/lib/machines/, and treat
157 if (fstatat(dfd
, filename
, &st
, 0) < 0)
161 (path
&& path_startswith(path
, "/usr")) ||
162 (faccessat(dfd
, filename
, W_OK
, AT_EACCESS
) < 0 && errno
== EROFS
);
164 if (S_ISDIR(st
.st_mode
)) {
165 _cleanup_close_
int fd
= -1;
166 unsigned file_attr
= 0;
174 fd
= openat(dfd
, filename
, O_CLOEXEC
|O_NOCTTY
|O_DIRECTORY
);
178 /* btrfs subvolumes have inode 256 */
179 if (st
.st_ino
== 256) {
181 r
= btrfs_is_filesystem(fd
);
185 BtrfsSubvolInfo info
;
187 /* It's a btrfs subvolume */
189 r
= btrfs_subvol_get_info_fd(fd
, 0, &info
);
193 r
= image_new(IMAGE_SUBVOLUME
,
197 info
.read_only
|| read_only
,
204 if (btrfs_quota_scan_ongoing(fd
) == 0) {
205 BtrfsQuotaInfo quota
;
207 r
= btrfs_subvol_get_subtree_quota_fd(fd
, 0, "a
);
209 (*ret
)->usage
= quota
.referenced
;
210 (*ret
)->usage_exclusive
= quota
.exclusive
;
212 (*ret
)->limit
= quota
.referenced_max
;
213 (*ret
)->limit_exclusive
= quota
.exclusive_max
;
221 /* If the IMMUTABLE bit is set, we consider the
222 * directory read-only. Since the ioctl is not
223 * supported everywhere we ignore failures. */
224 (void) read_attr_fd(fd
, &file_attr
);
226 /* It's just a normal directory. */
227 r
= image_new(IMAGE_DIRECTORY
,
231 read_only
|| (file_attr
& FS_IMMUTABLE_FL
),
240 } else if (S_ISREG(st
.st_mode
) && endswith(filename
, ".raw")) {
243 /* It's a RAW disk image */
248 fd_getcrtime_at(dfd
, filename
, &crtime
, 0);
251 pretty
= strndupa(filename
, strlen(filename
) - 4);
253 r
= image_new(IMAGE_RAW
,
257 !(st
.st_mode
& 0222) || read_only
,
259 timespec_load(&st
.st_mtim
),
264 (*ret
)->usage
= (*ret
)->usage_exclusive
= st
.st_blocks
* 512;
265 (*ret
)->limit
= (*ret
)->limit_exclusive
= st
.st_size
;
273 int image_find(const char *name
, Image
**ret
) {
279 /* There are no images with invalid names */
280 if (!image_name_is_valid(name
))
283 NULSTR_FOREACH(path
, image_search_path
) {
284 _cleanup_closedir_
DIR *d
= NULL
;
294 r
= image_make(NULL
, dirfd(d
), path
, name
, ret
);
295 if (r
== 0 || r
== -ENOENT
) {
296 _cleanup_free_
char *raw
= NULL
;
298 raw
= strappend(name
, ".raw");
302 r
= image_make(NULL
, dirfd(d
), path
, raw
, ret
);
303 if (r
== 0 || r
== -ENOENT
)
312 if (streq(name
, ".host"))
313 return image_make(".host", AT_FDCWD
, NULL
, "/", ret
);
318 int image_discover(Hashmap
*h
) {
324 NULSTR_FOREACH(path
, image_search_path
) {
325 _cleanup_closedir_
DIR *d
= NULL
;
336 FOREACH_DIRENT_ALL(de
, d
, return -errno
) {
337 _cleanup_(image_unrefp
) Image
*image
= NULL
;
339 if (!image_name_is_valid(de
->d_name
))
342 if (hashmap_contains(h
, de
->d_name
))
345 r
= image_make(NULL
, dirfd(d
), path
, de
->d_name
, &image
);
346 if (r
== 0 || r
== -ENOENT
)
351 r
= hashmap_put(h
, image
->name
, image
);
359 if (!hashmap_contains(h
, ".host")) {
360 _cleanup_(image_unrefp
) Image
*image
= NULL
;
362 r
= image_make(".host", AT_FDCWD
, NULL
, "/", &image
);
366 r
= hashmap_put(h
, image
->name
, image
);
377 void image_hashmap_free(Hashmap
*map
) {
380 while ((i
= hashmap_steal_first(map
)))
386 int image_remove(Image
*i
) {
387 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
388 _cleanup_strv_free_
char **settings
= NULL
;
394 if (path_equal(i
->path
, "/") ||
395 path_startswith(i
->path
, "/usr"))
398 settings
= image_settings_path(i
);
402 /* Make sure we don't interfere with a running nspawn */
403 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
409 case IMAGE_SUBVOLUME
:
410 r
= btrfs_subvol_remove(i
->path
, BTRFS_REMOVE_RECURSIVE
|BTRFS_REMOVE_QUOTA
);
415 case IMAGE_DIRECTORY
:
416 /* Allow deletion of read-only directories */
417 (void) chattr_path(i
->path
, false, FS_IMMUTABLE_FL
);
418 r
= rm_rf(i
->path
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
425 if (unlink(i
->path
) < 0)
433 STRV_FOREACH(j
, settings
) {
434 if (unlink(*j
) < 0 && errno
!= ENOENT
)
435 log_debug_errno(errno
, "Failed to unlink %s, ignoring: %m", *j
);
441 static int rename_settings_file(const char *path
, const char *new_name
) {
442 _cleanup_free_
char *rs
= NULL
;
445 fn
= strjoina(new_name
, ".nspawn");
447 rs
= file_in_same_dir(path
, fn
);
451 return rename_noreplace(AT_FDCWD
, path
, AT_FDCWD
, rs
);
454 int image_rename(Image
*i
, const char *new_name
) {
455 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
, name_lock
= LOCK_FILE_INIT
;
456 _cleanup_free_
char *new_path
= NULL
, *nn
= NULL
;
457 _cleanup_strv_free_
char **settings
= NULL
;
458 unsigned file_attr
= 0;
464 if (!image_name_is_valid(new_name
))
467 if (path_equal(i
->path
, "/") ||
468 path_startswith(i
->path
, "/usr"))
471 settings
= image_settings_path(i
);
475 /* Make sure we don't interfere with a running nspawn */
476 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
480 /* Make sure nobody takes the new name, between the time we
481 * checked it is currently unused in all search paths, and the
482 * time we take possesion of it */
483 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
487 r
= image_find(new_name
, NULL
);
495 case IMAGE_DIRECTORY
:
496 /* Turn of the immutable bit while we rename the image, so that we can rename it */
497 (void) read_attr_path(i
->path
, &file_attr
);
499 if (file_attr
& FS_IMMUTABLE_FL
)
500 (void) chattr_path(i
->path
, false, FS_IMMUTABLE_FL
);
504 case IMAGE_SUBVOLUME
:
505 new_path
= file_in_same_dir(i
->path
, new_name
);
511 fn
= strjoina(new_name
, ".raw");
512 new_path
= file_in_same_dir(i
->path
, fn
);
523 nn
= strdup(new_name
);
527 r
= rename_noreplace(AT_FDCWD
, i
->path
, AT_FDCWD
, new_path
);
531 /* Restore the immutable bit, if it was set before */
532 if (file_attr
& FS_IMMUTABLE_FL
)
533 (void) chattr_path(new_path
, true, FS_IMMUTABLE_FL
);
543 STRV_FOREACH(j
, settings
) {
544 r
= rename_settings_file(*j
, new_name
);
545 if (r
< 0 && r
!= -ENOENT
)
546 log_debug_errno(r
, "Failed to rename settings file %s, ignoring: %m", *j
);
552 static int clone_settings_file(const char *path
, const char *new_name
) {
553 _cleanup_free_
char *rs
= NULL
;
556 fn
= strjoina(new_name
, ".nspawn");
558 rs
= file_in_same_dir(path
, fn
);
562 return copy_file_atomic(path
, rs
, 0664, false, 0);
565 int image_clone(Image
*i
, const char *new_name
, bool read_only
) {
566 _cleanup_release_lock_file_ LockFile name_lock
= LOCK_FILE_INIT
;
567 _cleanup_strv_free_
char **settings
= NULL
;
568 const char *new_path
;
574 if (!image_name_is_valid(new_name
))
577 settings
= image_settings_path(i
);
581 /* Make sure nobody takes the new name, between the time we
582 * checked it is currently unused in all search paths, and the
583 * time we take possesion of it */
584 r
= image_name_lock(new_name
, LOCK_EX
|LOCK_NB
, &name_lock
);
588 r
= image_find(new_name
, NULL
);
596 case IMAGE_SUBVOLUME
:
597 case IMAGE_DIRECTORY
:
598 new_path
= strjoina("/var/lib/machines/", new_name
);
600 r
= btrfs_subvol_snapshot(i
->path
, new_path
, (read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
| BTRFS_SNAPSHOT_QUOTA
);
602 /* Enable "subtree" quotas for the copy, if we didn't
603 * copy any quota from the source. */
604 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
609 new_path
= strjoina("/var/lib/machines/", new_name
, ".raw");
611 r
= copy_file_atomic(i
->path
, new_path
, read_only
? 0444 : 0644, false, FS_NOCOW_FL
);
621 STRV_FOREACH(j
, settings
) {
622 r
= clone_settings_file(*j
, new_name
);
623 if (r
< 0 && r
!= -ENOENT
)
624 log_debug_errno(r
, "Failed to clone settings %s, ignoring: %m", *j
);
630 int image_read_only(Image
*i
, bool b
) {
631 _cleanup_release_lock_file_ LockFile global_lock
= LOCK_FILE_INIT
, local_lock
= LOCK_FILE_INIT
;
635 if (path_equal(i
->path
, "/") ||
636 path_startswith(i
->path
, "/usr"))
639 /* Make sure we don't interfere with a running nspawn */
640 r
= image_path_lock(i
->path
, LOCK_EX
|LOCK_NB
, &global_lock
, &local_lock
);
646 case IMAGE_SUBVOLUME
:
648 /* Note that we set the flag only on the top-level
649 * subvolume of the image. */
651 r
= btrfs_subvol_set_read_only(i
->path
, b
);
657 case IMAGE_DIRECTORY
:
658 /* For simple directory trees we cannot use the access
659 mode of the top-level directory, since it has an
660 effect on the container itself. However, we can
661 use the "immutable" flag, to at least make the
662 top-level directory read-only. It's not as good as
663 a read-only subvolume, but at least something, and
664 we can read the value back.*/
666 r
= chattr_path(i
->path
, b
, FS_IMMUTABLE_FL
);
675 if (stat(i
->path
, &st
) < 0)
678 if (chmod(i
->path
, (st
.st_mode
& 0444) | (b
? 0000 : 0200)) < 0)
681 /* If the images is now read-only, it's a good time to
682 * defrag it, given that no write patterns will
683 * fragment it again. */
685 (void) btrfs_defrag(i
->path
);
696 int image_path_lock(const char *path
, int operation
, LockFile
*global
, LockFile
*local
) {
697 _cleanup_free_
char *p
= NULL
;
698 LockFile t
= LOCK_FILE_INIT
;
706 /* Locks an image path. This actually creates two locks: one
707 * "local" one, next to the image path itself, which might be
708 * shared via NFS. And another "global" one, in /run, that
709 * uses the device/inode number. This has the benefit that we
710 * can even lock a tree that is a mount point, correctly. */
712 if (path_equal(path
, "/"))
715 if (!path_is_absolute(path
))
718 if (stat(path
, &st
) >= 0) {
719 if (asprintf(&p
, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st
.st_dev
, (unsigned long) st
.st_ino
) < 0)
723 r
= make_lock_file_for(path
, operation
, &t
);
728 mkdir_p("/run/systemd/nspawn/locks", 0700);
730 r
= make_lock_file(p
, operation
, global
);
732 release_lock_file(&t
);
741 int image_set_limit(Image
*i
, uint64_t referenced_max
) {
744 if (path_equal(i
->path
, "/") ||
745 path_startswith(i
->path
, "/usr"))
748 if (i
->type
!= IMAGE_SUBVOLUME
)
751 /* We set the quota both for the subvolume as well as for the
752 * subtree. The latter is mostly for historical reasons, since
753 * we didn't use to have a concept of subtree quota, and hence
754 * only modified the subvolume quota. */
756 (void) btrfs_qgroup_set_limit(i
->path
, 0, referenced_max
);
757 (void) btrfs_subvol_auto_qgroup(i
->path
, 0, true);
758 return btrfs_subvol_set_subtree_quota_limit(i
->path
, 0, referenced_max
);
761 int image_name_lock(const char *name
, int operation
, LockFile
*ret
) {
767 /* Locks an image name, regardless of the precise path used. */
769 if (!image_name_is_valid(name
))
772 if (streq(name
, ".host"))
775 mkdir_p("/run/systemd/nspawn/locks", 0700);
776 p
= strjoina("/run/systemd/nspawn/locks/name-", name
);
778 return make_lock_file(p
, operation
, ret
);
781 bool image_name_is_valid(const char *s
) {
782 if (!filename_is_valid(s
))
785 if (string_has_cc(s
, NULL
))
788 if (!utf8_is_valid(s
))
791 /* Temporary files for atomically creating new files */
792 if (startswith(s
, ".#"))
798 static const char* const image_type_table
[_IMAGE_TYPE_MAX
] = {
799 [IMAGE_DIRECTORY
] = "directory",
800 [IMAGE_SUBVOLUME
] = "subvolume",
804 DEFINE_STRING_TABLE_LOOKUP(image_type
, ImageType
);