]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/machine-image.c
Merge pull request #1880 from fsateler/sysctl-doc
[thirdparty/systemd.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <linux/fs.h>
24 #include <sys/statfs.h>
25
26 #include "alloc-util.h"
27 #include "btrfs-util.h"
28 #include "chattr-util.h"
29 #include "copy.h"
30 #include "dirent-util.h"
31 #include "fd-util.h"
32 #include "fs-util.h"
33 #include "machine-image.h"
34 #include "mkdir.h"
35 #include "path-util.h"
36 #include "rm-rf.h"
37 #include "string-table.h"
38 #include "string-util.h"
39 #include "strv.h"
40 #include "utf8.h"
41 #include "xattr-util.h"
42
43 static const char image_search_path[] =
44 "/var/lib/machines\0"
45 "/var/lib/container\0" /* legacy */
46 "/usr/local/lib/machines\0"
47 "/usr/lib/machines\0";
48
49 Image *image_unref(Image *i) {
50 if (!i)
51 return NULL;
52
53 free(i->name);
54 free(i->path);
55 free(i);
56 return NULL;
57 }
58
59 static char **image_settings_path(Image *image) {
60 _cleanup_strv_free_ char **l = NULL;
61 char **ret;
62 const char *fn, *s;
63 unsigned i = 0;
64
65 assert(image);
66
67 l = new0(char*, 4);
68 if (!l)
69 return NULL;
70
71 fn = strjoina(image->name, ".nspawn");
72
73 FOREACH_STRING(s, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
74 l[i] = strappend(s, fn);
75 if (!l[i])
76 return NULL;
77
78 i++;
79 }
80
81 l[i] = file_in_same_dir(image->path, fn);
82 if (!l[i])
83 return NULL;
84
85 ret = l;
86 l = NULL;
87
88 return ret;
89 }
90
91 static int image_new(
92 ImageType t,
93 const char *pretty,
94 const char *path,
95 const char *filename,
96 bool read_only,
97 usec_t crtime,
98 usec_t mtime,
99 Image **ret) {
100
101 _cleanup_(image_unrefp) Image *i = NULL;
102
103 assert(t >= 0);
104 assert(t < _IMAGE_TYPE_MAX);
105 assert(pretty);
106 assert(filename);
107 assert(ret);
108
109 i = new0(Image, 1);
110 if (!i)
111 return -ENOMEM;
112
113 i->type = t;
114 i->read_only = read_only;
115 i->crtime = crtime;
116 i->mtime = mtime;
117 i->usage = i->usage_exclusive = (uint64_t) -1;
118 i->limit = i->limit_exclusive = (uint64_t) -1;
119
120 i->name = strdup(pretty);
121 if (!i->name)
122 return -ENOMEM;
123
124 if (path)
125 i->path = strjoin(path, "/", filename, NULL);
126 else
127 i->path = strdup(filename);
128
129 if (!i->path)
130 return -ENOMEM;
131
132 path_kill_slashes(i->path);
133
134 *ret = i;
135 i = NULL;
136
137 return 0;
138 }
139
140 static int image_make(
141 const char *pretty,
142 int dfd,
143 const char *path,
144 const char *filename,
145 Image **ret) {
146
147 struct stat st;
148 bool read_only;
149 int r;
150
151 assert(filename);
152
153 /* We explicitly *do* follow symlinks here, since we want to
154 * allow symlinking trees into /var/lib/machines/, and treat
155 * them normally. */
156
157 if (fstatat(dfd, filename, &st, 0) < 0)
158 return -errno;
159
160 read_only =
161 (path && path_startswith(path, "/usr")) ||
162 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
163
164 if (S_ISDIR(st.st_mode)) {
165 _cleanup_close_ int fd = -1;
166 unsigned file_attr = 0;
167
168 if (!ret)
169 return 1;
170
171 if (!pretty)
172 pretty = filename;
173
174 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
175 if (fd < 0)
176 return -errno;
177
178 /* btrfs subvolumes have inode 256 */
179 if (st.st_ino == 256) {
180
181 r = btrfs_is_filesystem(fd);
182 if (r < 0)
183 return r;
184 if (r) {
185 BtrfsSubvolInfo info;
186
187 /* It's a btrfs subvolume */
188
189 r = btrfs_subvol_get_info_fd(fd, 0, &info);
190 if (r < 0)
191 return r;
192
193 r = image_new(IMAGE_SUBVOLUME,
194 pretty,
195 path,
196 filename,
197 info.read_only || read_only,
198 info.otime,
199 0,
200 ret);
201 if (r < 0)
202 return r;
203
204 if (btrfs_quota_scan_ongoing(fd) == 0) {
205 BtrfsQuotaInfo quota;
206
207 r = btrfs_subvol_get_subtree_quota_fd(fd, 0, &quota);
208 if (r >= 0) {
209 (*ret)->usage = quota.referenced;
210 (*ret)->usage_exclusive = quota.exclusive;
211
212 (*ret)->limit = quota.referenced_max;
213 (*ret)->limit_exclusive = quota.exclusive_max;
214 }
215 }
216
217 return 1;
218 }
219 }
220
221 /* If the IMMUTABLE bit is set, we consider the
222 * directory read-only. Since the ioctl is not
223 * supported everywhere we ignore failures. */
224 (void) read_attr_fd(fd, &file_attr);
225
226 /* It's just a normal directory. */
227 r = image_new(IMAGE_DIRECTORY,
228 pretty,
229 path,
230 filename,
231 read_only || (file_attr & FS_IMMUTABLE_FL),
232 0,
233 0,
234 ret);
235 if (r < 0)
236 return r;
237
238 return 1;
239
240 } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
241 usec_t crtime = 0;
242
243 /* It's a RAW disk image */
244
245 if (!ret)
246 return 1;
247
248 fd_getcrtime_at(dfd, filename, &crtime, 0);
249
250 if (!pretty)
251 pretty = strndupa(filename, strlen(filename) - 4);
252
253 r = image_new(IMAGE_RAW,
254 pretty,
255 path,
256 filename,
257 !(st.st_mode & 0222) || read_only,
258 crtime,
259 timespec_load(&st.st_mtim),
260 ret);
261 if (r < 0)
262 return r;
263
264 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
265 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
266
267 return 1;
268 }
269
270 return 0;
271 }
272
273 int image_find(const char *name, Image **ret) {
274 const char *path;
275 int r;
276
277 assert(name);
278
279 /* There are no images with invalid names */
280 if (!image_name_is_valid(name))
281 return 0;
282
283 NULSTR_FOREACH(path, image_search_path) {
284 _cleanup_closedir_ DIR *d = NULL;
285
286 d = opendir(path);
287 if (!d) {
288 if (errno == ENOENT)
289 continue;
290
291 return -errno;
292 }
293
294 r = image_make(NULL, dirfd(d), path, name, ret);
295 if (r == 0 || r == -ENOENT) {
296 _cleanup_free_ char *raw = NULL;
297
298 raw = strappend(name, ".raw");
299 if (!raw)
300 return -ENOMEM;
301
302 r = image_make(NULL, dirfd(d), path, raw, ret);
303 if (r == 0 || r == -ENOENT)
304 continue;
305 }
306 if (r < 0)
307 return r;
308
309 return 1;
310 }
311
312 if (streq(name, ".host"))
313 return image_make(".host", AT_FDCWD, NULL, "/", ret);
314
315 return 0;
316 };
317
318 int image_discover(Hashmap *h) {
319 const char *path;
320 int r;
321
322 assert(h);
323
324 NULSTR_FOREACH(path, image_search_path) {
325 _cleanup_closedir_ DIR *d = NULL;
326 struct dirent *de;
327
328 d = opendir(path);
329 if (!d) {
330 if (errno == ENOENT)
331 continue;
332
333 return -errno;
334 }
335
336 FOREACH_DIRENT_ALL(de, d, return -errno) {
337 _cleanup_(image_unrefp) Image *image = NULL;
338
339 if (!image_name_is_valid(de->d_name))
340 continue;
341
342 if (hashmap_contains(h, de->d_name))
343 continue;
344
345 r = image_make(NULL, dirfd(d), path, de->d_name, &image);
346 if (r == 0 || r == -ENOENT)
347 continue;
348 if (r < 0)
349 return r;
350
351 r = hashmap_put(h, image->name, image);
352 if (r < 0)
353 return r;
354
355 image = NULL;
356 }
357 }
358
359 if (!hashmap_contains(h, ".host")) {
360 _cleanup_(image_unrefp) Image *image = NULL;
361
362 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
363 if (r < 0)
364 return r;
365
366 r = hashmap_put(h, image->name, image);
367 if (r < 0)
368 return r;
369
370 image = NULL;
371
372 }
373
374 return 0;
375 }
376
377 void image_hashmap_free(Hashmap *map) {
378 Image *i;
379
380 while ((i = hashmap_steal_first(map)))
381 image_unref(i);
382
383 hashmap_free(map);
384 }
385
386 int image_remove(Image *i) {
387 _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
388 _cleanup_strv_free_ char **settings = NULL;
389 char **j;
390 int r;
391
392 assert(i);
393
394 if (path_equal(i->path, "/") ||
395 path_startswith(i->path, "/usr"))
396 return -EROFS;
397
398 settings = image_settings_path(i);
399 if (!settings)
400 return -ENOMEM;
401
402 /* Make sure we don't interfere with a running nspawn */
403 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
404 if (r < 0)
405 return r;
406
407 switch (i->type) {
408
409 case IMAGE_SUBVOLUME:
410 r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
411 if (r < 0)
412 return r;
413 break;
414
415 case IMAGE_DIRECTORY:
416 /* Allow deletion of read-only directories */
417 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
418 r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
419 if (r < 0)
420 return r;
421
422 break;
423
424 case IMAGE_RAW:
425 if (unlink(i->path) < 0)
426 return -errno;
427 break;
428
429 default:
430 return -EOPNOTSUPP;
431 }
432
433 STRV_FOREACH(j, settings) {
434 if (unlink(*j) < 0 && errno != ENOENT)
435 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j);
436 }
437
438 return 0;
439 }
440
441 static int rename_settings_file(const char *path, const char *new_name) {
442 _cleanup_free_ char *rs = NULL;
443 const char *fn;
444
445 fn = strjoina(new_name, ".nspawn");
446
447 rs = file_in_same_dir(path, fn);
448 if (!rs)
449 return -ENOMEM;
450
451 return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs);
452 }
453
454 int image_rename(Image *i, const char *new_name) {
455 _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
456 _cleanup_free_ char *new_path = NULL, *nn = NULL;
457 _cleanup_strv_free_ char **settings = NULL;
458 unsigned file_attr = 0;
459 char **j;
460 int r;
461
462 assert(i);
463
464 if (!image_name_is_valid(new_name))
465 return -EINVAL;
466
467 if (path_equal(i->path, "/") ||
468 path_startswith(i->path, "/usr"))
469 return -EROFS;
470
471 settings = image_settings_path(i);
472 if (!settings)
473 return -ENOMEM;
474
475 /* Make sure we don't interfere with a running nspawn */
476 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
477 if (r < 0)
478 return r;
479
480 /* Make sure nobody takes the new name, between the time we
481 * checked it is currently unused in all search paths, and the
482 * time we take possesion of it */
483 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
484 if (r < 0)
485 return r;
486
487 r = image_find(new_name, NULL);
488 if (r < 0)
489 return r;
490 if (r > 0)
491 return -EEXIST;
492
493 switch (i->type) {
494
495 case IMAGE_DIRECTORY:
496 /* Turn of the immutable bit while we rename the image, so that we can rename it */
497 (void) read_attr_path(i->path, &file_attr);
498
499 if (file_attr & FS_IMMUTABLE_FL)
500 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
501
502 /* fall through */
503
504 case IMAGE_SUBVOLUME:
505 new_path = file_in_same_dir(i->path, new_name);
506 break;
507
508 case IMAGE_RAW: {
509 const char *fn;
510
511 fn = strjoina(new_name, ".raw");
512 new_path = file_in_same_dir(i->path, fn);
513 break;
514 }
515
516 default:
517 return -EOPNOTSUPP;
518 }
519
520 if (!new_path)
521 return -ENOMEM;
522
523 nn = strdup(new_name);
524 if (!nn)
525 return -ENOMEM;
526
527 r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
528 if (r < 0)
529 return r;
530
531 /* Restore the immutable bit, if it was set before */
532 if (file_attr & FS_IMMUTABLE_FL)
533 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
534
535 free(i->path);
536 i->path = new_path;
537 new_path = NULL;
538
539 free(i->name);
540 i->name = nn;
541 nn = NULL;
542
543 STRV_FOREACH(j, settings) {
544 r = rename_settings_file(*j, new_name);
545 if (r < 0 && r != -ENOENT)
546 log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j);
547 }
548
549 return 0;
550 }
551
552 static int clone_settings_file(const char *path, const char *new_name) {
553 _cleanup_free_ char *rs = NULL;
554 const char *fn;
555
556 fn = strjoina(new_name, ".nspawn");
557
558 rs = file_in_same_dir(path, fn);
559 if (!rs)
560 return -ENOMEM;
561
562 return copy_file_atomic(path, rs, 0664, false, 0);
563 }
564
565 int image_clone(Image *i, const char *new_name, bool read_only) {
566 _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
567 _cleanup_strv_free_ char **settings = NULL;
568 const char *new_path;
569 char **j;
570 int r;
571
572 assert(i);
573
574 if (!image_name_is_valid(new_name))
575 return -EINVAL;
576
577 settings = image_settings_path(i);
578 if (!settings)
579 return -ENOMEM;
580
581 /* Make sure nobody takes the new name, between the time we
582 * checked it is currently unused in all search paths, and the
583 * time we take possesion of it */
584 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
585 if (r < 0)
586 return r;
587
588 r = image_find(new_name, NULL);
589 if (r < 0)
590 return r;
591 if (r > 0)
592 return -EEXIST;
593
594 switch (i->type) {
595
596 case IMAGE_SUBVOLUME:
597 case IMAGE_DIRECTORY:
598 new_path = strjoina("/var/lib/machines/", new_name);
599
600 r = btrfs_subvol_snapshot(i->path, new_path, (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
601
602 /* Enable "subtree" quotas for the copy, if we didn't
603 * copy any quota from the source. */
604 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
605
606 break;
607
608 case IMAGE_RAW:
609 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
610
611 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
612 break;
613
614 default:
615 return -EOPNOTSUPP;
616 }
617
618 if (r < 0)
619 return r;
620
621 STRV_FOREACH(j, settings) {
622 r = clone_settings_file(*j, new_name);
623 if (r < 0 && r != -ENOENT)
624 log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j);
625 }
626
627 return 0;
628 }
629
630 int image_read_only(Image *i, bool b) {
631 _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
632 int r;
633 assert(i);
634
635 if (path_equal(i->path, "/") ||
636 path_startswith(i->path, "/usr"))
637 return -EROFS;
638
639 /* Make sure we don't interfere with a running nspawn */
640 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
641 if (r < 0)
642 return r;
643
644 switch (i->type) {
645
646 case IMAGE_SUBVOLUME:
647
648 /* Note that we set the flag only on the top-level
649 * subvolume of the image. */
650
651 r = btrfs_subvol_set_read_only(i->path, b);
652 if (r < 0)
653 return r;
654
655 break;
656
657 case IMAGE_DIRECTORY:
658 /* For simple directory trees we cannot use the access
659 mode of the top-level directory, since it has an
660 effect on the container itself. However, we can
661 use the "immutable" flag, to at least make the
662 top-level directory read-only. It's not as good as
663 a read-only subvolume, but at least something, and
664 we can read the value back.*/
665
666 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
667 if (r < 0)
668 return r;
669
670 break;
671
672 case IMAGE_RAW: {
673 struct stat st;
674
675 if (stat(i->path, &st) < 0)
676 return -errno;
677
678 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
679 return -errno;
680
681 /* If the images is now read-only, it's a good time to
682 * defrag it, given that no write patterns will
683 * fragment it again. */
684 if (b)
685 (void) btrfs_defrag(i->path);
686 break;
687 }
688
689 default:
690 return -EOPNOTSUPP;
691 }
692
693 return 0;
694 }
695
696 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
697 _cleanup_free_ char *p = NULL;
698 LockFile t = LOCK_FILE_INIT;
699 struct stat st;
700 int r;
701
702 assert(path);
703 assert(global);
704 assert(local);
705
706 /* Locks an image path. This actually creates two locks: one
707 * "local" one, next to the image path itself, which might be
708 * shared via NFS. And another "global" one, in /run, that
709 * uses the device/inode number. This has the benefit that we
710 * can even lock a tree that is a mount point, correctly. */
711
712 if (path_equal(path, "/"))
713 return -EBUSY;
714
715 if (!path_is_absolute(path))
716 return -EINVAL;
717
718 if (stat(path, &st) >= 0) {
719 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
720 return -ENOMEM;
721 }
722
723 r = make_lock_file_for(path, operation, &t);
724 if (r < 0)
725 return r;
726
727 if (p) {
728 mkdir_p("/run/systemd/nspawn/locks", 0700);
729
730 r = make_lock_file(p, operation, global);
731 if (r < 0) {
732 release_lock_file(&t);
733 return r;
734 }
735 }
736
737 *local = t;
738 return 0;
739 }
740
741 int image_set_limit(Image *i, uint64_t referenced_max) {
742 assert(i);
743
744 if (path_equal(i->path, "/") ||
745 path_startswith(i->path, "/usr"))
746 return -EROFS;
747
748 if (i->type != IMAGE_SUBVOLUME)
749 return -EOPNOTSUPP;
750
751 /* We set the quota both for the subvolume as well as for the
752 * subtree. The latter is mostly for historical reasons, since
753 * we didn't use to have a concept of subtree quota, and hence
754 * only modified the subvolume quota. */
755
756 (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max);
757 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
758 return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max);
759 }
760
761 int image_name_lock(const char *name, int operation, LockFile *ret) {
762 const char *p;
763
764 assert(name);
765 assert(ret);
766
767 /* Locks an image name, regardless of the precise path used. */
768
769 if (!image_name_is_valid(name))
770 return -EINVAL;
771
772 if (streq(name, ".host"))
773 return -EBUSY;
774
775 mkdir_p("/run/systemd/nspawn/locks", 0700);
776 p = strjoina("/run/systemd/nspawn/locks/name-", name);
777
778 return make_lock_file(p, operation, ret);
779 }
780
781 bool image_name_is_valid(const char *s) {
782 if (!filename_is_valid(s))
783 return false;
784
785 if (string_has_cc(s, NULL))
786 return false;
787
788 if (!utf8_is_valid(s))
789 return false;
790
791 /* Temporary files for atomically creating new files */
792 if (startswith(s, ".#"))
793 return false;
794
795 return true;
796 }
797
798 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
799 [IMAGE_DIRECTORY] = "directory",
800 [IMAGE_SUBVOLUME] = "subvolume",
801 [IMAGE_RAW] = "raw",
802 };
803
804 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);