]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/machine-image.c
util-lib: introduce dirent-util.[ch] for directory entry calls
[thirdparty/systemd.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <linux/fs.h>
24 #include <sys/statfs.h>
25
26 #include "btrfs-util.h"
27 #include "copy.h"
28 #include "dirent-util.h"
29 #include "fd-util.h"
30 #include "machine-image.h"
31 #include "mkdir.h"
32 #include "path-util.h"
33 #include "rm-rf.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "utf8.h"
37
38 static const char image_search_path[] =
39 "/var/lib/machines\0"
40 "/var/lib/container\0" /* legacy */
41 "/usr/local/lib/machines\0"
42 "/usr/lib/machines\0";
43
44 Image *image_unref(Image *i) {
45 if (!i)
46 return NULL;
47
48 free(i->name);
49 free(i->path);
50 free(i);
51 return NULL;
52 }
53
54 static char **image_settings_path(Image *image) {
55 _cleanup_strv_free_ char **l = NULL;
56 char **ret;
57 const char *fn, *s;
58 unsigned i = 0;
59
60 assert(image);
61
62 l = new0(char*, 4);
63 if (!l)
64 return NULL;
65
66 fn = strjoina(image->name, ".nspawn");
67
68 FOREACH_STRING(s, "/etc/systemd/nspawn/", "/run/systemd/nspawn/") {
69 l[i] = strappend(s, fn);
70 if (!l[i])
71 return NULL;
72
73 i++;
74 }
75
76 l[i] = file_in_same_dir(image->path, fn);
77 if (!l[i])
78 return NULL;
79
80 ret = l;
81 l = NULL;
82
83 return ret;
84 }
85
86 static int image_new(
87 ImageType t,
88 const char *pretty,
89 const char *path,
90 const char *filename,
91 bool read_only,
92 usec_t crtime,
93 usec_t mtime,
94 Image **ret) {
95
96 _cleanup_(image_unrefp) Image *i = NULL;
97
98 assert(t >= 0);
99 assert(t < _IMAGE_TYPE_MAX);
100 assert(pretty);
101 assert(filename);
102 assert(ret);
103
104 i = new0(Image, 1);
105 if (!i)
106 return -ENOMEM;
107
108 i->type = t;
109 i->read_only = read_only;
110 i->crtime = crtime;
111 i->mtime = mtime;
112 i->usage = i->usage_exclusive = (uint64_t) -1;
113 i->limit = i->limit_exclusive = (uint64_t) -1;
114
115 i->name = strdup(pretty);
116 if (!i->name)
117 return -ENOMEM;
118
119 if (path)
120 i->path = strjoin(path, "/", filename, NULL);
121 else
122 i->path = strdup(filename);
123
124 if (!i->path)
125 return -ENOMEM;
126
127 path_kill_slashes(i->path);
128
129 *ret = i;
130 i = NULL;
131
132 return 0;
133 }
134
135 static int image_make(
136 const char *pretty,
137 int dfd,
138 const char *path,
139 const char *filename,
140 Image **ret) {
141
142 struct stat st;
143 bool read_only;
144 int r;
145
146 assert(filename);
147
148 /* We explicitly *do* follow symlinks here, since we want to
149 * allow symlinking trees into /var/lib/machines/, and treat
150 * them normally. */
151
152 if (fstatat(dfd, filename, &st, 0) < 0)
153 return -errno;
154
155 read_only =
156 (path && path_startswith(path, "/usr")) ||
157 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
158
159 if (S_ISDIR(st.st_mode)) {
160 _cleanup_close_ int fd = -1;
161 unsigned file_attr = 0;
162
163 if (!ret)
164 return 1;
165
166 if (!pretty)
167 pretty = filename;
168
169 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
170 if (fd < 0)
171 return -errno;
172
173 /* btrfs subvolumes have inode 256 */
174 if (st.st_ino == 256) {
175
176 r = btrfs_is_filesystem(fd);
177 if (r < 0)
178 return r;
179 if (r) {
180 BtrfsSubvolInfo info;
181
182 /* It's a btrfs subvolume */
183
184 r = btrfs_subvol_get_info_fd(fd, 0, &info);
185 if (r < 0)
186 return r;
187
188 r = image_new(IMAGE_SUBVOLUME,
189 pretty,
190 path,
191 filename,
192 info.read_only || read_only,
193 info.otime,
194 0,
195 ret);
196 if (r < 0)
197 return r;
198
199 if (btrfs_quota_scan_ongoing(fd) == 0) {
200 BtrfsQuotaInfo quota;
201
202 r = btrfs_subvol_get_subtree_quota_fd(fd, 0, &quota);
203 if (r >= 0) {
204 (*ret)->usage = quota.referenced;
205 (*ret)->usage_exclusive = quota.exclusive;
206
207 (*ret)->limit = quota.referenced_max;
208 (*ret)->limit_exclusive = quota.exclusive_max;
209 }
210 }
211
212 return 1;
213 }
214 }
215
216 /* If the IMMUTABLE bit is set, we consider the
217 * directory read-only. Since the ioctl is not
218 * supported everywhere we ignore failures. */
219 (void) read_attr_fd(fd, &file_attr);
220
221 /* It's just a normal directory. */
222 r = image_new(IMAGE_DIRECTORY,
223 pretty,
224 path,
225 filename,
226 read_only || (file_attr & FS_IMMUTABLE_FL),
227 0,
228 0,
229 ret);
230 if (r < 0)
231 return r;
232
233 return 1;
234
235 } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
236 usec_t crtime = 0;
237
238 /* It's a RAW disk image */
239
240 if (!ret)
241 return 1;
242
243 fd_getcrtime_at(dfd, filename, &crtime, 0);
244
245 if (!pretty)
246 pretty = strndupa(filename, strlen(filename) - 4);
247
248 r = image_new(IMAGE_RAW,
249 pretty,
250 path,
251 filename,
252 !(st.st_mode & 0222) || read_only,
253 crtime,
254 timespec_load(&st.st_mtim),
255 ret);
256 if (r < 0)
257 return r;
258
259 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
260 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
261
262 return 1;
263 }
264
265 return 0;
266 }
267
268 int image_find(const char *name, Image **ret) {
269 const char *path;
270 int r;
271
272 assert(name);
273
274 /* There are no images with invalid names */
275 if (!image_name_is_valid(name))
276 return 0;
277
278 NULSTR_FOREACH(path, image_search_path) {
279 _cleanup_closedir_ DIR *d = NULL;
280
281 d = opendir(path);
282 if (!d) {
283 if (errno == ENOENT)
284 continue;
285
286 return -errno;
287 }
288
289 r = image_make(NULL, dirfd(d), path, name, ret);
290 if (r == 0 || r == -ENOENT) {
291 _cleanup_free_ char *raw = NULL;
292
293 raw = strappend(name, ".raw");
294 if (!raw)
295 return -ENOMEM;
296
297 r = image_make(NULL, dirfd(d), path, raw, ret);
298 if (r == 0 || r == -ENOENT)
299 continue;
300 }
301 if (r < 0)
302 return r;
303
304 return 1;
305 }
306
307 if (streq(name, ".host"))
308 return image_make(".host", AT_FDCWD, NULL, "/", ret);
309
310 return 0;
311 };
312
313 int image_discover(Hashmap *h) {
314 const char *path;
315 int r;
316
317 assert(h);
318
319 NULSTR_FOREACH(path, image_search_path) {
320 _cleanup_closedir_ DIR *d = NULL;
321 struct dirent *de;
322
323 d = opendir(path);
324 if (!d) {
325 if (errno == ENOENT)
326 continue;
327
328 return -errno;
329 }
330
331 FOREACH_DIRENT_ALL(de, d, return -errno) {
332 _cleanup_(image_unrefp) Image *image = NULL;
333
334 if (!image_name_is_valid(de->d_name))
335 continue;
336
337 if (hashmap_contains(h, de->d_name))
338 continue;
339
340 r = image_make(NULL, dirfd(d), path, de->d_name, &image);
341 if (r == 0 || r == -ENOENT)
342 continue;
343 if (r < 0)
344 return r;
345
346 r = hashmap_put(h, image->name, image);
347 if (r < 0)
348 return r;
349
350 image = NULL;
351 }
352 }
353
354 if (!hashmap_contains(h, ".host")) {
355 _cleanup_(image_unrefp) Image *image = NULL;
356
357 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
358 if (r < 0)
359 return r;
360
361 r = hashmap_put(h, image->name, image);
362 if (r < 0)
363 return r;
364
365 image = NULL;
366
367 }
368
369 return 0;
370 }
371
372 void image_hashmap_free(Hashmap *map) {
373 Image *i;
374
375 while ((i = hashmap_steal_first(map)))
376 image_unref(i);
377
378 hashmap_free(map);
379 }
380
381 int image_remove(Image *i) {
382 _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
383 _cleanup_strv_free_ char **settings = NULL;
384 char **j;
385 int r;
386
387 assert(i);
388
389 if (path_equal(i->path, "/") ||
390 path_startswith(i->path, "/usr"))
391 return -EROFS;
392
393 settings = image_settings_path(i);
394 if (!settings)
395 return -ENOMEM;
396
397 /* Make sure we don't interfere with a running nspawn */
398 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
399 if (r < 0)
400 return r;
401
402 switch (i->type) {
403
404 case IMAGE_SUBVOLUME:
405 r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
406 if (r < 0)
407 return r;
408 break;
409
410 case IMAGE_DIRECTORY:
411 /* Allow deletion of read-only directories */
412 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
413 r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
414 if (r < 0)
415 return r;
416
417 break;
418
419 case IMAGE_RAW:
420 if (unlink(i->path) < 0)
421 return -errno;
422 break;
423
424 default:
425 return -EOPNOTSUPP;
426 }
427
428 STRV_FOREACH(j, settings) {
429 if (unlink(*j) < 0 && errno != ENOENT)
430 log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j);
431 }
432
433 return 0;
434 }
435
436 static int rename_settings_file(const char *path, const char *new_name) {
437 _cleanup_free_ char *rs = NULL;
438 const char *fn;
439
440 fn = strjoina(new_name, ".nspawn");
441
442 rs = file_in_same_dir(path, fn);
443 if (!rs)
444 return -ENOMEM;
445
446 return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs);
447 }
448
449 int image_rename(Image *i, const char *new_name) {
450 _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
451 _cleanup_free_ char *new_path = NULL, *nn = NULL;
452 _cleanup_strv_free_ char **settings = NULL;
453 unsigned file_attr = 0;
454 char **j;
455 int r;
456
457 assert(i);
458
459 if (!image_name_is_valid(new_name))
460 return -EINVAL;
461
462 if (path_equal(i->path, "/") ||
463 path_startswith(i->path, "/usr"))
464 return -EROFS;
465
466 settings = image_settings_path(i);
467 if (!settings)
468 return -ENOMEM;
469
470 /* Make sure we don't interfere with a running nspawn */
471 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
472 if (r < 0)
473 return r;
474
475 /* Make sure nobody takes the new name, between the time we
476 * checked it is currently unused in all search paths, and the
477 * time we take possesion of it */
478 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
479 if (r < 0)
480 return r;
481
482 r = image_find(new_name, NULL);
483 if (r < 0)
484 return r;
485 if (r > 0)
486 return -EEXIST;
487
488 switch (i->type) {
489
490 case IMAGE_DIRECTORY:
491 /* Turn of the immutable bit while we rename the image, so that we can rename it */
492 (void) read_attr_path(i->path, &file_attr);
493
494 if (file_attr & FS_IMMUTABLE_FL)
495 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
496
497 /* fall through */
498
499 case IMAGE_SUBVOLUME:
500 new_path = file_in_same_dir(i->path, new_name);
501 break;
502
503 case IMAGE_RAW: {
504 const char *fn;
505
506 fn = strjoina(new_name, ".raw");
507 new_path = file_in_same_dir(i->path, fn);
508 break;
509 }
510
511 default:
512 return -EOPNOTSUPP;
513 }
514
515 if (!new_path)
516 return -ENOMEM;
517
518 nn = strdup(new_name);
519 if (!nn)
520 return -ENOMEM;
521
522 r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
523 if (r < 0)
524 return r;
525
526 /* Restore the immutable bit, if it was set before */
527 if (file_attr & FS_IMMUTABLE_FL)
528 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
529
530 free(i->path);
531 i->path = new_path;
532 new_path = NULL;
533
534 free(i->name);
535 i->name = nn;
536 nn = NULL;
537
538 STRV_FOREACH(j, settings) {
539 r = rename_settings_file(*j, new_name);
540 if (r < 0 && r != -ENOENT)
541 log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j);
542 }
543
544 return 0;
545 }
546
547 static int clone_settings_file(const char *path, const char *new_name) {
548 _cleanup_free_ char *rs = NULL;
549 const char *fn;
550
551 fn = strjoina(new_name, ".nspawn");
552
553 rs = file_in_same_dir(path, fn);
554 if (!rs)
555 return -ENOMEM;
556
557 return copy_file_atomic(path, rs, 0664, false, 0);
558 }
559
560 int image_clone(Image *i, const char *new_name, bool read_only) {
561 _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
562 _cleanup_strv_free_ char **settings = NULL;
563 const char *new_path;
564 char **j;
565 int r;
566
567 assert(i);
568
569 if (!image_name_is_valid(new_name))
570 return -EINVAL;
571
572 settings = image_settings_path(i);
573 if (!settings)
574 return -ENOMEM;
575
576 /* Make sure nobody takes the new name, between the time we
577 * checked it is currently unused in all search paths, and the
578 * time we take possesion of it */
579 r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
580 if (r < 0)
581 return r;
582
583 r = image_find(new_name, NULL);
584 if (r < 0)
585 return r;
586 if (r > 0)
587 return -EEXIST;
588
589 switch (i->type) {
590
591 case IMAGE_SUBVOLUME:
592 case IMAGE_DIRECTORY:
593 new_path = strjoina("/var/lib/machines/", new_name);
594
595 r = btrfs_subvol_snapshot(i->path, new_path, (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
596
597 /* Enable "subtree" quotas for the copy, if we didn't
598 * copy any quota from the source. */
599 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
600
601 break;
602
603 case IMAGE_RAW:
604 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
605
606 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
607 break;
608
609 default:
610 return -EOPNOTSUPP;
611 }
612
613 if (r < 0)
614 return r;
615
616 STRV_FOREACH(j, settings) {
617 r = clone_settings_file(*j, new_name);
618 if (r < 0 && r != -ENOENT)
619 log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j);
620 }
621
622 return 0;
623 }
624
625 int image_read_only(Image *i, bool b) {
626 _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
627 int r;
628 assert(i);
629
630 if (path_equal(i->path, "/") ||
631 path_startswith(i->path, "/usr"))
632 return -EROFS;
633
634 /* Make sure we don't interfere with a running nspawn */
635 r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
636 if (r < 0)
637 return r;
638
639 switch (i->type) {
640
641 case IMAGE_SUBVOLUME:
642
643 /* Note that we set the flag only on the top-level
644 * subvolume of the image. */
645
646 r = btrfs_subvol_set_read_only(i->path, b);
647 if (r < 0)
648 return r;
649
650 break;
651
652 case IMAGE_DIRECTORY:
653 /* For simple directory trees we cannot use the access
654 mode of the top-level directory, since it has an
655 effect on the container itself. However, we can
656 use the "immutable" flag, to at least make the
657 top-level directory read-only. It's not as good as
658 a read-only subvolume, but at least something, and
659 we can read the value back.*/
660
661 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
662 if (r < 0)
663 return r;
664
665 break;
666
667 case IMAGE_RAW: {
668 struct stat st;
669
670 if (stat(i->path, &st) < 0)
671 return -errno;
672
673 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
674 return -errno;
675
676 /* If the images is now read-only, it's a good time to
677 * defrag it, given that no write patterns will
678 * fragment it again. */
679 if (b)
680 (void) btrfs_defrag(i->path);
681 break;
682 }
683
684 default:
685 return -EOPNOTSUPP;
686 }
687
688 return 0;
689 }
690
691 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
692 _cleanup_free_ char *p = NULL;
693 LockFile t = LOCK_FILE_INIT;
694 struct stat st;
695 int r;
696
697 assert(path);
698 assert(global);
699 assert(local);
700
701 /* Locks an image path. This actually creates two locks: one
702 * "local" one, next to the image path itself, which might be
703 * shared via NFS. And another "global" one, in /run, that
704 * uses the device/inode number. This has the benefit that we
705 * can even lock a tree that is a mount point, correctly. */
706
707 if (path_equal(path, "/"))
708 return -EBUSY;
709
710 if (!path_is_absolute(path))
711 return -EINVAL;
712
713 if (stat(path, &st) >= 0) {
714 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
715 return -ENOMEM;
716 }
717
718 r = make_lock_file_for(path, operation, &t);
719 if (r < 0)
720 return r;
721
722 if (p) {
723 mkdir_p("/run/systemd/nspawn/locks", 0700);
724
725 r = make_lock_file(p, operation, global);
726 if (r < 0) {
727 release_lock_file(&t);
728 return r;
729 }
730 }
731
732 *local = t;
733 return 0;
734 }
735
736 int image_set_limit(Image *i, uint64_t referenced_max) {
737 assert(i);
738
739 if (path_equal(i->path, "/") ||
740 path_startswith(i->path, "/usr"))
741 return -EROFS;
742
743 if (i->type != IMAGE_SUBVOLUME)
744 return -EOPNOTSUPP;
745
746 /* We set the quota both for the subvolume as well as for the
747 * subtree. The latter is mostly for historical reasons, since
748 * we didn't use to have a concept of subtree quota, and hence
749 * only modified the subvolume quota. */
750
751 (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max);
752 (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
753 return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max);
754 }
755
756 int image_name_lock(const char *name, int operation, LockFile *ret) {
757 const char *p;
758
759 assert(name);
760 assert(ret);
761
762 /* Locks an image name, regardless of the precise path used. */
763
764 if (!image_name_is_valid(name))
765 return -EINVAL;
766
767 if (streq(name, ".host"))
768 return -EBUSY;
769
770 mkdir_p("/run/systemd/nspawn/locks", 0700);
771 p = strjoina("/run/systemd/nspawn/locks/name-", name);
772
773 return make_lock_file(p, operation, ret);
774 }
775
776 bool image_name_is_valid(const char *s) {
777 if (!filename_is_valid(s))
778 return false;
779
780 if (string_has_cc(s, NULL))
781 return false;
782
783 if (!utf8_is_valid(s))
784 return false;
785
786 /* Temporary files for atomically creating new files */
787 if (startswith(s, ".#"))
788 return false;
789
790 return true;
791 }
792
793 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
794 [IMAGE_DIRECTORY] = "directory",
795 [IMAGE_SUBVOLUME] = "subvolume",
796 [IMAGE_RAW] = "raw",
797 };
798
799 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);