]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: use automatic cleanup
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58
59 typedef enum LinkJournal {
60 LINK_NO,
61 LINK_AUTO,
62 LINK_HOST,
63 LINK_GUEST
64 } LinkJournal;
65
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
75 (1ULL << CAP_CHOWN) |
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
81 (1ULL << CAP_KILL) |
82 (1ULL << CAP_LEASE) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
98
99 static int help(void) {
100
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
115
116 return 0;
117 }
118
119 static int parse_argv(int argc, char *argv[]) {
120
121 enum {
122 ARG_PRIVATE_NETWORK = 0x100,
123 ARG_UUID,
124 ARG_READ_ONLY,
125 ARG_CAPABILITY,
126 ARG_LINK_JOURNAL
127 };
128
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
140 { NULL, 0, NULL, 0 }
141 };
142
143 int c;
144
145 assert(argc >= 0);
146 assert(argv);
147
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
149
150 switch (c) {
151
152 case 'h':
153 help();
154 return 0;
155
156 case 'D':
157 free(arg_directory);
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
161 return -ENOMEM;
162 }
163
164 break;
165
166 case 'u':
167 free(arg_user);
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
170 return -ENOMEM;
171 }
172
173 break;
174
175 case 'C':
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
180 return -ENOMEM;
181 }
182 strv_uniq(arg_controllers);
183
184 break;
185
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
188 break;
189
190 case 'b':
191 arg_boot = true;
192 break;
193
194 case ARG_UUID:
195 arg_uuid = optarg;
196 break;
197
198 case ARG_READ_ONLY:
199 arg_read_only = true;
200 break;
201
202 case ARG_CAPABILITY: {
203 char *state, *word;
204 size_t length;
205
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207 cap_value_t cap;
208 char *t;
209
210 t = strndup(word, length);
211 if (!t)
212 return log_oom();
213
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
216 free(t);
217 return -EINVAL;
218 }
219
220 free(t);
221 arg_retain |= 1ULL << (uint64_t) cap;
222 }
223
224 break;
225 }
226
227 case 'j':
228 arg_link_journal = LINK_GUEST;
229 break;
230
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
240 else {
241 log_error("Failed to parse link journal mode %s", optarg);
242 return -EINVAL;
243 }
244
245 break;
246
247 case '?':
248 return -EINVAL;
249
250 default:
251 log_error("Unknown option code %c", c);
252 return -EINVAL;
253 }
254 }
255
256 return 1;
257 }
258
259 static int mount_all(const char *dest) {
260
261 typedef struct MountPoint {
262 const char *what;
263 const char *where;
264 const char *type;
265 const char *options;
266 unsigned long flags;
267 bool fatal;
268 } MountPoint;
269
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
279 #ifdef HAVE_SELINUX
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
282 #endif
283 };
284
285 unsigned k;
286 int r = 0;
287 char _cleanup_free_ *where = NULL;
288
289 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290 int t;
291
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
293 log_oom();
294
295 if (r == 0)
296 r = -ENOMEM;
297
298 break;
299 }
300
301 t = path_is_mount_point(where, true);
302 if (t < 0) {
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
304
305 if (r == 0)
306 r = t;
307
308 continue;
309 }
310
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
313 continue;
314
315 mkdir_p_label(where, 0755);
316
317 if (mount(mount_table[k].what,
318 where,
319 mount_table[k].type,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
323
324 log_error("mount(%s) failed: %m", where);
325
326 if (r == 0)
327 r = -errno;
328 }
329 }
330
331 return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335 char *where;
336
337 assert(dest);
338
339 /* Fix the timezone, if possible */
340 where = strappend(dest, "/etc/localtime");
341 if (!where)
342 return log_oom();
343
344 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347 free(where);
348
349 where = strappend(dest, "/etc/timezone");
350 if (!where)
351 return log_oom();
352
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356 free(where);
357
358 return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362 char *where;
363
364 assert(dest);
365
366 if (arg_private_network)
367 return 0;
368
369 /* Fix resolv.conf, if possible */
370 where = strappend(dest, "/etc/resolv.conf");
371 if (!where)
372 return log_oom();
373
374 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377 free(where);
378
379 return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383 char *from = NULL, *to = NULL;
384 sd_id128_t rnd;
385 char as_uuid[37];
386 int r;
387
388 assert(dest);
389
390 /* Generate a new randomized boot ID, so that each boot-up of
391 * the container gets a new one */
392
393 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394 if (!from) {
395 r = log_oom();
396 goto finish;
397 }
398
399 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
400 if (!to) {
401 r = log_oom();
402 goto finish;
403 }
404
405 r = sd_id128_randomize(&rnd);
406 if (r < 0) {
407 log_error("Failed to generate random boot id: %s", strerror(-r));
408 goto finish;
409 }
410
411 snprintf(as_uuid, sizeof(as_uuid),
412 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
413 SD_ID128_FORMAT_VAL(rnd));
414 char_array_0(as_uuid);
415
416 r = write_one_line_file(from, as_uuid);
417 if (r < 0) {
418 log_error("Failed to write boot id: %s", strerror(-r));
419 goto finish;
420 }
421
422 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
423 log_error("Failed to bind mount boot id: %m");
424 r = -errno;
425 } else
426 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
427
428 unlink(from);
429
430 finish:
431 free(from);
432 free(to);
433
434 return r;
435 }
436
437 static int copy_devnodes(const char *dest) {
438
439 static const char devnodes[] =
440 "null\0"
441 "zero\0"
442 "full\0"
443 "random\0"
444 "urandom\0"
445 "tty\0"
446 "ptmx\0";
447
448 const char *d;
449 int r = 0;
450 mode_t u;
451
452 assert(dest);
453
454 u = umask(0000);
455
456 NULSTR_FOREACH(d, devnodes) {
457 struct stat st;
458 char *from = NULL, *to = NULL;
459
460 asprintf(&from, "/dev/%s", d);
461 asprintf(&to, "%s/dev/%s", dest, d);
462
463 if (!from || !to) {
464 log_error("Failed to allocate devnode path");
465
466 free(from);
467 free(to);
468
469 from = to = NULL;
470
471 if (r == 0)
472 r = -ENOMEM;
473
474 break;
475 }
476
477 if (stat(from, &st) < 0) {
478
479 if (errno != ENOENT) {
480 log_error("Failed to stat %s: %m", from);
481 if (r == 0)
482 r = -errno;
483 }
484
485 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
486
487 log_error("%s is not a char or block device, cannot copy.", from);
488 if (r == 0)
489 r = -EIO;
490
491 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
492
493 log_error("mknod(%s) failed: %m", dest);
494 if (r == 0)
495 r = -errno;
496 }
497
498 free(from);
499 free(to);
500 }
501
502 umask(u);
503
504 return r;
505 }
506
507 static int setup_dev_console(const char *dest, const char *console) {
508 struct stat st;
509 char *to = NULL;
510 int r;
511 mode_t u;
512
513 assert(dest);
514 assert(console);
515
516 u = umask(0000);
517
518 if (stat(console, &st) < 0) {
519 log_error("Failed to stat %s: %m", console);
520 r = -errno;
521 goto finish;
522
523 } else if (!S_ISCHR(st.st_mode)) {
524 log_error("/dev/console is not a char device.");
525 r = -EIO;
526 goto finish;
527 }
528
529 r = chmod_and_chown(console, 0600, 0, 0);
530 if (r < 0) {
531 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
532 goto finish;
533 }
534
535 if (asprintf(&to, "%s/dev/console", dest) < 0) {
536 r = log_oom();
537 goto finish;
538 }
539
540 /* We need to bind mount the right tty to /dev/console since
541 * ptys can only exist on pts file systems. To have something
542 * to bind mount things on we create a device node first, that
543 * has the right major/minor (note that the major minor
544 * doesn't actually matter here, since we mount it over
545 * anyway). */
546
547 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
548 log_error("mknod() for /dev/console failed: %m");
549 r = -errno;
550 goto finish;
551 }
552
553 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
554 log_error("Bind mount for /dev/console failed: %m");
555 r = -errno;
556 goto finish;
557 }
558
559 finish:
560 free(to);
561 umask(u);
562
563 return r;
564 }
565
566 static int setup_kmsg(const char *dest, int kmsg_socket) {
567 char *from = NULL, *to = NULL;
568 int r, fd, k;
569 mode_t u;
570 union {
571 struct cmsghdr cmsghdr;
572 uint8_t buf[CMSG_SPACE(sizeof(int))];
573 } control;
574 struct msghdr mh;
575 struct cmsghdr *cmsg;
576
577 assert(dest);
578 assert(kmsg_socket >= 0);
579
580 u = umask(0000);
581
582 /* We create the kmsg FIFO as /dev/kmsg, but immediately
583 * delete it after bind mounting it to /proc/kmsg. While FIFOs
584 * on the reading side behave very similar to /proc/kmsg,
585 * their writing side behaves differently from /dev/kmsg in
586 * that writing blocks when nothing is reading. In order to
587 * avoid any problems with containers deadlocking due to this
588 * we simply make /dev/kmsg unavailable to the container. */
589 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
590 r = log_oom();
591 goto finish;
592 }
593
594 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
595 r = log_oom();
596 goto finish;
597 }
598
599 if (mkfifo(from, 0600) < 0) {
600 log_error("mkfifo() for /dev/kmsg failed: %m");
601 r = -errno;
602 goto finish;
603 }
604
605 r = chmod_and_chown(from, 0600, 0, 0);
606 if (r < 0) {
607 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
608 goto finish;
609 }
610
611 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
612 log_error("Bind mount for /proc/kmsg failed: %m");
613 r = -errno;
614 goto finish;
615 }
616
617 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
618 if (fd < 0) {
619 log_error("Failed to open fifo: %m");
620 r = -errno;
621 goto finish;
622 }
623
624 zero(mh);
625 zero(control);
626
627 mh.msg_control = &control;
628 mh.msg_controllen = sizeof(control);
629
630 cmsg = CMSG_FIRSTHDR(&mh);
631 cmsg->cmsg_level = SOL_SOCKET;
632 cmsg->cmsg_type = SCM_RIGHTS;
633 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
634 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
635
636 mh.msg_controllen = cmsg->cmsg_len;
637
638 /* Store away the fd in the socket, so that it stays open as
639 * long as we run the child */
640 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
641 close_nointr_nofail(fd);
642
643 if (k < 0) {
644 log_error("Failed to send FIFO fd: %m");
645 r = -errno;
646 goto finish;
647 }
648
649 /* And now make the FIFO unavailable as /dev/kmsg... */
650 unlink(from);
651
652 finish:
653 free(from);
654 free(to);
655 umask(u);
656
657 return r;
658 }
659
660 static int setup_hostname(void) {
661 char *hn;
662 int r = 0;
663
664 hn = path_get_file_name(arg_directory);
665 if (hn) {
666 hn = strdup(hn);
667 if (!hn)
668 return -ENOMEM;
669
670 hostname_cleanup(hn);
671
672 if (!isempty(hn))
673 if (sethostname(hn, strlen(hn)) < 0)
674 r = -errno;
675
676 free(hn);
677 }
678
679 return r;
680 }
681
682 static int setup_journal(const char *directory) {
683 sd_id128_t machine_id;
684 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
685 int r;
686
687 if (arg_link_journal == LINK_NO)
688 return 0;
689
690 p = strappend(directory, "/etc/machine-id");
691 if (!p) {
692 r = log_oom();
693 goto finish;
694 }
695
696 r = read_one_line_file(p, &b);
697 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
698 r = 0;
699 goto finish;
700 } else if (r < 0) {
701 log_error("Failed to read machine ID: %s", strerror(-r));
702 return r;
703 }
704
705 l = strstrip(b);
706 if (isempty(l) && arg_link_journal == LINK_AUTO) {
707 r = 0;
708 goto finish;
709 }
710
711 /* Verify validaty */
712 r = sd_id128_from_string(l, &machine_id);
713 if (r < 0) {
714 log_error("Failed to parse machine ID: %s", strerror(-r));
715 goto finish;
716 }
717
718 free(p);
719 p = strappend("/var/log/journal/", l);
720 q = strjoin(directory, "/var/log/journal/", l, NULL);
721 if (!p || !q) {
722 r = log_oom();
723 goto finish;
724 }
725
726 if (path_is_mount_point(p, false) > 0 ||
727 path_is_mount_point(q, false) > 0) {
728 if (arg_link_journal != LINK_AUTO) {
729 log_error("Journal already a mount point, refusing.");
730 r = -EEXIST;
731 goto finish;
732 }
733
734 r = 0;
735 goto finish;
736 }
737
738 r = readlink_and_make_absolute(p, &d);
739 if (r >= 0) {
740 if ((arg_link_journal == LINK_GUEST ||
741 arg_link_journal == LINK_AUTO) &&
742 path_equal(d, q)) {
743
744 mkdir_p(q, 0755);
745
746 r = 0;
747 goto finish;
748 }
749
750 if (unlink(p) < 0) {
751 log_error("Failed to remove symlink %s: %m", p);
752 r = -errno;
753 goto finish;
754 }
755 } else if (r == -EINVAL) {
756
757 if (arg_link_journal == LINK_GUEST &&
758 rmdir(p) < 0) {
759
760 if (errno == ENOTDIR)
761 log_error("%s already exists and is neither symlink nor directory.", p);
762 else {
763 log_error("Failed to remove %s: %m", p);
764 r = -errno;
765 }
766
767 goto finish;
768 }
769 } else if (r != -ENOENT) {
770 log_error("readlink(%s) failed: %m", p);
771 goto finish;
772 }
773
774 if (arg_link_journal == LINK_GUEST) {
775
776 if (symlink(q, p) < 0) {
777 log_error("Failed to symlink %s to %s: %m", q, p);
778 r = -errno;
779 goto finish;
780 }
781
782 mkdir_p(q, 0755);
783
784 r = 0;
785 goto finish;
786 }
787
788 if (arg_link_journal == LINK_HOST) {
789 r = mkdir_p(p, 0755);
790 if (r < 0) {
791 log_error("Failed to create %s: %m", p);
792 goto finish;
793 }
794
795 } else if (access(p, F_OK) < 0) {
796 r = 0;
797 goto finish;
798 }
799
800 if (dir_is_empty(q) == 0) {
801 log_error("%s not empty.", q);
802 r = -ENOTEMPTY;
803 goto finish;
804 }
805
806 r = mkdir_p(q, 0755);
807 if (r < 0) {
808 log_error("Failed to create %s: %m", q);
809 goto finish;
810 }
811
812 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
813 log_error("Failed to bind mount journal from host into guest: %m");
814 r = -errno;
815 goto finish;
816 }
817
818 r = 0;
819
820 finish:
821 free(p);
822 free(q);
823 free(d);
824 free(b);
825 return r;
826
827 }
828
829 static int drop_capabilities(void) {
830 return capability_bounding_set_drop(~arg_retain, false);
831 }
832
833 static int is_os_tree(const char *path) {
834 int r;
835 char *p;
836 /* We use /bin/sh as flag file if something is an OS */
837
838 if (asprintf(&p, "%s/bin/sh", path) < 0)
839 return -ENOMEM;
840
841 r = access(p, F_OK);
842 free(p);
843
844 return r < 0 ? 0 : 1;
845 }
846
847 static int process_pty(int master, sigset_t *mask) {
848
849 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
850 size_t in_buffer_full = 0, out_buffer_full = 0;
851 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
852 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
853 int ep = -1, signal_fd = -1, r;
854
855 fd_nonblock(STDIN_FILENO, 1);
856 fd_nonblock(STDOUT_FILENO, 1);
857 fd_nonblock(master, 1);
858
859 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
860 if (signal_fd < 0) {
861 log_error("signalfd(): %m");
862 r = -errno;
863 goto finish;
864 }
865
866 ep = epoll_create1(EPOLL_CLOEXEC);
867 if (ep < 0) {
868 log_error("Failed to create epoll: %m");
869 r = -errno;
870 goto finish;
871 }
872
873 zero(stdin_ev);
874 stdin_ev.events = EPOLLIN|EPOLLET;
875 stdin_ev.data.fd = STDIN_FILENO;
876
877 zero(stdout_ev);
878 stdout_ev.events = EPOLLOUT|EPOLLET;
879 stdout_ev.data.fd = STDOUT_FILENO;
880
881 zero(master_ev);
882 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
883 master_ev.data.fd = master;
884
885 zero(signal_ev);
886 signal_ev.events = EPOLLIN;
887 signal_ev.data.fd = signal_fd;
888
889 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
890 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
891 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
892 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
893 log_error("Failed to regiser fds in epoll: %m");
894 r = -errno;
895 goto finish;
896 }
897
898 for (;;) {
899 struct epoll_event ev[16];
900 ssize_t k;
901 int i, nfds;
902
903 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
904 if (nfds < 0) {
905
906 if (errno == EINTR || errno == EAGAIN)
907 continue;
908
909 log_error("epoll_wait(): %m");
910 r = -errno;
911 goto finish;
912 }
913
914 assert(nfds >= 1);
915
916 for (i = 0; i < nfds; i++) {
917 if (ev[i].data.fd == STDIN_FILENO) {
918
919 if (ev[i].events & (EPOLLIN|EPOLLHUP))
920 stdin_readable = true;
921
922 } else if (ev[i].data.fd == STDOUT_FILENO) {
923
924 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
925 stdout_writable = true;
926
927 } else if (ev[i].data.fd == master) {
928
929 if (ev[i].events & (EPOLLIN|EPOLLHUP))
930 master_readable = true;
931
932 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
933 master_writable = true;
934
935 } else if (ev[i].data.fd == signal_fd) {
936 struct signalfd_siginfo sfsi;
937 ssize_t n;
938
939 n = read(signal_fd, &sfsi, sizeof(sfsi));
940 if (n != sizeof(sfsi)) {
941
942 if (n >= 0) {
943 log_error("Failed to read from signalfd: invalid block size");
944 r = -EIO;
945 goto finish;
946 }
947
948 if (errno != EINTR && errno != EAGAIN) {
949 log_error("Failed to read from signalfd: %m");
950 r = -errno;
951 goto finish;
952 }
953 } else {
954
955 if (sfsi.ssi_signo == SIGWINCH) {
956 struct winsize ws;
957
958 /* The window size changed, let's forward that. */
959 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
960 ioctl(master, TIOCSWINSZ, &ws);
961 } else {
962 r = 0;
963 goto finish;
964 }
965 }
966 }
967 }
968
969 while ((stdin_readable && in_buffer_full <= 0) ||
970 (master_writable && in_buffer_full > 0) ||
971 (master_readable && out_buffer_full <= 0) ||
972 (stdout_writable && out_buffer_full > 0)) {
973
974 if (stdin_readable && in_buffer_full < LINE_MAX) {
975
976 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
977 if (k < 0) {
978
979 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
980 stdin_readable = false;
981 else {
982 log_error("read(): %m");
983 r = -errno;
984 goto finish;
985 }
986 } else
987 in_buffer_full += (size_t) k;
988 }
989
990 if (master_writable && in_buffer_full > 0) {
991
992 k = write(master, in_buffer, in_buffer_full);
993 if (k < 0) {
994
995 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
996 master_writable = false;
997 else {
998 log_error("write(): %m");
999 r = -errno;
1000 goto finish;
1001 }
1002
1003 } else {
1004 assert(in_buffer_full >= (size_t) k);
1005 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1006 in_buffer_full -= k;
1007 }
1008 }
1009
1010 if (master_readable && out_buffer_full < LINE_MAX) {
1011
1012 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1013 if (k < 0) {
1014
1015 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1016 master_readable = false;
1017 else {
1018 log_error("read(): %m");
1019 r = -errno;
1020 goto finish;
1021 }
1022 } else
1023 out_buffer_full += (size_t) k;
1024 }
1025
1026 if (stdout_writable && out_buffer_full > 0) {
1027
1028 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1029 if (k < 0) {
1030
1031 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1032 stdout_writable = false;
1033 else {
1034 log_error("write(): %m");
1035 r = -errno;
1036 goto finish;
1037 }
1038
1039 } else {
1040 assert(out_buffer_full >= (size_t) k);
1041 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1042 out_buffer_full -= k;
1043 }
1044 }
1045 }
1046 }
1047
1048 finish:
1049 if (ep >= 0)
1050 close_nointr_nofail(ep);
1051
1052 if (signal_fd >= 0)
1053 close_nointr_nofail(signal_fd);
1054
1055 return r;
1056 }
1057
1058 int main(int argc, char *argv[]) {
1059 pid_t pid = 0;
1060 int r = EXIT_FAILURE, k;
1061 char *oldcg = NULL, *newcg = NULL;
1062 char **controller = NULL;
1063 int master = -1;
1064 const char *console = NULL;
1065 struct termios saved_attr, raw_attr;
1066 sigset_t mask;
1067 bool saved_attr_valid = false;
1068 struct winsize ws;
1069 int kmsg_socket_pair[2] = { -1, -1 };
1070
1071 log_parse_environment();
1072 log_open();
1073
1074 r = parse_argv(argc, argv);
1075 if (r <= 0)
1076 goto finish;
1077
1078 if (arg_directory) {
1079 char *p;
1080
1081 p = path_make_absolute_cwd(arg_directory);
1082 free(arg_directory);
1083 arg_directory = p;
1084 } else
1085 arg_directory = get_current_dir_name();
1086
1087 if (!arg_directory) {
1088 log_error("Failed to determine path");
1089 goto finish;
1090 }
1091
1092 path_kill_slashes(arg_directory);
1093
1094 if (geteuid() != 0) {
1095 log_error("Need to be root.");
1096 goto finish;
1097 }
1098
1099 if (sd_booted() <= 0) {
1100 log_error("Not running on a systemd system.");
1101 goto finish;
1102 }
1103
1104 if (path_equal(arg_directory, "/")) {
1105 log_error("Spawning container on root directory not supported.");
1106 goto finish;
1107 }
1108
1109 if (is_os_tree(arg_directory) <= 0) {
1110 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1111 goto finish;
1112 }
1113
1114 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1115 if (k < 0) {
1116 log_error("Failed to determine current cgroup: %s", strerror(-k));
1117 goto finish;
1118 }
1119
1120 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1121 log_error("Failed to allocate cgroup path.");
1122 goto finish;
1123 }
1124
1125 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1126 if (k < 0) {
1127 log_error("Failed to create cgroup: %s", strerror(-k));
1128 goto finish;
1129 }
1130
1131 STRV_FOREACH(controller, arg_controllers) {
1132 k = cg_create_and_attach(*controller, newcg, 0);
1133 if (k < 0)
1134 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1135 }
1136
1137 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1138 if (master < 0) {
1139 log_error("Failed to acquire pseudo tty: %m");
1140 goto finish;
1141 }
1142
1143 console = ptsname(master);
1144 if (!console) {
1145 log_error("Failed to determine tty name: %m");
1146 goto finish;
1147 }
1148
1149 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1150
1151 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1152 ioctl(master, TIOCSWINSZ, &ws);
1153
1154 if (unlockpt(master) < 0) {
1155 log_error("Failed to unlock tty: %m");
1156 goto finish;
1157 }
1158
1159 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1160 log_error("Failed to get terminal attributes: %m");
1161 goto finish;
1162 }
1163
1164 saved_attr_valid = true;
1165
1166 raw_attr = saved_attr;
1167 cfmakeraw(&raw_attr);
1168 raw_attr.c_lflag &= ~ECHO;
1169
1170 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1171 log_error("Failed to create kmsg socket pair");
1172 goto finish;
1173 }
1174
1175 assert_se(sigemptyset(&mask) == 0);
1176 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1177 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1178
1179 for (;;) {
1180 siginfo_t status;
1181
1182 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1183 log_error("Failed to set terminal attributes: %m");
1184 goto finish;
1185 }
1186
1187 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1188 if (pid < 0) {
1189 if (errno == EINVAL)
1190 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1191 else
1192 log_error("clone() failed: %m");
1193
1194 goto finish;
1195 }
1196
1197 if (pid == 0) {
1198 /* child */
1199
1200 const char *home = NULL;
1201 uid_t uid = (uid_t) -1;
1202 gid_t gid = (gid_t) -1;
1203 const char *envp[] = {
1204 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1205 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1206 NULL, /* TERM */
1207 NULL, /* HOME */
1208 NULL, /* USER */
1209 NULL, /* LOGNAME */
1210 NULL, /* container_uuid */
1211 NULL
1212 };
1213
1214 envp[2] = strv_find_prefix(environ, "TERM=");
1215
1216 close_nointr_nofail(master);
1217
1218 close_nointr(STDIN_FILENO);
1219 close_nointr(STDOUT_FILENO);
1220 close_nointr(STDERR_FILENO);
1221
1222 close_all_fds(&kmsg_socket_pair[1], 1);
1223
1224 reset_all_signal_handlers();
1225
1226 assert_se(sigemptyset(&mask) == 0);
1227 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1228
1229 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1230 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1231 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1232 goto child_fail;
1233
1234 if (setsid() < 0) {
1235 log_error("setsid() failed: %m");
1236 goto child_fail;
1237 }
1238
1239 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1240 log_error("PR_SET_PDEATHSIG failed: %m");
1241 goto child_fail;
1242 }
1243
1244 /* Mark everything as slave, so that we still
1245 * receive mounts from the real root, but don't
1246 * propagate mounts to the real root. */
1247 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1248 log_error("MS_SLAVE|MS_REC failed: %m");
1249 goto child_fail;
1250 }
1251
1252 /* Turn directory into bind mount */
1253 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1254 log_error("Failed to make bind mount.");
1255 goto child_fail;
1256 }
1257
1258 if (arg_read_only)
1259 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1260 log_error("Failed to make read-only.");
1261 goto child_fail;
1262 }
1263
1264 if (mount_all(arg_directory) < 0)
1265 goto child_fail;
1266
1267 if (copy_devnodes(arg_directory) < 0)
1268 goto child_fail;
1269
1270 dev_setup(arg_directory);
1271
1272 if (setup_dev_console(arg_directory, console) < 0)
1273 goto child_fail;
1274
1275 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1276 goto child_fail;
1277
1278 close_nointr_nofail(kmsg_socket_pair[1]);
1279
1280 if (setup_boot_id(arg_directory) < 0)
1281 goto child_fail;
1282
1283 if (setup_timezone(arg_directory) < 0)
1284 goto child_fail;
1285
1286 if (setup_resolv_conf(arg_directory) < 0)
1287 goto child_fail;
1288
1289 if (setup_journal(arg_directory) < 0)
1290 goto child_fail;
1291
1292 if (chdir(arg_directory) < 0) {
1293 log_error("chdir(%s) failed: %m", arg_directory);
1294 goto child_fail;
1295 }
1296
1297 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1298 log_error("mount(MS_MOVE) failed: %m");
1299 goto child_fail;
1300 }
1301
1302 if (chroot(".") < 0) {
1303 log_error("chroot() failed: %m");
1304 goto child_fail;
1305 }
1306
1307 if (chdir("/") < 0) {
1308 log_error("chdir() failed: %m");
1309 goto child_fail;
1310 }
1311
1312 umask(0022);
1313
1314 loopback_setup();
1315
1316 if (drop_capabilities() < 0) {
1317 log_error("drop_capabilities() failed: %m");
1318 goto child_fail;
1319 }
1320
1321 if (arg_user) {
1322
1323 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1324 log_error("get_user_creds() failed: %m");
1325 goto child_fail;
1326 }
1327
1328 if (mkdir_parents_label(home, 0775) < 0) {
1329 log_error("mkdir_parents_label() failed: %m");
1330 goto child_fail;
1331 }
1332
1333 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1334 log_error("mkdir_safe_label() failed: %m");
1335 goto child_fail;
1336 }
1337
1338 if (initgroups((const char*)arg_user, gid) < 0) {
1339 log_error("initgroups() failed: %m");
1340 goto child_fail;
1341 }
1342
1343 if (setresgid(gid, gid, gid) < 0) {
1344 log_error("setregid() failed: %m");
1345 goto child_fail;
1346 }
1347
1348 if (setresuid(uid, uid, uid) < 0) {
1349 log_error("setreuid() failed: %m");
1350 goto child_fail;
1351 }
1352 }
1353
1354 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1355 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1356 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1357 log_oom();
1358 goto child_fail;
1359 }
1360
1361 if (arg_uuid) {
1362 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1363 log_oom();
1364 goto child_fail;
1365 }
1366 }
1367
1368 setup_hostname();
1369
1370 if (arg_boot) {
1371 char **a;
1372 size_t l;
1373
1374 /* Automatically search for the init system */
1375
1376 l = 1 + argc - optind;
1377 a = newa(char*, l + 1);
1378 memcpy(a + 1, argv + optind, l * sizeof(char*));
1379
1380 a[0] = (char*) "/usr/lib/systemd/systemd";
1381 execve(a[0], a, (char**) envp);
1382
1383 a[0] = (char*) "/lib/systemd/systemd";
1384 execve(a[0], a, (char**) envp);
1385
1386 a[0] = (char*) "/sbin/init";
1387 execve(a[0], a, (char**) envp);
1388 } else if (argc > optind)
1389 execvpe(argv[optind], argv + optind, (char**) envp);
1390 else {
1391 chdir(home ? home : "/root");
1392 execle("/bin/bash", "-bash", NULL, (char**) envp);
1393 }
1394
1395 log_error("execv() failed: %m");
1396
1397 child_fail:
1398 _exit(EXIT_FAILURE);
1399 }
1400
1401 if (process_pty(master, &mask) < 0)
1402 goto finish;
1403
1404
1405 if (saved_attr_valid)
1406 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1407
1408 r = wait_for_terminate(pid, &status);
1409 if (r < 0) {
1410 r = EXIT_FAILURE;
1411 break;
1412 }
1413
1414 if (status.si_code == CLD_EXITED) {
1415 if (status.si_status != 0) {
1416 log_error("Container failed with error code %i.", status.si_status);
1417 r = status.si_status;
1418 break;
1419 }
1420
1421 log_debug("Container exited successfully.");
1422 break;
1423 } else if (status.si_code == CLD_KILLED &&
1424 status.si_status == SIGINT) {
1425 log_info("Container has been shut down.");
1426 r = 0;
1427 break;
1428 } else if (status.si_code == CLD_KILLED &&
1429 status.si_status == SIGHUP) {
1430 log_info("Container is being rebooted.");
1431 continue;
1432 } else if (status.si_code == CLD_KILLED ||
1433 status.si_code == CLD_DUMPED) {
1434
1435 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1436 r = EXIT_FAILURE;
1437 break;
1438 } else {
1439 log_error("Container failed due to unknown reason.");
1440 r = EXIT_FAILURE;
1441 break;
1442 }
1443 }
1444
1445 finish:
1446 if (saved_attr_valid)
1447 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1448
1449 if (master >= 0)
1450 close_nointr_nofail(master);
1451
1452 close_pipe(kmsg_socket_pair);
1453
1454 if (oldcg)
1455 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1456
1457 if (newcg)
1458 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1459
1460 free(arg_directory);
1461 strv_free(arg_controllers);
1462 free(oldcg);
1463 free(newcg);
1464
1465 return r;
1466 }