]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: introduce new --link-journal= switch to link container journals into host
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56
57 typedef enum LinkJournal {
58 LINK_NO,
59 LINK_AUTO,
60 LINK_HOST,
61 LINK_GUEST
62 } LinkJournal;
63
64 static char *arg_directory = NULL;
65 static char *arg_user = NULL;
66 static char **arg_controllers = NULL;
67 static char *arg_uuid = NULL;
68 static bool arg_private_network = false;
69 static bool arg_read_only = false;
70 static bool arg_boot = false;
71 static LinkJournal arg_link_journal = LINK_AUTO;
72 static uint64_t arg_retain =
73 (1ULL << CAP_CHOWN) |
74 (1ULL << CAP_DAC_OVERRIDE) |
75 (1ULL << CAP_DAC_READ_SEARCH) |
76 (1ULL << CAP_FOWNER) |
77 (1ULL << CAP_FSETID) |
78 (1ULL << CAP_IPC_OWNER) |
79 (1ULL << CAP_KILL) |
80 (1ULL << CAP_LEASE) |
81 (1ULL << CAP_LINUX_IMMUTABLE) |
82 (1ULL << CAP_NET_BIND_SERVICE) |
83 (1ULL << CAP_NET_BROADCAST) |
84 (1ULL << CAP_NET_RAW) |
85 (1ULL << CAP_SETGID) |
86 (1ULL << CAP_SETFCAP) |
87 (1ULL << CAP_SETPCAP) |
88 (1ULL << CAP_SETUID) |
89 (1ULL << CAP_SYS_ADMIN) |
90 (1ULL << CAP_SYS_CHROOT) |
91 (1ULL << CAP_SYS_NICE) |
92 (1ULL << CAP_SYS_PTRACE) |
93 (1ULL << CAP_SYS_TTY_CONFIG) |
94 (1ULL << CAP_SYS_RESOURCE);
95
96 static int help(void) {
97
98 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100 " -h --help Show this help\n"
101 " -D --directory=NAME Root directory for the container\n"
102 " -b --boot Boot up full system (i.e. invoke init)\n"
103 " -u --user=USER Run the command under specified user or uid\n"
104 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
105 " --uuid=UUID Set a specific machine UUID for the container\n"
106 " --private-network Disable network in container\n"
107 " --read-only Mount the root directory read-only\n"
108 " --capability=CAP In addition to the default, retain specified capability\n"
109 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
110 " -j Equivalent to --link-journal=host\n",
111 program_invocation_short_name);
112
113 return 0;
114 }
115
116 static int parse_argv(int argc, char *argv[]) {
117
118 enum {
119 ARG_PRIVATE_NETWORK = 0x100,
120 ARG_UUID,
121 ARG_READ_ONLY,
122 ARG_CAPABILITY,
123 ARG_LINK_JOURNAL
124 };
125
126 static const struct option options[] = {
127 { "help", no_argument, NULL, 'h' },
128 { "directory", required_argument, NULL, 'D' },
129 { "user", required_argument, NULL, 'u' },
130 { "controllers", required_argument, NULL, 'C' },
131 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
132 { "boot", no_argument, NULL, 'b' },
133 { "uuid", required_argument, NULL, ARG_UUID },
134 { "read-only", no_argument, NULL, ARG_READ_ONLY },
135 { "capability", required_argument, NULL, ARG_CAPABILITY },
136 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
137 { NULL, 0, NULL, 0 }
138 };
139
140 int c;
141
142 assert(argc >= 0);
143 assert(argv);
144
145 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
146
147 switch (c) {
148
149 case 'h':
150 help();
151 return 0;
152
153 case 'D':
154 free(arg_directory);
155 arg_directory = canonicalize_file_name(optarg);
156 if (!arg_directory) {
157 log_error("Failed to canonicalize root directory.");
158 return -ENOMEM;
159 }
160
161 break;
162
163 case 'u':
164 free(arg_user);
165 if (!(arg_user = strdup(optarg))) {
166 log_error("Failed to duplicate user name.");
167 return -ENOMEM;
168 }
169
170 break;
171
172 case 'C':
173 strv_free(arg_controllers);
174 arg_controllers = strv_split(optarg, ",");
175 if (!arg_controllers) {
176 log_error("Failed to split controllers list.");
177 return -ENOMEM;
178 }
179 strv_uniq(arg_controllers);
180
181 break;
182
183 case ARG_PRIVATE_NETWORK:
184 arg_private_network = true;
185 break;
186
187 case 'b':
188 arg_boot = true;
189 break;
190
191 case ARG_UUID:
192 arg_uuid = optarg;
193 break;
194
195 case ARG_READ_ONLY:
196 arg_read_only = true;
197 break;
198
199 case ARG_CAPABILITY: {
200 char *state, *word;
201 size_t length;
202
203 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
204 cap_value_t cap;
205 char *t;
206
207 t = strndup(word, length);
208 if (!t) {
209 log_error("Out of memory.");
210 return -ENOMEM;
211 }
212
213 if (cap_from_name(t, &cap) < 0) {
214 log_error("Failed to parse capability %s.", t);
215 free(t);
216 return -EINVAL;
217 }
218
219 free(t);
220 arg_retain |= 1ULL << (uint64_t) cap;
221 }
222
223 break;
224 }
225
226 case 'j':
227 arg_link_journal = LINK_GUEST;
228 break;
229
230 case ARG_LINK_JOURNAL:
231 if (streq(optarg, "auto"))
232 arg_link_journal = LINK_AUTO;
233 else if (streq(optarg, "no"))
234 arg_link_journal = LINK_NO;
235 else if (streq(optarg, "guest"))
236 arg_link_journal = LINK_GUEST;
237 else if (streq(optarg, "host"))
238 arg_link_journal = LINK_HOST;
239 else {
240 log_error("Failed to parse link journal mode %s", optarg);
241 return -EINVAL;
242 }
243
244 break;
245
246 case '?':
247 return -EINVAL;
248
249 default:
250 log_error("Unknown option code %c", c);
251 return -EINVAL;
252 }
253 }
254
255 return 1;
256 }
257
258 static int mount_all(const char *dest) {
259
260 typedef struct MountPoint {
261 const char *what;
262 const char *where;
263 const char *type;
264 const char *options;
265 unsigned long flags;
266 bool fatal;
267 } MountPoint;
268
269 static const MountPoint mount_table[] = {
270 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
271 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
272 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
273 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
274 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
277 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 #ifdef HAVE_SELINUX
279 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
280 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
281 #endif
282 };
283
284 unsigned k;
285 int r = 0;
286 char *where;
287
288 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289 int t;
290
291 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
292 log_error("Out of memory");
293
294 if (r == 0)
295 r = -ENOMEM;
296
297 break;
298 }
299
300 t = path_is_mount_point(where, false);
301 if (t < 0) {
302 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
303 free(where);
304
305 if (r == 0)
306 r = t;
307
308 continue;
309 }
310
311 mkdir_p_label(where, 0755);
312
313 if (mount(mount_table[k].what,
314 where,
315 mount_table[k].type,
316 mount_table[k].flags,
317 mount_table[k].options) < 0 &&
318 mount_table[k].fatal) {
319
320 log_error("mount(%s) failed: %m", where);
321
322 if (r == 0)
323 r = -errno;
324 }
325
326 free(where);
327 }
328
329 return r;
330 }
331
332 static int setup_timezone(const char *dest) {
333 char *where;
334
335 assert(dest);
336
337 /* Fix the timezone, if possible */
338 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
339 log_error("Out of memory");
340 return -ENOMEM;
341 }
342
343 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
344 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
345
346 free(where);
347
348 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
349 log_error("Out of memory");
350 return -ENOMEM;
351 }
352
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356 free(where);
357
358 return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362 char *where;
363
364 assert(dest);
365
366 if (arg_private_network)
367 return 0;
368
369 /* Fix resolv.conf, if possible */
370 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
371 log_error("Out of memory");
372 return -ENOMEM;
373 }
374
375 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
376 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
377
378 free(where);
379
380 return 0;
381 }
382
383 static int copy_devnodes(const char *dest) {
384
385 static const char devnodes[] =
386 "null\0"
387 "zero\0"
388 "full\0"
389 "random\0"
390 "urandom\0"
391 "tty\0"
392 "ptmx\0"
393 "rtc0\0";
394
395 const char *d;
396 int r = 0;
397 mode_t u;
398
399 assert(dest);
400
401 u = umask(0000);
402
403 NULSTR_FOREACH(d, devnodes) {
404 struct stat st;
405 char *from = NULL, *to = NULL;
406
407 asprintf(&from, "/dev/%s", d);
408 asprintf(&to, "%s/dev/%s", dest, d);
409
410 if (!from || !to) {
411 log_error("Failed to allocate devnode path");
412
413 free(from);
414 free(to);
415
416 from = to = NULL;
417
418 if (r == 0)
419 r = -ENOMEM;
420
421 break;
422 }
423
424 if (stat(from, &st) < 0) {
425
426 if (errno != ENOENT) {
427 log_error("Failed to stat %s: %m", from);
428 if (r == 0)
429 r = -errno;
430 }
431
432 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
433
434 log_error("%s is not a char or block device, cannot copy.", from);
435 if (r == 0)
436 r = -EIO;
437
438 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
439
440 log_error("mknod(%s) failed: %m", dest);
441 if (r == 0)
442 r = -errno;
443 }
444
445 free(from);
446 free(to);
447 }
448
449 umask(u);
450
451 return r;
452 }
453
454 static int setup_dev_console(const char *dest, const char *console) {
455 struct stat st;
456 char *to = NULL;
457 int r;
458 mode_t u;
459
460 assert(dest);
461 assert(console);
462
463 u = umask(0000);
464
465 if (stat(console, &st) < 0) {
466 log_error("Failed to stat %s: %m", console);
467 r = -errno;
468 goto finish;
469
470 } else if (!S_ISCHR(st.st_mode)) {
471 log_error("/dev/console is not a char device.");
472 r = -EIO;
473 goto finish;
474 }
475
476 r = chmod_and_chown(console, 0600, 0, 0);
477 if (r < 0) {
478 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
479 goto finish;
480 }
481
482 if (asprintf(&to, "%s/dev/console", dest) < 0) {
483 log_error("Out of memory");
484 r = -ENOMEM;
485 goto finish;
486 }
487
488 /* We need to bind mount the right tty to /dev/console since
489 * ptys can only exist on pts file systems. To have something
490 * to bind mount things on we create a device node first, that
491 * has the right major/minor (note that the major minor
492 * doesn't actually matter here, since we mount it over
493 * anyway). */
494
495 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
496 log_error("mknod() for /dev/console failed: %m");
497 r = -errno;
498 goto finish;
499 }
500
501 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
502 log_error("Bind mount for /dev/console failed: %m");
503 r = -errno;
504 goto finish;
505 }
506
507 finish:
508 free(to);
509 umask(u);
510
511 return r;
512 }
513
514 static int setup_kmsg(const char *dest, int kmsg_socket) {
515 char *from = NULL, *to = NULL;
516 int r, fd, k;
517 mode_t u;
518 union {
519 struct cmsghdr cmsghdr;
520 uint8_t buf[CMSG_SPACE(sizeof(int))];
521 } control;
522 struct msghdr mh;
523 struct cmsghdr *cmsg;
524
525 assert(dest);
526 assert(kmsg_socket >= 0);
527
528 u = umask(0000);
529
530 /* We create the kmsg FIFO as /dev/kmsg, but immediately
531 * delete it after bind mounting it to /proc/kmsg. While FIFOs
532 * on the reading side behave very similar to /proc/kmsg,
533 * their writing side behaves differently from /dev/kmsg in
534 * that writing blocks when nothing is reading. In order to
535 * avoid any problems with containers deadlocking due to this
536 * we simply make /dev/kmsg unavailable to the container. */
537 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
538 log_error("Out of memory");
539 r = -ENOMEM;
540 goto finish;
541 }
542
543 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
544 log_error("Out of memory");
545 r = -ENOMEM;
546 goto finish;
547 }
548
549 if (mkfifo(from, 0600) < 0) {
550 log_error("mkfifo() for /dev/kmsg failed: %m");
551 r = -errno;
552 goto finish;
553 }
554
555 r = chmod_and_chown(from, 0600, 0, 0);
556 if (r < 0) {
557 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
558 goto finish;
559 }
560
561 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
562 log_error("Bind mount for /proc/kmsg failed: %m");
563 r = -errno;
564 goto finish;
565 }
566
567 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
568 if (fd < 0) {
569 log_error("Failed to open fifo: %m");
570 r = -errno;
571 goto finish;
572 }
573
574 zero(mh);
575 zero(control);
576
577 mh.msg_control = &control;
578 mh.msg_controllen = sizeof(control);
579
580 cmsg = CMSG_FIRSTHDR(&mh);
581 cmsg->cmsg_level = SOL_SOCKET;
582 cmsg->cmsg_type = SCM_RIGHTS;
583 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
584 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
585
586 mh.msg_controllen = cmsg->cmsg_len;
587
588 /* Store away the fd in the socket, so that it stays open as
589 * long as we run the child */
590 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
591 close_nointr_nofail(fd);
592
593 if (k < 0) {
594 log_error("Failed to send FIFO fd: %m");
595 r = -errno;
596 goto finish;
597 }
598
599 /* And now make the FIFO unavailable as /dev/kmsg... */
600 unlink(from);
601
602 finish:
603 free(from);
604 free(to);
605 umask(u);
606
607 return r;
608 }
609
610 static int setup_hostname(void) {
611 char *hn;
612 int r = 0;
613
614 hn = path_get_file_name(arg_directory);
615 if (hn) {
616 hn = strdup(hn);
617 if (!hn)
618 return -ENOMEM;
619
620 hostname_cleanup(hn);
621
622 if (!isempty(hn))
623 if (sethostname(hn, strlen(hn)) < 0)
624 r = -errno;
625
626 free(hn);
627 }
628
629 return r;
630 }
631
632 static int setup_journal(const char *directory) {
633 sd_id128_t machine_id;
634 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
635 int r;
636
637 if (arg_link_journal == LINK_NO)
638 return 0;
639
640 p = strappend(directory, "/etc/machine-id");
641 if (!p) {
642 log_error("Out of memory");
643 r = -ENOMEM;
644 goto finish;
645 }
646
647 r = read_one_line_file(p, &b);
648 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
649 r = 0;
650 goto finish;
651 } else if (r < 0) {
652 log_error("Failed to read machine ID: %s", strerror(-r));
653 return r;
654 }
655
656 l = strstrip(b);
657 if (isempty(l) && arg_link_journal == LINK_AUTO) {
658 r = 0;
659 goto finish;
660 }
661
662 /* Verify validaty */
663 r = sd_id128_from_string(l, &machine_id);
664 if (r < 0) {
665 log_error("Failed to parse machine ID: %s", strerror(-r));
666 goto finish;
667 }
668
669 free(p);
670 p = strappend("/var/log/journal/", l);
671 q = strjoin(directory, "/var/log/journal/", l, NULL);
672 if (!p || !q) {
673 log_error("Out of memory");
674 r = -ENOMEM;
675 goto finish;
676 }
677
678 if (path_is_mount_point(p, false) > 0 ||
679 path_is_mount_point(q, false) > 0) {
680 if (arg_link_journal != LINK_AUTO) {
681 log_error("Journal already a mount point, refusing.");
682 r = -EEXIST;
683 goto finish;
684 }
685
686 r = 0;
687 goto finish;
688 }
689
690 r = readlink_and_make_absolute(p, &d);
691 if (r >= 0) {
692 if ((arg_link_journal == LINK_GUEST ||
693 arg_link_journal == LINK_AUTO) &&
694 path_equal(d, q)) {
695
696 mkdir_p(q, 0755);
697
698 r = 0;
699 goto finish;
700 }
701
702 if (unlink(p) < 0) {
703 log_error("Failed to remove symlink %s: %m", p);
704 r = -errno;
705 goto finish;
706 }
707 } else if (r == -EINVAL) {
708
709 if (arg_link_journal == LINK_GUEST &&
710 rmdir(p) < 0) {
711
712 if (errno == ENOTDIR)
713 log_error("%s already exists and is neither symlink nor directory.", p);
714 else {
715 log_error("Failed to remove %s: %m", p);
716 r = -errno;
717 }
718
719 goto finish;
720 }
721 } else if (r != -ENOENT) {
722 log_error("readlink(%s) failed: %m", p);
723 goto finish;
724 }
725
726 if (arg_link_journal == LINK_GUEST) {
727
728 if (symlink(q, p) < 0) {
729 log_error("Failed to symlink %s to %s: %m", q, p);
730 r = -errno;
731 goto finish;
732 }
733
734 mkdir_p(q, 0755);
735
736 r = 0;
737 goto finish;
738 }
739
740 if (arg_link_journal == LINK_HOST) {
741 r = mkdir_p(p, 0755);
742 if (r < 0) {
743 log_error("Failed to create %s: %m", p);
744 goto finish;
745 }
746
747 } else if (access(p, F_OK) < 0) {
748 r = 0;
749 goto finish;
750 }
751
752 if (dir_is_empty(q) == 0) {
753 log_error("%s not empty.", q);
754 r = -ENOTEMPTY;
755 goto finish;
756 }
757
758 r = mkdir_p(q, 0755);
759 if (r < 0) {
760 log_error("Failed to create %s: %m", q);
761 goto finish;
762 }
763
764 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765 log_error("Failed to bind mount journal from host into guest: %m");
766 r = -errno;
767 goto finish;
768 }
769
770 r = 0;
771
772 finish:
773 free(p);
774 free(q);
775 free(d);
776 free(b);
777 return r;
778
779 }
780
781 static int drop_capabilities(void) {
782 return capability_bounding_set_drop(~arg_retain, false);
783 }
784
785 static int is_os_tree(const char *path) {
786 int r;
787 char *p;
788 /* We use /bin/sh as flag file if something is an OS */
789
790 if (asprintf(&p, "%s/bin/sh", path) < 0)
791 return -ENOMEM;
792
793 r = access(p, F_OK);
794 free(p);
795
796 return r < 0 ? 0 : 1;
797 }
798
799 static int process_pty(int master, sigset_t *mask) {
800
801 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802 size_t in_buffer_full = 0, out_buffer_full = 0;
803 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805 int ep = -1, signal_fd = -1, r;
806
807 fd_nonblock(STDIN_FILENO, 1);
808 fd_nonblock(STDOUT_FILENO, 1);
809 fd_nonblock(master, 1);
810
811 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
812 log_error("signalfd(): %m");
813 r = -errno;
814 goto finish;
815 }
816
817 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
818 log_error("Failed to create epoll: %m");
819 r = -errno;
820 goto finish;
821 }
822
823 zero(stdin_ev);
824 stdin_ev.events = EPOLLIN|EPOLLET;
825 stdin_ev.data.fd = STDIN_FILENO;
826
827 zero(stdout_ev);
828 stdout_ev.events = EPOLLOUT|EPOLLET;
829 stdout_ev.data.fd = STDOUT_FILENO;
830
831 zero(master_ev);
832 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
833 master_ev.data.fd = master;
834
835 zero(signal_ev);
836 signal_ev.events = EPOLLIN;
837 signal_ev.data.fd = signal_fd;
838
839 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
840 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
841 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
842 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
843 log_error("Failed to regiser fds in epoll: %m");
844 r = -errno;
845 goto finish;
846 }
847
848 for (;;) {
849 struct epoll_event ev[16];
850 ssize_t k;
851 int i, nfds;
852
853 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
854
855 if (errno == EINTR || errno == EAGAIN)
856 continue;
857
858 log_error("epoll_wait(): %m");
859 r = -errno;
860 goto finish;
861 }
862
863 assert(nfds >= 1);
864
865 for (i = 0; i < nfds; i++) {
866 if (ev[i].data.fd == STDIN_FILENO) {
867
868 if (ev[i].events & (EPOLLIN|EPOLLHUP))
869 stdin_readable = true;
870
871 } else if (ev[i].data.fd == STDOUT_FILENO) {
872
873 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
874 stdout_writable = true;
875
876 } else if (ev[i].data.fd == master) {
877
878 if (ev[i].events & (EPOLLIN|EPOLLHUP))
879 master_readable = true;
880
881 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
882 master_writable = true;
883
884 } else if (ev[i].data.fd == signal_fd) {
885 struct signalfd_siginfo sfsi;
886 ssize_t n;
887
888 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
889
890 if (n >= 0) {
891 log_error("Failed to read from signalfd: invalid block size");
892 r = -EIO;
893 goto finish;
894 }
895
896 if (errno != EINTR && errno != EAGAIN) {
897 log_error("Failed to read from signalfd: %m");
898 r = -errno;
899 goto finish;
900 }
901 } else {
902
903 if (sfsi.ssi_signo == SIGWINCH) {
904 struct winsize ws;
905
906 /* The window size changed, let's forward that. */
907 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
908 ioctl(master, TIOCSWINSZ, &ws);
909 } else {
910 r = 0;
911 goto finish;
912 }
913 }
914 }
915 }
916
917 while ((stdin_readable && in_buffer_full <= 0) ||
918 (master_writable && in_buffer_full > 0) ||
919 (master_readable && out_buffer_full <= 0) ||
920 (stdout_writable && out_buffer_full > 0)) {
921
922 if (stdin_readable && in_buffer_full < LINE_MAX) {
923
924 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
925
926 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
927 stdin_readable = false;
928 else {
929 log_error("read(): %m");
930 r = -errno;
931 goto finish;
932 }
933 } else
934 in_buffer_full += (size_t) k;
935 }
936
937 if (master_writable && in_buffer_full > 0) {
938
939 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
940
941 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
942 master_writable = false;
943 else {
944 log_error("write(): %m");
945 r = -errno;
946 goto finish;
947 }
948
949 } else {
950 assert(in_buffer_full >= (size_t) k);
951 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
952 in_buffer_full -= k;
953 }
954 }
955
956 if (master_readable && out_buffer_full < LINE_MAX) {
957
958 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
959
960 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
961 master_readable = false;
962 else {
963 log_error("read(): %m");
964 r = -errno;
965 goto finish;
966 }
967 } else
968 out_buffer_full += (size_t) k;
969 }
970
971 if (stdout_writable && out_buffer_full > 0) {
972
973 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
974
975 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
976 stdout_writable = false;
977 else {
978 log_error("write(): %m");
979 r = -errno;
980 goto finish;
981 }
982
983 } else {
984 assert(out_buffer_full >= (size_t) k);
985 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
986 out_buffer_full -= k;
987 }
988 }
989 }
990 }
991
992 finish:
993 if (ep >= 0)
994 close_nointr_nofail(ep);
995
996 if (signal_fd >= 0)
997 close_nointr_nofail(signal_fd);
998
999 return r;
1000 }
1001
1002 int main(int argc, char *argv[]) {
1003 pid_t pid = 0;
1004 int r = EXIT_FAILURE, k;
1005 char *oldcg = NULL, *newcg = NULL;
1006 char **controller = NULL;
1007 int master = -1;
1008 const char *console = NULL;
1009 struct termios saved_attr, raw_attr;
1010 sigset_t mask;
1011 bool saved_attr_valid = false;
1012 struct winsize ws;
1013 int kmsg_socket_pair[2] = { -1, -1 };
1014
1015 log_parse_environment();
1016 log_open();
1017
1018 if ((r = parse_argv(argc, argv)) <= 0)
1019 goto finish;
1020
1021 if (arg_directory) {
1022 char *p;
1023
1024 p = path_make_absolute_cwd(arg_directory);
1025 free(arg_directory);
1026 arg_directory = p;
1027 } else
1028 arg_directory = get_current_dir_name();
1029
1030 if (!arg_directory) {
1031 log_error("Failed to determine path");
1032 goto finish;
1033 }
1034
1035 path_kill_slashes(arg_directory);
1036
1037 if (geteuid() != 0) {
1038 log_error("Need to be root.");
1039 goto finish;
1040 }
1041
1042 if (sd_booted() <= 0) {
1043 log_error("Not running on a systemd system.");
1044 goto finish;
1045 }
1046
1047 if (path_equal(arg_directory, "/")) {
1048 log_error("Spawning container on root directory not supported.");
1049 goto finish;
1050 }
1051
1052 if (is_os_tree(arg_directory) <= 0) {
1053 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1054 goto finish;
1055 }
1056
1057 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
1058 log_error("Failed to determine current cgroup: %s", strerror(-k));
1059 goto finish;
1060 }
1061
1062 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1063 log_error("Failed to allocate cgroup path.");
1064 goto finish;
1065 }
1066
1067 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1068 if (k < 0) {
1069 log_error("Failed to create cgroup: %s", strerror(-k));
1070 goto finish;
1071 }
1072
1073 STRV_FOREACH(controller,arg_controllers) {
1074 k = cg_create_and_attach(*controller, newcg, 0);
1075 if (k < 0)
1076 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1077 }
1078
1079 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
1080 log_error("Failed to acquire pseudo tty: %m");
1081 goto finish;
1082 }
1083
1084 if (!(console = ptsname(master))) {
1085 log_error("Failed to determine tty name: %m");
1086 goto finish;
1087 }
1088
1089 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1090
1091 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1092 ioctl(master, TIOCSWINSZ, &ws);
1093
1094 if (unlockpt(master) < 0) {
1095 log_error("Failed to unlock tty: %m");
1096 goto finish;
1097 }
1098
1099 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1100 log_error("Failed to get terminal attributes: %m");
1101 goto finish;
1102 }
1103
1104 saved_attr_valid = true;
1105
1106 raw_attr = saved_attr;
1107 cfmakeraw(&raw_attr);
1108 raw_attr.c_lflag &= ~ECHO;
1109
1110 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1111 log_error("Failed to set terminal attributes: %m");
1112 goto finish;
1113 }
1114
1115 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1116 log_error("Failed to create kmsg socket pair");
1117 goto finish;
1118 }
1119
1120 assert_se(sigemptyset(&mask) == 0);
1121 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1122 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1123
1124 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1125 if (pid < 0) {
1126 if (errno == EINVAL)
1127 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1128 else
1129 log_error("clone() failed: %m");
1130
1131 goto finish;
1132 }
1133
1134 if (pid == 0) {
1135 /* child */
1136
1137 const char *home = NULL;
1138 uid_t uid = (uid_t) -1;
1139 gid_t gid = (gid_t) -1;
1140 const char *envp[] = {
1141 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1143 NULL, /* TERM */
1144 NULL, /* HOME */
1145 NULL, /* USER */
1146 NULL, /* LOGNAME */
1147 NULL, /* container_uuid */
1148 NULL
1149 };
1150
1151 envp[2] = strv_find_prefix(environ, "TERM=");
1152
1153 close_nointr_nofail(master);
1154
1155 close_nointr(STDIN_FILENO);
1156 close_nointr(STDOUT_FILENO);
1157 close_nointr(STDERR_FILENO);
1158
1159 close_all_fds(&kmsg_socket_pair[1], 1);
1160
1161 reset_all_signal_handlers();
1162
1163 assert_se(sigemptyset(&mask) == 0);
1164 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1165
1166 if (setsid() < 0)
1167 goto child_fail;
1168
1169 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
1170 goto child_fail;
1171
1172 /* Mark / as private, in case somebody marked it shared */
1173 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
1174 goto child_fail;
1175
1176 /* Turn directory into bind mount */
1177 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
1178 log_error("Failed to make bind mount.");
1179 goto child_fail;
1180 }
1181
1182 if (arg_read_only)
1183 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1184 log_error("Failed to make read-only.");
1185 goto child_fail;
1186 }
1187
1188 if (mount_all(arg_directory) < 0)
1189 goto child_fail;
1190
1191 if (copy_devnodes(arg_directory) < 0)
1192 goto child_fail;
1193
1194 if (setup_dev_console(arg_directory, console) < 0)
1195 goto child_fail;
1196
1197 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1198 goto child_fail;
1199
1200 close_nointr_nofail(kmsg_socket_pair[1]);
1201
1202 if (setup_timezone(arg_directory) < 0)
1203 goto child_fail;
1204
1205 if (setup_resolv_conf(arg_directory) < 0)
1206 goto child_fail;
1207
1208 if (setup_journal(arg_directory) < 0)
1209 goto child_fail;
1210
1211 if (chdir(arg_directory) < 0) {
1212 log_error("chdir(%s) failed: %m", arg_directory);
1213 goto child_fail;
1214 }
1215
1216 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1217 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1218 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1219 goto child_fail;
1220
1221 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1222 log_error("mount(MS_BIND) failed: %m");
1223 goto child_fail;
1224 }
1225
1226 if (chroot(".") < 0) {
1227 log_error("chroot() failed: %m");
1228 goto child_fail;
1229 }
1230
1231 if (chdir("/") < 0) {
1232 log_error("chdir() failed: %m");
1233 goto child_fail;
1234 }
1235
1236 umask(0022);
1237
1238 loopback_setup();
1239
1240 if (drop_capabilities() < 0) {
1241 log_error("drop_capabilities() failed: %m");
1242 goto child_fail;
1243 }
1244
1245 if (arg_user) {
1246
1247 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1248 log_error("get_user_creds() failed: %m");
1249 goto child_fail;
1250 }
1251
1252 if (mkdir_parents_label(home, 0775) < 0) {
1253 log_error("mkdir_parents_label() failed: %m");
1254 goto child_fail;
1255 }
1256
1257 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1258 log_error("mkdir_safe_label() failed: %m");
1259 goto child_fail;
1260 }
1261
1262 if (initgroups((const char*)arg_user, gid) < 0) {
1263 log_error("initgroups() failed: %m");
1264 goto child_fail;
1265 }
1266
1267 if (setresgid(gid, gid, gid) < 0) {
1268 log_error("setregid() failed: %m");
1269 goto child_fail;
1270 }
1271
1272 if (setresuid(uid, uid, uid) < 0) {
1273 log_error("setreuid() failed: %m");
1274 goto child_fail;
1275 }
1276 }
1277
1278 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1279 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1280 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1281 log_error("Out of memory");
1282 goto child_fail;
1283 }
1284
1285 if (arg_uuid) {
1286 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1287 log_error("Out of memory");
1288 goto child_fail;
1289 }
1290 }
1291
1292 setup_hostname();
1293
1294 if (arg_boot) {
1295 char **a;
1296 size_t l;
1297
1298 /* Automatically search for the init system */
1299
1300 l = 1 + argc - optind;
1301 a = newa(char*, l + 1);
1302 memcpy(a + 1, argv + optind, l * sizeof(char*));
1303
1304 a[0] = (char*) "/usr/lib/systemd/systemd";
1305 execve(a[0], a, (char**) envp);
1306
1307 a[0] = (char*) "/lib/systemd/systemd";
1308 execve(a[0], a, (char**) envp);
1309
1310 a[0] = (char*) "/sbin/init";
1311 execve(a[0], a, (char**) envp);
1312 } else if (argc > optind)
1313 execvpe(argv[optind], argv + optind, (char**) envp);
1314 else {
1315 chdir(home ? home : "/root");
1316 execle("/bin/bash", "-bash", NULL, (char**) envp);
1317 }
1318
1319 log_error("execv() failed: %m");
1320
1321 child_fail:
1322 _exit(EXIT_FAILURE);
1323 }
1324
1325 if (process_pty(master, &mask) < 0)
1326 goto finish;
1327
1328 if (saved_attr_valid) {
1329 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1330 saved_attr_valid = false;
1331 }
1332
1333 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1334
1335 if (r < 0)
1336 r = EXIT_FAILURE;
1337
1338 finish:
1339 if (saved_attr_valid)
1340 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1341
1342 if (master >= 0)
1343 close_nointr_nofail(master);
1344
1345 close_pipe(kmsg_socket_pair);
1346
1347 if (oldcg)
1348 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1349
1350 if (newcg)
1351 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1352
1353 free(arg_directory);
1354 strv_free(arg_controllers);
1355 free(oldcg);
1356 free(newcg);
1357
1358 return r;
1359 }