]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: print PID and show how to enter the namespace
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 typedef enum LinkJournal {
64 LINK_NO,
65 LINK_AUTO,
66 LINK_HOST,
67 LINK_GUEST
68 } LinkJournal;
69
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
79 (1ULL << CAP_CHOWN) |
80 (1ULL << CAP_DAC_OVERRIDE) |
81 (1ULL << CAP_DAC_READ_SEARCH) |
82 (1ULL << CAP_FOWNER) |
83 (1ULL << CAP_FSETID) |
84 (1ULL << CAP_IPC_OWNER) |
85 (1ULL << CAP_KILL) |
86 (1ULL << CAP_LEASE) |
87 (1ULL << CAP_LINUX_IMMUTABLE) |
88 (1ULL << CAP_NET_BIND_SERVICE) |
89 (1ULL << CAP_NET_BROADCAST) |
90 (1ULL << CAP_NET_RAW) |
91 (1ULL << CAP_SETGID) |
92 (1ULL << CAP_SETFCAP) |
93 (1ULL << CAP_SETPCAP) |
94 (1ULL << CAP_SETUID) |
95 (1ULL << CAP_SYS_ADMIN) |
96 (1ULL << CAP_SYS_CHROOT) |
97 (1ULL << CAP_SYS_NICE) |
98 (1ULL << CAP_SYS_PTRACE) |
99 (1ULL << CAP_SYS_TTY_CONFIG) |
100 (1ULL << CAP_SYS_RESOURCE) |
101 (1ULL << CAP_SYS_BOOT) |
102 (1ULL << CAP_AUDIT_WRITE) |
103 (1ULL << CAP_AUDIT_CONTROL);
104
105 static int help(void) {
106
107 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
108 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
109 " -h --help Show this help\n"
110 " --version Print version string\n"
111 " -D --directory=NAME Root directory for the container\n"
112 " -b --boot Boot up full system (i.e. invoke init)\n"
113 " -u --user=USER Run the command under specified user or uid\n"
114 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
115 " --uuid=UUID Set a specific machine UUID for the container\n"
116 " --private-network Disable network in container\n"
117 " --read-only Mount the root directory read-only\n"
118 " --capability=CAP In addition to the default, retain specified capability\n"
119 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
120 " -j Equivalent to --link-journal=host\n",
121 program_invocation_short_name);
122
123 return 0;
124 }
125
126 static int parse_argv(int argc, char *argv[]) {
127
128 enum {
129 ARG_VERSION = 0x100,
130 ARG_PRIVATE_NETWORK,
131 ARG_UUID,
132 ARG_READ_ONLY,
133 ARG_CAPABILITY,
134 ARG_LINK_JOURNAL
135 };
136
137 static const struct option options[] = {
138 { "help", no_argument, NULL, 'h' },
139 { "version", no_argument, NULL, ARG_VERSION },
140 { "directory", required_argument, NULL, 'D' },
141 { "user", required_argument, NULL, 'u' },
142 { "controllers", required_argument, NULL, 'C' },
143 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
144 { "boot", no_argument, NULL, 'b' },
145 { "uuid", required_argument, NULL, ARG_UUID },
146 { "read-only", no_argument, NULL, ARG_READ_ONLY },
147 { "capability", required_argument, NULL, ARG_CAPABILITY },
148 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
149 { NULL, 0, NULL, 0 }
150 };
151
152 int c;
153
154 assert(argc >= 0);
155 assert(argv);
156
157 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158
159 switch (c) {
160
161 case 'h':
162 help();
163 return 0;
164
165 case ARG_VERSION:
166 puts(PACKAGE_STRING);
167 puts(SYSTEMD_FEATURES);
168 return 0;
169
170 case 'D':
171 free(arg_directory);
172 arg_directory = canonicalize_file_name(optarg);
173 if (!arg_directory) {
174 log_error("Failed to canonicalize root directory.");
175 return -ENOMEM;
176 }
177
178 break;
179
180 case 'u':
181 free(arg_user);
182 if (!(arg_user = strdup(optarg))) {
183 log_error("Failed to duplicate user name.");
184 return -ENOMEM;
185 }
186
187 break;
188
189 case 'C':
190 strv_free(arg_controllers);
191 arg_controllers = strv_split(optarg, ",");
192 if (!arg_controllers) {
193 log_error("Failed to split controllers list.");
194 return -ENOMEM;
195 }
196 strv_uniq(arg_controllers);
197
198 break;
199
200 case ARG_PRIVATE_NETWORK:
201 arg_private_network = true;
202 break;
203
204 case 'b':
205 arg_boot = true;
206 break;
207
208 case ARG_UUID:
209 arg_uuid = optarg;
210 break;
211
212 case ARG_READ_ONLY:
213 arg_read_only = true;
214 break;
215
216 case ARG_CAPABILITY: {
217 char *state, *word;
218 size_t length;
219
220 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
221 cap_value_t cap;
222 char *t;
223
224 t = strndup(word, length);
225 if (!t)
226 return log_oom();
227
228 if (cap_from_name(t, &cap) < 0) {
229 log_error("Failed to parse capability %s.", t);
230 free(t);
231 return -EINVAL;
232 }
233
234 free(t);
235 arg_retain |= 1ULL << (uint64_t) cap;
236 }
237
238 break;
239 }
240
241 case 'j':
242 arg_link_journal = LINK_GUEST;
243 break;
244
245 case ARG_LINK_JOURNAL:
246 if (streq(optarg, "auto"))
247 arg_link_journal = LINK_AUTO;
248 else if (streq(optarg, "no"))
249 arg_link_journal = LINK_NO;
250 else if (streq(optarg, "guest"))
251 arg_link_journal = LINK_GUEST;
252 else if (streq(optarg, "host"))
253 arg_link_journal = LINK_HOST;
254 else {
255 log_error("Failed to parse link journal mode %s", optarg);
256 return -EINVAL;
257 }
258
259 break;
260
261 case '?':
262 return -EINVAL;
263
264 default:
265 log_error("Unknown option code %c", c);
266 return -EINVAL;
267 }
268 }
269
270 return 1;
271 }
272
273 static int mount_all(const char *dest) {
274
275 typedef struct MountPoint {
276 const char *what;
277 const char *where;
278 const char *type;
279 const char *options;
280 unsigned long flags;
281 bool fatal;
282 } MountPoint;
283
284 static const MountPoint mount_table[] = {
285 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
286 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
287 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
288 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
289 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
290 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
291 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
292 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
293 #ifdef HAVE_SELINUX
294 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
295 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
296 #endif
297 };
298
299 unsigned k;
300 int r = 0;
301
302 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
303 char _cleanup_free_ *where = NULL;
304 int t;
305
306 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
307 log_oom();
308
309 if (r == 0)
310 r = -ENOMEM;
311
312 break;
313 }
314
315 t = path_is_mount_point(where, true);
316 if (t < 0) {
317 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
318
319 if (r == 0)
320 r = t;
321
322 continue;
323 }
324
325 /* Skip this entry if it is not a remount. */
326 if (mount_table[k].what && t > 0)
327 continue;
328
329 mkdir_p_label(where, 0755);
330
331 if (mount(mount_table[k].what,
332 where,
333 mount_table[k].type,
334 mount_table[k].flags,
335 mount_table[k].options) < 0 &&
336 mount_table[k].fatal) {
337
338 log_error("mount(%s) failed: %m", where);
339
340 if (r == 0)
341 r = -errno;
342 }
343 }
344
345 return r;
346 }
347
348 static int setup_timezone(const char *dest) {
349 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
350 char *z, *y;
351 int r;
352
353 assert(dest);
354
355 /* Fix the timezone, if possible */
356 r = readlink_malloc("/etc/localtime", &p);
357 if (r < 0) {
358 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
359 return 0;
360 }
361
362 z = path_startswith(p, "../usr/share/zoneinfo/");
363 if (!z)
364 z = path_startswith(p, "/usr/share/zoneinfo/");
365 if (!z) {
366 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
367 return 0;
368 }
369
370 where = strappend(dest, "/etc/localtime");
371 if (!where)
372 return log_oom();
373
374 r = readlink_malloc(where, &q);
375 if (r >= 0) {
376 y = path_startswith(q, "../usr/share/zoneinfo/");
377 if (!y)
378 y = path_startswith(q, "/usr/share/zoneinfo/");
379
380
381 /* Already pointing to the right place? Then do nothing .. */
382 if (y && streq(y, z))
383 return 0;
384 }
385
386 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
387 if (!check)
388 return log_oom();
389
390 if (access(check, F_OK) < 0) {
391 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
392 return 0;
393 }
394
395 what = strappend("../usr/share/zoneinfo/", z);
396 if (!what)
397 return log_oom();
398
399 unlink(where);
400 if (symlink(what, where) < 0) {
401 log_error("Failed to correct timezone of container: %m");
402 return 0;
403 }
404
405 return 0;
406 }
407
408 static int setup_resolv_conf(const char *dest) {
409 char *where;
410
411 assert(dest);
412
413 if (arg_private_network)
414 return 0;
415
416 /* Fix resolv.conf, if possible */
417 where = strappend(dest, "/etc/resolv.conf");
418 if (!where)
419 return log_oom();
420
421 /* We don't really care for the results of this really. If it
422 * fails, it fails, but meh... */
423 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
424 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
425
426 free(where);
427
428 return 0;
429 }
430
431 static int setup_boot_id(const char *dest) {
432 char _cleanup_free_ *from = NULL, *to = NULL;
433 sd_id128_t rnd;
434 char as_uuid[37];
435 int r;
436
437 assert(dest);
438
439 /* Generate a new randomized boot ID, so that each boot-up of
440 * the container gets a new one */
441
442 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
443 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
444 if (!from || !to)
445 return log_oom();
446
447 r = sd_id128_randomize(&rnd);
448 if (r < 0) {
449 log_error("Failed to generate random boot id: %s", strerror(-r));
450 return r;
451 }
452
453 snprintf(as_uuid, sizeof(as_uuid),
454 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
455 SD_ID128_FORMAT_VAL(rnd));
456 char_array_0(as_uuid);
457
458 r = write_one_line_file(from, as_uuid);
459 if (r < 0) {
460 log_error("Failed to write boot id: %s", strerror(-r));
461 return r;
462 }
463
464 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
465 log_error("Failed to bind mount boot id: %m");
466 r = -errno;
467 } else
468 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
469
470 unlink(from);
471 return r;
472 }
473
474 static int copy_devnodes(const char *dest) {
475
476 static const char devnodes[] =
477 "null\0"
478 "zero\0"
479 "full\0"
480 "random\0"
481 "urandom\0"
482 "tty\0"
483 "ptmx\0";
484
485 const char *d;
486 int r = 0;
487 mode_t _cleanup_umask_ u;
488
489 assert(dest);
490
491 u = umask(0000);
492
493 NULSTR_FOREACH(d, devnodes) {
494 struct stat st;
495 char _cleanup_free_ *from = NULL, *to = NULL;
496
497 asprintf(&from, "/dev/%s", d);
498 asprintf(&to, "%s/dev/%s", dest, d);
499
500 if (!from || !to) {
501 log_oom();
502
503 if (r == 0)
504 r = -ENOMEM;
505
506 break;
507 }
508
509 if (stat(from, &st) < 0) {
510
511 if (errno != ENOENT) {
512 log_error("Failed to stat %s: %m", from);
513 if (r == 0)
514 r = -errno;
515 }
516
517 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
518
519 log_error("%s is not a char or block device, cannot copy", from);
520 if (r == 0)
521 r = -EIO;
522
523 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
524
525 log_error("mknod(%s) failed: %m", dest);
526 if (r == 0)
527 r = -errno;
528 }
529 }
530
531 return r;
532 }
533
534 static int setup_dev_console(const char *dest, const char *console) {
535 struct stat st;
536 char _cleanup_free_ *to = NULL;
537 int r;
538 mode_t _cleanup_umask_ u;
539
540 assert(dest);
541 assert(console);
542
543 u = umask(0000);
544
545 if (stat(console, &st) < 0) {
546 log_error("Failed to stat %s: %m", console);
547 return -errno;
548
549 } else if (!S_ISCHR(st.st_mode)) {
550 log_error("/dev/console is not a char device");
551 return -EIO;
552 }
553
554 r = chmod_and_chown(console, 0600, 0, 0);
555 if (r < 0) {
556 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
557 return r;
558 }
559
560 if (asprintf(&to, "%s/dev/console", dest) < 0)
561 return log_oom();
562
563 /* We need to bind mount the right tty to /dev/console since
564 * ptys can only exist on pts file systems. To have something
565 * to bind mount things on we create a device node first, that
566 * has the right major/minor (note that the major minor
567 * doesn't actually matter here, since we mount it over
568 * anyway). */
569
570 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
571 log_error("mknod() for /dev/console failed: %m");
572 return -errno;
573 }
574
575 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
576 log_error("Bind mount for /dev/console failed: %m");
577 return -errno;
578 }
579
580 return 0;
581 }
582
583 static int setup_kmsg(const char *dest, int kmsg_socket) {
584 char _cleanup_free_ *from = NULL, *to = NULL;
585 int r, fd, k;
586 mode_t _cleanup_umask_ u;
587 union {
588 struct cmsghdr cmsghdr;
589 uint8_t buf[CMSG_SPACE(sizeof(int))];
590 } control;
591 struct msghdr mh;
592 struct cmsghdr *cmsg;
593
594 assert(dest);
595 assert(kmsg_socket >= 0);
596
597 u = umask(0000);
598
599 /* We create the kmsg FIFO as /dev/kmsg, but immediately
600 * delete it after bind mounting it to /proc/kmsg. While FIFOs
601 * on the reading side behave very similar to /proc/kmsg,
602 * their writing side behaves differently from /dev/kmsg in
603 * that writing blocks when nothing is reading. In order to
604 * avoid any problems with containers deadlocking due to this
605 * we simply make /dev/kmsg unavailable to the container. */
606 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
607 asprintf(&to, "%s/proc/kmsg", dest) < 0)
608 return log_oom();
609
610 if (mkfifo(from, 0600) < 0) {
611 log_error("mkfifo() for /dev/kmsg failed: %m");
612 return -errno;
613 }
614
615 r = chmod_and_chown(from, 0600, 0, 0);
616 if (r < 0) {
617 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
618 return r;
619 }
620
621 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
622 log_error("Bind mount for /proc/kmsg failed: %m");
623 return -errno;
624 }
625
626 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
627 if (fd < 0) {
628 log_error("Failed to open fifo: %m");
629 return -errno;
630 }
631
632 zero(mh);
633 zero(control);
634
635 mh.msg_control = &control;
636 mh.msg_controllen = sizeof(control);
637
638 cmsg = CMSG_FIRSTHDR(&mh);
639 cmsg->cmsg_level = SOL_SOCKET;
640 cmsg->cmsg_type = SCM_RIGHTS;
641 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
642 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
643
644 mh.msg_controllen = cmsg->cmsg_len;
645
646 /* Store away the fd in the socket, so that it stays open as
647 * long as we run the child */
648 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
649 close_nointr_nofail(fd);
650
651 if (k < 0) {
652 log_error("Failed to send FIFO fd: %m");
653 return -errno;
654 }
655
656 /* And now make the FIFO unavailable as /dev/kmsg... */
657 unlink(from);
658 return 0;
659 }
660
661 static int setup_hostname(void) {
662 char *hn;
663 int r = 0;
664
665 hn = path_get_file_name(arg_directory);
666 if (hn) {
667 hn = strdup(hn);
668 if (!hn)
669 return -ENOMEM;
670
671 hostname_cleanup(hn);
672
673 if (!isempty(hn))
674 if (sethostname(hn, strlen(hn)) < 0)
675 r = -errno;
676
677 free(hn);
678 }
679
680 return r;
681 }
682
683 static int setup_journal(const char *directory) {
684 sd_id128_t machine_id;
685 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
686 char *id;
687 int r;
688
689 if (arg_link_journal == LINK_NO)
690 return 0;
691
692 p = strappend(directory, "/etc/machine-id");
693 if (!p)
694 return log_oom();
695
696 r = read_one_line_file(p, &b);
697 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
698 return 0;
699 else if (r < 0) {
700 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
701 return r;
702 }
703
704 id = strstrip(b);
705 if (isempty(id) && arg_link_journal == LINK_AUTO)
706 return 0;
707
708 /* Verify validity */
709 r = sd_id128_from_string(id, &machine_id);
710 if (r < 0) {
711 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
712 return r;
713 }
714
715 free(p);
716 p = strappend("/var/log/journal/", id);
717 q = strjoin(directory, "/var/log/journal/", id, NULL);
718 if (!p || !q)
719 return log_oom();
720
721 if (path_is_mount_point(p, false) > 0) {
722 if (arg_link_journal != LINK_AUTO) {
723 log_error("%s: already a mount point, refusing to use for journal", p);
724 return -EEXIST;
725 }
726
727 return 0;
728 }
729
730 if (path_is_mount_point(q, false) > 0) {
731 if (arg_link_journal != LINK_AUTO) {
732 log_error("%s: already a mount point, refusing to use for journal", q);
733 return -EEXIST;
734 }
735
736 return 0;
737 }
738
739 r = readlink_and_make_absolute(p, &d);
740 if (r >= 0) {
741 if ((arg_link_journal == LINK_GUEST ||
742 arg_link_journal == LINK_AUTO) &&
743 path_equal(d, q)) {
744
745 r = mkdir_p(q, 0755);
746 if (r < 0)
747 log_warning("failed to create directory %s: %m", q);
748 return 0;
749 }
750
751 if (unlink(p) < 0) {
752 log_error("Failed to remove symlink %s: %m", p);
753 return -errno;
754 }
755 } else if (r == -EINVAL) {
756
757 if (arg_link_journal == LINK_GUEST &&
758 rmdir(p) < 0) {
759
760 if (errno == ENOTDIR) {
761 log_error("%s already exists and is neither a symlink nor a directory", p);
762 return r;
763 } else {
764 log_error("Failed to remove %s: %m", p);
765 return -errno;
766 }
767 }
768 } else if (r != -ENOENT) {
769 log_error("readlink(%s) failed: %m", p);
770 return r;
771 }
772
773 if (arg_link_journal == LINK_GUEST) {
774
775 if (symlink(q, p) < 0) {
776 log_error("Failed to symlink %s to %s: %m", q, p);
777 return -errno;
778 }
779
780 r = mkdir_p(q, 0755);
781 if (r < 0)
782 log_warning("failed to create directory %s: %m", q);
783 return 0;
784 }
785
786 if (arg_link_journal == LINK_HOST) {
787 r = mkdir_p(p, 0755);
788 if (r < 0) {
789 log_error("Failed to create %s: %m", p);
790 return r;
791 }
792
793 } else if (access(p, F_OK) < 0)
794 return 0;
795
796 if (dir_is_empty(q) == 0) {
797 log_error("%s not empty.", q);
798 return -ENOTEMPTY;
799 }
800
801 r = mkdir_p(q, 0755);
802 if (r < 0) {
803 log_error("Failed to create %s: %m", q);
804 return r;
805 }
806
807 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
808 log_error("Failed to bind mount journal from host into guest: %m");
809 return -errno;
810 }
811
812 return 0;
813 }
814
815 static int drop_capabilities(void) {
816 return capability_bounding_set_drop(~arg_retain, false);
817 }
818
819 static int is_os_tree(const char *path) {
820 int r;
821 char *p;
822 /* We use /bin/sh as flag file if something is an OS */
823
824 if (asprintf(&p, "%s/bin/sh", path) < 0)
825 return -ENOMEM;
826
827 r = access(p, F_OK);
828 free(p);
829
830 return r < 0 ? 0 : 1;
831 }
832
833 static int process_pty(int master, pid_t pid, sigset_t *mask) {
834
835 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
836 size_t in_buffer_full = 0, out_buffer_full = 0;
837 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
838 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
839 int ep = -1, signal_fd = -1, r;
840 bool tried_orderly_shutdown = false;
841
842 assert(master >= 0);
843 assert(pid > 0);
844 assert(mask);
845
846 fd_nonblock(STDIN_FILENO, 1);
847 fd_nonblock(STDOUT_FILENO, 1);
848 fd_nonblock(master, 1);
849
850 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
851 if (signal_fd < 0) {
852 log_error("signalfd(): %m");
853 r = -errno;
854 goto finish;
855 }
856
857 ep = epoll_create1(EPOLL_CLOEXEC);
858 if (ep < 0) {
859 log_error("Failed to create epoll: %m");
860 r = -errno;
861 goto finish;
862 }
863
864 /* We read from STDIN only if this is actually a TTY,
865 * otherwise we assume non-interactivity. */
866 if (isatty(STDIN_FILENO)) {
867 zero(stdin_ev);
868 stdin_ev.events = EPOLLIN|EPOLLET;
869 stdin_ev.data.fd = STDIN_FILENO;
870
871 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
872 log_error("Failed to register STDIN in epoll: %m");
873 r = -errno;
874 goto finish;
875 }
876 }
877
878 zero(stdout_ev);
879 stdout_ev.events = EPOLLOUT|EPOLLET;
880 stdout_ev.data.fd = STDOUT_FILENO;
881
882 zero(master_ev);
883 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884 master_ev.data.fd = master;
885
886 zero(signal_ev);
887 signal_ev.events = EPOLLIN;
888 signal_ev.data.fd = signal_fd;
889
890 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
891 if (errno != EPERM) {
892 log_error("Failed to register stdout in epoll: %m");
893 r = -errno;
894 goto finish;
895 }
896 /* stdout without epoll support. Likely redirected to regular file. */
897 stdout_writable = true;
898 }
899
900 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
901 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
902 log_error("Failed to register fds in epoll: %m");
903 r = -errno;
904 goto finish;
905 }
906
907 for (;;) {
908 struct epoll_event ev[16];
909 ssize_t k;
910 int i, nfds;
911
912 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
913 if (nfds < 0) {
914
915 if (errno == EINTR || errno == EAGAIN)
916 continue;
917
918 log_error("epoll_wait(): %m");
919 r = -errno;
920 goto finish;
921 }
922
923 assert(nfds >= 1);
924
925 for (i = 0; i < nfds; i++) {
926 if (ev[i].data.fd == STDIN_FILENO) {
927
928 if (ev[i].events & (EPOLLIN|EPOLLHUP))
929 stdin_readable = true;
930
931 } else if (ev[i].data.fd == STDOUT_FILENO) {
932
933 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934 stdout_writable = true;
935
936 } else if (ev[i].data.fd == master) {
937
938 if (ev[i].events & (EPOLLIN|EPOLLHUP))
939 master_readable = true;
940
941 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
942 master_writable = true;
943
944 } else if (ev[i].data.fd == signal_fd) {
945 struct signalfd_siginfo sfsi;
946 ssize_t n;
947
948 n = read(signal_fd, &sfsi, sizeof(sfsi));
949 if (n != sizeof(sfsi)) {
950
951 if (n >= 0) {
952 log_error("Failed to read from signalfd: invalid block size");
953 r = -EIO;
954 goto finish;
955 }
956
957 if (errno != EINTR && errno != EAGAIN) {
958 log_error("Failed to read from signalfd: %m");
959 r = -errno;
960 goto finish;
961 }
962 } else {
963
964 if (sfsi.ssi_signo == SIGWINCH) {
965 struct winsize ws;
966
967 /* The window size changed, let's forward that. */
968 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
969 ioctl(master, TIOCSWINSZ, &ws);
970 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
971
972 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
973
974 /* This only works for systemd... */
975 tried_orderly_shutdown = true;
976 kill(pid, SIGRTMIN+3);
977
978 } else {
979 r = 0;
980 goto finish;
981 }
982 }
983 }
984 }
985
986 while ((stdin_readable && in_buffer_full <= 0) ||
987 (master_writable && in_buffer_full > 0) ||
988 (master_readable && out_buffer_full <= 0) ||
989 (stdout_writable && out_buffer_full > 0)) {
990
991 if (stdin_readable && in_buffer_full < LINE_MAX) {
992
993 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
994 if (k < 0) {
995
996 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997 stdin_readable = false;
998 else {
999 log_error("read(): %m");
1000 r = -errno;
1001 goto finish;
1002 }
1003 } else
1004 in_buffer_full += (size_t) k;
1005 }
1006
1007 if (master_writable && in_buffer_full > 0) {
1008
1009 k = write(master, in_buffer, in_buffer_full);
1010 if (k < 0) {
1011
1012 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1013 master_writable = false;
1014 else {
1015 log_error("write(): %m");
1016 r = -errno;
1017 goto finish;
1018 }
1019
1020 } else {
1021 assert(in_buffer_full >= (size_t) k);
1022 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1023 in_buffer_full -= k;
1024 }
1025 }
1026
1027 if (master_readable && out_buffer_full < LINE_MAX) {
1028
1029 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1030 if (k < 0) {
1031
1032 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033 master_readable = false;
1034 else {
1035 log_error("read(): %m");
1036 r = -errno;
1037 goto finish;
1038 }
1039 } else
1040 out_buffer_full += (size_t) k;
1041 }
1042
1043 if (stdout_writable && out_buffer_full > 0) {
1044
1045 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1046 if (k < 0) {
1047
1048 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1049 stdout_writable = false;
1050 else {
1051 log_error("write(): %m");
1052 r = -errno;
1053 goto finish;
1054 }
1055
1056 } else {
1057 assert(out_buffer_full >= (size_t) k);
1058 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1059 out_buffer_full -= k;
1060 }
1061 }
1062 }
1063 }
1064
1065 finish:
1066 if (ep >= 0)
1067 close_nointr_nofail(ep);
1068
1069 if (signal_fd >= 0)
1070 close_nointr_nofail(signal_fd);
1071
1072 return r;
1073 }
1074
1075 int main(int argc, char *argv[]) {
1076 pid_t pid = 0;
1077 int r = EXIT_FAILURE, k;
1078 char *oldcg = NULL, *newcg = NULL;
1079 char **controller = NULL;
1080 int master = -1, n_fd_passed;
1081 const char *console = NULL;
1082 struct termios saved_attr, raw_attr;
1083 sigset_t mask;
1084 bool saved_attr_valid = false;
1085 struct winsize ws;
1086 int kmsg_socket_pair[2] = { -1, -1 };
1087 FDSet *fds = NULL;
1088
1089 log_parse_environment();
1090 log_open();
1091
1092 r = parse_argv(argc, argv);
1093 if (r <= 0)
1094 goto finish;
1095
1096 if (arg_directory) {
1097 char *p;
1098
1099 p = path_make_absolute_cwd(arg_directory);
1100 free(arg_directory);
1101 arg_directory = p;
1102 } else
1103 arg_directory = get_current_dir_name();
1104
1105 if (!arg_directory) {
1106 log_error("Failed to determine path");
1107 goto finish;
1108 }
1109
1110 path_kill_slashes(arg_directory);
1111
1112 if (geteuid() != 0) {
1113 log_error("Need to be root.");
1114 goto finish;
1115 }
1116
1117 if (sd_booted() <= 0) {
1118 log_error("Not running on a systemd system.");
1119 goto finish;
1120 }
1121
1122 if (path_equal(arg_directory, "/")) {
1123 log_error("Spawning container on root directory not supported.");
1124 goto finish;
1125 }
1126
1127 if (is_os_tree(arg_directory) <= 0) {
1128 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1129 goto finish;
1130 }
1131
1132 log_close();
1133 n_fd_passed = sd_listen_fds(false);
1134 if (n_fd_passed > 0) {
1135 k = fdset_new_listen_fds(&fds, false);
1136 if (k < 0) {
1137 log_error("Failed to collect file descriptors: %s", strerror(-k));
1138 goto finish;
1139 }
1140 }
1141 fdset_close_others(fds);
1142 log_open();
1143
1144 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1145 if (k < 0) {
1146 log_error("Failed to determine current cgroup: %s", strerror(-k));
1147 goto finish;
1148 }
1149
1150 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1151 log_error("Failed to allocate cgroup path.");
1152 goto finish;
1153 }
1154
1155 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1156 if (k < 0) {
1157 log_error("Failed to create cgroup: %s", strerror(-k));
1158 goto finish;
1159 }
1160
1161 STRV_FOREACH(controller, arg_controllers) {
1162 k = cg_create_and_attach(*controller, newcg, 0);
1163 if (k < 0)
1164 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1165 }
1166
1167 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1168 if (master < 0) {
1169 log_error("Failed to acquire pseudo tty: %m");
1170 goto finish;
1171 }
1172
1173 console = ptsname(master);
1174 if (!console) {
1175 log_error("Failed to determine tty name: %m");
1176 goto finish;
1177 }
1178
1179 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1180
1181 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1182 ioctl(master, TIOCSWINSZ, &ws);
1183
1184 if (unlockpt(master) < 0) {
1185 log_error("Failed to unlock tty: %m");
1186 goto finish;
1187 }
1188
1189 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1190 saved_attr_valid = true;
1191
1192 raw_attr = saved_attr;
1193 cfmakeraw(&raw_attr);
1194 raw_attr.c_lflag &= ~ECHO;
1195 }
1196
1197 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1198 log_error("Failed to create kmsg socket pair");
1199 goto finish;
1200 }
1201
1202 assert_se(sigemptyset(&mask) == 0);
1203 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1204 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1205
1206 for (;;) {
1207 siginfo_t status;
1208 int pipefd[2];
1209
1210 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1211 log_error("pipe2(): %m");
1212 goto finish;
1213 }
1214
1215 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1216 if (pid < 0) {
1217 if (errno == EINVAL)
1218 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1219 else
1220 log_error("clone() failed: %m");
1221
1222 goto finish;
1223 }
1224
1225 if (pid == 0) {
1226 /* child */
1227 const char *home = NULL;
1228 uid_t uid = (uid_t) -1;
1229 gid_t gid = (gid_t) -1;
1230 unsigned n_env = 0;
1231 const char *envp[] = {
1232 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1233 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1234 NULL, /* TERM */
1235 NULL, /* HOME */
1236 NULL, /* USER */
1237 NULL, /* LOGNAME */
1238 NULL, /* container_uuid */
1239 NULL, /* LISTEN_FDS */
1240 NULL, /* LISTEN_PID */
1241 NULL
1242 };
1243
1244 envp[2] = strv_find_prefix(environ, "TERM=");
1245 n_env = 3;
1246
1247 close(pipefd[1]);
1248 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1249 close(pipefd[0]);
1250
1251 close_nointr_nofail(master);
1252 master = -1;
1253
1254 if (saved_attr_valid) {
1255 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1256 log_error("Failed to set terminal attributes: %m");
1257 goto child_fail;
1258 }
1259 }
1260
1261 close_nointr(STDIN_FILENO);
1262 close_nointr(STDOUT_FILENO);
1263 close_nointr(STDERR_FILENO);
1264
1265 close_nointr_nofail(kmsg_socket_pair[0]);
1266 kmsg_socket_pair[0] = -1;
1267
1268 reset_all_signal_handlers();
1269
1270 assert_se(sigemptyset(&mask) == 0);
1271 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1272
1273 k = open_terminal(console, O_RDWR);
1274 if (k != STDIN_FILENO) {
1275 if (k >= 0) {
1276 close_nointr_nofail(k);
1277 k = -EINVAL;
1278 }
1279
1280 log_error("Failed to open console: %s", strerror(-k));
1281 goto child_fail;
1282 }
1283
1284 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1285 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1286 log_error("Failed to duplicate console: %m");
1287 goto child_fail;
1288 }
1289
1290 if (setsid() < 0) {
1291 log_error("setsid() failed: %m");
1292 goto child_fail;
1293 }
1294
1295 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1296 log_error("PR_SET_PDEATHSIG failed: %m");
1297 goto child_fail;
1298 }
1299
1300 /* Mark everything as slave, so that we still
1301 * receive mounts from the real root, but don't
1302 * propagate mounts to the real root. */
1303 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1304 log_error("MS_SLAVE|MS_REC failed: %m");
1305 goto child_fail;
1306 }
1307
1308 /* Turn directory into bind mount */
1309 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1310 log_error("Failed to make bind mount.");
1311 goto child_fail;
1312 }
1313
1314 if (arg_read_only)
1315 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1316 log_error("Failed to make read-only.");
1317 goto child_fail;
1318 }
1319
1320 if (mount_all(arg_directory) < 0)
1321 goto child_fail;
1322
1323 if (copy_devnodes(arg_directory) < 0)
1324 goto child_fail;
1325
1326 dev_setup(arg_directory);
1327
1328 if (setup_dev_console(arg_directory, console) < 0)
1329 goto child_fail;
1330
1331 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1332 goto child_fail;
1333
1334 close_nointr_nofail(kmsg_socket_pair[1]);
1335 kmsg_socket_pair[1] = -1;
1336
1337 if (setup_boot_id(arg_directory) < 0)
1338 goto child_fail;
1339
1340 if (setup_timezone(arg_directory) < 0)
1341 goto child_fail;
1342
1343 if (setup_resolv_conf(arg_directory) < 0)
1344 goto child_fail;
1345
1346 if (setup_journal(arg_directory) < 0)
1347 goto child_fail;
1348
1349 if (chdir(arg_directory) < 0) {
1350 log_error("chdir(%s) failed: %m", arg_directory);
1351 goto child_fail;
1352 }
1353
1354 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1355 log_error("mount(MS_MOVE) failed: %m");
1356 goto child_fail;
1357 }
1358
1359 if (chroot(".") < 0) {
1360 log_error("chroot() failed: %m");
1361 goto child_fail;
1362 }
1363
1364 if (chdir("/") < 0) {
1365 log_error("chdir() failed: %m");
1366 goto child_fail;
1367 }
1368
1369 umask(0022);
1370
1371 loopback_setup();
1372
1373 if (drop_capabilities() < 0) {
1374 log_error("drop_capabilities() failed: %m");
1375 goto child_fail;
1376 }
1377
1378 if (arg_user) {
1379
1380 /* Note that this resolves user names
1381 * inside the container, and hence
1382 * accesses the NSS modules from the
1383 * container and not the host. This is
1384 * a bit weird... */
1385
1386 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1387 log_error("get_user_creds() failed: %m");
1388 goto child_fail;
1389 }
1390
1391 if (mkdir_parents_label(home, 0775) < 0) {
1392 log_error("mkdir_parents_label() failed: %m");
1393 goto child_fail;
1394 }
1395
1396 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1397 log_error("mkdir_safe_label() failed: %m");
1398 goto child_fail;
1399 }
1400
1401 if (initgroups((const char*)arg_user, gid) < 0) {
1402 log_error("initgroups() failed: %m");
1403 goto child_fail;
1404 }
1405
1406 if (setresgid(gid, gid, gid) < 0) {
1407 log_error("setregid() failed: %m");
1408 goto child_fail;
1409 }
1410
1411 if (setresuid(uid, uid, uid) < 0) {
1412 log_error("setreuid() failed: %m");
1413 goto child_fail;
1414 }
1415 } else {
1416 /* Reset everything fully to 0, just in case */
1417
1418 if (setgroups(0, NULL) < 0) {
1419 log_error("setgroups() failed: %m");
1420 goto child_fail;
1421 }
1422
1423 if (setresgid(0, 0, 0) < 0) {
1424 log_error("setregid() failed: %m");
1425 goto child_fail;
1426 }
1427
1428 if (setresuid(0, 0, 0) < 0) {
1429 log_error("setreuid() failed: %m");
1430 goto child_fail;
1431 }
1432 }
1433
1434 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1435 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1436 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1437 log_oom();
1438 goto child_fail;
1439 }
1440
1441 if (arg_uuid) {
1442 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1443 log_oom();
1444 goto child_fail;
1445 }
1446 }
1447
1448 if (fdset_size(fds) > 0) {
1449 k = fdset_cloexec(fds, false);
1450 if (k < 0) {
1451 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1452 goto child_fail;
1453 }
1454
1455 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1456 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1457 log_oom();
1458 goto child_fail;
1459 }
1460 }
1461
1462 setup_hostname();
1463
1464 if (arg_boot) {
1465 char **a;
1466 size_t l;
1467
1468 /* Automatically search for the init system */
1469
1470 l = 1 + argc - optind;
1471 a = newa(char*, l + 1);
1472 memcpy(a + 1, argv + optind, l * sizeof(char*));
1473
1474 a[0] = (char*) "/usr/lib/systemd/systemd";
1475 execve(a[0], a, (char**) envp);
1476
1477 a[0] = (char*) "/lib/systemd/systemd";
1478 execve(a[0], a, (char**) envp);
1479
1480 a[0] = (char*) "/sbin/init";
1481 execve(a[0], a, (char**) envp);
1482 } else if (argc > optind)
1483 execvpe(argv[optind], argv + optind, (char**) envp);
1484 else {
1485 chdir(home ? home : "/root");
1486 execle("/bin/bash", "-bash", NULL, (char**) envp);
1487 }
1488
1489 log_error("execv() failed: %m");
1490
1491 child_fail:
1492 _exit(EXIT_FAILURE);
1493 }
1494
1495 log_info("Init process in the container running as PID %d", pid);
1496 close(pipefd[0]);
1497 close(pipefd[1]);
1498
1499 fdset_free(fds);
1500 fds = NULL;
1501
1502 if (process_pty(master, pid, &mask) < 0)
1503 goto finish;
1504
1505 if (saved_attr_valid)
1506 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1507
1508 r = wait_for_terminate(pid, &status);
1509 if (r < 0) {
1510 r = EXIT_FAILURE;
1511 break;
1512 }
1513
1514 if (status.si_code == CLD_EXITED) {
1515 if (status.si_status != 0) {
1516 log_error("Container failed with error code %i.", status.si_status);
1517 r = status.si_status;
1518 break;
1519 }
1520
1521 log_debug("Container exited successfully.");
1522 break;
1523 } else if (status.si_code == CLD_KILLED &&
1524 status.si_status == SIGINT) {
1525 log_info("Container has been shut down.");
1526 r = 0;
1527 break;
1528 } else if (status.si_code == CLD_KILLED &&
1529 status.si_status == SIGHUP) {
1530 log_info("Container is being rebooted.");
1531 continue;
1532 } else if (status.si_code == CLD_KILLED ||
1533 status.si_code == CLD_DUMPED) {
1534
1535 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1536 r = EXIT_FAILURE;
1537 break;
1538 } else {
1539 log_error("Container failed due to unknown reason.");
1540 r = EXIT_FAILURE;
1541 break;
1542 }
1543 }
1544
1545 finish:
1546 if (saved_attr_valid)
1547 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1548
1549 if (master >= 0)
1550 close_nointr_nofail(master);
1551
1552 close_pipe(kmsg_socket_pair);
1553
1554 if (oldcg)
1555 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1556
1557 if (newcg)
1558 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1559
1560 free(arg_directory);
1561 strv_free(arg_controllers);
1562 free(oldcg);
1563 free(newcg);
1564
1565 fdset_free(fds);
1566
1567 return r;
1568 }