]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: add --bind= and --bind-ro= to bind mount host paths into the container
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 typedef enum LinkJournal {
64 LINK_NO,
65 LINK_AUTO,
66 LINK_HOST,
67 LINK_GUEST
68 } LinkJournal;
69
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
79 (1ULL << CAP_CHOWN) |
80 (1ULL << CAP_DAC_OVERRIDE) |
81 (1ULL << CAP_DAC_READ_SEARCH) |
82 (1ULL << CAP_FOWNER) |
83 (1ULL << CAP_FSETID) |
84 (1ULL << CAP_IPC_OWNER) |
85 (1ULL << CAP_KILL) |
86 (1ULL << CAP_LEASE) |
87 (1ULL << CAP_LINUX_IMMUTABLE) |
88 (1ULL << CAP_NET_BIND_SERVICE) |
89 (1ULL << CAP_NET_BROADCAST) |
90 (1ULL << CAP_NET_RAW) |
91 (1ULL << CAP_SETGID) |
92 (1ULL << CAP_SETFCAP) |
93 (1ULL << CAP_SETPCAP) |
94 (1ULL << CAP_SETUID) |
95 (1ULL << CAP_SYS_ADMIN) |
96 (1ULL << CAP_SYS_CHROOT) |
97 (1ULL << CAP_SYS_NICE) |
98 (1ULL << CAP_SYS_PTRACE) |
99 (1ULL << CAP_SYS_TTY_CONFIG) |
100 (1ULL << CAP_SYS_RESOURCE) |
101 (1ULL << CAP_SYS_BOOT) |
102 (1ULL << CAP_AUDIT_WRITE) |
103 (1ULL << CAP_AUDIT_CONTROL);
104 static char **arg_bind = NULL;
105 static char **arg_bind_ro = NULL;
106
107 static int help(void) {
108
109 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
110 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
111 " -h --help Show this help\n"
112 " --version Print version string\n"
113 " -D --directory=NAME Root directory for the container\n"
114 " -b --boot Boot up full system (i.e. invoke init)\n"
115 " -u --user=USER Run the command under specified user or uid\n"
116 " -C --controllers=LIST Put the container in specified comma-separated\n"
117 " cgroup hierarchies\n"
118 " --uuid=UUID Set a specific machine UUID for the container\n"
119 " --private-network Disable network in container\n"
120 " --read-only Mount the root directory read-only\n"
121 " --capability=CAP In addition to the default, retain specified\n"
122 " capability\n"
123 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
124 " -j Equivalent to --link-journal=host\n"
125 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
126 " the container\n"
127 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
128 program_invocation_short_name);
129
130 return 0;
131 }
132
133 static int parse_argv(int argc, char *argv[]) {
134
135 enum {
136 ARG_VERSION = 0x100,
137 ARG_PRIVATE_NETWORK,
138 ARG_UUID,
139 ARG_READ_ONLY,
140 ARG_CAPABILITY,
141 ARG_LINK_JOURNAL,
142 ARG_BIND,
143 ARG_BIND_RO
144 };
145
146 static const struct option options[] = {
147 { "help", no_argument, NULL, 'h' },
148 { "version", no_argument, NULL, ARG_VERSION },
149 { "directory", required_argument, NULL, 'D' },
150 { "user", required_argument, NULL, 'u' },
151 { "controllers", required_argument, NULL, 'C' },
152 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
153 { "boot", no_argument, NULL, 'b' },
154 { "uuid", required_argument, NULL, ARG_UUID },
155 { "read-only", no_argument, NULL, ARG_READ_ONLY },
156 { "capability", required_argument, NULL, ARG_CAPABILITY },
157 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
158 { "bind", required_argument, NULL, ARG_BIND },
159 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
160 { NULL, 0, NULL, 0 }
161 };
162
163 int c;
164
165 assert(argc >= 0);
166 assert(argv);
167
168 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
169
170 switch (c) {
171
172 case 'h':
173 help();
174 return 0;
175
176 case ARG_VERSION:
177 puts(PACKAGE_STRING);
178 puts(SYSTEMD_FEATURES);
179 return 0;
180
181 case 'D':
182 free(arg_directory);
183 arg_directory = canonicalize_file_name(optarg);
184 if (!arg_directory) {
185 log_error("Failed to canonicalize root directory.");
186 return -ENOMEM;
187 }
188
189 break;
190
191 case 'u':
192 free(arg_user);
193 if (!(arg_user = strdup(optarg))) {
194 log_error("Failed to duplicate user name.");
195 return -ENOMEM;
196 }
197
198 break;
199
200 case 'C':
201 strv_free(arg_controllers);
202 arg_controllers = strv_split(optarg, ",");
203 if (!arg_controllers) {
204 log_error("Failed to split controllers list.");
205 return -ENOMEM;
206 }
207 strv_uniq(arg_controllers);
208
209 break;
210
211 case ARG_PRIVATE_NETWORK:
212 arg_private_network = true;
213 break;
214
215 case 'b':
216 arg_boot = true;
217 break;
218
219 case ARG_UUID:
220 arg_uuid = optarg;
221 break;
222
223 case ARG_READ_ONLY:
224 arg_read_only = true;
225 break;
226
227 case ARG_CAPABILITY: {
228 char *state, *word;
229 size_t length;
230
231 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
232 cap_value_t cap;
233 char *t;
234
235 t = strndup(word, length);
236 if (!t)
237 return log_oom();
238
239 if (cap_from_name(t, &cap) < 0) {
240 log_error("Failed to parse capability %s.", t);
241 free(t);
242 return -EINVAL;
243 }
244
245 free(t);
246 arg_retain |= 1ULL << (uint64_t) cap;
247 }
248
249 break;
250 }
251
252 case 'j':
253 arg_link_journal = LINK_GUEST;
254 break;
255
256 case ARG_LINK_JOURNAL:
257 if (streq(optarg, "auto"))
258 arg_link_journal = LINK_AUTO;
259 else if (streq(optarg, "no"))
260 arg_link_journal = LINK_NO;
261 else if (streq(optarg, "guest"))
262 arg_link_journal = LINK_GUEST;
263 else if (streq(optarg, "host"))
264 arg_link_journal = LINK_HOST;
265 else {
266 log_error("Failed to parse link journal mode %s", optarg);
267 return -EINVAL;
268 }
269
270 break;
271
272 case ARG_BIND:
273 case ARG_BIND_RO: {
274 _cleanup_free_ char *a = NULL, *b = NULL;
275 char *e;
276 char ***x;
277 int r;
278
279 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
280
281 e = strchr(optarg, ':');
282 if (e) {
283 a = strndup(optarg, e - optarg);
284 b = strdup(e + 1);
285 } else {
286 a = strdup(optarg);
287 b = strdup(optarg);
288 }
289
290 if (!a || !b)
291 return log_oom();
292
293 if (!path_is_absolute(a) || !path_is_absolute(b)) {
294 log_error("Invalid bind mount specification: %s", optarg);
295 return -EINVAL;
296 }
297
298 r = strv_extend(x, a);
299 if (r < 0)
300 return r;
301
302 r = strv_extend(x, b);
303 if (r < 0)
304 return r;
305
306 break;
307 }
308
309 case '?':
310 return -EINVAL;
311
312 default:
313 log_error("Unknown option code %c", c);
314 return -EINVAL;
315 }
316 }
317
318 return 1;
319 }
320
321 static int mount_all(const char *dest) {
322
323 typedef struct MountPoint {
324 const char *what;
325 const char *where;
326 const char *type;
327 const char *options;
328 unsigned long flags;
329 bool fatal;
330 } MountPoint;
331
332 static const MountPoint mount_table[] = {
333 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
334 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
335 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
336 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
337 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
338 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
339 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
340 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
341 #ifdef HAVE_SELINUX
342 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
343 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
344 #endif
345 };
346
347 unsigned k;
348 int r = 0;
349
350 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
351 char _cleanup_free_ *where = NULL;
352 int t;
353
354 where = strjoin(dest, "/", mount_table[k].where, NULL);
355 if (!where)
356 return log_oom();
357
358 t = path_is_mount_point(where, true);
359 if (t < 0) {
360 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
361
362 if (r == 0)
363 r = t;
364
365 continue;
366 }
367
368 /* Skip this entry if it is not a remount. */
369 if (mount_table[k].what && t > 0)
370 continue;
371
372 mkdir_p(where, 0755);
373
374 if (mount(mount_table[k].what,
375 where,
376 mount_table[k].type,
377 mount_table[k].flags,
378 mount_table[k].options) < 0 &&
379 mount_table[k].fatal) {
380
381 log_error("mount(%s) failed: %m", where);
382
383 if (r == 0)
384 r = -errno;
385 }
386 }
387
388 return r;
389 }
390
391 static int mount_binds(const char *dest, char **l, unsigned long flags) {
392 char **x, **y;
393
394 STRV_FOREACH_PAIR(x, y, l) {
395 _cleanup_free_ char *where = NULL;
396
397 where = strjoin(dest, "/", *y, NULL);
398 if (!where)
399 return log_oom();
400
401 mkdir_p_label(where, 0755);
402
403 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
404 log_error("mount(%s) failed: %m", where);
405 return -errno;
406 }
407
408 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
409 log_error("mount(%s) failed: %m", where);
410 return -errno;
411 }
412 }
413
414 return 0;
415 }
416
417 static int setup_timezone(const char *dest) {
418 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
419 char *z, *y;
420 int r;
421
422 assert(dest);
423
424 /* Fix the timezone, if possible */
425 r = readlink_malloc("/etc/localtime", &p);
426 if (r < 0) {
427 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
428 return 0;
429 }
430
431 z = path_startswith(p, "../usr/share/zoneinfo/");
432 if (!z)
433 z = path_startswith(p, "/usr/share/zoneinfo/");
434 if (!z) {
435 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
436 return 0;
437 }
438
439 where = strappend(dest, "/etc/localtime");
440 if (!where)
441 return log_oom();
442
443 r = readlink_malloc(where, &q);
444 if (r >= 0) {
445 y = path_startswith(q, "../usr/share/zoneinfo/");
446 if (!y)
447 y = path_startswith(q, "/usr/share/zoneinfo/");
448
449
450 /* Already pointing to the right place? Then do nothing .. */
451 if (y && streq(y, z))
452 return 0;
453 }
454
455 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
456 if (!check)
457 return log_oom();
458
459 if (access(check, F_OK) < 0) {
460 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
461 return 0;
462 }
463
464 what = strappend("../usr/share/zoneinfo/", z);
465 if (!what)
466 return log_oom();
467
468 unlink(where);
469 if (symlink(what, where) < 0) {
470 log_error("Failed to correct timezone of container: %m");
471 return 0;
472 }
473
474 return 0;
475 }
476
477 static int setup_resolv_conf(const char *dest) {
478 char *where;
479
480 assert(dest);
481
482 if (arg_private_network)
483 return 0;
484
485 /* Fix resolv.conf, if possible */
486 where = strappend(dest, "/etc/resolv.conf");
487 if (!where)
488 return log_oom();
489
490 /* We don't really care for the results of this really. If it
491 * fails, it fails, but meh... */
492 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
493 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
494
495 free(where);
496
497 return 0;
498 }
499
500 static int setup_boot_id(const char *dest) {
501 char _cleanup_free_ *from = NULL, *to = NULL;
502 sd_id128_t rnd;
503 char as_uuid[37];
504 int r;
505
506 assert(dest);
507
508 /* Generate a new randomized boot ID, so that each boot-up of
509 * the container gets a new one */
510
511 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
512 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
513 if (!from || !to)
514 return log_oom();
515
516 r = sd_id128_randomize(&rnd);
517 if (r < 0) {
518 log_error("Failed to generate random boot id: %s", strerror(-r));
519 return r;
520 }
521
522 snprintf(as_uuid, sizeof(as_uuid),
523 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
524 SD_ID128_FORMAT_VAL(rnd));
525 char_array_0(as_uuid);
526
527 r = write_one_line_file(from, as_uuid);
528 if (r < 0) {
529 log_error("Failed to write boot id: %s", strerror(-r));
530 return r;
531 }
532
533 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
534 log_error("Failed to bind mount boot id: %m");
535 r = -errno;
536 } else
537 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
538
539 unlink(from);
540 return r;
541 }
542
543 static int copy_devnodes(const char *dest) {
544
545 static const char devnodes[] =
546 "null\0"
547 "zero\0"
548 "full\0"
549 "random\0"
550 "urandom\0"
551 "tty\0"
552 "ptmx\0";
553
554 const char *d;
555 int r = 0;
556 mode_t _cleanup_umask_ u;
557
558 assert(dest);
559
560 u = umask(0000);
561
562 NULSTR_FOREACH(d, devnodes) {
563 struct stat st;
564 char _cleanup_free_ *from = NULL, *to = NULL;
565
566 asprintf(&from, "/dev/%s", d);
567 asprintf(&to, "%s/dev/%s", dest, d);
568
569 if (!from || !to) {
570 log_oom();
571
572 if (r == 0)
573 r = -ENOMEM;
574
575 break;
576 }
577
578 if (stat(from, &st) < 0) {
579
580 if (errno != ENOENT) {
581 log_error("Failed to stat %s: %m", from);
582 if (r == 0)
583 r = -errno;
584 }
585
586 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
587
588 log_error("%s is not a char or block device, cannot copy", from);
589 if (r == 0)
590 r = -EIO;
591
592 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
593
594 log_error("mknod(%s) failed: %m", dest);
595 if (r == 0)
596 r = -errno;
597 }
598 }
599
600 return r;
601 }
602
603 static int setup_dev_console(const char *dest, const char *console) {
604 struct stat st;
605 char _cleanup_free_ *to = NULL;
606 int r;
607 mode_t _cleanup_umask_ u;
608
609 assert(dest);
610 assert(console);
611
612 u = umask(0000);
613
614 if (stat(console, &st) < 0) {
615 log_error("Failed to stat %s: %m", console);
616 return -errno;
617
618 } else if (!S_ISCHR(st.st_mode)) {
619 log_error("/dev/console is not a char device");
620 return -EIO;
621 }
622
623 r = chmod_and_chown(console, 0600, 0, 0);
624 if (r < 0) {
625 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
626 return r;
627 }
628
629 if (asprintf(&to, "%s/dev/console", dest) < 0)
630 return log_oom();
631
632 /* We need to bind mount the right tty to /dev/console since
633 * ptys can only exist on pts file systems. To have something
634 * to bind mount things on we create a device node first, that
635 * has the right major/minor (note that the major minor
636 * doesn't actually matter here, since we mount it over
637 * anyway). */
638
639 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
640 log_error("mknod() for /dev/console failed: %m");
641 return -errno;
642 }
643
644 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
645 log_error("Bind mount for /dev/console failed: %m");
646 return -errno;
647 }
648
649 return 0;
650 }
651
652 static int setup_kmsg(const char *dest, int kmsg_socket) {
653 char _cleanup_free_ *from = NULL, *to = NULL;
654 int r, fd, k;
655 mode_t _cleanup_umask_ u;
656 union {
657 struct cmsghdr cmsghdr;
658 uint8_t buf[CMSG_SPACE(sizeof(int))];
659 } control;
660 struct msghdr mh;
661 struct cmsghdr *cmsg;
662
663 assert(dest);
664 assert(kmsg_socket >= 0);
665
666 u = umask(0000);
667
668 /* We create the kmsg FIFO as /dev/kmsg, but immediately
669 * delete it after bind mounting it to /proc/kmsg. While FIFOs
670 * on the reading side behave very similar to /proc/kmsg,
671 * their writing side behaves differently from /dev/kmsg in
672 * that writing blocks when nothing is reading. In order to
673 * avoid any problems with containers deadlocking due to this
674 * we simply make /dev/kmsg unavailable to the container. */
675 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
676 asprintf(&to, "%s/proc/kmsg", dest) < 0)
677 return log_oom();
678
679 if (mkfifo(from, 0600) < 0) {
680 log_error("mkfifo() for /dev/kmsg failed: %m");
681 return -errno;
682 }
683
684 r = chmod_and_chown(from, 0600, 0, 0);
685 if (r < 0) {
686 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
687 return r;
688 }
689
690 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
691 log_error("Bind mount for /proc/kmsg failed: %m");
692 return -errno;
693 }
694
695 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
696 if (fd < 0) {
697 log_error("Failed to open fifo: %m");
698 return -errno;
699 }
700
701 zero(mh);
702 zero(control);
703
704 mh.msg_control = &control;
705 mh.msg_controllen = sizeof(control);
706
707 cmsg = CMSG_FIRSTHDR(&mh);
708 cmsg->cmsg_level = SOL_SOCKET;
709 cmsg->cmsg_type = SCM_RIGHTS;
710 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
711 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
712
713 mh.msg_controllen = cmsg->cmsg_len;
714
715 /* Store away the fd in the socket, so that it stays open as
716 * long as we run the child */
717 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
718 close_nointr_nofail(fd);
719
720 if (k < 0) {
721 log_error("Failed to send FIFO fd: %m");
722 return -errno;
723 }
724
725 /* And now make the FIFO unavailable as /dev/kmsg... */
726 unlink(from);
727 return 0;
728 }
729
730 static int setup_hostname(void) {
731 char *hn;
732 int r = 0;
733
734 hn = path_get_file_name(arg_directory);
735 if (hn) {
736 hn = strdup(hn);
737 if (!hn)
738 return -ENOMEM;
739
740 hostname_cleanup(hn);
741
742 if (!isempty(hn))
743 if (sethostname(hn, strlen(hn)) < 0)
744 r = -errno;
745
746 free(hn);
747 }
748
749 return r;
750 }
751
752 static int setup_journal(const char *directory) {
753 sd_id128_t machine_id;
754 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
755 char *id;
756 int r;
757
758 if (arg_link_journal == LINK_NO)
759 return 0;
760
761 p = strappend(directory, "/etc/machine-id");
762 if (!p)
763 return log_oom();
764
765 r = read_one_line_file(p, &b);
766 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
767 return 0;
768 else if (r < 0) {
769 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
770 return r;
771 }
772
773 id = strstrip(b);
774 if (isempty(id) && arg_link_journal == LINK_AUTO)
775 return 0;
776
777 /* Verify validity */
778 r = sd_id128_from_string(id, &machine_id);
779 if (r < 0) {
780 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
781 return r;
782 }
783
784 free(p);
785 p = strappend("/var/log/journal/", id);
786 q = strjoin(directory, "/var/log/journal/", id, NULL);
787 if (!p || !q)
788 return log_oom();
789
790 if (path_is_mount_point(p, false) > 0) {
791 if (arg_link_journal != LINK_AUTO) {
792 log_error("%s: already a mount point, refusing to use for journal", p);
793 return -EEXIST;
794 }
795
796 return 0;
797 }
798
799 if (path_is_mount_point(q, false) > 0) {
800 if (arg_link_journal != LINK_AUTO) {
801 log_error("%s: already a mount point, refusing to use for journal", q);
802 return -EEXIST;
803 }
804
805 return 0;
806 }
807
808 r = readlink_and_make_absolute(p, &d);
809 if (r >= 0) {
810 if ((arg_link_journal == LINK_GUEST ||
811 arg_link_journal == LINK_AUTO) &&
812 path_equal(d, q)) {
813
814 r = mkdir_p(q, 0755);
815 if (r < 0)
816 log_warning("failed to create directory %s: %m", q);
817 return 0;
818 }
819
820 if (unlink(p) < 0) {
821 log_error("Failed to remove symlink %s: %m", p);
822 return -errno;
823 }
824 } else if (r == -EINVAL) {
825
826 if (arg_link_journal == LINK_GUEST &&
827 rmdir(p) < 0) {
828
829 if (errno == ENOTDIR) {
830 log_error("%s already exists and is neither a symlink nor a directory", p);
831 return r;
832 } else {
833 log_error("Failed to remove %s: %m", p);
834 return -errno;
835 }
836 }
837 } else if (r != -ENOENT) {
838 log_error("readlink(%s) failed: %m", p);
839 return r;
840 }
841
842 if (arg_link_journal == LINK_GUEST) {
843
844 if (symlink(q, p) < 0) {
845 log_error("Failed to symlink %s to %s: %m", q, p);
846 return -errno;
847 }
848
849 r = mkdir_p(q, 0755);
850 if (r < 0)
851 log_warning("failed to create directory %s: %m", q);
852 return 0;
853 }
854
855 if (arg_link_journal == LINK_HOST) {
856 r = mkdir_p(p, 0755);
857 if (r < 0) {
858 log_error("Failed to create %s: %m", p);
859 return r;
860 }
861
862 } else if (access(p, F_OK) < 0)
863 return 0;
864
865 if (dir_is_empty(q) == 0) {
866 log_error("%s not empty.", q);
867 return -ENOTEMPTY;
868 }
869
870 r = mkdir_p(q, 0755);
871 if (r < 0) {
872 log_error("Failed to create %s: %m", q);
873 return r;
874 }
875
876 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
877 log_error("Failed to bind mount journal from host into guest: %m");
878 return -errno;
879 }
880
881 return 0;
882 }
883
884 static int drop_capabilities(void) {
885 return capability_bounding_set_drop(~arg_retain, false);
886 }
887
888 static int is_os_tree(const char *path) {
889 int r;
890 char *p;
891 /* We use /bin/sh as flag file if something is an OS */
892
893 if (asprintf(&p, "%s/bin/sh", path) < 0)
894 return -ENOMEM;
895
896 r = access(p, F_OK);
897 free(p);
898
899 return r < 0 ? 0 : 1;
900 }
901
902 static int process_pty(int master, pid_t pid, sigset_t *mask) {
903
904 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
905 size_t in_buffer_full = 0, out_buffer_full = 0;
906 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
907 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
908 int ep = -1, signal_fd = -1, r;
909 bool tried_orderly_shutdown = false;
910
911 assert(master >= 0);
912 assert(pid > 0);
913 assert(mask);
914
915 fd_nonblock(STDIN_FILENO, 1);
916 fd_nonblock(STDOUT_FILENO, 1);
917 fd_nonblock(master, 1);
918
919 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
920 if (signal_fd < 0) {
921 log_error("signalfd(): %m");
922 r = -errno;
923 goto finish;
924 }
925
926 ep = epoll_create1(EPOLL_CLOEXEC);
927 if (ep < 0) {
928 log_error("Failed to create epoll: %m");
929 r = -errno;
930 goto finish;
931 }
932
933 /* We read from STDIN only if this is actually a TTY,
934 * otherwise we assume non-interactivity. */
935 if (isatty(STDIN_FILENO)) {
936 zero(stdin_ev);
937 stdin_ev.events = EPOLLIN|EPOLLET;
938 stdin_ev.data.fd = STDIN_FILENO;
939
940 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
941 log_error("Failed to register STDIN in epoll: %m");
942 r = -errno;
943 goto finish;
944 }
945 }
946
947 zero(stdout_ev);
948 stdout_ev.events = EPOLLOUT|EPOLLET;
949 stdout_ev.data.fd = STDOUT_FILENO;
950
951 zero(master_ev);
952 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
953 master_ev.data.fd = master;
954
955 zero(signal_ev);
956 signal_ev.events = EPOLLIN;
957 signal_ev.data.fd = signal_fd;
958
959 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
960 if (errno != EPERM) {
961 log_error("Failed to register stdout in epoll: %m");
962 r = -errno;
963 goto finish;
964 }
965 /* stdout without epoll support. Likely redirected to regular file. */
966 stdout_writable = true;
967 }
968
969 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
970 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
971 log_error("Failed to register fds in epoll: %m");
972 r = -errno;
973 goto finish;
974 }
975
976 for (;;) {
977 struct epoll_event ev[16];
978 ssize_t k;
979 int i, nfds;
980
981 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
982 if (nfds < 0) {
983
984 if (errno == EINTR || errno == EAGAIN)
985 continue;
986
987 log_error("epoll_wait(): %m");
988 r = -errno;
989 goto finish;
990 }
991
992 assert(nfds >= 1);
993
994 for (i = 0; i < nfds; i++) {
995 if (ev[i].data.fd == STDIN_FILENO) {
996
997 if (ev[i].events & (EPOLLIN|EPOLLHUP))
998 stdin_readable = true;
999
1000 } else if (ev[i].data.fd == STDOUT_FILENO) {
1001
1002 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1003 stdout_writable = true;
1004
1005 } else if (ev[i].data.fd == master) {
1006
1007 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1008 master_readable = true;
1009
1010 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1011 master_writable = true;
1012
1013 } else if (ev[i].data.fd == signal_fd) {
1014 struct signalfd_siginfo sfsi;
1015 ssize_t n;
1016
1017 n = read(signal_fd, &sfsi, sizeof(sfsi));
1018 if (n != sizeof(sfsi)) {
1019
1020 if (n >= 0) {
1021 log_error("Failed to read from signalfd: invalid block size");
1022 r = -EIO;
1023 goto finish;
1024 }
1025
1026 if (errno != EINTR && errno != EAGAIN) {
1027 log_error("Failed to read from signalfd: %m");
1028 r = -errno;
1029 goto finish;
1030 }
1031 } else {
1032
1033 if (sfsi.ssi_signo == SIGWINCH) {
1034 struct winsize ws;
1035
1036 /* The window size changed, let's forward that. */
1037 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1038 ioctl(master, TIOCSWINSZ, &ws);
1039 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1040
1041 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1042
1043 /* This only works for systemd... */
1044 tried_orderly_shutdown = true;
1045 kill(pid, SIGRTMIN+3);
1046
1047 } else {
1048 r = 0;
1049 goto finish;
1050 }
1051 }
1052 }
1053 }
1054
1055 while ((stdin_readable && in_buffer_full <= 0) ||
1056 (master_writable && in_buffer_full > 0) ||
1057 (master_readable && out_buffer_full <= 0) ||
1058 (stdout_writable && out_buffer_full > 0)) {
1059
1060 if (stdin_readable && in_buffer_full < LINE_MAX) {
1061
1062 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1063 if (k < 0) {
1064
1065 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1066 stdin_readable = false;
1067 else {
1068 log_error("read(): %m");
1069 r = -errno;
1070 goto finish;
1071 }
1072 } else
1073 in_buffer_full += (size_t) k;
1074 }
1075
1076 if (master_writable && in_buffer_full > 0) {
1077
1078 k = write(master, in_buffer, in_buffer_full);
1079 if (k < 0) {
1080
1081 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1082 master_writable = false;
1083 else {
1084 log_error("write(): %m");
1085 r = -errno;
1086 goto finish;
1087 }
1088
1089 } else {
1090 assert(in_buffer_full >= (size_t) k);
1091 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1092 in_buffer_full -= k;
1093 }
1094 }
1095
1096 if (master_readable && out_buffer_full < LINE_MAX) {
1097
1098 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1099 if (k < 0) {
1100
1101 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1102 master_readable = false;
1103 else {
1104 log_error("read(): %m");
1105 r = -errno;
1106 goto finish;
1107 }
1108 } else
1109 out_buffer_full += (size_t) k;
1110 }
1111
1112 if (stdout_writable && out_buffer_full > 0) {
1113
1114 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1115 if (k < 0) {
1116
1117 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1118 stdout_writable = false;
1119 else {
1120 log_error("write(): %m");
1121 r = -errno;
1122 goto finish;
1123 }
1124
1125 } else {
1126 assert(out_buffer_full >= (size_t) k);
1127 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1128 out_buffer_full -= k;
1129 }
1130 }
1131 }
1132 }
1133
1134 finish:
1135 if (ep >= 0)
1136 close_nointr_nofail(ep);
1137
1138 if (signal_fd >= 0)
1139 close_nointr_nofail(signal_fd);
1140
1141 return r;
1142 }
1143
1144 int main(int argc, char *argv[]) {
1145 pid_t pid = 0;
1146 int r = EXIT_FAILURE, k;
1147 char *oldcg = NULL, *newcg = NULL;
1148 char **controller = NULL;
1149 int master = -1, n_fd_passed;
1150 const char *console = NULL;
1151 struct termios saved_attr, raw_attr;
1152 sigset_t mask;
1153 bool saved_attr_valid = false;
1154 struct winsize ws;
1155 int kmsg_socket_pair[2] = { -1, -1 };
1156 FDSet *fds = NULL;
1157
1158 log_parse_environment();
1159 log_open();
1160
1161 r = parse_argv(argc, argv);
1162 if (r <= 0)
1163 goto finish;
1164
1165 if (arg_directory) {
1166 char *p;
1167
1168 p = path_make_absolute_cwd(arg_directory);
1169 free(arg_directory);
1170 arg_directory = p;
1171 } else
1172 arg_directory = get_current_dir_name();
1173
1174 if (!arg_directory) {
1175 log_error("Failed to determine path");
1176 goto finish;
1177 }
1178
1179 path_kill_slashes(arg_directory);
1180
1181 if (geteuid() != 0) {
1182 log_error("Need to be root.");
1183 goto finish;
1184 }
1185
1186 if (sd_booted() <= 0) {
1187 log_error("Not running on a systemd system.");
1188 goto finish;
1189 }
1190
1191 if (path_equal(arg_directory, "/")) {
1192 log_error("Spawning container on root directory not supported.");
1193 goto finish;
1194 }
1195
1196 if (is_os_tree(arg_directory) <= 0) {
1197 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1198 goto finish;
1199 }
1200
1201 log_close();
1202 n_fd_passed = sd_listen_fds(false);
1203 if (n_fd_passed > 0) {
1204 k = fdset_new_listen_fds(&fds, false);
1205 if (k < 0) {
1206 log_error("Failed to collect file descriptors: %s", strerror(-k));
1207 goto finish;
1208 }
1209 }
1210 fdset_close_others(fds);
1211 log_open();
1212
1213 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1214 if (k < 0) {
1215 log_error("Failed to determine current cgroup: %s", strerror(-k));
1216 goto finish;
1217 }
1218
1219 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1220 log_error("Failed to allocate cgroup path.");
1221 goto finish;
1222 }
1223
1224 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1225 if (k < 0) {
1226 log_error("Failed to create cgroup: %s", strerror(-k));
1227 goto finish;
1228 }
1229
1230 STRV_FOREACH(controller, arg_controllers) {
1231 k = cg_create_and_attach(*controller, newcg, 0);
1232 if (k < 0)
1233 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1234 }
1235
1236 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1237 if (master < 0) {
1238 log_error("Failed to acquire pseudo tty: %m");
1239 goto finish;
1240 }
1241
1242 console = ptsname(master);
1243 if (!console) {
1244 log_error("Failed to determine tty name: %m");
1245 goto finish;
1246 }
1247
1248 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1249
1250 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1251 ioctl(master, TIOCSWINSZ, &ws);
1252
1253 if (unlockpt(master) < 0) {
1254 log_error("Failed to unlock tty: %m");
1255 goto finish;
1256 }
1257
1258 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1259 saved_attr_valid = true;
1260
1261 raw_attr = saved_attr;
1262 cfmakeraw(&raw_attr);
1263 raw_attr.c_lflag &= ~ECHO;
1264 }
1265
1266 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1267 log_error("Failed to create kmsg socket pair");
1268 goto finish;
1269 }
1270
1271 assert_se(sigemptyset(&mask) == 0);
1272 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1273 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1274
1275 for (;;) {
1276 siginfo_t status;
1277 int pipefd[2];
1278
1279 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1280 log_error("pipe2(): %m");
1281 goto finish;
1282 }
1283
1284 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1285 if (pid < 0) {
1286 if (errno == EINVAL)
1287 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1288 else
1289 log_error("clone() failed: %m");
1290
1291 goto finish;
1292 }
1293
1294 if (pid == 0) {
1295 /* child */
1296 const char *home = NULL;
1297 uid_t uid = (uid_t) -1;
1298 gid_t gid = (gid_t) -1;
1299 unsigned n_env = 0;
1300 const char *envp[] = {
1301 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1302 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1303 NULL, /* TERM */
1304 NULL, /* HOME */
1305 NULL, /* USER */
1306 NULL, /* LOGNAME */
1307 NULL, /* container_uuid */
1308 NULL, /* LISTEN_FDS */
1309 NULL, /* LISTEN_PID */
1310 NULL
1311 };
1312
1313 envp[2] = strv_find_prefix(environ, "TERM=");
1314 n_env = 3;
1315
1316 close_nointr_nofail(pipefd[1]);
1317 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1318 close_nointr_nofail(pipefd[0]);
1319
1320 close_nointr_nofail(master);
1321 master = -1;
1322
1323 if (saved_attr_valid) {
1324 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1325 log_error("Failed to set terminal attributes: %m");
1326 goto child_fail;
1327 }
1328 }
1329
1330 close_nointr(STDIN_FILENO);
1331 close_nointr(STDOUT_FILENO);
1332 close_nointr(STDERR_FILENO);
1333
1334 close_nointr_nofail(kmsg_socket_pair[0]);
1335 kmsg_socket_pair[0] = -1;
1336
1337 reset_all_signal_handlers();
1338
1339 assert_se(sigemptyset(&mask) == 0);
1340 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1341
1342 k = open_terminal(console, O_RDWR);
1343 if (k != STDIN_FILENO) {
1344 if (k >= 0) {
1345 close_nointr_nofail(k);
1346 k = -EINVAL;
1347 }
1348
1349 log_error("Failed to open console: %s", strerror(-k));
1350 goto child_fail;
1351 }
1352
1353 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1354 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1355 log_error("Failed to duplicate console: %m");
1356 goto child_fail;
1357 }
1358
1359 if (setsid() < 0) {
1360 log_error("setsid() failed: %m");
1361 goto child_fail;
1362 }
1363
1364 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1365 log_error("PR_SET_PDEATHSIG failed: %m");
1366 goto child_fail;
1367 }
1368
1369 /* Mark everything as slave, so that we still
1370 * receive mounts from the real root, but don't
1371 * propagate mounts to the real root. */
1372 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1373 log_error("MS_SLAVE|MS_REC failed: %m");
1374 goto child_fail;
1375 }
1376
1377 /* Turn directory into bind mount */
1378 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1379 log_error("Failed to make bind mount.");
1380 goto child_fail;
1381 }
1382
1383 if (arg_read_only)
1384 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1385 log_error("Failed to make read-only.");
1386 goto child_fail;
1387 }
1388
1389 if (mount_all(arg_directory) < 0)
1390 goto child_fail;
1391
1392 if (copy_devnodes(arg_directory) < 0)
1393 goto child_fail;
1394
1395 dev_setup(arg_directory);
1396
1397 if (setup_dev_console(arg_directory, console) < 0)
1398 goto child_fail;
1399
1400 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1401 goto child_fail;
1402
1403 close_nointr_nofail(kmsg_socket_pair[1]);
1404 kmsg_socket_pair[1] = -1;
1405
1406 if (setup_boot_id(arg_directory) < 0)
1407 goto child_fail;
1408
1409 if (setup_timezone(arg_directory) < 0)
1410 goto child_fail;
1411
1412 if (setup_resolv_conf(arg_directory) < 0)
1413 goto child_fail;
1414
1415 if (setup_journal(arg_directory) < 0)
1416 goto child_fail;
1417
1418 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1419 goto child_fail;
1420
1421 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1422 goto child_fail;
1423
1424 if (chdir(arg_directory) < 0) {
1425 log_error("chdir(%s) failed: %m", arg_directory);
1426 goto child_fail;
1427 }
1428
1429 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1430 log_error("mount(MS_MOVE) failed: %m");
1431 goto child_fail;
1432 }
1433
1434 if (chroot(".") < 0) {
1435 log_error("chroot() failed: %m");
1436 goto child_fail;
1437 }
1438
1439 if (chdir("/") < 0) {
1440 log_error("chdir() failed: %m");
1441 goto child_fail;
1442 }
1443
1444 umask(0022);
1445
1446 loopback_setup();
1447
1448 if (drop_capabilities() < 0) {
1449 log_error("drop_capabilities() failed: %m");
1450 goto child_fail;
1451 }
1452
1453 if (arg_user) {
1454
1455 /* Note that this resolves user names
1456 * inside the container, and hence
1457 * accesses the NSS modules from the
1458 * container and not the host. This is
1459 * a bit weird... */
1460
1461 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1462 log_error("get_user_creds() failed: %m");
1463 goto child_fail;
1464 }
1465
1466 if (mkdir_parents_label(home, 0775) < 0) {
1467 log_error("mkdir_parents_label() failed: %m");
1468 goto child_fail;
1469 }
1470
1471 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1472 log_error("mkdir_safe_label() failed: %m");
1473 goto child_fail;
1474 }
1475
1476 if (initgroups((const char*)arg_user, gid) < 0) {
1477 log_error("initgroups() failed: %m");
1478 goto child_fail;
1479 }
1480
1481 if (setresgid(gid, gid, gid) < 0) {
1482 log_error("setregid() failed: %m");
1483 goto child_fail;
1484 }
1485
1486 if (setresuid(uid, uid, uid) < 0) {
1487 log_error("setreuid() failed: %m");
1488 goto child_fail;
1489 }
1490 } else {
1491 /* Reset everything fully to 0, just in case */
1492
1493 if (setgroups(0, NULL) < 0) {
1494 log_error("setgroups() failed: %m");
1495 goto child_fail;
1496 }
1497
1498 if (setresgid(0, 0, 0) < 0) {
1499 log_error("setregid() failed: %m");
1500 goto child_fail;
1501 }
1502
1503 if (setresuid(0, 0, 0) < 0) {
1504 log_error("setreuid() failed: %m");
1505 goto child_fail;
1506 }
1507 }
1508
1509 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1510 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1511 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1512 log_oom();
1513 goto child_fail;
1514 }
1515
1516 if (arg_uuid) {
1517 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1518 log_oom();
1519 goto child_fail;
1520 }
1521 }
1522
1523 if (fdset_size(fds) > 0) {
1524 k = fdset_cloexec(fds, false);
1525 if (k < 0) {
1526 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1527 goto child_fail;
1528 }
1529
1530 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1531 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1532 log_oom();
1533 goto child_fail;
1534 }
1535 }
1536
1537 setup_hostname();
1538
1539 if (arg_boot) {
1540 char **a;
1541 size_t l;
1542
1543 /* Automatically search for the init system */
1544
1545 l = 1 + argc - optind;
1546 a = newa(char*, l + 1);
1547 memcpy(a + 1, argv + optind, l * sizeof(char*));
1548
1549 a[0] = (char*) "/usr/lib/systemd/systemd";
1550 execve(a[0], a, (char**) envp);
1551
1552 a[0] = (char*) "/lib/systemd/systemd";
1553 execve(a[0], a, (char**) envp);
1554
1555 a[0] = (char*) "/sbin/init";
1556 execve(a[0], a, (char**) envp);
1557 } else if (argc > optind)
1558 execvpe(argv[optind], argv + optind, (char**) envp);
1559 else {
1560 chdir(home ? home : "/root");
1561 execle("/bin/bash", "-bash", NULL, (char**) envp);
1562 }
1563
1564 log_error("execv() failed: %m");
1565
1566 child_fail:
1567 _exit(EXIT_FAILURE);
1568 }
1569
1570 log_info("Init process in the container running as PID %d", pid);
1571 close_nointr_nofail(pipefd[0]);
1572 close_nointr_nofail(pipefd[1]);
1573
1574 fdset_free(fds);
1575 fds = NULL;
1576
1577 if (process_pty(master, pid, &mask) < 0)
1578 goto finish;
1579
1580 if (saved_attr_valid)
1581 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1582
1583 r = wait_for_terminate(pid, &status);
1584 if (r < 0) {
1585 r = EXIT_FAILURE;
1586 break;
1587 }
1588
1589 if (status.si_code == CLD_EXITED) {
1590 if (status.si_status != 0) {
1591 log_error("Container failed with error code %i.", status.si_status);
1592 r = status.si_status;
1593 break;
1594 }
1595
1596 log_debug("Container exited successfully.");
1597 break;
1598 } else if (status.si_code == CLD_KILLED &&
1599 status.si_status == SIGINT) {
1600 log_info("Container has been shut down.");
1601 r = 0;
1602 break;
1603 } else if (status.si_code == CLD_KILLED &&
1604 status.si_status == SIGHUP) {
1605 log_info("Container is being rebooted.");
1606 continue;
1607 } else if (status.si_code == CLD_KILLED ||
1608 status.si_code == CLD_DUMPED) {
1609
1610 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1611 r = EXIT_FAILURE;
1612 break;
1613 } else {
1614 log_error("Container failed due to unknown reason.");
1615 r = EXIT_FAILURE;
1616 break;
1617 }
1618 }
1619
1620 finish:
1621 if (saved_attr_valid)
1622 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1623
1624 if (master >= 0)
1625 close_nointr_nofail(master);
1626
1627 close_pipe(kmsg_socket_pair);
1628
1629 if (oldcg)
1630 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1631
1632 if (newcg)
1633 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1634
1635 free(arg_directory);
1636 strv_free(arg_controllers);
1637 free(oldcg);
1638 free(newcg);
1639
1640 fdset_free(fds);
1641
1642 return r;
1643 }