]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: introduce the new /machine/ tree in the cgroup tree and move containers there
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 #ifndef TTY_GID
64 #define TTY_GID 5
65 #endif
66
67 typedef enum LinkJournal {
68 LINK_NO,
69 LINK_AUTO,
70 LINK_HOST,
71 LINK_GUEST
72 } LinkJournal;
73
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
84 (1ULL << CAP_CHOWN) |
85 (1ULL << CAP_DAC_OVERRIDE) |
86 (1ULL << CAP_DAC_READ_SEARCH) |
87 (1ULL << CAP_FOWNER) |
88 (1ULL << CAP_FSETID) |
89 (1ULL << CAP_IPC_OWNER) |
90 (1ULL << CAP_KILL) |
91 (1ULL << CAP_LEASE) |
92 (1ULL << CAP_LINUX_IMMUTABLE) |
93 (1ULL << CAP_NET_BIND_SERVICE) |
94 (1ULL << CAP_NET_BROADCAST) |
95 (1ULL << CAP_NET_RAW) |
96 (1ULL << CAP_SETGID) |
97 (1ULL << CAP_SETFCAP) |
98 (1ULL << CAP_SETPCAP) |
99 (1ULL << CAP_SETUID) |
100 (1ULL << CAP_SYS_ADMIN) |
101 (1ULL << CAP_SYS_CHROOT) |
102 (1ULL << CAP_SYS_NICE) |
103 (1ULL << CAP_SYS_PTRACE) |
104 (1ULL << CAP_SYS_TTY_CONFIG) |
105 (1ULL << CAP_SYS_RESOURCE) |
106 (1ULL << CAP_SYS_BOOT) |
107 (1ULL << CAP_AUDIT_WRITE) |
108 (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
111
112 static int help(void) {
113
114 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116 " -h --help Show this help\n"
117 " --version Print version string\n"
118 " -D --directory=NAME Root directory for the container\n"
119 " -b --boot Boot up full system (i.e. invoke init)\n"
120 " -u --user=USER Run the command under specified user or uid\n"
121 " -C --controllers=LIST Put the container in specified comma-separated\n"
122 " cgroup hierarchies\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " --private-network Disable network in container\n"
126 " --read-only Mount the root directory read-only\n"
127 " --capability=CAP In addition to the default, retain specified\n"
128 " capability\n"
129 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
130 " -j Equivalent to --link-journal=host\n"
131 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
132 " the container\n"
133 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134 program_invocation_short_name);
135
136 return 0;
137 }
138
139 static int parse_argv(int argc, char *argv[]) {
140
141 enum {
142 ARG_VERSION = 0x100,
143 ARG_PRIVATE_NETWORK,
144 ARG_UUID,
145 ARG_READ_ONLY,
146 ARG_CAPABILITY,
147 ARG_LINK_JOURNAL,
148 ARG_BIND,
149 ARG_BIND_RO
150 };
151
152 static const struct option options[] = {
153 { "help", no_argument, NULL, 'h' },
154 { "version", no_argument, NULL, ARG_VERSION },
155 { "directory", required_argument, NULL, 'D' },
156 { "user", required_argument, NULL, 'u' },
157 { "controllers", required_argument, NULL, 'C' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
167 { NULL, 0, NULL, 0 }
168 };
169
170 int c;
171
172 assert(argc >= 0);
173 assert(argv);
174
175 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
176
177 switch (c) {
178
179 case 'h':
180 help();
181 return 0;
182
183 case ARG_VERSION:
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
186 return 0;
187
188 case 'D':
189 free(arg_directory);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
193 return -ENOMEM;
194 }
195
196 break;
197
198 case 'u':
199 free(arg_user);
200 arg_user = strdup(optarg);
201 if (!arg_user)
202 return log_oom();
203
204 break;
205
206 case 'C':
207 strv_free(arg_controllers);
208 arg_controllers = strv_split(optarg, ",");
209 if (!arg_controllers)
210 return log_oom();
211
212 cg_shorten_controllers(arg_controllers);
213 break;
214
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
217 break;
218
219 case 'b':
220 arg_boot = true;
221 break;
222
223 case ARG_UUID:
224 arg_uuid = optarg;
225 break;
226
227 case 'M':
228 if (!hostname_is_valid(optarg)) {
229 log_error("Invalid machine name: %s", optarg);
230 return -EINVAL;
231 }
232
233 free(arg_machine);
234 arg_machine = strdup(optarg);
235 if (!arg_machine)
236 return log_oom();
237
238 break;
239
240 case ARG_READ_ONLY:
241 arg_read_only = true;
242 break;
243
244 case ARG_CAPABILITY: {
245 char *state, *word;
246 size_t length;
247
248 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
249 cap_value_t cap;
250 char *t;
251
252 t = strndup(word, length);
253 if (!t)
254 return log_oom();
255
256 if (cap_from_name(t, &cap) < 0) {
257 log_error("Failed to parse capability %s.", t);
258 free(t);
259 return -EINVAL;
260 }
261
262 free(t);
263 arg_retain |= 1ULL << (uint64_t) cap;
264 }
265
266 break;
267 }
268
269 case 'j':
270 arg_link_journal = LINK_GUEST;
271 break;
272
273 case ARG_LINK_JOURNAL:
274 if (streq(optarg, "auto"))
275 arg_link_journal = LINK_AUTO;
276 else if (streq(optarg, "no"))
277 arg_link_journal = LINK_NO;
278 else if (streq(optarg, "guest"))
279 arg_link_journal = LINK_GUEST;
280 else if (streq(optarg, "host"))
281 arg_link_journal = LINK_HOST;
282 else {
283 log_error("Failed to parse link journal mode %s", optarg);
284 return -EINVAL;
285 }
286
287 break;
288
289 case ARG_BIND:
290 case ARG_BIND_RO: {
291 _cleanup_free_ char *a = NULL, *b = NULL;
292 char *e;
293 char ***x;
294 int r;
295
296 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
297
298 e = strchr(optarg, ':');
299 if (e) {
300 a = strndup(optarg, e - optarg);
301 b = strdup(e + 1);
302 } else {
303 a = strdup(optarg);
304 b = strdup(optarg);
305 }
306
307 if (!a || !b)
308 return log_oom();
309
310 if (!path_is_absolute(a) || !path_is_absolute(b)) {
311 log_error("Invalid bind mount specification: %s", optarg);
312 return -EINVAL;
313 }
314
315 r = strv_extend(x, a);
316 if (r < 0)
317 return r;
318
319 r = strv_extend(x, b);
320 if (r < 0)
321 return r;
322
323 break;
324 }
325
326 case '?':
327 return -EINVAL;
328
329 default:
330 log_error("Unknown option code %c", c);
331 return -EINVAL;
332 }
333 }
334
335 return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340 typedef struct MountPoint {
341 const char *what;
342 const char *where;
343 const char *type;
344 const char *options;
345 unsigned long flags;
346 bool fatal;
347 } MountPoint;
348
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 #ifdef HAVE_SELINUX
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
361 #endif
362 };
363
364 unsigned k;
365 int r = 0;
366
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 char _cleanup_free_ *where = NULL;
369 int t;
370
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
372 if (!where)
373 return log_oom();
374
375 t = path_is_mount_point(where, true);
376 if (t < 0) {
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379 if (r == 0)
380 r = t;
381
382 continue;
383 }
384
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
387 continue;
388
389 mkdir_p(where, 0755);
390
391 if (mount(mount_table[k].what,
392 where,
393 mount_table[k].type,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
397
398 log_error("mount(%s) failed: %m", where);
399
400 if (r == 0)
401 r = -errno;
402 }
403 }
404
405 return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409 char **x, **y;
410
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
413
414 where = strjoin(dest, "/", *y, NULL);
415 if (!where)
416 return log_oom();
417
418 mkdir_p_label(where, 0755);
419
420 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421 log_error("mount(%s) failed: %m", where);
422 return -errno;
423 }
424
425 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426 log_error("mount(%s) failed: %m", where);
427 return -errno;
428 }
429 }
430
431 return 0;
432 }
433
434 static int setup_timezone(const char *dest) {
435 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
436 char *z, *y;
437 int r;
438
439 assert(dest);
440
441 /* Fix the timezone, if possible */
442 r = readlink_malloc("/etc/localtime", &p);
443 if (r < 0) {
444 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
445 return 0;
446 }
447
448 z = path_startswith(p, "../usr/share/zoneinfo/");
449 if (!z)
450 z = path_startswith(p, "/usr/share/zoneinfo/");
451 if (!z) {
452 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
453 return 0;
454 }
455
456 where = strappend(dest, "/etc/localtime");
457 if (!where)
458 return log_oom();
459
460 r = readlink_malloc(where, &q);
461 if (r >= 0) {
462 y = path_startswith(q, "../usr/share/zoneinfo/");
463 if (!y)
464 y = path_startswith(q, "/usr/share/zoneinfo/");
465
466
467 /* Already pointing to the right place? Then do nothing .. */
468 if (y && streq(y, z))
469 return 0;
470 }
471
472 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
473 if (!check)
474 return log_oom();
475
476 if (access(check, F_OK) < 0) {
477 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
478 return 0;
479 }
480
481 what = strappend("../usr/share/zoneinfo/", z);
482 if (!what)
483 return log_oom();
484
485 unlink(where);
486 if (symlink(what, where) < 0) {
487 log_error("Failed to correct timezone of container: %m");
488 return 0;
489 }
490
491 return 0;
492 }
493
494 static int setup_resolv_conf(const char *dest) {
495 char *where;
496
497 assert(dest);
498
499 if (arg_private_network)
500 return 0;
501
502 /* Fix resolv.conf, if possible */
503 where = strappend(dest, "/etc/resolv.conf");
504 if (!where)
505 return log_oom();
506
507 /* We don't really care for the results of this really. If it
508 * fails, it fails, but meh... */
509 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
510 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
511
512 free(where);
513
514 return 0;
515 }
516
517 static int setup_boot_id(const char *dest) {
518 char _cleanup_free_ *from = NULL, *to = NULL;
519 sd_id128_t rnd;
520 char as_uuid[37];
521 int r;
522
523 assert(dest);
524
525 /* Generate a new randomized boot ID, so that each boot-up of
526 * the container gets a new one */
527
528 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
529 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
530 if (!from || !to)
531 return log_oom();
532
533 r = sd_id128_randomize(&rnd);
534 if (r < 0) {
535 log_error("Failed to generate random boot id: %s", strerror(-r));
536 return r;
537 }
538
539 snprintf(as_uuid, sizeof(as_uuid),
540 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
541 SD_ID128_FORMAT_VAL(rnd));
542 char_array_0(as_uuid);
543
544 r = write_string_file(from, as_uuid);
545 if (r < 0) {
546 log_error("Failed to write boot id: %s", strerror(-r));
547 return r;
548 }
549
550 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
551 log_error("Failed to bind mount boot id: %m");
552 r = -errno;
553 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
554 log_warning("Failed to make boot id read-only: %m");
555
556 unlink(from);
557 return r;
558 }
559
560 static int copy_devnodes(const char *dest) {
561
562 static const char devnodes[] =
563 "null\0"
564 "zero\0"
565 "full\0"
566 "random\0"
567 "urandom\0"
568 "tty\0";
569
570 const char *d;
571 int r = 0;
572 mode_t _cleanup_umask_ u;
573
574 assert(dest);
575
576 u = umask(0000);
577
578 NULSTR_FOREACH(d, devnodes) {
579 struct stat st;
580 char _cleanup_free_ *from = NULL, *to = NULL;
581
582 asprintf(&from, "/dev/%s", d);
583 asprintf(&to, "%s/dev/%s", dest, d);
584
585 if (!from || !to) {
586 log_oom();
587
588 if (r == 0)
589 r = -ENOMEM;
590
591 break;
592 }
593
594 if (stat(from, &st) < 0) {
595
596 if (errno != ENOENT) {
597 log_error("Failed to stat %s: %m", from);
598 if (r == 0)
599 r = -errno;
600 }
601
602 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
603
604 log_error("%s is not a char or block device, cannot copy", from);
605 if (r == 0)
606 r = -EIO;
607
608 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
609
610 log_error("mknod(%s) failed: %m", dest);
611 if (r == 0)
612 r = -errno;
613 }
614 }
615
616 return r;
617 }
618
619 static int setup_ptmx(const char *dest) {
620 _cleanup_free_ char *p = NULL;
621
622 p = strappend(dest, "/dev/ptmx");
623 if (!p)
624 return log_oom();
625
626 if (symlink("pts/ptmx", p) < 0) {
627 log_error("Failed to create /dev/ptmx symlink: %m");
628 return -errno;
629 }
630
631 return 0;
632 }
633
634 static int setup_dev_console(const char *dest, const char *console) {
635 struct stat st;
636 char _cleanup_free_ *to = NULL;
637 int r;
638 mode_t _cleanup_umask_ u;
639
640 assert(dest);
641 assert(console);
642
643 u = umask(0000);
644
645 if (stat(console, &st) < 0) {
646 log_error("Failed to stat %s: %m", console);
647 return -errno;
648
649 } else if (!S_ISCHR(st.st_mode)) {
650 log_error("/dev/console is not a char device");
651 return -EIO;
652 }
653
654 r = chmod_and_chown(console, 0600, 0, 0);
655 if (r < 0) {
656 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
657 return r;
658 }
659
660 if (asprintf(&to, "%s/dev/console", dest) < 0)
661 return log_oom();
662
663 /* We need to bind mount the right tty to /dev/console since
664 * ptys can only exist on pts file systems. To have something
665 * to bind mount things on we create a device node first, that
666 * has the right major/minor (note that the major minor
667 * doesn't actually matter here, since we mount it over
668 * anyway). */
669
670 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
671 log_error("mknod() for /dev/console failed: %m");
672 return -errno;
673 }
674
675 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
676 log_error("Bind mount for /dev/console failed: %m");
677 return -errno;
678 }
679
680 return 0;
681 }
682
683 static int setup_kmsg(const char *dest, int kmsg_socket) {
684 char _cleanup_free_ *from = NULL, *to = NULL;
685 int r, fd, k;
686 mode_t _cleanup_umask_ u;
687 union {
688 struct cmsghdr cmsghdr;
689 uint8_t buf[CMSG_SPACE(sizeof(int))];
690 } control = {};
691 struct msghdr mh = {
692 .msg_control = &control,
693 .msg_controllen = sizeof(control),
694 };
695 struct cmsghdr *cmsg;
696
697 assert(dest);
698 assert(kmsg_socket >= 0);
699
700 u = umask(0000);
701
702 /* We create the kmsg FIFO as /dev/kmsg, but immediately
703 * delete it after bind mounting it to /proc/kmsg. While FIFOs
704 * on the reading side behave very similar to /proc/kmsg,
705 * their writing side behaves differently from /dev/kmsg in
706 * that writing blocks when nothing is reading. In order to
707 * avoid any problems with containers deadlocking due to this
708 * we simply make /dev/kmsg unavailable to the container. */
709 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
710 asprintf(&to, "%s/proc/kmsg", dest) < 0)
711 return log_oom();
712
713 if (mkfifo(from, 0600) < 0) {
714 log_error("mkfifo() for /dev/kmsg failed: %m");
715 return -errno;
716 }
717
718 r = chmod_and_chown(from, 0600, 0, 0);
719 if (r < 0) {
720 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
721 return r;
722 }
723
724 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
725 log_error("Bind mount for /proc/kmsg failed: %m");
726 return -errno;
727 }
728
729 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
730 if (fd < 0) {
731 log_error("Failed to open fifo: %m");
732 return -errno;
733 }
734
735 cmsg = CMSG_FIRSTHDR(&mh);
736 cmsg->cmsg_level = SOL_SOCKET;
737 cmsg->cmsg_type = SCM_RIGHTS;
738 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
739 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
740
741 mh.msg_controllen = cmsg->cmsg_len;
742
743 /* Store away the fd in the socket, so that it stays open as
744 * long as we run the child */
745 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
746 close_nointr_nofail(fd);
747
748 if (k < 0) {
749 log_error("Failed to send FIFO fd: %m");
750 return -errno;
751 }
752
753 /* And now make the FIFO unavailable as /dev/kmsg... */
754 unlink(from);
755 return 0;
756 }
757
758 static int setup_hostname(void) {
759
760 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
761 return -errno;
762
763 return 0;
764 }
765
766 static int setup_journal(const char *directory) {
767 sd_id128_t machine_id;
768 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
769 char *id;
770 int r;
771
772 if (arg_link_journal == LINK_NO)
773 return 0;
774
775 p = strappend(directory, "/etc/machine-id");
776 if (!p)
777 return log_oom();
778
779 r = read_one_line_file(p, &b);
780 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
781 return 0;
782 else if (r < 0) {
783 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
784 return r;
785 }
786
787 id = strstrip(b);
788 if (isempty(id) && arg_link_journal == LINK_AUTO)
789 return 0;
790
791 /* Verify validity */
792 r = sd_id128_from_string(id, &machine_id);
793 if (r < 0) {
794 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
795 return r;
796 }
797
798 free(p);
799 p = strappend("/var/log/journal/", id);
800 q = strjoin(directory, "/var/log/journal/", id, NULL);
801 if (!p || !q)
802 return log_oom();
803
804 if (path_is_mount_point(p, false) > 0) {
805 if (arg_link_journal != LINK_AUTO) {
806 log_error("%s: already a mount point, refusing to use for journal", p);
807 return -EEXIST;
808 }
809
810 return 0;
811 }
812
813 if (path_is_mount_point(q, false) > 0) {
814 if (arg_link_journal != LINK_AUTO) {
815 log_error("%s: already a mount point, refusing to use for journal", q);
816 return -EEXIST;
817 }
818
819 return 0;
820 }
821
822 r = readlink_and_make_absolute(p, &d);
823 if (r >= 0) {
824 if ((arg_link_journal == LINK_GUEST ||
825 arg_link_journal == LINK_AUTO) &&
826 path_equal(d, q)) {
827
828 r = mkdir_p(q, 0755);
829 if (r < 0)
830 log_warning("failed to create directory %s: %m", q);
831 return 0;
832 }
833
834 if (unlink(p) < 0) {
835 log_error("Failed to remove symlink %s: %m", p);
836 return -errno;
837 }
838 } else if (r == -EINVAL) {
839
840 if (arg_link_journal == LINK_GUEST &&
841 rmdir(p) < 0) {
842
843 if (errno == ENOTDIR) {
844 log_error("%s already exists and is neither a symlink nor a directory", p);
845 return r;
846 } else {
847 log_error("Failed to remove %s: %m", p);
848 return -errno;
849 }
850 }
851 } else if (r != -ENOENT) {
852 log_error("readlink(%s) failed: %m", p);
853 return r;
854 }
855
856 if (arg_link_journal == LINK_GUEST) {
857
858 if (symlink(q, p) < 0) {
859 log_error("Failed to symlink %s to %s: %m", q, p);
860 return -errno;
861 }
862
863 r = mkdir_p(q, 0755);
864 if (r < 0)
865 log_warning("failed to create directory %s: %m", q);
866 return 0;
867 }
868
869 if (arg_link_journal == LINK_HOST) {
870 r = mkdir_p(p, 0755);
871 if (r < 0) {
872 log_error("Failed to create %s: %m", p);
873 return r;
874 }
875
876 } else if (access(p, F_OK) < 0)
877 return 0;
878
879 if (dir_is_empty(q) == 0) {
880 log_error("%s not empty.", q);
881 return -ENOTEMPTY;
882 }
883
884 r = mkdir_p(q, 0755);
885 if (r < 0) {
886 log_error("Failed to create %s: %m", q);
887 return r;
888 }
889
890 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
891 log_error("Failed to bind mount journal from host into guest: %m");
892 return -errno;
893 }
894
895 return 0;
896 }
897
898 static int setup_cgroup(const char *path) {
899 char **c;
900 int r;
901
902 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
903 if (r < 0) {
904 log_error("Failed to create cgroup: %s", strerror(-r));
905 return r;
906 }
907
908 STRV_FOREACH(c, arg_controllers) {
909 r = cg_create_and_attach(*c, path, 1);
910 if (r < 0)
911 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
912 }
913
914 return 0;
915 }
916
917 static int drop_capabilities(void) {
918 return capability_bounding_set_drop(~arg_retain, false);
919 }
920
921 static int is_os_tree(const char *path) {
922 int r;
923 char *p;
924 /* We use /bin/sh as flag file if something is an OS */
925
926 if (asprintf(&p, "%s/bin/sh", path) < 0)
927 return -ENOMEM;
928
929 r = access(p, F_OK);
930 free(p);
931
932 return r < 0 ? 0 : 1;
933 }
934
935 static int process_pty(int master, pid_t pid, sigset_t *mask) {
936
937 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
938 size_t in_buffer_full = 0, out_buffer_full = 0;
939 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
940 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
941 int ep = -1, signal_fd = -1, r;
942 bool tried_orderly_shutdown = false;
943
944 assert(master >= 0);
945 assert(pid > 0);
946 assert(mask);
947
948 fd_nonblock(STDIN_FILENO, 1);
949 fd_nonblock(STDOUT_FILENO, 1);
950 fd_nonblock(master, 1);
951
952 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
953 if (signal_fd < 0) {
954 log_error("signalfd(): %m");
955 r = -errno;
956 goto finish;
957 }
958
959 ep = epoll_create1(EPOLL_CLOEXEC);
960 if (ep < 0) {
961 log_error("Failed to create epoll: %m");
962 r = -errno;
963 goto finish;
964 }
965
966 /* We read from STDIN only if this is actually a TTY,
967 * otherwise we assume non-interactivity. */
968 if (isatty(STDIN_FILENO)) {
969 zero(stdin_ev);
970 stdin_ev.events = EPOLLIN|EPOLLET;
971 stdin_ev.data.fd = STDIN_FILENO;
972
973 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
974 log_error("Failed to register STDIN in epoll: %m");
975 r = -errno;
976 goto finish;
977 }
978 }
979
980 zero(stdout_ev);
981 stdout_ev.events = EPOLLOUT|EPOLLET;
982 stdout_ev.data.fd = STDOUT_FILENO;
983
984 zero(master_ev);
985 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
986 master_ev.data.fd = master;
987
988 zero(signal_ev);
989 signal_ev.events = EPOLLIN;
990 signal_ev.data.fd = signal_fd;
991
992 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
993 if (errno != EPERM) {
994 log_error("Failed to register stdout in epoll: %m");
995 r = -errno;
996 goto finish;
997 }
998 /* stdout without epoll support. Likely redirected to regular file. */
999 stdout_writable = true;
1000 }
1001
1002 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1003 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1004 log_error("Failed to register fds in epoll: %m");
1005 r = -errno;
1006 goto finish;
1007 }
1008
1009 for (;;) {
1010 struct epoll_event ev[16];
1011 ssize_t k;
1012 int i, nfds;
1013
1014 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1015 if (nfds < 0) {
1016
1017 if (errno == EINTR || errno == EAGAIN)
1018 continue;
1019
1020 log_error("epoll_wait(): %m");
1021 r = -errno;
1022 goto finish;
1023 }
1024
1025 assert(nfds >= 1);
1026
1027 for (i = 0; i < nfds; i++) {
1028 if (ev[i].data.fd == STDIN_FILENO) {
1029
1030 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1031 stdin_readable = true;
1032
1033 } else if (ev[i].data.fd == STDOUT_FILENO) {
1034
1035 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1036 stdout_writable = true;
1037
1038 } else if (ev[i].data.fd == master) {
1039
1040 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1041 master_readable = true;
1042
1043 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1044 master_writable = true;
1045
1046 } else if (ev[i].data.fd == signal_fd) {
1047 struct signalfd_siginfo sfsi;
1048 ssize_t n;
1049
1050 n = read(signal_fd, &sfsi, sizeof(sfsi));
1051 if (n != sizeof(sfsi)) {
1052
1053 if (n >= 0) {
1054 log_error("Failed to read from signalfd: invalid block size");
1055 r = -EIO;
1056 goto finish;
1057 }
1058
1059 if (errno != EINTR && errno != EAGAIN) {
1060 log_error("Failed to read from signalfd: %m");
1061 r = -errno;
1062 goto finish;
1063 }
1064 } else {
1065
1066 if (sfsi.ssi_signo == SIGWINCH) {
1067 struct winsize ws;
1068
1069 /* The window size changed, let's forward that. */
1070 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1071 ioctl(master, TIOCSWINSZ, &ws);
1072 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1073
1074 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1075
1076 /* This only works for systemd... */
1077 tried_orderly_shutdown = true;
1078 kill(pid, SIGRTMIN+3);
1079
1080 } else {
1081 r = 0;
1082 goto finish;
1083 }
1084 }
1085 }
1086 }
1087
1088 while ((stdin_readable && in_buffer_full <= 0) ||
1089 (master_writable && in_buffer_full > 0) ||
1090 (master_readable && out_buffer_full <= 0) ||
1091 (stdout_writable && out_buffer_full > 0)) {
1092
1093 if (stdin_readable && in_buffer_full < LINE_MAX) {
1094
1095 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1096 if (k < 0) {
1097
1098 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1099 stdin_readable = false;
1100 else {
1101 log_error("read(): %m");
1102 r = -errno;
1103 goto finish;
1104 }
1105 } else
1106 in_buffer_full += (size_t) k;
1107 }
1108
1109 if (master_writable && in_buffer_full > 0) {
1110
1111 k = write(master, in_buffer, in_buffer_full);
1112 if (k < 0) {
1113
1114 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1115 master_writable = false;
1116 else {
1117 log_error("write(): %m");
1118 r = -errno;
1119 goto finish;
1120 }
1121
1122 } else {
1123 assert(in_buffer_full >= (size_t) k);
1124 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1125 in_buffer_full -= k;
1126 }
1127 }
1128
1129 if (master_readable && out_buffer_full < LINE_MAX) {
1130
1131 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1132 if (k < 0) {
1133
1134 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1135 master_readable = false;
1136 else {
1137 log_error("read(): %m");
1138 r = -errno;
1139 goto finish;
1140 }
1141 } else
1142 out_buffer_full += (size_t) k;
1143 }
1144
1145 if (stdout_writable && out_buffer_full > 0) {
1146
1147 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1148 if (k < 0) {
1149
1150 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1151 stdout_writable = false;
1152 else {
1153 log_error("write(): %m");
1154 r = -errno;
1155 goto finish;
1156 }
1157
1158 } else {
1159 assert(out_buffer_full >= (size_t) k);
1160 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1161 out_buffer_full -= k;
1162 }
1163 }
1164 }
1165 }
1166
1167 finish:
1168 if (ep >= 0)
1169 close_nointr_nofail(ep);
1170
1171 if (signal_fd >= 0)
1172 close_nointr_nofail(signal_fd);
1173
1174 return r;
1175 }
1176
1177 int main(int argc, char *argv[]) {
1178 pid_t pid = 0;
1179 int r = EXIT_FAILURE, k;
1180 _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1181 _cleanup_close_ int master = -1;
1182 int n_fd_passed;
1183 const char *console = NULL;
1184 struct termios saved_attr, raw_attr;
1185 sigset_t mask;
1186 bool saved_attr_valid = false;
1187 struct winsize ws;
1188 int kmsg_socket_pair[2] = { -1, -1 };
1189 FDSet *fds = NULL;
1190
1191 log_parse_environment();
1192 log_open();
1193
1194 r = parse_argv(argc, argv);
1195 if (r <= 0)
1196 goto finish;
1197
1198 if (arg_directory) {
1199 char *p;
1200
1201 p = path_make_absolute_cwd(arg_directory);
1202 free(arg_directory);
1203 arg_directory = p;
1204 } else
1205 arg_directory = get_current_dir_name();
1206
1207 if (!arg_directory) {
1208 log_error("Failed to determine path");
1209 goto finish;
1210 }
1211
1212 path_kill_slashes(arg_directory);
1213
1214 if (!arg_machine) {
1215 arg_machine = strdup(path_get_file_name(arg_directory));
1216 if (!arg_machine) {
1217 log_oom();
1218 goto finish;
1219 }
1220
1221 hostname_cleanup(arg_machine);
1222 if (isempty(arg_machine)) {
1223 log_error("Failed to determine machine name automatically, please use -M.");
1224 goto finish;
1225 }
1226 }
1227
1228 if (geteuid() != 0) {
1229 log_error("Need to be root.");
1230 goto finish;
1231 }
1232
1233 if (sd_booted() <= 0) {
1234 log_error("Not running on a systemd system.");
1235 goto finish;
1236 }
1237
1238 if (path_equal(arg_directory, "/")) {
1239 log_error("Spawning container on root directory not supported.");
1240 goto finish;
1241 }
1242
1243 if (is_os_tree(arg_directory) <= 0) {
1244 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1245 goto finish;
1246 }
1247
1248 log_close();
1249 n_fd_passed = sd_listen_fds(false);
1250 if (n_fd_passed > 0) {
1251 k = fdset_new_listen_fds(&fds, false);
1252 if (k < 0) {
1253 log_error("Failed to collect file descriptors: %s", strerror(-k));
1254 goto finish;
1255 }
1256 }
1257 fdset_close_others(fds);
1258 log_open();
1259
1260 k = cg_get_machine_path(&machine_root);
1261 if (k < 0) {
1262 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1263 goto finish;
1264 }
1265
1266 newcg = strjoin(machine_root, "/", arg_machine, NULL);
1267 if (!newcg) {
1268 log_error("Failed to allocate cgroup path.");
1269 goto finish;
1270 }
1271
1272 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1273 if (r <= 0 && r != -ENOENT) {
1274 log_error("Container already running.");
1275
1276 free(newcg);
1277 newcg = NULL;
1278
1279 goto finish;
1280 }
1281
1282 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1283 if (master < 0) {
1284 log_error("Failed to acquire pseudo tty: %m");
1285 goto finish;
1286 }
1287
1288 console = ptsname(master);
1289 if (!console) {
1290 log_error("Failed to determine tty name: %m");
1291 goto finish;
1292 }
1293
1294 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1295
1296 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1297 ioctl(master, TIOCSWINSZ, &ws);
1298
1299 if (unlockpt(master) < 0) {
1300 log_error("Failed to unlock tty: %m");
1301 goto finish;
1302 }
1303
1304 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1305 saved_attr_valid = true;
1306
1307 raw_attr = saved_attr;
1308 cfmakeraw(&raw_attr);
1309 raw_attr.c_lflag &= ~ECHO;
1310 }
1311
1312 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1313 log_error("Failed to create kmsg socket pair.");
1314 goto finish;
1315 }
1316
1317 assert_se(sigemptyset(&mask) == 0);
1318 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1319 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1320
1321 for (;;) {
1322 siginfo_t status;
1323 int pipefd[2];
1324
1325 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1326 log_error("pipe2(): %m");
1327 goto finish;
1328 }
1329
1330 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1331 if (pid < 0) {
1332 if (errno == EINVAL)
1333 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1334 else
1335 log_error("clone() failed: %m");
1336
1337 goto finish;
1338 }
1339
1340 if (pid == 0) {
1341 /* child */
1342 const char *home = NULL;
1343 uid_t uid = (uid_t) -1;
1344 gid_t gid = (gid_t) -1;
1345 unsigned n_env = 2;
1346 const char *envp[] = {
1347 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1348 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1349 NULL, /* TERM */
1350 NULL, /* HOME */
1351 NULL, /* USER */
1352 NULL, /* LOGNAME */
1353 NULL, /* container_uuid */
1354 NULL, /* LISTEN_FDS */
1355 NULL, /* LISTEN_PID */
1356 NULL
1357 };
1358
1359 envp[n_env] = strv_find_prefix(environ, "TERM=");
1360 if (envp[n_env])
1361 n_env ++;
1362
1363 close_nointr_nofail(pipefd[1]);
1364 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1365 close_nointr_nofail(pipefd[0]);
1366
1367 close_nointr_nofail(master);
1368 master = -1;
1369
1370 if (saved_attr_valid) {
1371 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1372 log_error("Failed to set terminal attributes: %m");
1373 goto child_fail;
1374 }
1375 }
1376
1377 close_nointr(STDIN_FILENO);
1378 close_nointr(STDOUT_FILENO);
1379 close_nointr(STDERR_FILENO);
1380
1381 close_nointr_nofail(kmsg_socket_pair[0]);
1382 kmsg_socket_pair[0] = -1;
1383
1384 reset_all_signal_handlers();
1385
1386 assert_se(sigemptyset(&mask) == 0);
1387 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1388
1389 k = open_terminal(console, O_RDWR);
1390 if (k != STDIN_FILENO) {
1391 if (k >= 0) {
1392 close_nointr_nofail(k);
1393 k = -EINVAL;
1394 }
1395
1396 log_error("Failed to open console: %s", strerror(-k));
1397 goto child_fail;
1398 }
1399
1400 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1401 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1402 log_error("Failed to duplicate console: %m");
1403 goto child_fail;
1404 }
1405
1406 if (setsid() < 0) {
1407 log_error("setsid() failed: %m");
1408 goto child_fail;
1409 }
1410
1411 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1412 log_error("PR_SET_PDEATHSIG failed: %m");
1413 goto child_fail;
1414 }
1415
1416 if (setup_cgroup(newcg) < 0)
1417 goto child_fail;
1418
1419 /* Mark everything as slave, so that we still
1420 * receive mounts from the real root, but don't
1421 * propagate mounts to the real root. */
1422 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1423 log_error("MS_SLAVE|MS_REC failed: %m");
1424 goto child_fail;
1425 }
1426
1427 /* Turn directory into bind mount */
1428 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1429 log_error("Failed to make bind mount.");
1430 goto child_fail;
1431 }
1432
1433 if (arg_read_only)
1434 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1435 log_error("Failed to make read-only.");
1436 goto child_fail;
1437 }
1438
1439 if (mount_all(arg_directory) < 0)
1440 goto child_fail;
1441
1442 if (copy_devnodes(arg_directory) < 0)
1443 goto child_fail;
1444
1445 if (setup_ptmx(arg_directory) < 0)
1446 goto child_fail;
1447
1448 dev_setup(arg_directory);
1449
1450 if (setup_dev_console(arg_directory, console) < 0)
1451 goto child_fail;
1452
1453 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1454 goto child_fail;
1455
1456 close_nointr_nofail(kmsg_socket_pair[1]);
1457 kmsg_socket_pair[1] = -1;
1458
1459 if (setup_boot_id(arg_directory) < 0)
1460 goto child_fail;
1461
1462 if (setup_timezone(arg_directory) < 0)
1463 goto child_fail;
1464
1465 if (setup_resolv_conf(arg_directory) < 0)
1466 goto child_fail;
1467
1468 if (setup_journal(arg_directory) < 0)
1469 goto child_fail;
1470
1471 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1472 goto child_fail;
1473
1474 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1475 goto child_fail;
1476
1477 if (chdir(arg_directory) < 0) {
1478 log_error("chdir(%s) failed: %m", arg_directory);
1479 goto child_fail;
1480 }
1481
1482 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1483 log_error("mount(MS_MOVE) failed: %m");
1484 goto child_fail;
1485 }
1486
1487 if (chroot(".") < 0) {
1488 log_error("chroot() failed: %m");
1489 goto child_fail;
1490 }
1491
1492 if (chdir("/") < 0) {
1493 log_error("chdir() failed: %m");
1494 goto child_fail;
1495 }
1496
1497 umask(0022);
1498
1499 loopback_setup();
1500
1501 if (drop_capabilities() < 0) {
1502 log_error("drop_capabilities() failed: %m");
1503 goto child_fail;
1504 }
1505
1506 if (arg_user) {
1507
1508 /* Note that this resolves user names
1509 * inside the container, and hence
1510 * accesses the NSS modules from the
1511 * container and not the host. This is
1512 * a bit weird... */
1513
1514 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1515 log_error("get_user_creds() failed: %m");
1516 goto child_fail;
1517 }
1518
1519 if (mkdir_parents_label(home, 0775) < 0) {
1520 log_error("mkdir_parents_label() failed: %m");
1521 goto child_fail;
1522 }
1523
1524 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1525 log_error("mkdir_safe_label() failed: %m");
1526 goto child_fail;
1527 }
1528
1529 if (initgroups((const char*)arg_user, gid) < 0) {
1530 log_error("initgroups() failed: %m");
1531 goto child_fail;
1532 }
1533
1534 if (setresgid(gid, gid, gid) < 0) {
1535 log_error("setregid() failed: %m");
1536 goto child_fail;
1537 }
1538
1539 if (setresuid(uid, uid, uid) < 0) {
1540 log_error("setreuid() failed: %m");
1541 goto child_fail;
1542 }
1543 } else {
1544 /* Reset everything fully to 0, just in case */
1545
1546 if (setgroups(0, NULL) < 0) {
1547 log_error("setgroups() failed: %m");
1548 goto child_fail;
1549 }
1550
1551 if (setresgid(0, 0, 0) < 0) {
1552 log_error("setregid() failed: %m");
1553 goto child_fail;
1554 }
1555
1556 if (setresuid(0, 0, 0) < 0) {
1557 log_error("setreuid() failed: %m");
1558 goto child_fail;
1559 }
1560 }
1561
1562 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1563 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1564 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1565 log_oom();
1566 goto child_fail;
1567 }
1568
1569 if (arg_uuid) {
1570 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1571 log_oom();
1572 goto child_fail;
1573 }
1574 }
1575
1576 if (fdset_size(fds) > 0) {
1577 k = fdset_cloexec(fds, false);
1578 if (k < 0) {
1579 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1580 goto child_fail;
1581 }
1582
1583 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1584 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1585 log_oom();
1586 goto child_fail;
1587 }
1588 }
1589
1590 setup_hostname();
1591
1592 if (arg_boot) {
1593 char **a;
1594 size_t l;
1595
1596 /* Automatically search for the init system */
1597
1598 l = 1 + argc - optind;
1599 a = newa(char*, l + 1);
1600 memcpy(a + 1, argv + optind, l * sizeof(char*));
1601
1602 a[0] = (char*) "/usr/lib/systemd/systemd";
1603 execve(a[0], a, (char**) envp);
1604
1605 a[0] = (char*) "/lib/systemd/systemd";
1606 execve(a[0], a, (char**) envp);
1607
1608 a[0] = (char*) "/sbin/init";
1609 execve(a[0], a, (char**) envp);
1610 } else if (argc > optind)
1611 execvpe(argv[optind], argv + optind, (char**) envp);
1612 else {
1613 chdir(home ? home : "/root");
1614 execle("/bin/bash", "-bash", NULL, (char**) envp);
1615 }
1616
1617 log_error("execv() failed: %m");
1618
1619 child_fail:
1620 _exit(EXIT_FAILURE);
1621 }
1622
1623 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1624 close_nointr_nofail(pipefd[0]);
1625 close_nointr_nofail(pipefd[1]);
1626
1627 fdset_free(fds);
1628 fds = NULL;
1629
1630 if (process_pty(master, pid, &mask) < 0)
1631 goto finish;
1632
1633 if (saved_attr_valid)
1634 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1635
1636 r = wait_for_terminate(pid, &status);
1637 if (r < 0) {
1638 r = EXIT_FAILURE;
1639 break;
1640 }
1641
1642 if (status.si_code == CLD_EXITED) {
1643 if (status.si_status != 0) {
1644 log_error("Container failed with error code %i.", status.si_status);
1645 r = status.si_status;
1646 break;
1647 }
1648
1649 log_debug("Container exited successfully.");
1650 break;
1651 } else if (status.si_code == CLD_KILLED &&
1652 status.si_status == SIGINT) {
1653 log_info("Container has been shut down.");
1654 r = 0;
1655 break;
1656 } else if (status.si_code == CLD_KILLED &&
1657 status.si_status == SIGHUP) {
1658 log_info("Container is being rebooted.");
1659 continue;
1660 } else if (status.si_code == CLD_KILLED ||
1661 status.si_code == CLD_DUMPED) {
1662
1663 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1664 r = EXIT_FAILURE;
1665 break;
1666 } else {
1667 log_error("Container failed due to unknown reason.");
1668 r = EXIT_FAILURE;
1669 break;
1670 }
1671 }
1672
1673 finish:
1674 if (saved_attr_valid)
1675 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1676
1677 close_pipe(kmsg_socket_pair);
1678
1679 if (newcg)
1680 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1681
1682 free(arg_directory);
1683 free(arg_machine);
1684 strv_free(arg_controllers);
1685
1686 fdset_free(fds);
1687
1688 return r;
1689 }