]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Introduce _cleanup_fdset_free_
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
45
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
48
49 #include "log.h"
50 #include "util.h"
51 #include "mkdir.h"
52 #include "macro.h"
53 #include "audit.h"
54 #include "missing.h"
55 #include "cgroup-util.h"
56 #include "strv.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
59 #include "sd-id128.h"
60 #include "dev-setup.h"
61 #include "fdset.h"
62 #include "build.h"
63 #include "fileio.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72 LINK_NO,
73 LINK_AUTO,
74 LINK_HOST,
75 LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88 (1ULL << CAP_CHOWN) |
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
94 (1ULL << CAP_KILL) |
95 (1ULL << CAP_LEASE) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " --uuid=UUID Set a specific machine UUID for the container\n"
126 " -M --machine=NAME Set the machine name for the container\n"
127 " -S --slice=SLICE Place the container in the specified slice\n"
128 " --private-network Disable network in container\n"
129 " --read-only Mount the root directory read-only\n"
130 " --capability=CAP In addition to the default, retain specified\n"
131 " capability\n"
132 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
133 " -j Equivalent to --link-journal=host\n"
134 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
135 " the container\n"
136 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137 program_invocation_short_name);
138
139 return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144 enum {
145 ARG_VERSION = 0x100,
146 ARG_PRIVATE_NETWORK,
147 ARG_UUID,
148 ARG_READ_ONLY,
149 ARG_CAPABILITY,
150 ARG_LINK_JOURNAL,
151 ARG_BIND,
152 ARG_BIND_RO
153 };
154
155 static const struct option options[] = {
156 { "help", no_argument, NULL, 'h' },
157 { "version", no_argument, NULL, ARG_VERSION },
158 { "directory", required_argument, NULL, 'D' },
159 { "user", required_argument, NULL, 'u' },
160 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
161 { "boot", no_argument, NULL, 'b' },
162 { "uuid", required_argument, NULL, ARG_UUID },
163 { "read-only", no_argument, NULL, ARG_READ_ONLY },
164 { "capability", required_argument, NULL, ARG_CAPABILITY },
165 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
166 { "bind", required_argument, NULL, ARG_BIND },
167 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
168 { "machine", required_argument, NULL, 'M' },
169 { "slice", required_argument, NULL, 'S' },
170 { NULL, 0, NULL, 0 }
171 };
172
173 int c, r;
174
175 assert(argc >= 0);
176 assert(argv);
177
178 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
179
180 switch (c) {
181
182 case 'h':
183 help();
184 return 0;
185
186 case ARG_VERSION:
187 puts(PACKAGE_STRING);
188 puts(SYSTEMD_FEATURES);
189 return 0;
190
191 case 'D':
192 free(arg_directory);
193 arg_directory = canonicalize_file_name(optarg);
194 if (!arg_directory) {
195 log_error("Failed to canonicalize root directory.");
196 return -ENOMEM;
197 }
198
199 break;
200
201 case 'u':
202 free(arg_user);
203 arg_user = strdup(optarg);
204 if (!arg_user)
205 return log_oom();
206
207 break;
208
209 case ARG_PRIVATE_NETWORK:
210 arg_private_network = true;
211 break;
212
213 case 'b':
214 arg_boot = true;
215 break;
216
217 case ARG_UUID:
218 r = sd_id128_from_string(optarg, &arg_uuid);
219 if (r < 0) {
220 log_error("Invalid UUID: %s", optarg);
221 return r;
222 }
223 break;
224
225 case 'S':
226 arg_slice = strdup(optarg);
227 break;
228
229 case 'M':
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
232 return -EINVAL;
233 }
234
235 free(arg_machine);
236 arg_machine = strdup(optarg);
237 if (!arg_machine)
238 return log_oom();
239
240 break;
241
242 case ARG_READ_ONLY:
243 arg_read_only = true;
244 break;
245
246 case ARG_CAPABILITY: {
247 char *state, *word;
248 size_t length;
249
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251 cap_value_t cap;
252 char *t;
253
254 t = strndup(word, length);
255 if (!t)
256 return log_oom();
257
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
260 free(t);
261 return -EINVAL;
262 }
263
264 free(t);
265 arg_retain |= 1ULL << (uint64_t) cap;
266 }
267
268 break;
269 }
270
271 case 'j':
272 arg_link_journal = LINK_GUEST;
273 break;
274
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
284 else {
285 log_error("Failed to parse link journal mode %s", optarg);
286 return -EINVAL;
287 }
288
289 break;
290
291 case ARG_BIND:
292 case ARG_BIND_RO: {
293 _cleanup_free_ char *a = NULL, *b = NULL;
294 char *e;
295 char ***x;
296
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299 e = strchr(optarg, ':');
300 if (e) {
301 a = strndup(optarg, e - optarg);
302 b = strdup(e + 1);
303 } else {
304 a = strdup(optarg);
305 b = strdup(optarg);
306 }
307
308 if (!a || !b)
309 return log_oom();
310
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
313 return -EINVAL;
314 }
315
316 r = strv_extend(x, a);
317 if (r < 0)
318 return r;
319
320 r = strv_extend(x, b);
321 if (r < 0)
322 return r;
323
324 break;
325 }
326
327 case '?':
328 return -EINVAL;
329
330 default:
331 log_error("Unknown option code %c", c);
332 return -EINVAL;
333 }
334 }
335
336 return 1;
337 }
338
339 static int mount_all(const char *dest) {
340
341 typedef struct MountPoint {
342 const char *what;
343 const char *where;
344 const char *type;
345 const char *options;
346 unsigned long flags;
347 bool fatal;
348 } MountPoint;
349
350 static const MountPoint mount_table[] = {
351 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
352 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
353 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
354 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
355 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
356 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
359 #ifdef HAVE_SELINUX
360 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
361 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
362 #endif
363 };
364
365 unsigned k;
366 int r = 0;
367
368 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369 _cleanup_free_ char *where = NULL;
370 int t;
371
372 where = strjoin(dest, "/", mount_table[k].where, NULL);
373 if (!where)
374 return log_oom();
375
376 t = path_is_mount_point(where, true);
377 if (t < 0) {
378 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
379
380 if (r == 0)
381 r = t;
382
383 continue;
384 }
385
386 /* Skip this entry if it is not a remount. */
387 if (mount_table[k].what && t > 0)
388 continue;
389
390 mkdir_p(where, 0755);
391
392 if (mount(mount_table[k].what,
393 where,
394 mount_table[k].type,
395 mount_table[k].flags,
396 mount_table[k].options) < 0 &&
397 mount_table[k].fatal) {
398
399 log_error("mount(%s) failed: %m", where);
400
401 if (r == 0)
402 r = -errno;
403 }
404 }
405
406 return r;
407 }
408
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
410 char **x, **y;
411
412 STRV_FOREACH_PAIR(x, y, l) {
413 _cleanup_free_ char *where = NULL;
414 struct stat source_st, dest_st;
415
416 if (stat(*x, &source_st) < 0) {
417 log_error("failed to stat %s: %m", *x);
418 return -errno;
419 }
420
421 where = strjoin(dest, "/", *y, NULL);
422 if (!where)
423 return log_oom();
424
425 if (stat(where, &dest_st) == 0) {
426 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
427 log_error("The file types of %s and %s do not match. Refusing bind mount",
428 *x, where);
429 return -EINVAL;
430 }
431 } else {
432 /* Create the mount point, but be conservative -- refuse to create block
433 * and char devices. */
434 if (S_ISDIR(source_st.st_mode))
435 mkdir_p_label(where, 0755);
436 else if (S_ISFIFO(source_st.st_mode))
437 mkfifo(where, 0644);
438 else if (S_ISSOCK(source_st.st_mode))
439 mknod(where, 0644 | S_IFSOCK, 0);
440 else if (S_ISREG(source_st.st_mode))
441 touch(where);
442 else {
443 log_error("Refusing to create mountpoint for file: %s", *x);
444 return -ENOTSUP;
445 }
446 }
447
448 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
449 log_error("mount(%s) failed: %m", where);
450 return -errno;
451 }
452
453 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
454 log_error("mount(%s) failed: %m", where);
455 return -errno;
456 }
457 }
458
459 return 0;
460 }
461
462 static int setup_timezone(const char *dest) {
463 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
464 char *z, *y;
465 int r;
466
467 assert(dest);
468
469 /* Fix the timezone, if possible */
470 r = readlink_malloc("/etc/localtime", &p);
471 if (r < 0) {
472 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
473 return 0;
474 }
475
476 z = path_startswith(p, "../usr/share/zoneinfo/");
477 if (!z)
478 z = path_startswith(p, "/usr/share/zoneinfo/");
479 if (!z) {
480 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
481 return 0;
482 }
483
484 where = strappend(dest, "/etc/localtime");
485 if (!where)
486 return log_oom();
487
488 r = readlink_malloc(where, &q);
489 if (r >= 0) {
490 y = path_startswith(q, "../usr/share/zoneinfo/");
491 if (!y)
492 y = path_startswith(q, "/usr/share/zoneinfo/");
493
494
495 /* Already pointing to the right place? Then do nothing .. */
496 if (y && streq(y, z))
497 return 0;
498 }
499
500 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
501 if (!check)
502 return log_oom();
503
504 if (access(check, F_OK) < 0) {
505 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
506 return 0;
507 }
508
509 what = strappend("../usr/share/zoneinfo/", z);
510 if (!what)
511 return log_oom();
512
513 unlink(where);
514 if (symlink(what, where) < 0) {
515 log_error("Failed to correct timezone of container: %m");
516 return 0;
517 }
518
519 return 0;
520 }
521
522 static int setup_resolv_conf(const char *dest) {
523 char _cleanup_free_ *where = NULL;
524
525 assert(dest);
526
527 if (arg_private_network)
528 return 0;
529
530 /* Fix resolv.conf, if possible */
531 where = strappend(dest, "/etc/resolv.conf");
532 if (!where)
533 return log_oom();
534
535 /* We don't really care for the results of this really. If it
536 * fails, it fails, but meh... */
537 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
538
539 return 0;
540 }
541
542 static int setup_boot_id(const char *dest) {
543 _cleanup_free_ char *from = NULL, *to = NULL;
544 sd_id128_t rnd;
545 char as_uuid[37];
546 int r;
547
548 assert(dest);
549
550 /* Generate a new randomized boot ID, so that each boot-up of
551 * the container gets a new one */
552
553 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
554 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
555 if (!from || !to)
556 return log_oom();
557
558 r = sd_id128_randomize(&rnd);
559 if (r < 0) {
560 log_error("Failed to generate random boot id: %s", strerror(-r));
561 return r;
562 }
563
564 snprintf(as_uuid, sizeof(as_uuid),
565 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
566 SD_ID128_FORMAT_VAL(rnd));
567 char_array_0(as_uuid);
568
569 r = write_string_file(from, as_uuid);
570 if (r < 0) {
571 log_error("Failed to write boot id: %s", strerror(-r));
572 return r;
573 }
574
575 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
576 log_error("Failed to bind mount boot id: %m");
577 r = -errno;
578 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
579 log_warning("Failed to make boot id read-only: %m");
580
581 unlink(from);
582 return r;
583 }
584
585 static int copy_devnodes(const char *dest) {
586
587 static const char devnodes[] =
588 "null\0"
589 "zero\0"
590 "full\0"
591 "random\0"
592 "urandom\0"
593 "tty\0";
594
595 const char *d;
596 int r = 0;
597 _cleanup_umask_ mode_t u;
598
599 assert(dest);
600
601 u = umask(0000);
602
603 NULSTR_FOREACH(d, devnodes) {
604 struct stat st;
605 _cleanup_free_ char *from = NULL, *to = NULL;
606
607 asprintf(&from, "/dev/%s", d);
608 asprintf(&to, "%s/dev/%s", dest, d);
609
610 if (!from || !to) {
611 log_oom();
612
613 if (r == 0)
614 r = -ENOMEM;
615
616 break;
617 }
618
619 if (stat(from, &st) < 0) {
620
621 if (errno != ENOENT) {
622 log_error("Failed to stat %s: %m", from);
623 if (r == 0)
624 r = -errno;
625 }
626
627 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
628
629 log_error("%s is not a char or block device, cannot copy", from);
630 if (r == 0)
631 r = -EIO;
632
633 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
634
635 log_error("mknod(%s) failed: %m", dest);
636 if (r == 0)
637 r = -errno;
638 }
639 }
640
641 return r;
642 }
643
644 static int setup_ptmx(const char *dest) {
645 _cleanup_free_ char *p = NULL;
646
647 p = strappend(dest, "/dev/ptmx");
648 if (!p)
649 return log_oom();
650
651 if (symlink("pts/ptmx", p) < 0) {
652 log_error("Failed to create /dev/ptmx symlink: %m");
653 return -errno;
654 }
655
656 return 0;
657 }
658
659 static int setup_dev_console(const char *dest, const char *console) {
660 struct stat st;
661 _cleanup_free_ char *to = NULL;
662 int r;
663 _cleanup_umask_ mode_t u;
664
665 assert(dest);
666 assert(console);
667
668 u = umask(0000);
669
670 if (stat(console, &st) < 0) {
671 log_error("Failed to stat %s: %m", console);
672 return -errno;
673
674 } else if (!S_ISCHR(st.st_mode)) {
675 log_error("/dev/console is not a char device");
676 return -EIO;
677 }
678
679 r = chmod_and_chown(console, 0600, 0, 0);
680 if (r < 0) {
681 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
682 return r;
683 }
684
685 if (asprintf(&to, "%s/dev/console", dest) < 0)
686 return log_oom();
687
688 /* We need to bind mount the right tty to /dev/console since
689 * ptys can only exist on pts file systems. To have something
690 * to bind mount things on we create a device node first, that
691 * has the right major/minor (note that the major minor
692 * doesn't actually matter here, since we mount it over
693 * anyway). */
694
695 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
696 log_error("mknod() for /dev/console failed: %m");
697 return -errno;
698 }
699
700 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
701 log_error("Bind mount for /dev/console failed: %m");
702 return -errno;
703 }
704
705 return 0;
706 }
707
708 static int setup_kmsg(const char *dest, int kmsg_socket) {
709 _cleanup_free_ char *from = NULL, *to = NULL;
710 int r, fd, k;
711 _cleanup_umask_ mode_t u;
712 union {
713 struct cmsghdr cmsghdr;
714 uint8_t buf[CMSG_SPACE(sizeof(int))];
715 } control = {};
716 struct msghdr mh = {
717 .msg_control = &control,
718 .msg_controllen = sizeof(control),
719 };
720 struct cmsghdr *cmsg;
721
722 assert(dest);
723 assert(kmsg_socket >= 0);
724
725 u = umask(0000);
726
727 /* We create the kmsg FIFO as /dev/kmsg, but immediately
728 * delete it after bind mounting it to /proc/kmsg. While FIFOs
729 * on the reading side behave very similar to /proc/kmsg,
730 * their writing side behaves differently from /dev/kmsg in
731 * that writing blocks when nothing is reading. In order to
732 * avoid any problems with containers deadlocking due to this
733 * we simply make /dev/kmsg unavailable to the container. */
734 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
735 asprintf(&to, "%s/proc/kmsg", dest) < 0)
736 return log_oom();
737
738 if (mkfifo(from, 0600) < 0) {
739 log_error("mkfifo() for /dev/kmsg failed: %m");
740 return -errno;
741 }
742
743 r = chmod_and_chown(from, 0600, 0, 0);
744 if (r < 0) {
745 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
746 return r;
747 }
748
749 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750 log_error("Bind mount for /proc/kmsg failed: %m");
751 return -errno;
752 }
753
754 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
755 if (fd < 0) {
756 log_error("Failed to open fifo: %m");
757 return -errno;
758 }
759
760 cmsg = CMSG_FIRSTHDR(&mh);
761 cmsg->cmsg_level = SOL_SOCKET;
762 cmsg->cmsg_type = SCM_RIGHTS;
763 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
764 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
765
766 mh.msg_controllen = cmsg->cmsg_len;
767
768 /* Store away the fd in the socket, so that it stays open as
769 * long as we run the child */
770 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
771 close_nointr_nofail(fd);
772
773 if (k < 0) {
774 log_error("Failed to send FIFO fd: %m");
775 return -errno;
776 }
777
778 /* And now make the FIFO unavailable as /dev/kmsg... */
779 unlink(from);
780 return 0;
781 }
782
783 static int setup_hostname(void) {
784
785 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
786 return -errno;
787
788 return 0;
789 }
790
791 static int setup_journal(const char *directory) {
792 sd_id128_t machine_id;
793 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
794 char *id;
795 int r;
796
797 if (arg_link_journal == LINK_NO)
798 return 0;
799
800 p = strappend(directory, "/etc/machine-id");
801 if (!p)
802 return log_oom();
803
804 r = read_one_line_file(p, &b);
805 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
806 return 0;
807 else if (r < 0) {
808 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
809 return r;
810 }
811
812 id = strstrip(b);
813 if (isempty(id) && arg_link_journal == LINK_AUTO)
814 return 0;
815
816 /* Verify validity */
817 r = sd_id128_from_string(id, &machine_id);
818 if (r < 0) {
819 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
820 return r;
821 }
822
823 free(p);
824 p = strappend("/var/log/journal/", id);
825 q = strjoin(directory, "/var/log/journal/", id, NULL);
826 if (!p || !q)
827 return log_oom();
828
829 if (path_is_mount_point(p, false) > 0) {
830 if (arg_link_journal != LINK_AUTO) {
831 log_error("%s: already a mount point, refusing to use for journal", p);
832 return -EEXIST;
833 }
834
835 return 0;
836 }
837
838 if (path_is_mount_point(q, false) > 0) {
839 if (arg_link_journal != LINK_AUTO) {
840 log_error("%s: already a mount point, refusing to use for journal", q);
841 return -EEXIST;
842 }
843
844 return 0;
845 }
846
847 r = readlink_and_make_absolute(p, &d);
848 if (r >= 0) {
849 if ((arg_link_journal == LINK_GUEST ||
850 arg_link_journal == LINK_AUTO) &&
851 path_equal(d, q)) {
852
853 r = mkdir_p(q, 0755);
854 if (r < 0)
855 log_warning("failed to create directory %s: %m", q);
856 return 0;
857 }
858
859 if (unlink(p) < 0) {
860 log_error("Failed to remove symlink %s: %m", p);
861 return -errno;
862 }
863 } else if (r == -EINVAL) {
864
865 if (arg_link_journal == LINK_GUEST &&
866 rmdir(p) < 0) {
867
868 if (errno == ENOTDIR) {
869 log_error("%s already exists and is neither a symlink nor a directory", p);
870 return r;
871 } else {
872 log_error("Failed to remove %s: %m", p);
873 return -errno;
874 }
875 }
876 } else if (r != -ENOENT) {
877 log_error("readlink(%s) failed: %m", p);
878 return r;
879 }
880
881 if (arg_link_journal == LINK_GUEST) {
882
883 if (symlink(q, p) < 0) {
884 log_error("Failed to symlink %s to %s: %m", q, p);
885 return -errno;
886 }
887
888 r = mkdir_p(q, 0755);
889 if (r < 0)
890 log_warning("failed to create directory %s: %m", q);
891 return 0;
892 }
893
894 if (arg_link_journal == LINK_HOST) {
895 r = mkdir_p(p, 0755);
896 if (r < 0) {
897 log_error("Failed to create %s: %m", p);
898 return r;
899 }
900
901 } else if (access(p, F_OK) < 0)
902 return 0;
903
904 if (dir_is_empty(q) == 0) {
905 log_error("%s not empty.", q);
906 return -ENOTEMPTY;
907 }
908
909 r = mkdir_p(q, 0755);
910 if (r < 0) {
911 log_error("Failed to create %s: %m", q);
912 return r;
913 }
914
915 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
916 log_error("Failed to bind mount journal from host into guest: %m");
917 return -errno;
918 }
919
920 return 0;
921 }
922
923 static int drop_capabilities(void) {
924 return capability_bounding_set_drop(~arg_retain, false);
925 }
926
927 static int process_pty(int master, pid_t pid, sigset_t *mask) {
928
929 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
930 size_t in_buffer_full = 0, out_buffer_full = 0;
931 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
932 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
933 int ep = -1, signal_fd = -1, r;
934 bool tried_orderly_shutdown = false;
935
936 assert(master >= 0);
937 assert(pid > 0);
938 assert(mask);
939
940 fd_nonblock(STDIN_FILENO, 1);
941 fd_nonblock(STDOUT_FILENO, 1);
942 fd_nonblock(master, 1);
943
944 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
945 if (signal_fd < 0) {
946 log_error("signalfd(): %m");
947 r = -errno;
948 goto finish;
949 }
950
951 ep = epoll_create1(EPOLL_CLOEXEC);
952 if (ep < 0) {
953 log_error("Failed to create epoll: %m");
954 r = -errno;
955 goto finish;
956 }
957
958 /* We read from STDIN only if this is actually a TTY,
959 * otherwise we assume non-interactivity. */
960 if (isatty(STDIN_FILENO)) {
961 zero(stdin_ev);
962 stdin_ev.events = EPOLLIN|EPOLLET;
963 stdin_ev.data.fd = STDIN_FILENO;
964
965 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
966 log_error("Failed to register STDIN in epoll: %m");
967 r = -errno;
968 goto finish;
969 }
970 }
971
972 zero(stdout_ev);
973 stdout_ev.events = EPOLLOUT|EPOLLET;
974 stdout_ev.data.fd = STDOUT_FILENO;
975
976 zero(master_ev);
977 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
978 master_ev.data.fd = master;
979
980 zero(signal_ev);
981 signal_ev.events = EPOLLIN;
982 signal_ev.data.fd = signal_fd;
983
984 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
985 if (errno != EPERM) {
986 log_error("Failed to register stdout in epoll: %m");
987 r = -errno;
988 goto finish;
989 }
990 /* stdout without epoll support. Likely redirected to regular file. */
991 stdout_writable = true;
992 }
993
994 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
995 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
996 log_error("Failed to register fds in epoll: %m");
997 r = -errno;
998 goto finish;
999 }
1000
1001 for (;;) {
1002 struct epoll_event ev[16];
1003 ssize_t k;
1004 int i, nfds;
1005
1006 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1007 if (nfds < 0) {
1008
1009 if (errno == EINTR || errno == EAGAIN)
1010 continue;
1011
1012 log_error("epoll_wait(): %m");
1013 r = -errno;
1014 goto finish;
1015 }
1016
1017 assert(nfds >= 1);
1018
1019 for (i = 0; i < nfds; i++) {
1020 if (ev[i].data.fd == STDIN_FILENO) {
1021
1022 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1023 stdin_readable = true;
1024
1025 } else if (ev[i].data.fd == STDOUT_FILENO) {
1026
1027 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1028 stdout_writable = true;
1029
1030 } else if (ev[i].data.fd == master) {
1031
1032 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1033 master_readable = true;
1034
1035 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1036 master_writable = true;
1037
1038 } else if (ev[i].data.fd == signal_fd) {
1039 struct signalfd_siginfo sfsi;
1040 ssize_t n;
1041
1042 n = read(signal_fd, &sfsi, sizeof(sfsi));
1043 if (n != sizeof(sfsi)) {
1044
1045 if (n >= 0) {
1046 log_error("Failed to read from signalfd: invalid block size");
1047 r = -EIO;
1048 goto finish;
1049 }
1050
1051 if (errno != EINTR && errno != EAGAIN) {
1052 log_error("Failed to read from signalfd: %m");
1053 r = -errno;
1054 goto finish;
1055 }
1056 } else {
1057
1058 if (sfsi.ssi_signo == SIGWINCH) {
1059 struct winsize ws;
1060
1061 /* The window size changed, let's forward that. */
1062 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1063 ioctl(master, TIOCSWINSZ, &ws);
1064 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1065
1066 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1067
1068 /* This only works for systemd... */
1069 tried_orderly_shutdown = true;
1070 kill(pid, SIGRTMIN+3);
1071
1072 } else {
1073 r = 0;
1074 goto finish;
1075 }
1076 }
1077 }
1078 }
1079
1080 while ((stdin_readable && in_buffer_full <= 0) ||
1081 (master_writable && in_buffer_full > 0) ||
1082 (master_readable && out_buffer_full <= 0) ||
1083 (stdout_writable && out_buffer_full > 0)) {
1084
1085 if (stdin_readable && in_buffer_full < LINE_MAX) {
1086
1087 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1088 if (k < 0) {
1089
1090 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1091 stdin_readable = false;
1092 else {
1093 log_error("read(): %m");
1094 r = -errno;
1095 goto finish;
1096 }
1097 } else
1098 in_buffer_full += (size_t) k;
1099 }
1100
1101 if (master_writable && in_buffer_full > 0) {
1102
1103 k = write(master, in_buffer, in_buffer_full);
1104 if (k < 0) {
1105
1106 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1107 master_writable = false;
1108 else {
1109 log_error("write(): %m");
1110 r = -errno;
1111 goto finish;
1112 }
1113
1114 } else {
1115 assert(in_buffer_full >= (size_t) k);
1116 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1117 in_buffer_full -= k;
1118 }
1119 }
1120
1121 if (master_readable && out_buffer_full < LINE_MAX) {
1122
1123 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1124 if (k < 0) {
1125
1126 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1127 master_readable = false;
1128 else {
1129 log_error("read(): %m");
1130 r = -errno;
1131 goto finish;
1132 }
1133 } else
1134 out_buffer_full += (size_t) k;
1135 }
1136
1137 if (stdout_writable && out_buffer_full > 0) {
1138
1139 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1140 if (k < 0) {
1141
1142 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1143 stdout_writable = false;
1144 else {
1145 log_error("write(): %m");
1146 r = -errno;
1147 goto finish;
1148 }
1149
1150 } else {
1151 assert(out_buffer_full >= (size_t) k);
1152 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1153 out_buffer_full -= k;
1154 }
1155 }
1156 }
1157 }
1158
1159 finish:
1160 if (ep >= 0)
1161 close_nointr_nofail(ep);
1162
1163 if (signal_fd >= 0)
1164 close_nointr_nofail(signal_fd);
1165
1166 return r;
1167 }
1168
1169 static int register_machine(void) {
1170 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1171 _cleanup_bus_unref_ sd_bus *bus = NULL;
1172 int r;
1173
1174 r = sd_bus_open_system(&bus);
1175 if (r < 0) {
1176 log_error("Failed to open system bus: %s", strerror(-r));
1177 return r;
1178 }
1179
1180 r = sd_bus_call_method(
1181 bus,
1182 "org.freedesktop.machine1",
1183 "/org/freedesktop/machine1",
1184 "org.freedesktop.machine1.Manager",
1185 "CreateMachine",
1186 &error,
1187 NULL,
1188 "sayssusa(sv)",
1189 arg_machine,
1190 SD_BUS_APPEND_ID128(arg_uuid),
1191 "nspawn",
1192 "container",
1193 (uint32_t) 0,
1194 strempty(arg_directory),
1195 1, "Slice", "s", strempty(arg_slice));
1196 if (r < 0) {
1197 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1198 return r;
1199 }
1200
1201 return 0;
1202 }
1203
1204 static bool audit_enabled(void) {
1205 int fd;
1206
1207 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1208 if (fd >= 0) {
1209 close_nointr_nofail(fd);
1210 return true;
1211 }
1212 return false;
1213 }
1214
1215 int main(int argc, char *argv[]) {
1216 pid_t pid = 0;
1217 int r = EXIT_FAILURE, k;
1218 _cleanup_close_ int master = -1;
1219 int n_fd_passed;
1220 const char *console = NULL;
1221 struct termios saved_attr, raw_attr;
1222 sigset_t mask;
1223 bool saved_attr_valid = false;
1224 struct winsize ws;
1225 int kmsg_socket_pair[2] = { -1, -1 };
1226 _cleanup_fdset_free_ FDSet *fds = NULL;
1227
1228 log_parse_environment();
1229 log_open();
1230
1231 k = parse_argv(argc, argv);
1232 if (k < 0)
1233 goto finish;
1234 else if (k == 0) {
1235 r = EXIT_SUCCESS;
1236 goto finish;
1237 }
1238
1239 if (arg_directory) {
1240 char *p;
1241
1242 p = path_make_absolute_cwd(arg_directory);
1243 free(arg_directory);
1244 arg_directory = p;
1245 } else
1246 arg_directory = get_current_dir_name();
1247
1248 if (!arg_directory) {
1249 log_error("Failed to determine path, please use -D.");
1250 goto finish;
1251 }
1252
1253 path_kill_slashes(arg_directory);
1254
1255 if (!arg_machine) {
1256 arg_machine = strdup(path_get_file_name(arg_directory));
1257 if (!arg_machine) {
1258 log_oom();
1259 goto finish;
1260 }
1261
1262 hostname_cleanup(arg_machine, false);
1263 if (isempty(arg_machine)) {
1264 log_error("Failed to determine machine name automatically, please use -M.");
1265 goto finish;
1266 }
1267 }
1268
1269 if (geteuid() != 0) {
1270 log_error("Need to be root.");
1271 goto finish;
1272 }
1273
1274 if (sd_booted() <= 0) {
1275 log_error("Not running on a systemd system.");
1276 goto finish;
1277 }
1278
1279 if (arg_boot && audit_enabled()) {
1280 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1281 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1282 "line before using systemd-nspawn. Sleeping for 5s...\n");
1283 sleep(5);
1284 }
1285
1286 if (path_equal(arg_directory, "/")) {
1287 log_error("Spawning container on root directory not supported.");
1288 goto finish;
1289 }
1290
1291 if (path_is_os_tree(arg_directory) <= 0) {
1292 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1293 goto finish;
1294 }
1295
1296 log_close();
1297 n_fd_passed = sd_listen_fds(false);
1298 if (n_fd_passed > 0) {
1299 k = fdset_new_listen_fds(&fds, false);
1300 if (k < 0) {
1301 log_error("Failed to collect file descriptors: %s", strerror(-k));
1302 goto finish;
1303 }
1304 }
1305 fdset_close_others(fds);
1306 log_open();
1307
1308 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1309 if (master < 0) {
1310 log_error("Failed to acquire pseudo tty: %m");
1311 goto finish;
1312 }
1313
1314 console = ptsname(master);
1315 if (!console) {
1316 log_error("Failed to determine tty name: %m");
1317 goto finish;
1318 }
1319
1320 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1321
1322 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1323 ioctl(master, TIOCSWINSZ, &ws);
1324
1325 if (unlockpt(master) < 0) {
1326 log_error("Failed to unlock tty: %m");
1327 goto finish;
1328 }
1329
1330 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1331 saved_attr_valid = true;
1332
1333 raw_attr = saved_attr;
1334 cfmakeraw(&raw_attr);
1335 raw_attr.c_lflag &= ~ECHO;
1336 }
1337
1338 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1339 log_error("Failed to create kmsg socket pair.");
1340 goto finish;
1341 }
1342
1343 sd_notify(0, "READY=1");
1344
1345 assert_se(sigemptyset(&mask) == 0);
1346 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1347 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1348
1349 for (;;) {
1350 siginfo_t status;
1351 int pipefd[2], pipefd2[2];
1352
1353 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1354 log_error("pipe2(): %m");
1355 goto finish;
1356 }
1357
1358 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1359 log_error("pipe2(): %m");
1360 close_pipe(pipefd);
1361 goto finish;
1362 }
1363
1364 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1365 if (pid < 0) {
1366 if (errno == EINVAL)
1367 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1368 else
1369 log_error("clone() failed: %m");
1370
1371 goto finish;
1372 }
1373
1374 if (pid == 0) {
1375 /* child */
1376 const char *home = NULL;
1377 uid_t uid = (uid_t) -1;
1378 gid_t gid = (gid_t) -1;
1379 unsigned n_env = 2;
1380 const char *envp[] = {
1381 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1382 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1383 NULL, /* TERM */
1384 NULL, /* HOME */
1385 NULL, /* USER */
1386 NULL, /* LOGNAME */
1387 NULL, /* container_uuid */
1388 NULL, /* LISTEN_FDS */
1389 NULL, /* LISTEN_PID */
1390 NULL
1391 };
1392
1393 envp[n_env] = strv_find_prefix(environ, "TERM=");
1394 if (envp[n_env])
1395 n_env ++;
1396
1397 /* Wait for the parent process to log our PID */
1398 close_nointr_nofail(pipefd[1]);
1399 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1400 close_nointr_nofail(pipefd[0]);
1401
1402 close_nointr_nofail(master);
1403 master = -1;
1404
1405 if (saved_attr_valid) {
1406 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1407 log_error("Failed to set terminal attributes: %m");
1408 goto child_fail;
1409 }
1410 }
1411
1412 close_nointr(STDIN_FILENO);
1413 close_nointr(STDOUT_FILENO);
1414 close_nointr(STDERR_FILENO);
1415
1416 close_nointr_nofail(kmsg_socket_pair[0]);
1417 kmsg_socket_pair[0] = -1;
1418
1419 reset_all_signal_handlers();
1420
1421 assert_se(sigemptyset(&mask) == 0);
1422 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1423
1424 k = open_terminal(console, O_RDWR);
1425 if (k != STDIN_FILENO) {
1426 if (k >= 0) {
1427 close_nointr_nofail(k);
1428 k = -EINVAL;
1429 }
1430
1431 log_error("Failed to open console: %s", strerror(-k));
1432 goto child_fail;
1433 }
1434
1435 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1436 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1437 log_error("Failed to duplicate console: %m");
1438 goto child_fail;
1439 }
1440
1441 if (setsid() < 0) {
1442 log_error("setsid() failed: %m");
1443 goto child_fail;
1444 }
1445
1446 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1447 log_error("PR_SET_PDEATHSIG failed: %m");
1448 goto child_fail;
1449 }
1450
1451 close_pipe(pipefd2);
1452
1453 r = register_machine();
1454 if (r < 0)
1455 goto finish;
1456
1457 /* Mark everything as slave, so that we still
1458 * receive mounts from the real root, but don't
1459 * propagate mounts to the real root. */
1460 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1461 log_error("MS_SLAVE|MS_REC failed: %m");
1462 goto child_fail;
1463 }
1464
1465 /* Turn directory into bind mount */
1466 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1467 log_error("Failed to make bind mount.");
1468 goto child_fail;
1469 }
1470
1471 if (arg_read_only)
1472 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1473 log_error("Failed to make read-only.");
1474 goto child_fail;
1475 }
1476
1477 if (mount_all(arg_directory) < 0)
1478 goto child_fail;
1479
1480 if (copy_devnodes(arg_directory) < 0)
1481 goto child_fail;
1482
1483 if (setup_ptmx(arg_directory) < 0)
1484 goto child_fail;
1485
1486 dev_setup(arg_directory);
1487
1488 if (setup_dev_console(arg_directory, console) < 0)
1489 goto child_fail;
1490
1491 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1492 goto child_fail;
1493
1494 close_nointr_nofail(kmsg_socket_pair[1]);
1495 kmsg_socket_pair[1] = -1;
1496
1497 if (setup_boot_id(arg_directory) < 0)
1498 goto child_fail;
1499
1500 if (setup_timezone(arg_directory) < 0)
1501 goto child_fail;
1502
1503 if (setup_resolv_conf(arg_directory) < 0)
1504 goto child_fail;
1505
1506 if (setup_journal(arg_directory) < 0)
1507 goto child_fail;
1508
1509 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1510 goto child_fail;
1511
1512 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1513 goto child_fail;
1514
1515 if (chdir(arg_directory) < 0) {
1516 log_error("chdir(%s) failed: %m", arg_directory);
1517 goto child_fail;
1518 }
1519
1520 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1521 log_error("mount(MS_MOVE) failed: %m");
1522 goto child_fail;
1523 }
1524
1525 if (chroot(".") < 0) {
1526 log_error("chroot() failed: %m");
1527 goto child_fail;
1528 }
1529
1530 if (chdir("/") < 0) {
1531 log_error("chdir() failed: %m");
1532 goto child_fail;
1533 }
1534
1535 umask(0022);
1536
1537 loopback_setup();
1538
1539 if (drop_capabilities() < 0) {
1540 log_error("drop_capabilities() failed: %m");
1541 goto child_fail;
1542 }
1543
1544 if (arg_user) {
1545
1546 /* Note that this resolves user names
1547 * inside the container, and hence
1548 * accesses the NSS modules from the
1549 * container and not the host. This is
1550 * a bit weird... */
1551
1552 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1553 log_error("get_user_creds() failed: %m");
1554 goto child_fail;
1555 }
1556
1557 if (mkdir_parents_label(home, 0775) < 0) {
1558 log_error("mkdir_parents_label() failed: %m");
1559 goto child_fail;
1560 }
1561
1562 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1563 log_error("mkdir_safe_label() failed: %m");
1564 goto child_fail;
1565 }
1566
1567 if (initgroups((const char*)arg_user, gid) < 0) {
1568 log_error("initgroups() failed: %m");
1569 goto child_fail;
1570 }
1571
1572 if (setresgid(gid, gid, gid) < 0) {
1573 log_error("setregid() failed: %m");
1574 goto child_fail;
1575 }
1576
1577 if (setresuid(uid, uid, uid) < 0) {
1578 log_error("setreuid() failed: %m");
1579 goto child_fail;
1580 }
1581 } else {
1582 /* Reset everything fully to 0, just in case */
1583
1584 if (setgroups(0, NULL) < 0) {
1585 log_error("setgroups() failed: %m");
1586 goto child_fail;
1587 }
1588
1589 if (setresgid(0, 0, 0) < 0) {
1590 log_error("setregid() failed: %m");
1591 goto child_fail;
1592 }
1593
1594 if (setresuid(0, 0, 0) < 0) {
1595 log_error("setreuid() failed: %m");
1596 goto child_fail;
1597 }
1598 }
1599
1600 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1601 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1602 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1603 log_oom();
1604 goto child_fail;
1605 }
1606
1607 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1608 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1609 log_oom();
1610 goto child_fail;
1611 }
1612 }
1613
1614 if (fdset_size(fds) > 0) {
1615 k = fdset_cloexec(fds, false);
1616 if (k < 0) {
1617 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1618 goto child_fail;
1619 }
1620
1621 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1622 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1623 log_oom();
1624 goto child_fail;
1625 }
1626 }
1627
1628 setup_hostname();
1629
1630 if (arg_boot) {
1631 char **a;
1632 size_t l;
1633
1634 /* Automatically search for the init system */
1635
1636 l = 1 + argc - optind;
1637 a = newa(char*, l + 1);
1638 memcpy(a + 1, argv + optind, l * sizeof(char*));
1639
1640 a[0] = (char*) "/usr/lib/systemd/systemd";
1641 execve(a[0], a, (char**) envp);
1642
1643 a[0] = (char*) "/lib/systemd/systemd";
1644 execve(a[0], a, (char**) envp);
1645
1646 a[0] = (char*) "/sbin/init";
1647 execve(a[0], a, (char**) envp);
1648 } else if (argc > optind)
1649 execvpe(argv[optind], argv + optind, (char**) envp);
1650 else {
1651 chdir(home ? home : "/root");
1652 execle("/bin/bash", "-bash", NULL, (char**) envp);
1653 }
1654
1655 log_error("execv() failed: %m");
1656
1657 child_fail:
1658 _exit(EXIT_FAILURE);
1659 }
1660
1661 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1662 close_nointr_nofail(pipefd[0]);
1663 close_nointr_nofail(pipefd[1]);
1664
1665 /* Wait for the child process to establish cgroup hierarchy */
1666 close_nointr_nofail(pipefd2[1]);
1667 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1668 close_nointr_nofail(pipefd2[0]);
1669
1670 fdset_free(fds);
1671 fds = NULL;
1672
1673 if (process_pty(master, pid, &mask) < 0)
1674 goto finish;
1675
1676 if (saved_attr_valid)
1677 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1678
1679 k = wait_for_terminate(pid, &status);
1680 if (k < 0) {
1681 r = EXIT_FAILURE;
1682 break;
1683 }
1684
1685 if (status.si_code == CLD_EXITED) {
1686 r = status.si_status;
1687 if (status.si_status != 0) {
1688 log_error("Container failed with error code %i.", status.si_status);
1689 break;
1690 }
1691
1692 log_debug("Container exited successfully.");
1693 break;
1694 } else if (status.si_code == CLD_KILLED &&
1695 status.si_status == SIGINT) {
1696 log_info("Container has been shut down.");
1697 r = 0;
1698 break;
1699 } else if (status.si_code == CLD_KILLED &&
1700 status.si_status == SIGHUP) {
1701 log_info("Container is being rebooted.");
1702 continue;
1703 } else if (status.si_code == CLD_KILLED ||
1704 status.si_code == CLD_DUMPED) {
1705
1706 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1707 r = EXIT_FAILURE;
1708 break;
1709 } else {
1710 log_error("Container failed due to unknown reason.");
1711 r = EXIT_FAILURE;
1712 break;
1713 }
1714 }
1715
1716 finish:
1717 if (saved_attr_valid)
1718 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1719
1720 close_pipe(kmsg_socket_pair);
1721
1722 if (pid > 0)
1723 kill(pid, SIGKILL);
1724
1725 free(arg_directory);
1726 free(arg_machine);
1727
1728 return r;
1729 }