]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
bus: log message parsing errors everywhere with a generalized bus_log_parse_error()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64
65 #ifndef TTY_GID
66 #define TTY_GID 5
67 #endif
68
69 typedef enum LinkJournal {
70 LINK_NO,
71 LINK_AUTO,
72 LINK_HOST,
73 LINK_GUEST
74 } LinkJournal;
75
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
86 (1ULL << CAP_CHOWN) |
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
92 (1ULL << CAP_KILL) |
93 (1ULL << CAP_LEASE) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
107 (1ULL << CAP_SYS_RESOURCE) |
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL);
111 static char **arg_bind = NULL;
112 static char **arg_bind_ro = NULL;
113
114 static int help(void) {
115
116 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
117 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
118 " -h --help Show this help\n"
119 " --version Print version string\n"
120 " -D --directory=NAME Root directory for the container\n"
121 " -b --boot Boot up full system (i.e. invoke init)\n"
122 " -u --user=USER Run the command under specified user or uid\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " -S --slice=SLICE Place the container in the specified slice\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
129 " capability\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " the container\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
136
137 return 0;
138 }
139
140 static int parse_argv(int argc, char *argv[]) {
141
142 enum {
143 ARG_VERSION = 0x100,
144 ARG_PRIVATE_NETWORK,
145 ARG_UUID,
146 ARG_READ_ONLY,
147 ARG_CAPABILITY,
148 ARG_LINK_JOURNAL,
149 ARG_BIND,
150 ARG_BIND_RO
151 };
152
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
167 { "slice", required_argument, NULL, 'S' },
168 {}
169 };
170
171 int c, r;
172
173 assert(argc >= 0);
174 assert(argv);
175
176 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
177
178 switch (c) {
179
180 case 'h':
181 return help();
182
183 case ARG_VERSION:
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
186 return 0;
187
188 case 'D':
189 free(arg_directory);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
193 return -ENOMEM;
194 }
195
196 break;
197
198 case 'u':
199 free(arg_user);
200 arg_user = strdup(optarg);
201 if (!arg_user)
202 return log_oom();
203
204 break;
205
206 case ARG_PRIVATE_NETWORK:
207 arg_private_network = true;
208 break;
209
210 case 'b':
211 arg_boot = true;
212 break;
213
214 case ARG_UUID:
215 r = sd_id128_from_string(optarg, &arg_uuid);
216 if (r < 0) {
217 log_error("Invalid UUID: %s", optarg);
218 return r;
219 }
220 break;
221
222 case 'S':
223 arg_slice = strdup(optarg);
224 if (!arg_slice)
225 return log_oom();
226
227 break;
228
229 case 'M':
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
232 return -EINVAL;
233 }
234
235 free(arg_machine);
236 arg_machine = strdup(optarg);
237 if (!arg_machine)
238 return log_oom();
239
240 break;
241
242 case ARG_READ_ONLY:
243 arg_read_only = true;
244 break;
245
246 case ARG_CAPABILITY: {
247 char *state, *word;
248 size_t length;
249
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251 cap_value_t cap;
252 char *t;
253
254 t = strndup(word, length);
255 if (!t)
256 return log_oom();
257
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
260 free(t);
261 return -EINVAL;
262 }
263
264 free(t);
265 arg_retain |= 1ULL << (uint64_t) cap;
266 }
267
268 break;
269 }
270
271 case 'j':
272 arg_link_journal = LINK_GUEST;
273 break;
274
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
284 else {
285 log_error("Failed to parse link journal mode %s", optarg);
286 return -EINVAL;
287 }
288
289 break;
290
291 case ARG_BIND:
292 case ARG_BIND_RO: {
293 _cleanup_free_ char *a = NULL, *b = NULL;
294 char *e;
295 char ***x;
296
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299 e = strchr(optarg, ':');
300 if (e) {
301 a = strndup(optarg, e - optarg);
302 b = strdup(e + 1);
303 } else {
304 a = strdup(optarg);
305 b = strdup(optarg);
306 }
307
308 if (!a || !b)
309 return log_oom();
310
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
313 return -EINVAL;
314 }
315
316 r = strv_extend(x, a);
317 if (r < 0)
318 return log_oom();
319
320 r = strv_extend(x, b);
321 if (r < 0)
322 return log_oom();
323
324 break;
325 }
326
327 case '?':
328 return -EINVAL;
329
330 default:
331 assert_not_reached("Unhandled option");
332 }
333 }
334
335 return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340 typedef struct MountPoint {
341 const char *what;
342 const char *where;
343 const char *type;
344 const char *options;
345 unsigned long flags;
346 bool fatal;
347 } MountPoint;
348
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 #ifdef HAVE_SELINUX
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
361 #endif
362 };
363
364 unsigned k;
365 int r = 0;
366
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 _cleanup_free_ char *where = NULL;
369 int t;
370
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
372 if (!where)
373 return log_oom();
374
375 t = path_is_mount_point(where, true);
376 if (t < 0) {
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379 if (r == 0)
380 r = t;
381
382 continue;
383 }
384
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
387 continue;
388
389 mkdir_p(where, 0755);
390
391 if (mount(mount_table[k].what,
392 where,
393 mount_table[k].type,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
397
398 log_error("mount(%s) failed: %m", where);
399
400 if (r == 0)
401 r = -errno;
402 }
403 }
404
405 return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409 char **x, **y;
410
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
413 struct stat source_st, dest_st;
414
415 if (stat(*x, &source_st) < 0) {
416 log_error("failed to stat %s: %m", *x);
417 return -errno;
418 }
419
420 where = strjoin(dest, "/", *y, NULL);
421 if (!where)
422 return log_oom();
423
424 if (stat(where, &dest_st) == 0) {
425 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
426 log_error("The file types of %s and %s do not match. Refusing bind mount",
427 *x, where);
428 return -EINVAL;
429 }
430 } else {
431 /* Create the mount point, but be conservative -- refuse to create block
432 * and char devices. */
433 if (S_ISDIR(source_st.st_mode))
434 mkdir_p_label(where, 0755);
435 else if (S_ISFIFO(source_st.st_mode))
436 mkfifo(where, 0644);
437 else if (S_ISSOCK(source_st.st_mode))
438 mknod(where, 0644 | S_IFSOCK, 0);
439 else if (S_ISREG(source_st.st_mode))
440 touch(where);
441 else {
442 log_error("Refusing to create mountpoint for file: %s", *x);
443 return -ENOTSUP;
444 }
445 }
446
447 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
448 log_error("mount(%s) failed: %m", where);
449 return -errno;
450 }
451
452 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
453 log_error("mount(%s) failed: %m", where);
454 return -errno;
455 }
456 }
457
458 return 0;
459 }
460
461 static int setup_timezone(const char *dest) {
462 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
463 char *z, *y;
464 int r;
465
466 assert(dest);
467
468 /* Fix the timezone, if possible */
469 r = readlink_malloc("/etc/localtime", &p);
470 if (r < 0) {
471 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
472 return 0;
473 }
474
475 z = path_startswith(p, "../usr/share/zoneinfo/");
476 if (!z)
477 z = path_startswith(p, "/usr/share/zoneinfo/");
478 if (!z) {
479 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
480 return 0;
481 }
482
483 where = strappend(dest, "/etc/localtime");
484 if (!where)
485 return log_oom();
486
487 r = readlink_malloc(where, &q);
488 if (r >= 0) {
489 y = path_startswith(q, "../usr/share/zoneinfo/");
490 if (!y)
491 y = path_startswith(q, "/usr/share/zoneinfo/");
492
493
494 /* Already pointing to the right place? Then do nothing .. */
495 if (y && streq(y, z))
496 return 0;
497 }
498
499 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
500 if (!check)
501 return log_oom();
502
503 if (access(check, F_OK) < 0) {
504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
505 return 0;
506 }
507
508 what = strappend("../usr/share/zoneinfo/", z);
509 if (!what)
510 return log_oom();
511
512 unlink(where);
513 if (symlink(what, where) < 0) {
514 log_error("Failed to correct timezone of container: %m");
515 return 0;
516 }
517
518 return 0;
519 }
520
521 static int setup_resolv_conf(const char *dest) {
522 char _cleanup_free_ *where = NULL;
523
524 assert(dest);
525
526 if (arg_private_network)
527 return 0;
528
529 /* Fix resolv.conf, if possible */
530 where = strappend(dest, "/etc/resolv.conf");
531 if (!where)
532 return log_oom();
533
534 /* We don't really care for the results of this really. If it
535 * fails, it fails, but meh... */
536 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
537
538 return 0;
539 }
540
541 static int setup_boot_id(const char *dest) {
542 _cleanup_free_ char *from = NULL, *to = NULL;
543 sd_id128_t rnd;
544 char as_uuid[37];
545 int r;
546
547 assert(dest);
548
549 /* Generate a new randomized boot ID, so that each boot-up of
550 * the container gets a new one */
551
552 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
553 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
554 if (!from || !to)
555 return log_oom();
556
557 r = sd_id128_randomize(&rnd);
558 if (r < 0) {
559 log_error("Failed to generate random boot id: %s", strerror(-r));
560 return r;
561 }
562
563 snprintf(as_uuid, sizeof(as_uuid),
564 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
565 SD_ID128_FORMAT_VAL(rnd));
566 char_array_0(as_uuid);
567
568 r = write_string_file(from, as_uuid);
569 if (r < 0) {
570 log_error("Failed to write boot id: %s", strerror(-r));
571 return r;
572 }
573
574 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
575 log_error("Failed to bind mount boot id: %m");
576 r = -errno;
577 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
578 log_warning("Failed to make boot id read-only: %m");
579
580 unlink(from);
581 return r;
582 }
583
584 static int copy_devnodes(const char *dest) {
585
586 static const char devnodes[] =
587 "null\0"
588 "zero\0"
589 "full\0"
590 "random\0"
591 "urandom\0"
592 "tty\0";
593
594 const char *d;
595 int r = 0;
596 _cleanup_umask_ mode_t u;
597
598 assert(dest);
599
600 u = umask(0000);
601
602 NULSTR_FOREACH(d, devnodes) {
603 struct stat st;
604 _cleanup_free_ char *from = NULL, *to = NULL;
605
606 asprintf(&from, "/dev/%s", d);
607 asprintf(&to, "%s/dev/%s", dest, d);
608
609 if (!from || !to) {
610 log_oom();
611
612 if (r == 0)
613 r = -ENOMEM;
614
615 break;
616 }
617
618 if (stat(from, &st) < 0) {
619
620 if (errno != ENOENT) {
621 log_error("Failed to stat %s: %m", from);
622 if (r == 0)
623 r = -errno;
624 }
625
626 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
627
628 log_error("%s is not a char or block device, cannot copy", from);
629 if (r == 0)
630 r = -EIO;
631
632 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
633
634 log_error("mknod(%s) failed: %m", dest);
635 if (r == 0)
636 r = -errno;
637 }
638 }
639
640 return r;
641 }
642
643 static int setup_ptmx(const char *dest) {
644 _cleanup_free_ char *p = NULL;
645
646 p = strappend(dest, "/dev/ptmx");
647 if (!p)
648 return log_oom();
649
650 if (symlink("pts/ptmx", p) < 0) {
651 log_error("Failed to create /dev/ptmx symlink: %m");
652 return -errno;
653 }
654
655 return 0;
656 }
657
658 static int setup_dev_console(const char *dest, const char *console) {
659 struct stat st;
660 _cleanup_free_ char *to = NULL;
661 int r;
662 _cleanup_umask_ mode_t u;
663
664 assert(dest);
665 assert(console);
666
667 u = umask(0000);
668
669 if (stat(console, &st) < 0) {
670 log_error("Failed to stat %s: %m", console);
671 return -errno;
672
673 } else if (!S_ISCHR(st.st_mode)) {
674 log_error("/dev/console is not a char device");
675 return -EIO;
676 }
677
678 r = chmod_and_chown(console, 0600, 0, 0);
679 if (r < 0) {
680 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
681 return r;
682 }
683
684 if (asprintf(&to, "%s/dev/console", dest) < 0)
685 return log_oom();
686
687 /* We need to bind mount the right tty to /dev/console since
688 * ptys can only exist on pts file systems. To have something
689 * to bind mount things on we create a device node first, that
690 * has the right major/minor (note that the major minor
691 * doesn't actually matter here, since we mount it over
692 * anyway). */
693
694 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
695 log_error("mknod() for /dev/console failed: %m");
696 return -errno;
697 }
698
699 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
700 log_error("Bind mount for /dev/console failed: %m");
701 return -errno;
702 }
703
704 return 0;
705 }
706
707 static int setup_kmsg(const char *dest, int kmsg_socket) {
708 _cleanup_free_ char *from = NULL, *to = NULL;
709 int r, fd, k;
710 _cleanup_umask_ mode_t u;
711 union {
712 struct cmsghdr cmsghdr;
713 uint8_t buf[CMSG_SPACE(sizeof(int))];
714 } control = {};
715 struct msghdr mh = {
716 .msg_control = &control,
717 .msg_controllen = sizeof(control),
718 };
719 struct cmsghdr *cmsg;
720
721 assert(dest);
722 assert(kmsg_socket >= 0);
723
724 u = umask(0000);
725
726 /* We create the kmsg FIFO as /dev/kmsg, but immediately
727 * delete it after bind mounting it to /proc/kmsg. While FIFOs
728 * on the reading side behave very similar to /proc/kmsg,
729 * their writing side behaves differently from /dev/kmsg in
730 * that writing blocks when nothing is reading. In order to
731 * avoid any problems with containers deadlocking due to this
732 * we simply make /dev/kmsg unavailable to the container. */
733 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
734 asprintf(&to, "%s/proc/kmsg", dest) < 0)
735 return log_oom();
736
737 if (mkfifo(from, 0600) < 0) {
738 log_error("mkfifo() for /dev/kmsg failed: %m");
739 return -errno;
740 }
741
742 r = chmod_and_chown(from, 0600, 0, 0);
743 if (r < 0) {
744 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
745 return r;
746 }
747
748 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749 log_error("Bind mount for /proc/kmsg failed: %m");
750 return -errno;
751 }
752
753 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
754 if (fd < 0) {
755 log_error("Failed to open fifo: %m");
756 return -errno;
757 }
758
759 cmsg = CMSG_FIRSTHDR(&mh);
760 cmsg->cmsg_level = SOL_SOCKET;
761 cmsg->cmsg_type = SCM_RIGHTS;
762 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
763 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
764
765 mh.msg_controllen = cmsg->cmsg_len;
766
767 /* Store away the fd in the socket, so that it stays open as
768 * long as we run the child */
769 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
770 close_nointr_nofail(fd);
771
772 if (k < 0) {
773 log_error("Failed to send FIFO fd: %m");
774 return -errno;
775 }
776
777 /* And now make the FIFO unavailable as /dev/kmsg... */
778 unlink(from);
779 return 0;
780 }
781
782 static int setup_hostname(void) {
783
784 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
785 return -errno;
786
787 return 0;
788 }
789
790 static int setup_journal(const char *directory) {
791 sd_id128_t machine_id;
792 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
793 char *id;
794 int r;
795
796 if (arg_link_journal == LINK_NO)
797 return 0;
798
799 p = strappend(directory, "/etc/machine-id");
800 if (!p)
801 return log_oom();
802
803 r = read_one_line_file(p, &b);
804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
805 return 0;
806 else if (r < 0) {
807 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
808 return r;
809 }
810
811 id = strstrip(b);
812 if (isempty(id) && arg_link_journal == LINK_AUTO)
813 return 0;
814
815 /* Verify validity */
816 r = sd_id128_from_string(id, &machine_id);
817 if (r < 0) {
818 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
819 return r;
820 }
821
822 free(p);
823 p = strappend("/var/log/journal/", id);
824 q = strjoin(directory, "/var/log/journal/", id, NULL);
825 if (!p || !q)
826 return log_oom();
827
828 if (path_is_mount_point(p, false) > 0) {
829 if (arg_link_journal != LINK_AUTO) {
830 log_error("%s: already a mount point, refusing to use for journal", p);
831 return -EEXIST;
832 }
833
834 return 0;
835 }
836
837 if (path_is_mount_point(q, false) > 0) {
838 if (arg_link_journal != LINK_AUTO) {
839 log_error("%s: already a mount point, refusing to use for journal", q);
840 return -EEXIST;
841 }
842
843 return 0;
844 }
845
846 r = readlink_and_make_absolute(p, &d);
847 if (r >= 0) {
848 if ((arg_link_journal == LINK_GUEST ||
849 arg_link_journal == LINK_AUTO) &&
850 path_equal(d, q)) {
851
852 r = mkdir_p(q, 0755);
853 if (r < 0)
854 log_warning("failed to create directory %s: %m", q);
855 return 0;
856 }
857
858 if (unlink(p) < 0) {
859 log_error("Failed to remove symlink %s: %m", p);
860 return -errno;
861 }
862 } else if (r == -EINVAL) {
863
864 if (arg_link_journal == LINK_GUEST &&
865 rmdir(p) < 0) {
866
867 if (errno == ENOTDIR) {
868 log_error("%s already exists and is neither a symlink nor a directory", p);
869 return r;
870 } else {
871 log_error("Failed to remove %s: %m", p);
872 return -errno;
873 }
874 }
875 } else if (r != -ENOENT) {
876 log_error("readlink(%s) failed: %m", p);
877 return r;
878 }
879
880 if (arg_link_journal == LINK_GUEST) {
881
882 if (symlink(q, p) < 0) {
883 log_error("Failed to symlink %s to %s: %m", q, p);
884 return -errno;
885 }
886
887 r = mkdir_p(q, 0755);
888 if (r < 0)
889 log_warning("failed to create directory %s: %m", q);
890 return 0;
891 }
892
893 if (arg_link_journal == LINK_HOST) {
894 r = mkdir_p(p, 0755);
895 if (r < 0) {
896 log_error("Failed to create %s: %m", p);
897 return r;
898 }
899
900 } else if (access(p, F_OK) < 0)
901 return 0;
902
903 if (dir_is_empty(q) == 0) {
904 log_error("%s not empty.", q);
905 return -ENOTEMPTY;
906 }
907
908 r = mkdir_p(q, 0755);
909 if (r < 0) {
910 log_error("Failed to create %s: %m", q);
911 return r;
912 }
913
914 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
915 log_error("Failed to bind mount journal from host into guest: %m");
916 return -errno;
917 }
918
919 return 0;
920 }
921
922 static int drop_capabilities(void) {
923 return capability_bounding_set_drop(~arg_retain, false);
924 }
925
926 static int register_machine(void) {
927 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
928 _cleanup_bus_unref_ sd_bus *bus = NULL;
929 int r;
930
931 r = sd_bus_open_system(&bus);
932 if (r < 0) {
933 log_error("Failed to open system bus: %s", strerror(-r));
934 return r;
935 }
936
937 r = sd_bus_call_method(
938 bus,
939 "org.freedesktop.machine1",
940 "/org/freedesktop/machine1",
941 "org.freedesktop.machine1.Manager",
942 "CreateMachine",
943 &error,
944 NULL,
945 "sayssusa(sv)",
946 arg_machine,
947 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
948 "nspawn",
949 "container",
950 (uint32_t) 0,
951 strempty(arg_directory),
952 !isempty(arg_slice), "Slice", "s", arg_slice);
953 if (r < 0) {
954 log_error("Failed to register machine: %s", bus_error_message(&error, r));
955 return r;
956 }
957
958 return 0;
959 }
960
961 static int terminate_machine(pid_t pid) {
962 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
963 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
964 _cleanup_bus_unref_ sd_bus *bus = NULL;
965 const char *path;
966 int r;
967
968 r = sd_bus_open_system(&bus);
969 if (r < 0) {
970 log_error("Failed to open system bus: %s", strerror(-r));
971 return r;
972 }
973
974 r = sd_bus_call_method(
975 bus,
976 "org.freedesktop.machine1",
977 "/org/freedesktop/machine1",
978 "org.freedesktop.machine1.Manager",
979 "GetMachineByPID",
980 &error,
981 &reply,
982 "u",
983 (uint32_t) pid);
984 if (r < 0) {
985 /* Note that the machine might already have been
986 * cleaned up automatically, hence don't consider it a
987 * failure if we cannot get the machine object. */
988 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
989 return 0;
990 }
991
992 r = sd_bus_message_read(reply, "o", &path);
993 if (r < 0)
994 return bus_log_parse_error(r);
995
996 r = sd_bus_call_method(
997 bus,
998 "org.freedesktop.machine1",
999 path,
1000 "org.freedesktop.machine1.Machine",
1001 "Terminate",
1002 &error,
1003 NULL,
1004 NULL);
1005 if (r < 0) {
1006 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1007 return 0;
1008 }
1009
1010 return 0;
1011 }
1012
1013 static bool audit_enabled(void) {
1014 int fd;
1015
1016 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1017 if (fd >= 0) {
1018 close_nointr_nofail(fd);
1019 return true;
1020 }
1021 return false;
1022 }
1023
1024 int main(int argc, char *argv[]) {
1025 pid_t pid = 0;
1026 int r = EXIT_FAILURE, k;
1027 _cleanup_close_ int master = -1;
1028 int n_fd_passed;
1029 const char *console = NULL;
1030 sigset_t mask;
1031 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1032 _cleanup_fdset_free_ FDSet *fds = NULL;
1033
1034 log_parse_environment();
1035 log_open();
1036
1037 k = parse_argv(argc, argv);
1038 if (k < 0)
1039 goto finish;
1040 else if (k == 0) {
1041 r = EXIT_SUCCESS;
1042 goto finish;
1043 }
1044
1045 if (arg_directory) {
1046 char *p;
1047
1048 p = path_make_absolute_cwd(arg_directory);
1049 free(arg_directory);
1050 arg_directory = p;
1051 } else
1052 arg_directory = get_current_dir_name();
1053
1054 if (!arg_directory) {
1055 log_error("Failed to determine path, please use -D.");
1056 goto finish;
1057 }
1058
1059 path_kill_slashes(arg_directory);
1060
1061 if (!arg_machine) {
1062 arg_machine = strdup(path_get_file_name(arg_directory));
1063 if (!arg_machine) {
1064 log_oom();
1065 goto finish;
1066 }
1067
1068 hostname_cleanup(arg_machine, false);
1069 if (isempty(arg_machine)) {
1070 log_error("Failed to determine machine name automatically, please use -M.");
1071 goto finish;
1072 }
1073 }
1074
1075 if (geteuid() != 0) {
1076 log_error("Need to be root.");
1077 goto finish;
1078 }
1079
1080 if (sd_booted() <= 0) {
1081 log_error("Not running on a systemd system.");
1082 goto finish;
1083 }
1084
1085 if (arg_boot && audit_enabled()) {
1086 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1087 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1088 "line before using systemd-nspawn. Sleeping for 5s...\n");
1089 sleep(5);
1090 }
1091
1092 if (path_equal(arg_directory, "/")) {
1093 log_error("Spawning container on root directory not supported.");
1094 goto finish;
1095 }
1096
1097 if (path_is_os_tree(arg_directory) <= 0) {
1098 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1099 goto finish;
1100 }
1101
1102 log_close();
1103 n_fd_passed = sd_listen_fds(false);
1104 if (n_fd_passed > 0) {
1105 k = fdset_new_listen_fds(&fds, false);
1106 if (k < 0) {
1107 log_error("Failed to collect file descriptors: %s", strerror(-k));
1108 goto finish;
1109 }
1110 }
1111 fdset_close_others(fds);
1112 log_open();
1113
1114 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1115 if (master < 0) {
1116 log_error("Failed to acquire pseudo tty: %m");
1117 goto finish;
1118 }
1119
1120 console = ptsname(master);
1121 if (!console) {
1122 log_error("Failed to determine tty name: %m");
1123 goto finish;
1124 }
1125
1126 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1127
1128 if (unlockpt(master) < 0) {
1129 log_error("Failed to unlock tty: %m");
1130 goto finish;
1131 }
1132
1133 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1134 log_error("Failed to create kmsg socket pair.");
1135 goto finish;
1136 }
1137
1138 sd_notify(0, "READY=1");
1139
1140 assert_se(sigemptyset(&mask) == 0);
1141 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1142 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1143
1144 for (;;) {
1145 siginfo_t status;
1146
1147 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1148 if (pid < 0) {
1149 if (errno == EINVAL)
1150 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1151 else
1152 log_error("clone() failed: %m");
1153
1154 goto finish;
1155 }
1156
1157 if (pid == 0) {
1158 /* child */
1159 const char *home = NULL;
1160 uid_t uid = (uid_t) -1;
1161 gid_t gid = (gid_t) -1;
1162 unsigned n_env = 2;
1163 const char *envp[] = {
1164 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1165 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1166 NULL, /* TERM */
1167 NULL, /* HOME */
1168 NULL, /* USER */
1169 NULL, /* LOGNAME */
1170 NULL, /* container_uuid */
1171 NULL, /* LISTEN_FDS */
1172 NULL, /* LISTEN_PID */
1173 NULL
1174 };
1175
1176 envp[n_env] = strv_find_prefix(environ, "TERM=");
1177 if (envp[n_env])
1178 n_env ++;
1179
1180 close_nointr_nofail(master);
1181 master = -1;
1182
1183 close_nointr(STDIN_FILENO);
1184 close_nointr(STDOUT_FILENO);
1185 close_nointr(STDERR_FILENO);
1186
1187 close_nointr_nofail(kmsg_socket_pair[0]);
1188 kmsg_socket_pair[0] = -1;
1189
1190 reset_all_signal_handlers();
1191
1192 assert_se(sigemptyset(&mask) == 0);
1193 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1194
1195 k = open_terminal(console, O_RDWR);
1196 if (k != STDIN_FILENO) {
1197 if (k >= 0) {
1198 close_nointr_nofail(k);
1199 k = -EINVAL;
1200 }
1201
1202 log_error("Failed to open console: %s", strerror(-k));
1203 goto child_fail;
1204 }
1205
1206 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1207 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1208 log_error("Failed to duplicate console: %m");
1209 goto child_fail;
1210 }
1211
1212 if (setsid() < 0) {
1213 log_error("setsid() failed: %m");
1214 goto child_fail;
1215 }
1216
1217 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1218 log_error("PR_SET_PDEATHSIG failed: %m");
1219 goto child_fail;
1220 }
1221
1222 r = register_machine();
1223 if (r < 0)
1224 goto finish;
1225
1226 /* Mark everything as slave, so that we still
1227 * receive mounts from the real root, but don't
1228 * propagate mounts to the real root. */
1229 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1230 log_error("MS_SLAVE|MS_REC failed: %m");
1231 goto child_fail;
1232 }
1233
1234 /* Turn directory into bind mount */
1235 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1236 log_error("Failed to make bind mount.");
1237 goto child_fail;
1238 }
1239
1240 if (arg_read_only)
1241 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1242 log_error("Failed to make read-only.");
1243 goto child_fail;
1244 }
1245
1246 if (mount_all(arg_directory) < 0)
1247 goto child_fail;
1248
1249 if (copy_devnodes(arg_directory) < 0)
1250 goto child_fail;
1251
1252 if (setup_ptmx(arg_directory) < 0)
1253 goto child_fail;
1254
1255 dev_setup(arg_directory);
1256
1257 if (setup_dev_console(arg_directory, console) < 0)
1258 goto child_fail;
1259
1260 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1261 goto child_fail;
1262
1263 close_nointr_nofail(kmsg_socket_pair[1]);
1264 kmsg_socket_pair[1] = -1;
1265
1266 if (setup_boot_id(arg_directory) < 0)
1267 goto child_fail;
1268
1269 if (setup_timezone(arg_directory) < 0)
1270 goto child_fail;
1271
1272 if (setup_resolv_conf(arg_directory) < 0)
1273 goto child_fail;
1274
1275 if (setup_journal(arg_directory) < 0)
1276 goto child_fail;
1277
1278 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1279 goto child_fail;
1280
1281 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1282 goto child_fail;
1283
1284 if (chdir(arg_directory) < 0) {
1285 log_error("chdir(%s) failed: %m", arg_directory);
1286 goto child_fail;
1287 }
1288
1289 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1290 log_error("mount(MS_MOVE) failed: %m");
1291 goto child_fail;
1292 }
1293
1294 if (chroot(".") < 0) {
1295 log_error("chroot() failed: %m");
1296 goto child_fail;
1297 }
1298
1299 if (chdir("/") < 0) {
1300 log_error("chdir() failed: %m");
1301 goto child_fail;
1302 }
1303
1304 umask(0022);
1305
1306 loopback_setup();
1307
1308 if (drop_capabilities() < 0) {
1309 log_error("drop_capabilities() failed: %m");
1310 goto child_fail;
1311 }
1312
1313 if (arg_user) {
1314
1315 /* Note that this resolves user names
1316 * inside the container, and hence
1317 * accesses the NSS modules from the
1318 * container and not the host. This is
1319 * a bit weird... */
1320
1321 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1322 log_error("get_user_creds() failed: %m");
1323 goto child_fail;
1324 }
1325
1326 if (mkdir_parents_label(home, 0775) < 0) {
1327 log_error("mkdir_parents_label() failed: %m");
1328 goto child_fail;
1329 }
1330
1331 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1332 log_error("mkdir_safe_label() failed: %m");
1333 goto child_fail;
1334 }
1335
1336 if (initgroups((const char*)arg_user, gid) < 0) {
1337 log_error("initgroups() failed: %m");
1338 goto child_fail;
1339 }
1340
1341 if (setresgid(gid, gid, gid) < 0) {
1342 log_error("setregid() failed: %m");
1343 goto child_fail;
1344 }
1345
1346 if (setresuid(uid, uid, uid) < 0) {
1347 log_error("setreuid() failed: %m");
1348 goto child_fail;
1349 }
1350 } else {
1351 /* Reset everything fully to 0, just in case */
1352
1353 if (setgroups(0, NULL) < 0) {
1354 log_error("setgroups() failed: %m");
1355 goto child_fail;
1356 }
1357
1358 if (setresgid(0, 0, 0) < 0) {
1359 log_error("setregid() failed: %m");
1360 goto child_fail;
1361 }
1362
1363 if (setresuid(0, 0, 0) < 0) {
1364 log_error("setreuid() failed: %m");
1365 goto child_fail;
1366 }
1367 }
1368
1369 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1370 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1371 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1372 log_oom();
1373 goto child_fail;
1374 }
1375
1376 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1377 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1378 log_oom();
1379 goto child_fail;
1380 }
1381 }
1382
1383 if (fdset_size(fds) > 0) {
1384 k = fdset_cloexec(fds, false);
1385 if (k < 0) {
1386 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1387 goto child_fail;
1388 }
1389
1390 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1391 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1392 log_oom();
1393 goto child_fail;
1394 }
1395 }
1396
1397 setup_hostname();
1398
1399 if (arg_boot) {
1400 char **a;
1401 size_t l;
1402
1403 /* Automatically search for the init system */
1404
1405 l = 1 + argc - optind;
1406 a = newa(char*, l + 1);
1407 memcpy(a + 1, argv + optind, l * sizeof(char*));
1408
1409 a[0] = (char*) "/usr/lib/systemd/systemd";
1410 execve(a[0], a, (char**) envp);
1411
1412 a[0] = (char*) "/lib/systemd/systemd";
1413 execve(a[0], a, (char**) envp);
1414
1415 a[0] = (char*) "/sbin/init";
1416 execve(a[0], a, (char**) envp);
1417 } else if (argc > optind)
1418 execvpe(argv[optind], argv + optind, (char**) envp);
1419 else {
1420 chdir(home ? home : "/root");
1421 execle("/bin/bash", "-bash", NULL, (char**) envp);
1422 }
1423
1424 log_error("execv() failed: %m");
1425
1426 child_fail:
1427 _exit(EXIT_FAILURE);
1428 }
1429
1430 fdset_free(fds);
1431 fds = NULL;
1432
1433 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1434 if (k < 0) {
1435 r = EXIT_FAILURE;
1436 break;
1437 }
1438
1439 putc('\n', stdout);
1440
1441 /* Kill if it is not dead yet anyway */
1442 terminate_machine(pid);
1443
1444 /* Redundant, but better safe than sorry */
1445 kill(pid, SIGKILL);
1446
1447 k = wait_for_terminate(pid, &status);
1448 pid = 0;
1449
1450 if (k < 0) {
1451 r = EXIT_FAILURE;
1452 break;
1453 }
1454
1455 if (status.si_code == CLD_EXITED) {
1456 r = status.si_status;
1457 if (status.si_status != 0) {
1458 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1459 break;
1460 }
1461
1462 log_debug("Container %s exited successfully.", arg_machine);
1463 break;
1464 } else if (status.si_code == CLD_KILLED &&
1465 status.si_status == SIGINT) {
1466 log_info("Container %s has been shut down.", arg_machine);
1467 r = 0;
1468 break;
1469 } else if (status.si_code == CLD_KILLED &&
1470 status.si_status == SIGHUP) {
1471 log_info("Container %s is being rebooted.", arg_machine);
1472 continue;
1473 } else if (status.si_code == CLD_KILLED ||
1474 status.si_code == CLD_DUMPED) {
1475
1476 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1477 r = EXIT_FAILURE;
1478 break;
1479 } else {
1480 log_error("Container %s failed due to unknown reason.", arg_machine);
1481 r = EXIT_FAILURE;
1482 break;
1483 }
1484 }
1485
1486 finish:
1487 if (pid > 0)
1488 kill(pid, SIGKILL);
1489
1490 free(arg_directory);
1491 free(arg_machine);
1492
1493 return r;
1494 }