]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
clients: unify how we invoke getopt_long()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43
44 #include "sd-daemon.h"
45 #include "sd-bus.h"
46 #include "sd-id128.h"
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61 #include "bus-util.h"
62 #include "bus-error.h"
63 #include "ptyfwd.h"
64
65 #ifndef TTY_GID
66 #define TTY_GID 5
67 #endif
68
69 typedef enum LinkJournal {
70 LINK_NO,
71 LINK_AUTO,
72 LINK_HOST,
73 LINK_GUEST
74 } LinkJournal;
75
76 static char *arg_directory = NULL;
77 static char *arg_user = NULL;
78 static sd_id128_t arg_uuid = {};
79 static char *arg_machine = NULL;
80 static const char *arg_slice = NULL;
81 static bool arg_private_network = false;
82 static bool arg_read_only = false;
83 static bool arg_boot = false;
84 static LinkJournal arg_link_journal = LINK_AUTO;
85 static uint64_t arg_retain =
86 (1ULL << CAP_CHOWN) |
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
92 (1ULL << CAP_KILL) |
93 (1ULL << CAP_LEASE) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
107 (1ULL << CAP_SYS_RESOURCE) |
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL);
111 static char **arg_bind = NULL;
112 static char **arg_bind_ro = NULL;
113
114 static int help(void) {
115
116 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
117 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
118 " -h --help Show this help\n"
119 " --version Print version string\n"
120 " -D --directory=NAME Root directory for the container\n"
121 " -b --boot Boot up full system (i.e. invoke init)\n"
122 " -u --user=USER Run the command under specified user or uid\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
124 " -M --machine=NAME Set the machine name for the container\n"
125 " -S --slice=SLICE Place the container in the specified slice\n"
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
129 " capability\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " the container\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135 program_invocation_short_name);
136
137 return 0;
138 }
139
140 static int parse_argv(int argc, char *argv[]) {
141
142 enum {
143 ARG_VERSION = 0x100,
144 ARG_PRIVATE_NETWORK,
145 ARG_UUID,
146 ARG_READ_ONLY,
147 ARG_CAPABILITY,
148 ARG_LINK_JOURNAL,
149 ARG_BIND,
150 ARG_BIND_RO
151 };
152
153 static const struct option options[] = {
154 { "help", no_argument, NULL, 'h' },
155 { "version", no_argument, NULL, ARG_VERSION },
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
159 { "boot", no_argument, NULL, 'b' },
160 { "uuid", required_argument, NULL, ARG_UUID },
161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
162 { "capability", required_argument, NULL, ARG_CAPABILITY },
163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
166 { "machine", required_argument, NULL, 'M' },
167 { "slice", required_argument, NULL, 'S' },
168 {}
169 };
170
171 int c, r;
172
173 assert(argc >= 0);
174 assert(argv);
175
176 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
177
178 switch (c) {
179
180 case 'h':
181 return help();
182
183 case ARG_VERSION:
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
186 return 0;
187
188 case 'D':
189 free(arg_directory);
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
193 return -ENOMEM;
194 }
195
196 break;
197
198 case 'u':
199 free(arg_user);
200 arg_user = strdup(optarg);
201 if (!arg_user)
202 return log_oom();
203
204 break;
205
206 case ARG_PRIVATE_NETWORK:
207 arg_private_network = true;
208 break;
209
210 case 'b':
211 arg_boot = true;
212 break;
213
214 case ARG_UUID:
215 r = sd_id128_from_string(optarg, &arg_uuid);
216 if (r < 0) {
217 log_error("Invalid UUID: %s", optarg);
218 return r;
219 }
220 break;
221
222 case 'S':
223 arg_slice = strdup(optarg);
224 if (!arg_slice)
225 return log_oom();
226
227 break;
228
229 case 'M':
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
232 return -EINVAL;
233 }
234
235 free(arg_machine);
236 arg_machine = strdup(optarg);
237 if (!arg_machine)
238 return log_oom();
239
240 break;
241
242 case ARG_READ_ONLY:
243 arg_read_only = true;
244 break;
245
246 case ARG_CAPABILITY: {
247 char *state, *word;
248 size_t length;
249
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251 cap_value_t cap;
252 char *t;
253
254 t = strndup(word, length);
255 if (!t)
256 return log_oom();
257
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
260 free(t);
261 return -EINVAL;
262 }
263
264 free(t);
265 arg_retain |= 1ULL << (uint64_t) cap;
266 }
267
268 break;
269 }
270
271 case 'j':
272 arg_link_journal = LINK_GUEST;
273 break;
274
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
284 else {
285 log_error("Failed to parse link journal mode %s", optarg);
286 return -EINVAL;
287 }
288
289 break;
290
291 case ARG_BIND:
292 case ARG_BIND_RO: {
293 _cleanup_free_ char *a = NULL, *b = NULL;
294 char *e;
295 char ***x;
296
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299 e = strchr(optarg, ':');
300 if (e) {
301 a = strndup(optarg, e - optarg);
302 b = strdup(e + 1);
303 } else {
304 a = strdup(optarg);
305 b = strdup(optarg);
306 }
307
308 if (!a || !b)
309 return log_oom();
310
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
313 return -EINVAL;
314 }
315
316 r = strv_extend(x, a);
317 if (r < 0)
318 return log_oom();
319
320 r = strv_extend(x, b);
321 if (r < 0)
322 return log_oom();
323
324 break;
325 }
326
327 case '?':
328 return -EINVAL;
329
330 default:
331 assert_not_reached("Unhandled option");
332 }
333 }
334
335 return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340 typedef struct MountPoint {
341 const char *what;
342 const char *where;
343 const char *type;
344 const char *options;
345 unsigned long flags;
346 bool fatal;
347 } MountPoint;
348
349 static const MountPoint mount_table[] = {
350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
358 #ifdef HAVE_SELINUX
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
361 #endif
362 };
363
364 unsigned k;
365 int r = 0;
366
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368 _cleanup_free_ char *where = NULL;
369 int t;
370
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
372 if (!where)
373 return log_oom();
374
375 t = path_is_mount_point(where, true);
376 if (t < 0) {
377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379 if (r == 0)
380 r = t;
381
382 continue;
383 }
384
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
387 continue;
388
389 mkdir_p(where, 0755);
390
391 if (mount(mount_table[k].what,
392 where,
393 mount_table[k].type,
394 mount_table[k].flags,
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
397
398 log_error("mount(%s) failed: %m", where);
399
400 if (r == 0)
401 r = -errno;
402 }
403 }
404
405 return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409 char **x, **y;
410
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
413 struct stat source_st, dest_st;
414
415 if (stat(*x, &source_st) < 0) {
416 log_error("failed to stat %s: %m", *x);
417 return -errno;
418 }
419
420 where = strjoin(dest, "/", *y, NULL);
421 if (!where)
422 return log_oom();
423
424 if (stat(where, &dest_st) == 0) {
425 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
426 log_error("The file types of %s and %s do not match. Refusing bind mount",
427 *x, where);
428 return -EINVAL;
429 }
430 } else {
431 /* Create the mount point, but be conservative -- refuse to create block
432 * and char devices. */
433 if (S_ISDIR(source_st.st_mode))
434 mkdir_p_label(where, 0755);
435 else if (S_ISFIFO(source_st.st_mode))
436 mkfifo(where, 0644);
437 else if (S_ISSOCK(source_st.st_mode))
438 mknod(where, 0644 | S_IFSOCK, 0);
439 else if (S_ISREG(source_st.st_mode))
440 touch(where);
441 else {
442 log_error("Refusing to create mountpoint for file: %s", *x);
443 return -ENOTSUP;
444 }
445 }
446
447 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
448 log_error("mount(%s) failed: %m", where);
449 return -errno;
450 }
451
452 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
453 log_error("mount(%s) failed: %m", where);
454 return -errno;
455 }
456 }
457
458 return 0;
459 }
460
461 static int setup_timezone(const char *dest) {
462 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
463 char *z, *y;
464 int r;
465
466 assert(dest);
467
468 /* Fix the timezone, if possible */
469 r = readlink_malloc("/etc/localtime", &p);
470 if (r < 0) {
471 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
472 return 0;
473 }
474
475 z = path_startswith(p, "../usr/share/zoneinfo/");
476 if (!z)
477 z = path_startswith(p, "/usr/share/zoneinfo/");
478 if (!z) {
479 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
480 return 0;
481 }
482
483 where = strappend(dest, "/etc/localtime");
484 if (!where)
485 return log_oom();
486
487 r = readlink_malloc(where, &q);
488 if (r >= 0) {
489 y = path_startswith(q, "../usr/share/zoneinfo/");
490 if (!y)
491 y = path_startswith(q, "/usr/share/zoneinfo/");
492
493
494 /* Already pointing to the right place? Then do nothing .. */
495 if (y && streq(y, z))
496 return 0;
497 }
498
499 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
500 if (!check)
501 return log_oom();
502
503 if (access(check, F_OK) < 0) {
504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
505 return 0;
506 }
507
508 what = strappend("../usr/share/zoneinfo/", z);
509 if (!what)
510 return log_oom();
511
512 unlink(where);
513 if (symlink(what, where) < 0) {
514 log_error("Failed to correct timezone of container: %m");
515 return 0;
516 }
517
518 return 0;
519 }
520
521 static int setup_resolv_conf(const char *dest) {
522 char _cleanup_free_ *where = NULL;
523
524 assert(dest);
525
526 if (arg_private_network)
527 return 0;
528
529 /* Fix resolv.conf, if possible */
530 where = strappend(dest, "/etc/resolv.conf");
531 if (!where)
532 return log_oom();
533
534 /* We don't really care for the results of this really. If it
535 * fails, it fails, but meh... */
536 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
537
538 return 0;
539 }
540
541 static int setup_boot_id(const char *dest) {
542 _cleanup_free_ char *from = NULL, *to = NULL;
543 sd_id128_t rnd;
544 char as_uuid[37];
545 int r;
546
547 assert(dest);
548
549 /* Generate a new randomized boot ID, so that each boot-up of
550 * the container gets a new one */
551
552 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
553 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
554 if (!from || !to)
555 return log_oom();
556
557 r = sd_id128_randomize(&rnd);
558 if (r < 0) {
559 log_error("Failed to generate random boot id: %s", strerror(-r));
560 return r;
561 }
562
563 snprintf(as_uuid, sizeof(as_uuid),
564 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
565 SD_ID128_FORMAT_VAL(rnd));
566 char_array_0(as_uuid);
567
568 r = write_string_file(from, as_uuid);
569 if (r < 0) {
570 log_error("Failed to write boot id: %s", strerror(-r));
571 return r;
572 }
573
574 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
575 log_error("Failed to bind mount boot id: %m");
576 r = -errno;
577 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
578 log_warning("Failed to make boot id read-only: %m");
579
580 unlink(from);
581 return r;
582 }
583
584 static int copy_devnodes(const char *dest) {
585
586 static const char devnodes[] =
587 "null\0"
588 "zero\0"
589 "full\0"
590 "random\0"
591 "urandom\0"
592 "tty\0";
593
594 const char *d;
595 int r = 0;
596 _cleanup_umask_ mode_t u;
597
598 assert(dest);
599
600 u = umask(0000);
601
602 NULSTR_FOREACH(d, devnodes) {
603 struct stat st;
604 _cleanup_free_ char *from = NULL, *to = NULL;
605
606 asprintf(&from, "/dev/%s", d);
607 asprintf(&to, "%s/dev/%s", dest, d);
608
609 if (!from || !to) {
610 log_oom();
611
612 if (r == 0)
613 r = -ENOMEM;
614
615 break;
616 }
617
618 if (stat(from, &st) < 0) {
619
620 if (errno != ENOENT) {
621 log_error("Failed to stat %s: %m", from);
622 if (r == 0)
623 r = -errno;
624 }
625
626 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
627
628 log_error("%s is not a char or block device, cannot copy", from);
629 if (r == 0)
630 r = -EIO;
631
632 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
633
634 log_error("mknod(%s) failed: %m", dest);
635 if (r == 0)
636 r = -errno;
637 }
638 }
639
640 return r;
641 }
642
643 static int setup_ptmx(const char *dest) {
644 _cleanup_free_ char *p = NULL;
645
646 p = strappend(dest, "/dev/ptmx");
647 if (!p)
648 return log_oom();
649
650 if (symlink("pts/ptmx", p) < 0) {
651 log_error("Failed to create /dev/ptmx symlink: %m");
652 return -errno;
653 }
654
655 return 0;
656 }
657
658 static int setup_dev_console(const char *dest, const char *console) {
659 struct stat st;
660 _cleanup_free_ char *to = NULL;
661 int r;
662 _cleanup_umask_ mode_t u;
663
664 assert(dest);
665 assert(console);
666
667 u = umask(0000);
668
669 if (stat(console, &st) < 0) {
670 log_error("Failed to stat %s: %m", console);
671 return -errno;
672
673 } else if (!S_ISCHR(st.st_mode)) {
674 log_error("/dev/console is not a char device");
675 return -EIO;
676 }
677
678 r = chmod_and_chown(console, 0600, 0, 0);
679 if (r < 0) {
680 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
681 return r;
682 }
683
684 if (asprintf(&to, "%s/dev/console", dest) < 0)
685 return log_oom();
686
687 /* We need to bind mount the right tty to /dev/console since
688 * ptys can only exist on pts file systems. To have something
689 * to bind mount things on we create a device node first, that
690 * has the right major/minor (note that the major minor
691 * doesn't actually matter here, since we mount it over
692 * anyway). */
693
694 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
695 log_error("mknod() for /dev/console failed: %m");
696 return -errno;
697 }
698
699 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
700 log_error("Bind mount for /dev/console failed: %m");
701 return -errno;
702 }
703
704 return 0;
705 }
706
707 static int setup_kmsg(const char *dest, int kmsg_socket) {
708 _cleanup_free_ char *from = NULL, *to = NULL;
709 int r, fd, k;
710 _cleanup_umask_ mode_t u;
711 union {
712 struct cmsghdr cmsghdr;
713 uint8_t buf[CMSG_SPACE(sizeof(int))];
714 } control = {};
715 struct msghdr mh = {
716 .msg_control = &control,
717 .msg_controllen = sizeof(control),
718 };
719 struct cmsghdr *cmsg;
720
721 assert(dest);
722 assert(kmsg_socket >= 0);
723
724 u = umask(0000);
725
726 /* We create the kmsg FIFO as /dev/kmsg, but immediately
727 * delete it after bind mounting it to /proc/kmsg. While FIFOs
728 * on the reading side behave very similar to /proc/kmsg,
729 * their writing side behaves differently from /dev/kmsg in
730 * that writing blocks when nothing is reading. In order to
731 * avoid any problems with containers deadlocking due to this
732 * we simply make /dev/kmsg unavailable to the container. */
733 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
734 asprintf(&to, "%s/proc/kmsg", dest) < 0)
735 return log_oom();
736
737 if (mkfifo(from, 0600) < 0) {
738 log_error("mkfifo() for /dev/kmsg failed: %m");
739 return -errno;
740 }
741
742 r = chmod_and_chown(from, 0600, 0, 0);
743 if (r < 0) {
744 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
745 return r;
746 }
747
748 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749 log_error("Bind mount for /proc/kmsg failed: %m");
750 return -errno;
751 }
752
753 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
754 if (fd < 0) {
755 log_error("Failed to open fifo: %m");
756 return -errno;
757 }
758
759 cmsg = CMSG_FIRSTHDR(&mh);
760 cmsg->cmsg_level = SOL_SOCKET;
761 cmsg->cmsg_type = SCM_RIGHTS;
762 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
763 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
764
765 mh.msg_controllen = cmsg->cmsg_len;
766
767 /* Store away the fd in the socket, so that it stays open as
768 * long as we run the child */
769 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
770 close_nointr_nofail(fd);
771
772 if (k < 0) {
773 log_error("Failed to send FIFO fd: %m");
774 return -errno;
775 }
776
777 /* And now make the FIFO unavailable as /dev/kmsg... */
778 unlink(from);
779 return 0;
780 }
781
782 static int setup_hostname(void) {
783
784 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
785 return -errno;
786
787 return 0;
788 }
789
790 static int setup_journal(const char *directory) {
791 sd_id128_t machine_id;
792 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
793 char *id;
794 int r;
795
796 if (arg_link_journal == LINK_NO)
797 return 0;
798
799 p = strappend(directory, "/etc/machine-id");
800 if (!p)
801 return log_oom();
802
803 r = read_one_line_file(p, &b);
804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
805 return 0;
806 else if (r < 0) {
807 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
808 return r;
809 }
810
811 id = strstrip(b);
812 if (isempty(id) && arg_link_journal == LINK_AUTO)
813 return 0;
814
815 /* Verify validity */
816 r = sd_id128_from_string(id, &machine_id);
817 if (r < 0) {
818 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
819 return r;
820 }
821
822 free(p);
823 p = strappend("/var/log/journal/", id);
824 q = strjoin(directory, "/var/log/journal/", id, NULL);
825 if (!p || !q)
826 return log_oom();
827
828 if (path_is_mount_point(p, false) > 0) {
829 if (arg_link_journal != LINK_AUTO) {
830 log_error("%s: already a mount point, refusing to use for journal", p);
831 return -EEXIST;
832 }
833
834 return 0;
835 }
836
837 if (path_is_mount_point(q, false) > 0) {
838 if (arg_link_journal != LINK_AUTO) {
839 log_error("%s: already a mount point, refusing to use for journal", q);
840 return -EEXIST;
841 }
842
843 return 0;
844 }
845
846 r = readlink_and_make_absolute(p, &d);
847 if (r >= 0) {
848 if ((arg_link_journal == LINK_GUEST ||
849 arg_link_journal == LINK_AUTO) &&
850 path_equal(d, q)) {
851
852 r = mkdir_p(q, 0755);
853 if (r < 0)
854 log_warning("failed to create directory %s: %m", q);
855 return 0;
856 }
857
858 if (unlink(p) < 0) {
859 log_error("Failed to remove symlink %s: %m", p);
860 return -errno;
861 }
862 } else if (r == -EINVAL) {
863
864 if (arg_link_journal == LINK_GUEST &&
865 rmdir(p) < 0) {
866
867 if (errno == ENOTDIR) {
868 log_error("%s already exists and is neither a symlink nor a directory", p);
869 return r;
870 } else {
871 log_error("Failed to remove %s: %m", p);
872 return -errno;
873 }
874 }
875 } else if (r != -ENOENT) {
876 log_error("readlink(%s) failed: %m", p);
877 return r;
878 }
879
880 if (arg_link_journal == LINK_GUEST) {
881
882 if (symlink(q, p) < 0) {
883 log_error("Failed to symlink %s to %s: %m", q, p);
884 return -errno;
885 }
886
887 r = mkdir_p(q, 0755);
888 if (r < 0)
889 log_warning("failed to create directory %s: %m", q);
890 return 0;
891 }
892
893 if (arg_link_journal == LINK_HOST) {
894 r = mkdir_p(p, 0755);
895 if (r < 0) {
896 log_error("Failed to create %s: %m", p);
897 return r;
898 }
899
900 } else if (access(p, F_OK) < 0)
901 return 0;
902
903 if (dir_is_empty(q) == 0) {
904 log_error("%s not empty.", q);
905 return -ENOTEMPTY;
906 }
907
908 r = mkdir_p(q, 0755);
909 if (r < 0) {
910 log_error("Failed to create %s: %m", q);
911 return r;
912 }
913
914 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
915 log_error("Failed to bind mount journal from host into guest: %m");
916 return -errno;
917 }
918
919 return 0;
920 }
921
922 static int drop_capabilities(void) {
923 return capability_bounding_set_drop(~arg_retain, false);
924 }
925
926 static int register_machine(void) {
927 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
928 _cleanup_bus_unref_ sd_bus *bus = NULL;
929 int r;
930
931 r = sd_bus_open_system(&bus);
932 if (r < 0) {
933 log_error("Failed to open system bus: %s", strerror(-r));
934 return r;
935 }
936
937 r = sd_bus_call_method(
938 bus,
939 "org.freedesktop.machine1",
940 "/org/freedesktop/machine1",
941 "org.freedesktop.machine1.Manager",
942 "CreateMachine",
943 &error,
944 NULL,
945 "sayssusa(sv)",
946 arg_machine,
947 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
948 "nspawn",
949 "container",
950 (uint32_t) 0,
951 strempty(arg_directory),
952 !isempty(arg_slice), "Slice", "s", arg_slice);
953 if (r < 0) {
954 log_error("Failed to register machine: %s", bus_error_message(&error, r));
955 return r;
956 }
957
958 return 0;
959 }
960
961 static int terminate_machine(pid_t pid) {
962 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
963 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
964 _cleanup_bus_unref_ sd_bus *bus = NULL;
965 const char *path;
966 int r;
967
968 r = sd_bus_open_system(&bus);
969 if (r < 0) {
970 log_error("Failed to open system bus: %s", strerror(-r));
971 return r;
972 }
973
974 r = sd_bus_call_method(
975 bus,
976 "org.freedesktop.machine1",
977 "/org/freedesktop/machine1",
978 "org.freedesktop.machine1.Manager",
979 "GetMachineByPID",
980 &error,
981 &reply,
982 "u",
983 (uint32_t) pid);
984 if (r < 0) {
985 /* Note that the machine might already have been
986 * cleaned up automatically, hence don't consider it a
987 * failure if we cannot get the machine object. */
988 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
989 return 0;
990 }
991
992 r = sd_bus_message_read(reply, "o", &path);
993 if (r < 0) {
994 log_error("Failed to parse GetMachineByPID() reply: %s", bus_error_message(&error, r));
995 return r;
996 }
997
998 r = sd_bus_call_method(
999 bus,
1000 "org.freedesktop.machine1",
1001 path,
1002 "org.freedesktop.machine1.Machine",
1003 "Terminate",
1004 &error,
1005 NULL,
1006 NULL);
1007 if (r < 0) {
1008 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1009 return 0;
1010 }
1011
1012 return 0;
1013 }
1014
1015 static bool audit_enabled(void) {
1016 int fd;
1017
1018 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1019 if (fd >= 0) {
1020 close_nointr_nofail(fd);
1021 return true;
1022 }
1023 return false;
1024 }
1025
1026 int main(int argc, char *argv[]) {
1027 pid_t pid = 0;
1028 int r = EXIT_FAILURE, k;
1029 _cleanup_close_ int master = -1;
1030 int n_fd_passed;
1031 const char *console = NULL;
1032 sigset_t mask;
1033 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1034 _cleanup_fdset_free_ FDSet *fds = NULL;
1035
1036 log_parse_environment();
1037 log_open();
1038
1039 k = parse_argv(argc, argv);
1040 if (k < 0)
1041 goto finish;
1042 else if (k == 0) {
1043 r = EXIT_SUCCESS;
1044 goto finish;
1045 }
1046
1047 if (arg_directory) {
1048 char *p;
1049
1050 p = path_make_absolute_cwd(arg_directory);
1051 free(arg_directory);
1052 arg_directory = p;
1053 } else
1054 arg_directory = get_current_dir_name();
1055
1056 if (!arg_directory) {
1057 log_error("Failed to determine path, please use -D.");
1058 goto finish;
1059 }
1060
1061 path_kill_slashes(arg_directory);
1062
1063 if (!arg_machine) {
1064 arg_machine = strdup(path_get_file_name(arg_directory));
1065 if (!arg_machine) {
1066 log_oom();
1067 goto finish;
1068 }
1069
1070 hostname_cleanup(arg_machine, false);
1071 if (isempty(arg_machine)) {
1072 log_error("Failed to determine machine name automatically, please use -M.");
1073 goto finish;
1074 }
1075 }
1076
1077 if (geteuid() != 0) {
1078 log_error("Need to be root.");
1079 goto finish;
1080 }
1081
1082 if (sd_booted() <= 0) {
1083 log_error("Not running on a systemd system.");
1084 goto finish;
1085 }
1086
1087 if (arg_boot && audit_enabled()) {
1088 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1089 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1090 "line before using systemd-nspawn. Sleeping for 5s...\n");
1091 sleep(5);
1092 }
1093
1094 if (path_equal(arg_directory, "/")) {
1095 log_error("Spawning container on root directory not supported.");
1096 goto finish;
1097 }
1098
1099 if (path_is_os_tree(arg_directory) <= 0) {
1100 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1101 goto finish;
1102 }
1103
1104 log_close();
1105 n_fd_passed = sd_listen_fds(false);
1106 if (n_fd_passed > 0) {
1107 k = fdset_new_listen_fds(&fds, false);
1108 if (k < 0) {
1109 log_error("Failed to collect file descriptors: %s", strerror(-k));
1110 goto finish;
1111 }
1112 }
1113 fdset_close_others(fds);
1114 log_open();
1115
1116 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1117 if (master < 0) {
1118 log_error("Failed to acquire pseudo tty: %m");
1119 goto finish;
1120 }
1121
1122 console = ptsname(master);
1123 if (!console) {
1124 log_error("Failed to determine tty name: %m");
1125 goto finish;
1126 }
1127
1128 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1129
1130 if (unlockpt(master) < 0) {
1131 log_error("Failed to unlock tty: %m");
1132 goto finish;
1133 }
1134
1135 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1136 log_error("Failed to create kmsg socket pair.");
1137 goto finish;
1138 }
1139
1140 sd_notify(0, "READY=1");
1141
1142 assert_se(sigemptyset(&mask) == 0);
1143 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1144 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1145
1146 for (;;) {
1147 siginfo_t status;
1148
1149 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1150 if (pid < 0) {
1151 if (errno == EINVAL)
1152 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1153 else
1154 log_error("clone() failed: %m");
1155
1156 goto finish;
1157 }
1158
1159 if (pid == 0) {
1160 /* child */
1161 const char *home = NULL;
1162 uid_t uid = (uid_t) -1;
1163 gid_t gid = (gid_t) -1;
1164 unsigned n_env = 2;
1165 const char *envp[] = {
1166 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1167 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1168 NULL, /* TERM */
1169 NULL, /* HOME */
1170 NULL, /* USER */
1171 NULL, /* LOGNAME */
1172 NULL, /* container_uuid */
1173 NULL, /* LISTEN_FDS */
1174 NULL, /* LISTEN_PID */
1175 NULL
1176 };
1177
1178 envp[n_env] = strv_find_prefix(environ, "TERM=");
1179 if (envp[n_env])
1180 n_env ++;
1181
1182 close_nointr_nofail(master);
1183 master = -1;
1184
1185 close_nointr(STDIN_FILENO);
1186 close_nointr(STDOUT_FILENO);
1187 close_nointr(STDERR_FILENO);
1188
1189 close_nointr_nofail(kmsg_socket_pair[0]);
1190 kmsg_socket_pair[0] = -1;
1191
1192 reset_all_signal_handlers();
1193
1194 assert_se(sigemptyset(&mask) == 0);
1195 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1196
1197 k = open_terminal(console, O_RDWR);
1198 if (k != STDIN_FILENO) {
1199 if (k >= 0) {
1200 close_nointr_nofail(k);
1201 k = -EINVAL;
1202 }
1203
1204 log_error("Failed to open console: %s", strerror(-k));
1205 goto child_fail;
1206 }
1207
1208 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1209 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1210 log_error("Failed to duplicate console: %m");
1211 goto child_fail;
1212 }
1213
1214 if (setsid() < 0) {
1215 log_error("setsid() failed: %m");
1216 goto child_fail;
1217 }
1218
1219 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1220 log_error("PR_SET_PDEATHSIG failed: %m");
1221 goto child_fail;
1222 }
1223
1224 r = register_machine();
1225 if (r < 0)
1226 goto finish;
1227
1228 /* Mark everything as slave, so that we still
1229 * receive mounts from the real root, but don't
1230 * propagate mounts to the real root. */
1231 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1232 log_error("MS_SLAVE|MS_REC failed: %m");
1233 goto child_fail;
1234 }
1235
1236 /* Turn directory into bind mount */
1237 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1238 log_error("Failed to make bind mount.");
1239 goto child_fail;
1240 }
1241
1242 if (arg_read_only)
1243 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1244 log_error("Failed to make read-only.");
1245 goto child_fail;
1246 }
1247
1248 if (mount_all(arg_directory) < 0)
1249 goto child_fail;
1250
1251 if (copy_devnodes(arg_directory) < 0)
1252 goto child_fail;
1253
1254 if (setup_ptmx(arg_directory) < 0)
1255 goto child_fail;
1256
1257 dev_setup(arg_directory);
1258
1259 if (setup_dev_console(arg_directory, console) < 0)
1260 goto child_fail;
1261
1262 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1263 goto child_fail;
1264
1265 close_nointr_nofail(kmsg_socket_pair[1]);
1266 kmsg_socket_pair[1] = -1;
1267
1268 if (setup_boot_id(arg_directory) < 0)
1269 goto child_fail;
1270
1271 if (setup_timezone(arg_directory) < 0)
1272 goto child_fail;
1273
1274 if (setup_resolv_conf(arg_directory) < 0)
1275 goto child_fail;
1276
1277 if (setup_journal(arg_directory) < 0)
1278 goto child_fail;
1279
1280 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1281 goto child_fail;
1282
1283 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1284 goto child_fail;
1285
1286 if (chdir(arg_directory) < 0) {
1287 log_error("chdir(%s) failed: %m", arg_directory);
1288 goto child_fail;
1289 }
1290
1291 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1292 log_error("mount(MS_MOVE) failed: %m");
1293 goto child_fail;
1294 }
1295
1296 if (chroot(".") < 0) {
1297 log_error("chroot() failed: %m");
1298 goto child_fail;
1299 }
1300
1301 if (chdir("/") < 0) {
1302 log_error("chdir() failed: %m");
1303 goto child_fail;
1304 }
1305
1306 umask(0022);
1307
1308 loopback_setup();
1309
1310 if (drop_capabilities() < 0) {
1311 log_error("drop_capabilities() failed: %m");
1312 goto child_fail;
1313 }
1314
1315 if (arg_user) {
1316
1317 /* Note that this resolves user names
1318 * inside the container, and hence
1319 * accesses the NSS modules from the
1320 * container and not the host. This is
1321 * a bit weird... */
1322
1323 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1324 log_error("get_user_creds() failed: %m");
1325 goto child_fail;
1326 }
1327
1328 if (mkdir_parents_label(home, 0775) < 0) {
1329 log_error("mkdir_parents_label() failed: %m");
1330 goto child_fail;
1331 }
1332
1333 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1334 log_error("mkdir_safe_label() failed: %m");
1335 goto child_fail;
1336 }
1337
1338 if (initgroups((const char*)arg_user, gid) < 0) {
1339 log_error("initgroups() failed: %m");
1340 goto child_fail;
1341 }
1342
1343 if (setresgid(gid, gid, gid) < 0) {
1344 log_error("setregid() failed: %m");
1345 goto child_fail;
1346 }
1347
1348 if (setresuid(uid, uid, uid) < 0) {
1349 log_error("setreuid() failed: %m");
1350 goto child_fail;
1351 }
1352 } else {
1353 /* Reset everything fully to 0, just in case */
1354
1355 if (setgroups(0, NULL) < 0) {
1356 log_error("setgroups() failed: %m");
1357 goto child_fail;
1358 }
1359
1360 if (setresgid(0, 0, 0) < 0) {
1361 log_error("setregid() failed: %m");
1362 goto child_fail;
1363 }
1364
1365 if (setresuid(0, 0, 0) < 0) {
1366 log_error("setreuid() failed: %m");
1367 goto child_fail;
1368 }
1369 }
1370
1371 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1372 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1373 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1374 log_oom();
1375 goto child_fail;
1376 }
1377
1378 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1379 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1380 log_oom();
1381 goto child_fail;
1382 }
1383 }
1384
1385 if (fdset_size(fds) > 0) {
1386 k = fdset_cloexec(fds, false);
1387 if (k < 0) {
1388 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1389 goto child_fail;
1390 }
1391
1392 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1393 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1394 log_oom();
1395 goto child_fail;
1396 }
1397 }
1398
1399 setup_hostname();
1400
1401 if (arg_boot) {
1402 char **a;
1403 size_t l;
1404
1405 /* Automatically search for the init system */
1406
1407 l = 1 + argc - optind;
1408 a = newa(char*, l + 1);
1409 memcpy(a + 1, argv + optind, l * sizeof(char*));
1410
1411 a[0] = (char*) "/usr/lib/systemd/systemd";
1412 execve(a[0], a, (char**) envp);
1413
1414 a[0] = (char*) "/lib/systemd/systemd";
1415 execve(a[0], a, (char**) envp);
1416
1417 a[0] = (char*) "/sbin/init";
1418 execve(a[0], a, (char**) envp);
1419 } else if (argc > optind)
1420 execvpe(argv[optind], argv + optind, (char**) envp);
1421 else {
1422 chdir(home ? home : "/root");
1423 execle("/bin/bash", "-bash", NULL, (char**) envp);
1424 }
1425
1426 log_error("execv() failed: %m");
1427
1428 child_fail:
1429 _exit(EXIT_FAILURE);
1430 }
1431
1432 fdset_free(fds);
1433 fds = NULL;
1434
1435 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1436 if (k < 0) {
1437 r = EXIT_FAILURE;
1438 break;
1439 }
1440
1441 putc('\n', stdout);
1442
1443 /* Kill if it is not dead yet anyway */
1444 terminate_machine(pid);
1445
1446 /* Redundant, but better safe than sorry */
1447 kill(pid, SIGKILL);
1448
1449 k = wait_for_terminate(pid, &status);
1450 pid = 0;
1451
1452 if (k < 0) {
1453 r = EXIT_FAILURE;
1454 break;
1455 }
1456
1457 if (status.si_code == CLD_EXITED) {
1458 r = status.si_status;
1459 if (status.si_status != 0) {
1460 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1461 break;
1462 }
1463
1464 log_debug("Container %s exited successfully.", arg_machine);
1465 break;
1466 } else if (status.si_code == CLD_KILLED &&
1467 status.si_status == SIGINT) {
1468 log_info("Container %s has been shut down.", arg_machine);
1469 r = 0;
1470 break;
1471 } else if (status.si_code == CLD_KILLED &&
1472 status.si_status == SIGHUP) {
1473 log_info("Container %s is being rebooted.", arg_machine);
1474 continue;
1475 } else if (status.si_code == CLD_KILLED ||
1476 status.si_code == CLD_DUMPED) {
1477
1478 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1479 r = EXIT_FAILURE;
1480 break;
1481 } else {
1482 log_error("Container %s failed due to unknown reason.", arg_machine);
1483 r = EXIT_FAILURE;
1484 break;
1485 }
1486 }
1487
1488 finish:
1489 if (pid > 0)
1490 kill(pid, SIGKILL);
1491
1492 free(arg_directory);
1493 free(arg_machine);
1494
1495 return r;
1496 }