]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: add new --network-interface= switch to move an existing interface into the...
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #include "sd-daemon.h"
52 #include "sd-bus.h"
53 #include "sd-id128.h"
54 #include "sd-rtnl.h"
55 #include "log.h"
56 #include "util.h"
57 #include "mkdir.h"
58 #include "macro.h"
59 #include "audit.h"
60 #include "missing.h"
61 #include "cgroup-util.h"
62 #include "strv.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
66 #include "fdset.h"
67 #include "build.h"
68 #include "fileio.h"
69 #include "bus-util.h"
70 #include "bus-error.h"
71 #include "ptyfwd.h"
72 #include "bus-kernel.h"
73 #include "env-util.h"
74 #include "def.h"
75 #include "rtnl-util.h"
76
77 typedef enum LinkJournal {
78 LINK_NO,
79 LINK_AUTO,
80 LINK_HOST,
81 LINK_GUEST
82 } LinkJournal;
83
84 static char *arg_directory = NULL;
85 static char *arg_user = NULL;
86 static sd_id128_t arg_uuid = {};
87 static char *arg_machine = NULL;
88 static char *arg_selinux_context = NULL;
89 static char *arg_selinux_apifs_context = NULL;
90 static const char *arg_slice = NULL;
91 static bool arg_private_network = false;
92 static bool arg_read_only = false;
93 static bool arg_boot = false;
94 static LinkJournal arg_link_journal = LINK_AUTO;
95 static uint64_t arg_retain =
96 (1ULL << CAP_CHOWN) |
97 (1ULL << CAP_DAC_OVERRIDE) |
98 (1ULL << CAP_DAC_READ_SEARCH) |
99 (1ULL << CAP_FOWNER) |
100 (1ULL << CAP_FSETID) |
101 (1ULL << CAP_IPC_OWNER) |
102 (1ULL << CAP_KILL) |
103 (1ULL << CAP_LEASE) |
104 (1ULL << CAP_LINUX_IMMUTABLE) |
105 (1ULL << CAP_NET_BIND_SERVICE) |
106 (1ULL << CAP_NET_BROADCAST) |
107 (1ULL << CAP_NET_RAW) |
108 (1ULL << CAP_SETGID) |
109 (1ULL << CAP_SETFCAP) |
110 (1ULL << CAP_SETPCAP) |
111 (1ULL << CAP_SETUID) |
112 (1ULL << CAP_SYS_ADMIN) |
113 (1ULL << CAP_SYS_CHROOT) |
114 (1ULL << CAP_SYS_NICE) |
115 (1ULL << CAP_SYS_PTRACE) |
116 (1ULL << CAP_SYS_TTY_CONFIG) |
117 (1ULL << CAP_SYS_RESOURCE) |
118 (1ULL << CAP_SYS_BOOT) |
119 (1ULL << CAP_AUDIT_WRITE) |
120 (1ULL << CAP_AUDIT_CONTROL) |
121 (1ULL << CAP_MKNOD);
122 static char **arg_bind = NULL;
123 static char **arg_bind_ro = NULL;
124 static char **arg_setenv = NULL;
125 static bool arg_quiet = false;
126 static bool arg_share_system = false;
127 static bool arg_register = true;
128 static bool arg_keep_unit = false;
129 static char **arg_network_interfaces = NULL;
130
131 static int help(void) {
132
133 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
134 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
135 " -h --help Show this help\n"
136 " --version Print version string\n"
137 " -D --directory=NAME Root directory for the container\n"
138 " -b --boot Boot up full system (i.e. invoke init)\n"
139 " -u --user=USER Run the command under specified user or uid\n"
140 " --uuid=UUID Set a specific machine UUID for the container\n"
141 " -M --machine=NAME Set the machine name for the container\n"
142 " -S --slice=SLICE Place the container in the specified slice\n"
143 " -Z --selinux-context=SECLABEL\n"
144 " Set the SELinux security context to be used by\n"
145 " processes in the container\n"
146 " -L --selinux-apifs-context=SECLABEL\n"
147 " Set the SELinux security context to be used by\n"
148 " API/tmpfs file systems in the container\n"
149 " --private-network Disable network in container\n"
150 " --network-interface=INTERFACE\n"
151 " Assign an existing network interface to the container\n"
152 " --share-system Share system namespaces with host\n"
153 " --read-only Mount the root directory read-only\n"
154 " --capability=CAP In addition to the default, retain specified\n"
155 " capability\n"
156 " --drop-capability=CAP Drop the specified capability from the default set\n"
157 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
158 " -j Equivalent to --link-journal=host\n"
159 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
160 " the container\n"
161 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
162 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
163 " --register=BOOLEAN Register container as machine\n"
164 " --keep-unit Do not register a scope for the machine, reuse\n"
165 " the service unit nspawn is running in\n"
166 " -q --quiet Do not show status information\n",
167 program_invocation_short_name);
168
169 return 0;
170 }
171
172 static int parse_argv(int argc, char *argv[]) {
173
174 enum {
175 ARG_VERSION = 0x100,
176 ARG_PRIVATE_NETWORK,
177 ARG_UUID,
178 ARG_READ_ONLY,
179 ARG_CAPABILITY,
180 ARG_DROP_CAPABILITY,
181 ARG_LINK_JOURNAL,
182 ARG_BIND,
183 ARG_BIND_RO,
184 ARG_SETENV,
185 ARG_SHARE_SYSTEM,
186 ARG_REGISTER,
187 ARG_KEEP_UNIT,
188 ARG_NETWORK_INTERFACE
189 };
190
191 static const struct option options[] = {
192 { "help", no_argument, NULL, 'h' },
193 { "version", no_argument, NULL, ARG_VERSION },
194 { "directory", required_argument, NULL, 'D' },
195 { "user", required_argument, NULL, 'u' },
196 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
197 { "boot", no_argument, NULL, 'b' },
198 { "uuid", required_argument, NULL, ARG_UUID },
199 { "read-only", no_argument, NULL, ARG_READ_ONLY },
200 { "capability", required_argument, NULL, ARG_CAPABILITY },
201 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
202 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
203 { "bind", required_argument, NULL, ARG_BIND },
204 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
205 { "machine", required_argument, NULL, 'M' },
206 { "slice", required_argument, NULL, 'S' },
207 { "setenv", required_argument, NULL, ARG_SETENV },
208 { "selinux-context", required_argument, NULL, 'Z' },
209 { "selinux-apifs-context", required_argument, NULL, 'L' },
210 { "quiet", no_argument, NULL, 'q' },
211 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
212 { "register", required_argument, NULL, ARG_REGISTER },
213 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
214 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
215 {}
216 };
217
218 int c, r;
219
220 assert(argc >= 0);
221 assert(argv);
222
223 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
224
225 switch (c) {
226
227 case 'h':
228 return help();
229
230 case ARG_VERSION:
231 puts(PACKAGE_STRING);
232 puts(SYSTEMD_FEATURES);
233 return 0;
234
235 case 'D':
236 free(arg_directory);
237 arg_directory = canonicalize_file_name(optarg);
238 if (!arg_directory) {
239 log_error("Invalid root directory: %m");
240 return -ENOMEM;
241 }
242
243 break;
244
245 case 'u':
246 free(arg_user);
247 arg_user = strdup(optarg);
248 if (!arg_user)
249 return log_oom();
250
251 break;
252
253 case ARG_NETWORK_INTERFACE:
254 if (strv_push(&arg_network_interfaces, optarg) < 0)
255 return log_oom();
256
257 /* fall through */
258
259 case ARG_PRIVATE_NETWORK:
260 arg_private_network = true;
261 break;
262
263 case 'b':
264 arg_boot = true;
265 break;
266
267 case ARG_UUID:
268 r = sd_id128_from_string(optarg, &arg_uuid);
269 if (r < 0) {
270 log_error("Invalid UUID: %s", optarg);
271 return r;
272 }
273 break;
274
275 case 'S':
276 arg_slice = strdup(optarg);
277 if (!arg_slice)
278 return log_oom();
279
280 break;
281
282 case 'M':
283 if (isempty(optarg)) {
284 free(arg_machine);
285 arg_machine = NULL;
286 } else {
287
288 if (!hostname_is_valid(optarg)) {
289 log_error("Invalid machine name: %s", optarg);
290 return -EINVAL;
291 }
292
293 free(arg_machine);
294 arg_machine = strdup(optarg);
295 if (!arg_machine)
296 return log_oom();
297
298 break;
299 }
300
301 case 'Z':
302 arg_selinux_context = optarg;
303 break;
304
305 case 'L':
306 arg_selinux_apifs_context = optarg;
307 break;
308
309 case ARG_READ_ONLY:
310 arg_read_only = true;
311 break;
312
313 case ARG_CAPABILITY:
314 case ARG_DROP_CAPABILITY: {
315 char *state, *word;
316 size_t length;
317
318 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
319 _cleanup_free_ char *t;
320 cap_value_t cap;
321
322 t = strndup(word, length);
323 if (!t)
324 return log_oom();
325
326 if (streq(t, "all")) {
327 if (c == ARG_CAPABILITY)
328 arg_retain = (uint64_t) -1;
329 else
330 arg_retain = 0;
331 } else {
332 if (cap_from_name(t, &cap) < 0) {
333 log_error("Failed to parse capability %s.", t);
334 return -EINVAL;
335 }
336
337 if (c == ARG_CAPABILITY)
338 arg_retain |= 1ULL << (uint64_t) cap;
339 else
340 arg_retain &= ~(1ULL << (uint64_t) cap);
341 }
342 }
343
344 break;
345 }
346
347 case 'j':
348 arg_link_journal = LINK_GUEST;
349 break;
350
351 case ARG_LINK_JOURNAL:
352 if (streq(optarg, "auto"))
353 arg_link_journal = LINK_AUTO;
354 else if (streq(optarg, "no"))
355 arg_link_journal = LINK_NO;
356 else if (streq(optarg, "guest"))
357 arg_link_journal = LINK_GUEST;
358 else if (streq(optarg, "host"))
359 arg_link_journal = LINK_HOST;
360 else {
361 log_error("Failed to parse link journal mode %s", optarg);
362 return -EINVAL;
363 }
364
365 break;
366
367 case ARG_BIND:
368 case ARG_BIND_RO: {
369 _cleanup_free_ char *a = NULL, *b = NULL;
370 char *e;
371 char ***x;
372
373 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
374
375 e = strchr(optarg, ':');
376 if (e) {
377 a = strndup(optarg, e - optarg);
378 b = strdup(e + 1);
379 } else {
380 a = strdup(optarg);
381 b = strdup(optarg);
382 }
383
384 if (!a || !b)
385 return log_oom();
386
387 if (!path_is_absolute(a) || !path_is_absolute(b)) {
388 log_error("Invalid bind mount specification: %s", optarg);
389 return -EINVAL;
390 }
391
392 r = strv_extend(x, a);
393 if (r < 0)
394 return log_oom();
395
396 r = strv_extend(x, b);
397 if (r < 0)
398 return log_oom();
399
400 break;
401 }
402
403 case ARG_SETENV: {
404 char **n;
405
406 if (!env_assignment_is_valid(optarg)) {
407 log_error("Environment variable assignment '%s' is not valid.", optarg);
408 return -EINVAL;
409 }
410
411 n = strv_env_set(arg_setenv, optarg);
412 if (!n)
413 return log_oom();
414
415 strv_free(arg_setenv);
416 arg_setenv = n;
417 break;
418 }
419
420 case 'q':
421 arg_quiet = true;
422 break;
423
424 case ARG_SHARE_SYSTEM:
425 arg_share_system = true;
426 break;
427
428 case ARG_REGISTER:
429 r = parse_boolean(optarg);
430 if (r < 0) {
431 log_error("Failed to parse --register= argument: %s", optarg);
432 return r;
433 }
434
435 arg_register = r;
436 break;
437
438 case ARG_KEEP_UNIT:
439 arg_keep_unit = true;
440 break;
441
442 case '?':
443 return -EINVAL;
444
445 default:
446 assert_not_reached("Unhandled option");
447 }
448 }
449
450 if (arg_share_system)
451 arg_register = false;
452
453 if (arg_boot && arg_share_system) {
454 log_error("--boot and --share-system may not be combined.");
455 return -EINVAL;
456 }
457
458 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
459 log_error("--keep-unit may not be used when invoked from a user session.");
460 return -EINVAL;
461 }
462
463 return 1;
464 }
465
466 static int mount_all(const char *dest) {
467
468 typedef struct MountPoint {
469 const char *what;
470 const char *where;
471 const char *type;
472 const char *options;
473 unsigned long flags;
474 bool fatal;
475 } MountPoint;
476
477 static const MountPoint mount_table[] = {
478 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
479 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
480 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
481 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
482 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
483 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
484 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
485 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
486 #ifdef HAVE_SELINUX
487 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
488 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
489 #endif
490 };
491
492 unsigned k;
493 int r = 0;
494
495 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
496 _cleanup_free_ char *where = NULL;
497 #ifdef HAVE_SELINUX
498 _cleanup_free_ char *options = NULL;
499 #endif
500 const char *o;
501 int t;
502
503 where = strjoin(dest, "/", mount_table[k].where, NULL);
504 if (!where)
505 return log_oom();
506
507 t = path_is_mount_point(where, true);
508 if (t < 0) {
509 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
510
511 if (r == 0)
512 r = t;
513
514 continue;
515 }
516
517 /* Skip this entry if it is not a remount. */
518 if (mount_table[k].what && t > 0)
519 continue;
520
521 mkdir_p(where, 0755);
522
523 #ifdef HAVE_SELINUX
524 if (arg_selinux_apifs_context &&
525 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
526 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
527 if (!options)
528 return log_oom();
529
530 o = options;
531 } else
532 #endif
533 o = mount_table[k].options;
534
535
536 if (mount(mount_table[k].what,
537 where,
538 mount_table[k].type,
539 mount_table[k].flags,
540 o) < 0 &&
541 mount_table[k].fatal) {
542
543 log_error("mount(%s) failed: %m", where);
544
545 if (r == 0)
546 r = -errno;
547 }
548 }
549
550 return r;
551 }
552
553 static int mount_binds(const char *dest, char **l, unsigned long flags) {
554 char **x, **y;
555
556 STRV_FOREACH_PAIR(x, y, l) {
557 char *where;
558 struct stat source_st, dest_st;
559 int r;
560
561 if (stat(*x, &source_st) < 0) {
562 log_error("failed to stat %s: %m", *x);
563 return -errno;
564 }
565
566 where = strappenda(dest, *y);
567 r = stat(where, &dest_st);
568 if (r == 0) {
569 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
570 log_error("The file types of %s and %s do not match. Refusing bind mount",
571 *x, where);
572 return -EINVAL;
573 }
574 } else if (errno == ENOENT) {
575 r = mkdir_parents_label(where, 0755);
576 if (r < 0) {
577 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
578 return r;
579 }
580 } else {
581 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
582 return -errno;
583 }
584 /* Create the mount point, but be conservative -- refuse to create block
585 * and char devices. */
586 if (S_ISDIR(source_st.st_mode))
587 mkdir_label(where, 0755);
588 else if (S_ISFIFO(source_st.st_mode))
589 mkfifo(where, 0644);
590 else if (S_ISSOCK(source_st.st_mode))
591 mknod(where, 0644 | S_IFSOCK, 0);
592 else if (S_ISREG(source_st.st_mode))
593 touch(where);
594 else {
595 log_error("Refusing to create mountpoint for file: %s", *x);
596 return -ENOTSUP;
597 }
598
599 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
600 log_error("mount(%s) failed: %m", where);
601 return -errno;
602 }
603
604 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
605 log_error("mount(%s) failed: %m", where);
606 return -errno;
607 }
608 }
609
610 return 0;
611 }
612
613 static int setup_timezone(const char *dest) {
614 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
615 char *z, *y;
616 int r;
617
618 assert(dest);
619
620 /* Fix the timezone, if possible */
621 r = readlink_malloc("/etc/localtime", &p);
622 if (r < 0) {
623 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
624 return 0;
625 }
626
627 z = path_startswith(p, "../usr/share/zoneinfo/");
628 if (!z)
629 z = path_startswith(p, "/usr/share/zoneinfo/");
630 if (!z) {
631 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
632 return 0;
633 }
634
635 where = strappend(dest, "/etc/localtime");
636 if (!where)
637 return log_oom();
638
639 r = readlink_malloc(where, &q);
640 if (r >= 0) {
641 y = path_startswith(q, "../usr/share/zoneinfo/");
642 if (!y)
643 y = path_startswith(q, "/usr/share/zoneinfo/");
644
645
646 /* Already pointing to the right place? Then do nothing .. */
647 if (y && streq(y, z))
648 return 0;
649 }
650
651 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
652 if (!check)
653 return log_oom();
654
655 if (access(check, F_OK) < 0) {
656 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
657 return 0;
658 }
659
660 what = strappend("../usr/share/zoneinfo/", z);
661 if (!what)
662 return log_oom();
663
664 unlink(where);
665 if (symlink(what, where) < 0) {
666 log_error("Failed to correct timezone of container: %m");
667 return 0;
668 }
669
670 return 0;
671 }
672
673 static int setup_resolv_conf(const char *dest) {
674 char _cleanup_free_ *where = NULL;
675
676 assert(dest);
677
678 if (arg_private_network)
679 return 0;
680
681 /* Fix resolv.conf, if possible */
682 where = strappend(dest, "/etc/resolv.conf");
683 if (!where)
684 return log_oom();
685
686 /* We don't really care for the results of this really. If it
687 * fails, it fails, but meh... */
688 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
689
690 return 0;
691 }
692
693 static int setup_boot_id(const char *dest) {
694 _cleanup_free_ char *from = NULL, *to = NULL;
695 sd_id128_t rnd;
696 char as_uuid[37];
697 int r;
698
699 assert(dest);
700
701 if (arg_share_system)
702 return 0;
703
704 /* Generate a new randomized boot ID, so that each boot-up of
705 * the container gets a new one */
706
707 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
708 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
709 if (!from || !to)
710 return log_oom();
711
712 r = sd_id128_randomize(&rnd);
713 if (r < 0) {
714 log_error("Failed to generate random boot id: %s", strerror(-r));
715 return r;
716 }
717
718 snprintf(as_uuid, sizeof(as_uuid),
719 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
720 SD_ID128_FORMAT_VAL(rnd));
721 char_array_0(as_uuid);
722
723 r = write_string_file(from, as_uuid);
724 if (r < 0) {
725 log_error("Failed to write boot id: %s", strerror(-r));
726 return r;
727 }
728
729 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
730 log_error("Failed to bind mount boot id: %m");
731 r = -errno;
732 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
733 log_warning("Failed to make boot id read-only: %m");
734
735 unlink(from);
736 return r;
737 }
738
739 static int copy_devnodes(const char *dest) {
740
741 static const char devnodes[] =
742 "null\0"
743 "zero\0"
744 "full\0"
745 "random\0"
746 "urandom\0"
747 "tty\0";
748
749 const char *d;
750 int r = 0;
751 _cleanup_umask_ mode_t u;
752
753 assert(dest);
754
755 u = umask(0000);
756
757 NULSTR_FOREACH(d, devnodes) {
758 _cleanup_free_ char *from = NULL, *to = NULL;
759 struct stat st;
760
761 from = strappend("/dev/", d);
762 to = strjoin(dest, "/dev/", d, NULL);
763 if (!from || !to)
764 return log_oom();
765
766 if (stat(from, &st) < 0) {
767
768 if (errno != ENOENT) {
769 log_error("Failed to stat %s: %m", from);
770 return -errno;
771 }
772
773 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
774
775 log_error("%s is not a char or block device, cannot copy", from);
776 return -EIO;
777
778 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
779
780 log_error("mknod(%s) failed: %m", dest);
781 return -errno;
782 }
783 }
784
785 return r;
786 }
787
788 static int setup_ptmx(const char *dest) {
789 _cleanup_free_ char *p = NULL;
790
791 p = strappend(dest, "/dev/ptmx");
792 if (!p)
793 return log_oom();
794
795 if (symlink("pts/ptmx", p) < 0) {
796 log_error("Failed to create /dev/ptmx symlink: %m");
797 return -errno;
798 }
799
800 return 0;
801 }
802
803 static int setup_dev_console(const char *dest, const char *console) {
804 struct stat st;
805 _cleanup_free_ char *to = NULL;
806 int r;
807 _cleanup_umask_ mode_t u;
808
809 assert(dest);
810 assert(console);
811
812 u = umask(0000);
813
814 if (stat(console, &st) < 0) {
815 log_error("Failed to stat %s: %m", console);
816 return -errno;
817
818 } else if (!S_ISCHR(st.st_mode)) {
819 log_error("/dev/console is not a char device");
820 return -EIO;
821 }
822
823 r = chmod_and_chown(console, 0600, 0, 0);
824 if (r < 0) {
825 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
826 return r;
827 }
828
829 if (asprintf(&to, "%s/dev/console", dest) < 0)
830 return log_oom();
831
832 /* We need to bind mount the right tty to /dev/console since
833 * ptys can only exist on pts file systems. To have something
834 * to bind mount things on we create a device node first, that
835 * has the right major/minor (note that the major minor
836 * doesn't actually matter here, since we mount it over
837 * anyway). */
838
839 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
840 log_error("mknod() for /dev/console failed: %m");
841 return -errno;
842 }
843
844 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
845 log_error("Bind mount for /dev/console failed: %m");
846 return -errno;
847 }
848
849 return 0;
850 }
851
852 static int setup_kmsg(const char *dest, int kmsg_socket) {
853 _cleanup_free_ char *from = NULL, *to = NULL;
854 int r, fd, k;
855 _cleanup_umask_ mode_t u;
856 union {
857 struct cmsghdr cmsghdr;
858 uint8_t buf[CMSG_SPACE(sizeof(int))];
859 } control = {};
860 struct msghdr mh = {
861 .msg_control = &control,
862 .msg_controllen = sizeof(control),
863 };
864 struct cmsghdr *cmsg;
865
866 assert(dest);
867 assert(kmsg_socket >= 0);
868
869 u = umask(0000);
870
871 /* We create the kmsg FIFO as /dev/kmsg, but immediately
872 * delete it after bind mounting it to /proc/kmsg. While FIFOs
873 * on the reading side behave very similar to /proc/kmsg,
874 * their writing side behaves differently from /dev/kmsg in
875 * that writing blocks when nothing is reading. In order to
876 * avoid any problems with containers deadlocking due to this
877 * we simply make /dev/kmsg unavailable to the container. */
878 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
879 asprintf(&to, "%s/proc/kmsg", dest) < 0)
880 return log_oom();
881
882 if (mkfifo(from, 0600) < 0) {
883 log_error("mkfifo() for /dev/kmsg failed: %m");
884 return -errno;
885 }
886
887 r = chmod_and_chown(from, 0600, 0, 0);
888 if (r < 0) {
889 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
890 return r;
891 }
892
893 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
894 log_error("Bind mount for /proc/kmsg failed: %m");
895 return -errno;
896 }
897
898 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
899 if (fd < 0) {
900 log_error("Failed to open fifo: %m");
901 return -errno;
902 }
903
904 cmsg = CMSG_FIRSTHDR(&mh);
905 cmsg->cmsg_level = SOL_SOCKET;
906 cmsg->cmsg_type = SCM_RIGHTS;
907 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
908 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
909
910 mh.msg_controllen = cmsg->cmsg_len;
911
912 /* Store away the fd in the socket, so that it stays open as
913 * long as we run the child */
914 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
915 close_nointr_nofail(fd);
916
917 if (k < 0) {
918 log_error("Failed to send FIFO fd: %m");
919 return -errno;
920 }
921
922 /* And now make the FIFO unavailable as /dev/kmsg... */
923 unlink(from);
924 return 0;
925 }
926
927 static int setup_hostname(void) {
928
929 if (arg_share_system)
930 return 0;
931
932 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
933 return -errno;
934
935 return 0;
936 }
937
938 static int setup_journal(const char *directory) {
939 sd_id128_t machine_id, this_id;
940 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
941 char *id;
942 int r;
943
944 p = strappend(directory, "/etc/machine-id");
945 if (!p)
946 return log_oom();
947
948 r = read_one_line_file(p, &b);
949 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
950 return 0;
951 else if (r < 0) {
952 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
953 return r;
954 }
955
956 id = strstrip(b);
957 if (isempty(id) && arg_link_journal == LINK_AUTO)
958 return 0;
959
960 /* Verify validity */
961 r = sd_id128_from_string(id, &machine_id);
962 if (r < 0) {
963 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
964 return r;
965 }
966
967 r = sd_id128_get_machine(&this_id);
968 if (r < 0) {
969 log_error("Failed to retrieve machine ID: %s", strerror(-r));
970 return r;
971 }
972
973 if (sd_id128_equal(machine_id, this_id)) {
974 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
975 "Host and machine ids are equal (%s): refusing to link journals", id);
976 if (arg_link_journal == LINK_AUTO)
977 return 0;
978 return
979 -EEXIST;
980 }
981
982 if (arg_link_journal == LINK_NO)
983 return 0;
984
985 free(p);
986 p = strappend("/var/log/journal/", id);
987 q = strjoin(directory, "/var/log/journal/", id, NULL);
988 if (!p || !q)
989 return log_oom();
990
991 if (path_is_mount_point(p, false) > 0) {
992 if (arg_link_journal != LINK_AUTO) {
993 log_error("%s: already a mount point, refusing to use for journal", p);
994 return -EEXIST;
995 }
996
997 return 0;
998 }
999
1000 if (path_is_mount_point(q, false) > 0) {
1001 if (arg_link_journal != LINK_AUTO) {
1002 log_error("%s: already a mount point, refusing to use for journal", q);
1003 return -EEXIST;
1004 }
1005
1006 return 0;
1007 }
1008
1009 r = readlink_and_make_absolute(p, &d);
1010 if (r >= 0) {
1011 if ((arg_link_journal == LINK_GUEST ||
1012 arg_link_journal == LINK_AUTO) &&
1013 path_equal(d, q)) {
1014
1015 r = mkdir_p(q, 0755);
1016 if (r < 0)
1017 log_warning("failed to create directory %s: %m", q);
1018 return 0;
1019 }
1020
1021 if (unlink(p) < 0) {
1022 log_error("Failed to remove symlink %s: %m", p);
1023 return -errno;
1024 }
1025 } else if (r == -EINVAL) {
1026
1027 if (arg_link_journal == LINK_GUEST &&
1028 rmdir(p) < 0) {
1029
1030 if (errno == ENOTDIR) {
1031 log_error("%s already exists and is neither a symlink nor a directory", p);
1032 return r;
1033 } else {
1034 log_error("Failed to remove %s: %m", p);
1035 return -errno;
1036 }
1037 }
1038 } else if (r != -ENOENT) {
1039 log_error("readlink(%s) failed: %m", p);
1040 return r;
1041 }
1042
1043 if (arg_link_journal == LINK_GUEST) {
1044
1045 if (symlink(q, p) < 0) {
1046 log_error("Failed to symlink %s to %s: %m", q, p);
1047 return -errno;
1048 }
1049
1050 r = mkdir_p(q, 0755);
1051 if (r < 0)
1052 log_warning("failed to create directory %s: %m", q);
1053 return 0;
1054 }
1055
1056 if (arg_link_journal == LINK_HOST) {
1057 r = mkdir_p(p, 0755);
1058 if (r < 0) {
1059 log_error("Failed to create %s: %m", p);
1060 return r;
1061 }
1062
1063 } else if (access(p, F_OK) < 0)
1064 return 0;
1065
1066 if (dir_is_empty(q) == 0) {
1067 log_error("%s not empty.", q);
1068 return -ENOTEMPTY;
1069 }
1070
1071 r = mkdir_p(q, 0755);
1072 if (r < 0) {
1073 log_error("Failed to create %s: %m", q);
1074 return r;
1075 }
1076
1077 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1078 log_error("Failed to bind mount journal from host into guest: %m");
1079 return -errno;
1080 }
1081
1082 return 0;
1083 }
1084
1085 static int setup_kdbus(const char *dest, const char *path) {
1086 const char *p;
1087
1088 if (!path)
1089 return 0;
1090
1091 p = strappenda(dest, "/dev/kdbus");
1092 if (mkdir(p, 0755) < 0) {
1093 log_error("Failed to create kdbus path: %m");
1094 return -errno;
1095 }
1096
1097 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1098 log_error("Failed to mount kdbus domain path: %m");
1099 return -errno;
1100 }
1101
1102 return 0;
1103 }
1104
1105 static int drop_capabilities(void) {
1106 return capability_bounding_set_drop(~arg_retain, false);
1107 }
1108
1109 static int register_machine(pid_t pid) {
1110 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1111 _cleanup_bus_unref_ sd_bus *bus = NULL;
1112 int r;
1113
1114 if (!arg_register)
1115 return 0;
1116
1117 r = sd_bus_default_system(&bus);
1118 if (r < 0) {
1119 log_error("Failed to open system bus: %s", strerror(-r));
1120 return r;
1121 }
1122
1123 if (arg_keep_unit) {
1124 r = sd_bus_call_method(
1125 bus,
1126 "org.freedesktop.machine1",
1127 "/org/freedesktop/machine1",
1128 "org.freedesktop.machine1.Manager",
1129 "RegisterMachine",
1130 &error,
1131 NULL,
1132 "sayssus",
1133 arg_machine,
1134 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1135 "nspawn",
1136 "container",
1137 (uint32_t) pid,
1138 strempty(arg_directory));
1139 } else {
1140 r = sd_bus_call_method(
1141 bus,
1142 "org.freedesktop.machine1",
1143 "/org/freedesktop/machine1",
1144 "org.freedesktop.machine1.Manager",
1145 "CreateMachine",
1146 &error,
1147 NULL,
1148 "sayssusa(sv)",
1149 arg_machine,
1150 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1151 "nspawn",
1152 "container",
1153 (uint32_t) pid,
1154 strempty(arg_directory),
1155 !isempty(arg_slice), "Slice", "s", arg_slice);
1156 }
1157
1158 if (r < 0) {
1159 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1160 return r;
1161 }
1162
1163 return 0;
1164 }
1165
1166 static int terminate_machine(pid_t pid) {
1167 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1168 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1169 _cleanup_bus_unref_ sd_bus *bus = NULL;
1170 const char *path;
1171 int r;
1172
1173 if (!arg_register)
1174 return 0;
1175
1176 r = sd_bus_default_system(&bus);
1177 if (r < 0) {
1178 log_error("Failed to open system bus: %s", strerror(-r));
1179 return r;
1180 }
1181
1182 r = sd_bus_call_method(
1183 bus,
1184 "org.freedesktop.machine1",
1185 "/org/freedesktop/machine1",
1186 "org.freedesktop.machine1.Manager",
1187 "GetMachineByPID",
1188 &error,
1189 &reply,
1190 "u",
1191 (uint32_t) pid);
1192 if (r < 0) {
1193 /* Note that the machine might already have been
1194 * cleaned up automatically, hence don't consider it a
1195 * failure if we cannot get the machine object. */
1196 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1197 return 0;
1198 }
1199
1200 r = sd_bus_message_read(reply, "o", &path);
1201 if (r < 0)
1202 return bus_log_parse_error(r);
1203
1204 r = sd_bus_call_method(
1205 bus,
1206 "org.freedesktop.machine1",
1207 path,
1208 "org.freedesktop.machine1.Machine",
1209 "Terminate",
1210 &error,
1211 NULL,
1212 NULL);
1213 if (r < 0) {
1214 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1215 return 0;
1216 }
1217
1218 return 0;
1219 }
1220
1221 static int reset_audit_loginuid(void) {
1222 _cleanup_free_ char *p = NULL;
1223 int r;
1224
1225 if (arg_share_system)
1226 return 0;
1227
1228 r = read_one_line_file("/proc/self/loginuid", &p);
1229 if (r == -EEXIST)
1230 return 0;
1231 if (r < 0) {
1232 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1233 return r;
1234 }
1235
1236 /* Already reset? */
1237 if (streq(p, "4294967295"))
1238 return 0;
1239
1240 r = write_string_file("/proc/self/loginuid", "4294967295");
1241 if (r < 0) {
1242 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1243 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1244 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1245 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1246 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1247
1248 sleep(5);
1249 }
1250
1251 return 0;
1252 }
1253
1254 static int move_network_interfaces(pid_t pid) {
1255 _cleanup_sd_rtnl_unref_ sd_rtnl *rtnl = NULL;
1256 char **i;
1257 int r;
1258
1259 if (!arg_private_network)
1260 return 0;
1261
1262 if (strv_isempty(arg_network_interfaces))
1263 return 0;
1264
1265 r = sd_rtnl_open(NETLINK_ROUTE, &rtnl);
1266 if (r < 0) {
1267 log_error("Failed to connect to netlink: %s", strerror(-r));
1268 return r;
1269 }
1270
1271 STRV_FOREACH(i, arg_network_interfaces) {
1272 _cleanup_sd_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1273 unsigned ifi;
1274
1275 ifi = if_nametoindex(*i);
1276 if (ifi == 0) {
1277 log_error("Failed to resolve interface %s: %m", *i);
1278 return -errno;
1279 }
1280
1281 r = sd_rtnl_message_link_new(RTM_NEWLINK, ifi, &m);
1282 if (r < 0) {
1283 log_error("Failed to allocate netlink message: %s", strerror(-r));
1284 return r;
1285 }
1286
1287 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1288 if (r < 0) {
1289 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1290 return r;
1291 }
1292
1293 r = sd_rtnl_call(rtnl, m, 0, NULL);
1294 if (r < 0) {
1295 log_error("Failed to move interface to namespace: %s", strerror(-r));
1296 return r;
1297 }
1298 }
1299
1300 return 0;
1301 }
1302
1303 int main(int argc, char *argv[]) {
1304 pid_t pid = 0;
1305 int r = EXIT_FAILURE, k;
1306 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1307 int n_fd_passed;
1308 const char *console = NULL;
1309 sigset_t mask;
1310 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1311 _cleanup_fdset_free_ FDSet *fds = NULL;
1312 _cleanup_free_ char *kdbus_domain = NULL;
1313
1314 log_parse_environment();
1315 log_open();
1316
1317 k = parse_argv(argc, argv);
1318 if (k < 0)
1319 goto finish;
1320 else if (k == 0) {
1321 r = EXIT_SUCCESS;
1322 goto finish;
1323 }
1324
1325 if (arg_directory) {
1326 char *p;
1327
1328 p = path_make_absolute_cwd(arg_directory);
1329 free(arg_directory);
1330 arg_directory = p;
1331 } else
1332 arg_directory = get_current_dir_name();
1333
1334 if (!arg_directory) {
1335 log_error("Failed to determine path, please use -D.");
1336 goto finish;
1337 }
1338
1339 path_kill_slashes(arg_directory);
1340
1341 if (!arg_machine) {
1342 arg_machine = strdup(basename(arg_directory));
1343 if (!arg_machine) {
1344 log_oom();
1345 goto finish;
1346 }
1347
1348 hostname_cleanup(arg_machine, false);
1349 if (isempty(arg_machine)) {
1350 log_error("Failed to determine machine name automatically, please use -M.");
1351 goto finish;
1352 }
1353 }
1354
1355 if (geteuid() != 0) {
1356 log_error("Need to be root.");
1357 goto finish;
1358 }
1359
1360 if (sd_booted() <= 0) {
1361 log_error("Not running on a systemd system.");
1362 goto finish;
1363 }
1364
1365 if (path_equal(arg_directory, "/")) {
1366 log_error("Spawning container on root directory not supported.");
1367 goto finish;
1368 }
1369
1370 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1371 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1372 goto finish;
1373 }
1374
1375 log_close();
1376 n_fd_passed = sd_listen_fds(false);
1377 if (n_fd_passed > 0) {
1378 k = fdset_new_listen_fds(&fds, false);
1379 if (k < 0) {
1380 log_error("Failed to collect file descriptors: %s", strerror(-k));
1381 goto finish;
1382 }
1383 }
1384 fdset_close_others(fds);
1385 log_open();
1386
1387 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1388 if (master < 0) {
1389 log_error("Failed to acquire pseudo tty: %m");
1390 goto finish;
1391 }
1392
1393 console = ptsname(master);
1394 if (!console) {
1395 log_error("Failed to determine tty name: %m");
1396 goto finish;
1397 }
1398
1399 if (!arg_quiet)
1400 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1401
1402 if (unlockpt(master) < 0) {
1403 log_error("Failed to unlock tty: %m");
1404 goto finish;
1405 }
1406
1407
1408 if (access("/dev/kdbus/control", F_OK) >= 0) {
1409
1410 if (arg_share_system) {
1411 kdbus_domain = strdup("/dev/kdbus");
1412 if (!kdbus_domain) {
1413 log_oom();
1414 goto finish;
1415 }
1416 } else {
1417 const char *ns;
1418
1419 ns = strappenda("machine-", arg_machine);
1420 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1421 if (r < 0)
1422 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1423 else
1424 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1425 }
1426 }
1427
1428 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1429 log_error("Failed to create kmsg socket pair: %m");
1430 goto finish;
1431 }
1432
1433 sd_notify(0, "READY=1");
1434
1435 assert_se(sigemptyset(&mask) == 0);
1436 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1437 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1438
1439 for (;;) {
1440 siginfo_t status;
1441
1442 sync_fd = eventfd(0, EFD_CLOEXEC);
1443 if (sync_fd < 0) {
1444 log_error("Failed to create event fd: %m");
1445 goto finish;
1446 }
1447
1448 pid = syscall(__NR_clone,
1449 SIGCHLD|CLONE_NEWNS|
1450 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1451 (arg_private_network ? CLONE_NEWNET : 0), NULL);
1452 if (pid < 0) {
1453 if (errno == EINVAL)
1454 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1455 else
1456 log_error("clone() failed: %m");
1457
1458 goto finish;
1459 }
1460
1461 if (pid == 0) {
1462 /* child */
1463 const char *home = NULL;
1464 uid_t uid = (uid_t) -1;
1465 gid_t gid = (gid_t) -1;
1466 unsigned n_env = 2;
1467 const char *envp[] = {
1468 "PATH=" DEFAULT_PATH_SPLIT_USR,
1469 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1470 NULL, /* TERM */
1471 NULL, /* HOME */
1472 NULL, /* USER */
1473 NULL, /* LOGNAME */
1474 NULL, /* container_uuid */
1475 NULL, /* LISTEN_FDS */
1476 NULL, /* LISTEN_PID */
1477 NULL
1478 };
1479 char **env_use;
1480 eventfd_t x;
1481
1482 envp[n_env] = strv_find_prefix(environ, "TERM=");
1483 if (envp[n_env])
1484 n_env ++;
1485
1486 close_nointr_nofail(master);
1487 master = -1;
1488
1489 close_nointr(STDIN_FILENO);
1490 close_nointr(STDOUT_FILENO);
1491 close_nointr(STDERR_FILENO);
1492
1493 close_nointr_nofail(kmsg_socket_pair[0]);
1494 kmsg_socket_pair[0] = -1;
1495
1496 reset_all_signal_handlers();
1497
1498 assert_se(sigemptyset(&mask) == 0);
1499 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1500
1501 k = open_terminal(console, O_RDWR);
1502 if (k != STDIN_FILENO) {
1503 if (k >= 0) {
1504 close_nointr_nofail(k);
1505 k = -EINVAL;
1506 }
1507
1508 log_error("Failed to open console: %s", strerror(-k));
1509 goto child_fail;
1510 }
1511
1512 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1513 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1514 log_error("Failed to duplicate console: %m");
1515 goto child_fail;
1516 }
1517
1518 if (setsid() < 0) {
1519 log_error("setsid() failed: %m");
1520 goto child_fail;
1521 }
1522
1523 if (reset_audit_loginuid() < 0)
1524 goto child_fail;
1525
1526 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1527 log_error("PR_SET_PDEATHSIG failed: %m");
1528 goto child_fail;
1529 }
1530
1531 /* Mark everything as slave, so that we still
1532 * receive mounts from the real root, but don't
1533 * propagate mounts to the real root. */
1534 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1535 log_error("MS_SLAVE|MS_REC failed: %m");
1536 goto child_fail;
1537 }
1538
1539 /* Turn directory into bind mount */
1540 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1541 log_error("Failed to make bind mount.");
1542 goto child_fail;
1543 }
1544
1545 if (arg_read_only)
1546 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1547 log_error("Failed to make read-only.");
1548 goto child_fail;
1549 }
1550
1551 if (mount_all(arg_directory) < 0)
1552 goto child_fail;
1553
1554 if (copy_devnodes(arg_directory) < 0)
1555 goto child_fail;
1556
1557 if (setup_ptmx(arg_directory) < 0)
1558 goto child_fail;
1559
1560 dev_setup(arg_directory);
1561
1562 if (setup_dev_console(arg_directory, console) < 0)
1563 goto child_fail;
1564
1565 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1566 goto child_fail;
1567
1568 close_nointr_nofail(kmsg_socket_pair[1]);
1569 kmsg_socket_pair[1] = -1;
1570
1571 if (setup_boot_id(arg_directory) < 0)
1572 goto child_fail;
1573
1574 if (setup_timezone(arg_directory) < 0)
1575 goto child_fail;
1576
1577 if (setup_resolv_conf(arg_directory) < 0)
1578 goto child_fail;
1579
1580 if (setup_journal(arg_directory) < 0)
1581 goto child_fail;
1582
1583 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1584 goto child_fail;
1585
1586 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1587 goto child_fail;
1588
1589 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1590 goto child_fail;
1591
1592 if (chdir(arg_directory) < 0) {
1593 log_error("chdir(%s) failed: %m", arg_directory);
1594 goto child_fail;
1595 }
1596
1597 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1598 log_error("mount(MS_MOVE) failed: %m");
1599 goto child_fail;
1600 }
1601
1602 if (chroot(".") < 0) {
1603 log_error("chroot() failed: %m");
1604 goto child_fail;
1605 }
1606
1607 if (chdir("/") < 0) {
1608 log_error("chdir() failed: %m");
1609 goto child_fail;
1610 }
1611
1612 umask(0022);
1613
1614 if (arg_private_network)
1615 loopback_setup();
1616
1617 if (drop_capabilities() < 0) {
1618 log_error("drop_capabilities() failed: %m");
1619 goto child_fail;
1620 }
1621
1622 if (arg_user) {
1623
1624 /* Note that this resolves user names
1625 * inside the container, and hence
1626 * accesses the NSS modules from the
1627 * container and not the host. This is
1628 * a bit weird... */
1629
1630 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1631 log_error("get_user_creds() failed: %m");
1632 goto child_fail;
1633 }
1634
1635 if (mkdir_parents_label(home, 0775) < 0) {
1636 log_error("mkdir_parents_label() failed: %m");
1637 goto child_fail;
1638 }
1639
1640 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1641 log_error("mkdir_safe_label() failed: %m");
1642 goto child_fail;
1643 }
1644
1645 if (initgroups((const char*)arg_user, gid) < 0) {
1646 log_error("initgroups() failed: %m");
1647 goto child_fail;
1648 }
1649
1650 if (setresgid(gid, gid, gid) < 0) {
1651 log_error("setregid() failed: %m");
1652 goto child_fail;
1653 }
1654
1655 if (setresuid(uid, uid, uid) < 0) {
1656 log_error("setreuid() failed: %m");
1657 goto child_fail;
1658 }
1659 } else {
1660 /* Reset everything fully to 0, just in case */
1661
1662 if (setgroups(0, NULL) < 0) {
1663 log_error("setgroups() failed: %m");
1664 goto child_fail;
1665 }
1666
1667 if (setresgid(0, 0, 0) < 0) {
1668 log_error("setregid() failed: %m");
1669 goto child_fail;
1670 }
1671
1672 if (setresuid(0, 0, 0) < 0) {
1673 log_error("setreuid() failed: %m");
1674 goto child_fail;
1675 }
1676 }
1677
1678 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1679 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1680 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1681 log_oom();
1682 goto child_fail;
1683 }
1684
1685 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1686 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1687 log_oom();
1688 goto child_fail;
1689 }
1690 }
1691
1692 if (fdset_size(fds) > 0) {
1693 k = fdset_cloexec(fds, false);
1694 if (k < 0) {
1695 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1696 goto child_fail;
1697 }
1698
1699 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1700 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1701 log_oom();
1702 goto child_fail;
1703 }
1704 }
1705
1706 setup_hostname();
1707
1708 eventfd_read(sync_fd, &x);
1709 close_nointr_nofail(sync_fd);
1710 sync_fd = -1;
1711
1712 if (!strv_isempty(arg_setenv)) {
1713 char **n;
1714
1715 n = strv_env_merge(2, envp, arg_setenv);
1716 if (!n) {
1717 log_oom();
1718 goto child_fail;
1719 }
1720
1721 env_use = n;
1722 } else
1723 env_use = (char**) envp;
1724
1725 #ifdef HAVE_SELINUX
1726 if (arg_selinux_context)
1727 if (setexeccon(arg_selinux_context) < 0)
1728 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1729 #endif
1730 if (arg_boot) {
1731 char **a;
1732 size_t l;
1733
1734 /* Automatically search for the init system */
1735
1736 l = 1 + argc - optind;
1737 a = newa(char*, l + 1);
1738 memcpy(a + 1, argv + optind, l * sizeof(char*));
1739
1740 a[0] = (char*) "/usr/lib/systemd/systemd";
1741 execve(a[0], a, env_use);
1742
1743 a[0] = (char*) "/lib/systemd/systemd";
1744 execve(a[0], a, env_use);
1745
1746 a[0] = (char*) "/sbin/init";
1747 execve(a[0], a, env_use);
1748 } else if (argc > optind)
1749 execvpe(argv[optind], argv + optind, env_use);
1750 else {
1751 chdir(home ? home : "/root");
1752 execle("/bin/bash", "-bash", NULL, env_use);
1753 }
1754
1755 log_error("execv() failed: %m");
1756
1757 child_fail:
1758 _exit(EXIT_FAILURE);
1759 }
1760
1761 fdset_free(fds);
1762 fds = NULL;
1763
1764 r = register_machine(pid);
1765 if (r < 0)
1766 goto finish;
1767
1768 r = move_network_interfaces(pid);
1769 if (r < 0)
1770 goto finish;
1771
1772 eventfd_write(sync_fd, 1);
1773 close_nointr_nofail(sync_fd);
1774 sync_fd = -1;
1775
1776 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1777 if (k < 0) {
1778 r = EXIT_FAILURE;
1779 break;
1780 }
1781
1782 if (!arg_quiet)
1783 putc('\n', stdout);
1784
1785 /* Kill if it is not dead yet anyway */
1786 terminate_machine(pid);
1787
1788 /* Redundant, but better safe than sorry */
1789 kill(pid, SIGKILL);
1790
1791 k = wait_for_terminate(pid, &status);
1792 pid = 0;
1793
1794 if (k < 0) {
1795 r = EXIT_FAILURE;
1796 break;
1797 }
1798
1799 if (status.si_code == CLD_EXITED) {
1800 r = status.si_status;
1801 if (status.si_status != 0) {
1802 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1803 break;
1804 }
1805
1806 if (!arg_quiet)
1807 log_debug("Container %s exited successfully.", arg_machine);
1808 break;
1809 } else if (status.si_code == CLD_KILLED &&
1810 status.si_status == SIGINT) {
1811
1812 if (!arg_quiet)
1813 log_info("Container %s has been shut down.", arg_machine);
1814 r = 0;
1815 break;
1816 } else if (status.si_code == CLD_KILLED &&
1817 status.si_status == SIGHUP) {
1818
1819 if (!arg_quiet)
1820 log_info("Container %s is being rebooted.", arg_machine);
1821 continue;
1822 } else if (status.si_code == CLD_KILLED ||
1823 status.si_code == CLD_DUMPED) {
1824
1825 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1826 r = EXIT_FAILURE;
1827 break;
1828 } else {
1829 log_error("Container %s failed due to unknown reason.", arg_machine);
1830 r = EXIT_FAILURE;
1831 break;
1832 }
1833 }
1834
1835 finish:
1836 if (pid > 0)
1837 kill(pid, SIGKILL);
1838
1839 free(arg_directory);
1840 free(arg_machine);
1841 free(arg_setenv);
1842 free(arg_network_interfaces);
1843
1844 return r;
1845 }