]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
event: make sure to possibly disarm the timerfds before we reenter epoll_wait
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
88213476 43
1f0cd86b
LP
44#include "sd-daemon.h"
45#include "sd-bus.h"
46#include "sd-id128.h"
88213476
LP
47#include "log.h"
48#include "util.h"
49e942b2 49#include "mkdir.h"
6b2d0e85 50#include "macro.h"
d7832d2c 51#include "audit.h"
94d82985 52#include "missing.h"
04d391da 53#include "cgroup-util.h"
a258bf26 54#include "strv.h"
9eb977db 55#include "path-util.h"
a41fe3a2 56#include "loopback-setup.h"
4fc9982c 57#include "dev-setup.h"
842f3b0f 58#include "fdset.h"
acbeb427 59#include "build.h"
a5c32cff 60#include "fileio.h"
40ca29a1 61#include "bus-util.h"
1f0cd86b 62#include "bus-error.h"
4ba93280 63#include "ptyfwd.h"
57fb9fb5 64
f2d88580
LP
65#ifndef TTY_GID
66#define TTY_GID 5
67#endif
68
57fb9fb5
LP
69typedef enum LinkJournal {
70 LINK_NO,
71 LINK_AUTO,
72 LINK_HOST,
73 LINK_GUEST
74} LinkJournal;
88213476
LP
75
76static char *arg_directory = NULL;
687d0825 77static char *arg_user = NULL;
9444b1f2 78static sd_id128_t arg_uuid = {};
7027ff61 79static char *arg_machine = NULL;
9444b1f2 80static const char *arg_slice = NULL;
ff01d048 81static bool arg_private_network = false;
bc2f673e 82static bool arg_read_only = false;
0f0dbc46 83static bool arg_boot = false;
57fb9fb5 84static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
85static uint64_t arg_retain =
86 (1ULL << CAP_CHOWN) |
87 (1ULL << CAP_DAC_OVERRIDE) |
88 (1ULL << CAP_DAC_READ_SEARCH) |
89 (1ULL << CAP_FOWNER) |
90 (1ULL << CAP_FSETID) |
91 (1ULL << CAP_IPC_OWNER) |
92 (1ULL << CAP_KILL) |
93 (1ULL << CAP_LEASE) |
94 (1ULL << CAP_LINUX_IMMUTABLE) |
95 (1ULL << CAP_NET_BIND_SERVICE) |
96 (1ULL << CAP_NET_BROADCAST) |
97 (1ULL << CAP_NET_RAW) |
98 (1ULL << CAP_SETGID) |
99 (1ULL << CAP_SETFCAP) |
100 (1ULL << CAP_SETPCAP) |
101 (1ULL << CAP_SETUID) |
102 (1ULL << CAP_SYS_ADMIN) |
103 (1ULL << CAP_SYS_CHROOT) |
104 (1ULL << CAP_SYS_NICE) |
105 (1ULL << CAP_SYS_PTRACE) |
106 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 107 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
108 (1ULL << CAP_SYS_BOOT) |
109 (1ULL << CAP_AUDIT_WRITE) |
110 (1ULL << CAP_AUDIT_CONTROL);
17fe0523
LP
111static char **arg_bind = NULL;
112static char **arg_bind_ro = NULL;
88213476
LP
113
114static int help(void) {
115
116 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
117 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
17fe0523 118 " -h --help Show this help\n"
7027ff61 119 " --version Print version string\n"
17fe0523
LP
120 " -D --directory=NAME Root directory for the container\n"
121 " -b --boot Boot up full system (i.e. invoke init)\n"
122 " -u --user=USER Run the command under specified user or uid\n"
17fe0523 123 " --uuid=UUID Set a specific machine UUID for the container\n"
7027ff61 124 " -M --machine=NAME Set the machine name for the container\n"
9444b1f2 125 " -S --slice=SLICE Place the container in the specified slice\n"
17fe0523
LP
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
129 " capability\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " the container\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
88213476
LP
135 program_invocation_short_name);
136
137 return 0;
138}
139
140static int parse_argv(int argc, char *argv[]) {
141
a41fe3a2 142 enum {
acbeb427
ZJS
143 ARG_VERSION = 0x100,
144 ARG_PRIVATE_NETWORK,
bc2f673e 145 ARG_UUID,
5076f0cc 146 ARG_READ_ONLY,
57fb9fb5 147 ARG_CAPABILITY,
17fe0523
LP
148 ARG_LINK_JOURNAL,
149 ARG_BIND,
150 ARG_BIND_RO
a41fe3a2
LP
151 };
152
88213476 153 static const struct option options[] = {
ff01d048 154 { "help", no_argument, NULL, 'h' },
acbeb427 155 { "version", no_argument, NULL, ARG_VERSION },
ff01d048
LP
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 159 { "boot", no_argument, NULL, 'b' },
144f0fc0 160 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
5076f0cc 162 { "capability", required_argument, NULL, ARG_CAPABILITY },
57fb9fb5 163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
17fe0523
LP
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
7027ff61 166 { "machine", required_argument, NULL, 'M' },
9444b1f2 167 { "slice", required_argument, NULL, 'S' },
eb9da376 168 {}
88213476
LP
169 };
170
9444b1f2 171 int c, r;
88213476
LP
172
173 assert(argc >= 0);
174 assert(argv);
175
bd5a5458 176 while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
88213476
LP
177
178 switch (c) {
179
180 case 'h':
eb9da376 181 return help();
88213476 182
acbeb427
ZJS
183 case ARG_VERSION:
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
186 return 0;
187
88213476
LP
188 case 'D':
189 free(arg_directory);
3a74cea5
LP
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
88213476
LP
193 return -ENOMEM;
194 }
195
196 break;
197
687d0825
MV
198 case 'u':
199 free(arg_user);
7027ff61
LP
200 arg_user = strdup(optarg);
201 if (!arg_user)
202 return log_oom();
687d0825
MV
203
204 break;
205
ff01d048
LP
206 case ARG_PRIVATE_NETWORK:
207 arg_private_network = true;
a41fe3a2
LP
208 break;
209
0f0dbc46
LP
210 case 'b':
211 arg_boot = true;
212 break;
213
144f0fc0 214 case ARG_UUID:
9444b1f2
LP
215 r = sd_id128_from_string(optarg, &arg_uuid);
216 if (r < 0) {
aa96c6cb 217 log_error("Invalid UUID: %s", optarg);
9444b1f2 218 return r;
aa96c6cb 219 }
9444b1f2 220 break;
aa96c6cb 221
9444b1f2
LP
222 case 'S':
223 arg_slice = strdup(optarg);
b3451bed
DH
224 if (!arg_slice)
225 return log_oom();
226
144f0fc0
LP
227 break;
228
7027ff61
LP
229 case 'M':
230 if (!hostname_is_valid(optarg)) {
231 log_error("Invalid machine name: %s", optarg);
232 return -EINVAL;
233 }
234
235 free(arg_machine);
236 arg_machine = strdup(optarg);
237 if (!arg_machine)
238 return log_oom();
239
240 break;
241
bc2f673e
LP
242 case ARG_READ_ONLY:
243 arg_read_only = true;
244 break;
245
5076f0cc
LP
246 case ARG_CAPABILITY: {
247 char *state, *word;
248 size_t length;
249
250 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251 cap_value_t cap;
252 char *t;
253
254 t = strndup(word, length);
0d0f0c50
SL
255 if (!t)
256 return log_oom();
5076f0cc
LP
257
258 if (cap_from_name(t, &cap) < 0) {
259 log_error("Failed to parse capability %s.", t);
260 free(t);
261 return -EINVAL;
262 }
263
264 free(t);
265 arg_retain |= 1ULL << (uint64_t) cap;
266 }
267
268 break;
269 }
270
57fb9fb5
LP
271 case 'j':
272 arg_link_journal = LINK_GUEST;
273 break;
274
275 case ARG_LINK_JOURNAL:
276 if (streq(optarg, "auto"))
277 arg_link_journal = LINK_AUTO;
278 else if (streq(optarg, "no"))
279 arg_link_journal = LINK_NO;
280 else if (streq(optarg, "guest"))
281 arg_link_journal = LINK_GUEST;
282 else if (streq(optarg, "host"))
283 arg_link_journal = LINK_HOST;
284 else {
285 log_error("Failed to parse link journal mode %s", optarg);
286 return -EINVAL;
287 }
288
289 break;
290
17fe0523
LP
291 case ARG_BIND:
292 case ARG_BIND_RO: {
293 _cleanup_free_ char *a = NULL, *b = NULL;
294 char *e;
295 char ***x;
17fe0523
LP
296
297 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299 e = strchr(optarg, ':');
300 if (e) {
301 a = strndup(optarg, e - optarg);
302 b = strdup(e + 1);
303 } else {
304 a = strdup(optarg);
305 b = strdup(optarg);
306 }
307
308 if (!a || !b)
309 return log_oom();
310
311 if (!path_is_absolute(a) || !path_is_absolute(b)) {
312 log_error("Invalid bind mount specification: %s", optarg);
313 return -EINVAL;
314 }
315
316 r = strv_extend(x, a);
317 if (r < 0)
b3451bed 318 return log_oom();
17fe0523
LP
319
320 r = strv_extend(x, b);
321 if (r < 0)
b3451bed 322 return log_oom();
17fe0523
LP
323
324 break;
325 }
326
88213476
LP
327 case '?':
328 return -EINVAL;
329
330 default:
eb9da376 331 assert_not_reached("Unhandled option");
88213476
LP
332 }
333 }
334
335 return 1;
336}
337
338static int mount_all(const char *dest) {
339
340 typedef struct MountPoint {
341 const char *what;
342 const char *where;
343 const char *type;
344 const char *options;
345 unsigned long flags;
3bd66c05 346 bool fatal;
88213476
LP
347 } MountPoint;
348
349 static const MountPoint mount_table[] = {
4b7a6af4 350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 358#ifdef HAVE_SELINUX
b4c59701
LP
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 361#endif
88213476
LP
362 };
363
364 unsigned k;
365 int r = 0;
366
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 368 _cleanup_free_ char *where = NULL;
88213476
LP
369 int t;
370
17fe0523
LP
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
372 if (!where)
373 return log_oom();
88213476 374
e65aec12 375 t = path_is_mount_point(where, true);
68fb0892 376 if (t < 0) {
88213476 377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
378
379 if (r == 0)
380 r = t;
381
382 continue;
383 }
384
9c1c7f71
LP
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
014a9c77
LP
387 continue;
388
17fe0523 389 mkdir_p(where, 0755);
88213476
LP
390
391 if (mount(mount_table[k].what,
392 where,
393 mount_table[k].type,
394 mount_table[k].flags,
3bd66c05
LP
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
88213476
LP
397
398 log_error("mount(%s) failed: %m", where);
399
400 if (r == 0)
401 r = -errno;
402 }
88213476
LP
403 }
404
e58a1277
LP
405 return r;
406}
f8440af5 407
17fe0523
LP
408static int mount_binds(const char *dest, char **l, unsigned long flags) {
409 char **x, **y;
410
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
d2421337
DR
413 struct stat source_st, dest_st;
414
415 if (stat(*x, &source_st) < 0) {
416 log_error("failed to stat %s: %m", *x);
417 return -errno;
418 }
17fe0523
LP
419
420 where = strjoin(dest, "/", *y, NULL);
421 if (!where)
422 return log_oom();
423
d2421337
DR
424 if (stat(where, &dest_st) == 0) {
425 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
cecf24e7 426 log_error("The file types of %s and %s do not match. Refusing bind mount",
d2421337
DR
427 *x, where);
428 return -EINVAL;
429 }
430 } else {
431 /* Create the mount point, but be conservative -- refuse to create block
432 * and char devices. */
433 if (S_ISDIR(source_st.st_mode))
434 mkdir_p_label(where, 0755);
435 else if (S_ISFIFO(source_st.st_mode))
436 mkfifo(where, 0644);
437 else if (S_ISSOCK(source_st.st_mode))
438 mknod(where, 0644 | S_IFSOCK, 0);
439 else if (S_ISREG(source_st.st_mode))
440 touch(where);
441 else {
442 log_error("Refusing to create mountpoint for file: %s", *x);
443 return -ENOTSUP;
444 }
445 }
17fe0523
LP
446
447 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
448 log_error("mount(%s) failed: %m", where);
449 return -errno;
450 }
451
452 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
453 log_error("mount(%s) failed: %m", where);
454 return -errno;
455 }
456 }
457
458 return 0;
459}
460
e58a1277 461static int setup_timezone(const char *dest) {
d4036145
LP
462 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
463 char *z, *y;
464 int r;
f8440af5 465
e58a1277
LP
466 assert(dest);
467
468 /* Fix the timezone, if possible */
d4036145
LP
469 r = readlink_malloc("/etc/localtime", &p);
470 if (r < 0) {
471 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
472 return 0;
473 }
474
475 z = path_startswith(p, "../usr/share/zoneinfo/");
476 if (!z)
477 z = path_startswith(p, "/usr/share/zoneinfo/");
478 if (!z) {
479 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
480 return 0;
481 }
482
04bc4a3f
LP
483 where = strappend(dest, "/etc/localtime");
484 if (!where)
0d0f0c50 485 return log_oom();
715ac17a 486
d4036145
LP
487 r = readlink_malloc(where, &q);
488 if (r >= 0) {
489 y = path_startswith(q, "../usr/share/zoneinfo/");
490 if (!y)
491 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 492
4d1c38b8 493
d4036145
LP
494 /* Already pointing to the right place? Then do nothing .. */
495 if (y && streq(y, z))
496 return 0;
497 }
498
499 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
500 if (!check)
0d0f0c50 501 return log_oom();
4d1c38b8 502
d4036145
LP
503 if (access(check, F_OK) < 0) {
504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
505 return 0;
506 }
68fb0892 507
d4036145
LP
508 what = strappend("../usr/share/zoneinfo/", z);
509 if (!what)
510 return log_oom();
511
512 unlink(where);
513 if (symlink(what, where) < 0) {
514 log_error("Failed to correct timezone of container: %m");
515 return 0;
516 }
e58a1277
LP
517
518 return 0;
88213476
LP
519}
520
2547bb41 521static int setup_resolv_conf(const char *dest) {
f333fbb1 522 char _cleanup_free_ *where = NULL;
2547bb41
LP
523
524 assert(dest);
525
526 if (arg_private_network)
527 return 0;
528
529 /* Fix resolv.conf, if possible */
04bc4a3f
LP
530 where = strappend(dest, "/etc/resolv.conf");
531 if (!where)
0d0f0c50 532 return log_oom();
2547bb41 533
77e63faf
LP
534 /* We don't really care for the results of this really. If it
535 * fails, it fails, but meh... */
51045322 536 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
2547bb41
LP
537
538 return 0;
539}
540
04bc4a3f 541static int setup_boot_id(const char *dest) {
7fd1b19b 542 _cleanup_free_ char *from = NULL, *to = NULL;
04bc4a3f
LP
543 sd_id128_t rnd;
544 char as_uuid[37];
545 int r;
546
547 assert(dest);
548
549 /* Generate a new randomized boot ID, so that each boot-up of
550 * the container gets a new one */
551
552 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 553 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
554 if (!from || !to)
555 return log_oom();
04bc4a3f
LP
556
557 r = sd_id128_randomize(&rnd);
558 if (r < 0) {
559 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 560 return r;
04bc4a3f
LP
561 }
562
563 snprintf(as_uuid, sizeof(as_uuid),
564 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
565 SD_ID128_FORMAT_VAL(rnd));
566 char_array_0(as_uuid);
567
574d5f2d 568 r = write_string_file(from, as_uuid);
04bc4a3f
LP
569 if (r < 0) {
570 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 571 return r;
04bc4a3f
LP
572 }
573
574 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
575 log_error("Failed to bind mount boot id: %m");
576 r = -errno;
10d18763
ZJS
577 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
578 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
579
580 unlink(from);
04bc4a3f
LP
581 return r;
582}
583
e58a1277 584static int copy_devnodes(const char *dest) {
88213476
LP
585
586 static const char devnodes[] =
587 "null\0"
588 "zero\0"
589 "full\0"
590 "random\0"
591 "urandom\0"
f2d88580 592 "tty\0";
88213476
LP
593
594 const char *d;
e58a1277 595 int r = 0;
7fd1b19b 596 _cleanup_umask_ mode_t u;
a258bf26
LP
597
598 assert(dest);
124640f1
LP
599
600 u = umask(0000);
88213476
LP
601
602 NULSTR_FOREACH(d, devnodes) {
e58a1277 603 struct stat st;
7fd1b19b 604 _cleanup_free_ char *from = NULL, *to = NULL;
88213476
LP
605
606 asprintf(&from, "/dev/%s", d);
607 asprintf(&to, "%s/dev/%s", dest, d);
608
609 if (!from || !to) {
ed8b7a3e 610 log_oom();
a258bf26 611
88213476
LP
612 if (r == 0)
613 r = -ENOMEM;
614
615 break;
616 }
617
618 if (stat(from, &st) < 0) {
619
620 if (errno != ENOENT) {
621 log_error("Failed to stat %s: %m", from);
88213476
LP
622 if (r == 0)
623 r = -errno;
624 }
625
a258bf26 626 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 627
ed8b7a3e 628 log_error("%s is not a char or block device, cannot copy", from);
a258bf26
LP
629 if (r == 0)
630 r = -EIO;
631
632 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
633
634 log_error("mknod(%s) failed: %m", dest);
635 if (r == 0)
636 r = -errno;
88213476 637 }
88213476
LP
638 }
639
e58a1277
LP
640 return r;
641}
88213476 642
f2d88580
LP
643static int setup_ptmx(const char *dest) {
644 _cleanup_free_ char *p = NULL;
645
646 p = strappend(dest, "/dev/ptmx");
647 if (!p)
648 return log_oom();
649
650 if (symlink("pts/ptmx", p) < 0) {
651 log_error("Failed to create /dev/ptmx symlink: %m");
652 return -errno;
653 }
654
655 return 0;
656}
657
e58a1277
LP
658static int setup_dev_console(const char *dest, const char *console) {
659 struct stat st;
7fd1b19b 660 _cleanup_free_ char *to = NULL;
e58a1277 661 int r;
7fd1b19b 662 _cleanup_umask_ mode_t u;
e58a1277
LP
663
664 assert(dest);
665 assert(console);
666
667 u = umask(0000);
668
669 if (stat(console, &st) < 0) {
670 log_error("Failed to stat %s: %m", console);
25ea79fe 671 return -errno;
88213476 672
a258bf26 673 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
674 log_error("/dev/console is not a char device");
675 return -EIO;
e58a1277 676 }
88213476 677
e58a1277
LP
678 r = chmod_and_chown(console, 0600, 0, 0);
679 if (r < 0) {
680 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 681 return r;
a258bf26 682 }
88213476 683
25ea79fe
ZJS
684 if (asprintf(&to, "%s/dev/console", dest) < 0)
685 return log_oom();
88213476 686
a258bf26
LP
687 /* We need to bind mount the right tty to /dev/console since
688 * ptys can only exist on pts file systems. To have something
689 * to bind mount things on we create a device node first, that
690 * has the right major/minor (note that the major minor
691 * doesn't actually matter here, since we mount it over
692 * anyway). */
693
e58a1277
LP
694 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
695 log_error("mknod() for /dev/console failed: %m");
25ea79fe 696 return -errno;
e58a1277 697 }
a258bf26
LP
698
699 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 700 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 701 return -errno;
a258bf26
LP
702 }
703
25ea79fe 704 return 0;
e58a1277
LP
705}
706
707static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 708 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 709 int r, fd, k;
7fd1b19b 710 _cleanup_umask_ mode_t u;
e58a1277
LP
711 union {
712 struct cmsghdr cmsghdr;
713 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
714 } control = {};
715 struct msghdr mh = {
716 .msg_control = &control,
717 .msg_controllen = sizeof(control),
718 };
e58a1277
LP
719 struct cmsghdr *cmsg;
720
721 assert(dest);
722 assert(kmsg_socket >= 0);
a258bf26 723
e58a1277 724 u = umask(0000);
a258bf26 725
f1e5dfe2
LP
726 /* We create the kmsg FIFO as /dev/kmsg, but immediately
727 * delete it after bind mounting it to /proc/kmsg. While FIFOs
728 * on the reading side behave very similar to /proc/kmsg,
729 * their writing side behaves differently from /dev/kmsg in
730 * that writing blocks when nothing is reading. In order to
731 * avoid any problems with containers deadlocking due to this
732 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
733 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
734 asprintf(&to, "%s/proc/kmsg", dest) < 0)
735 return log_oom();
e58a1277
LP
736
737 if (mkfifo(from, 0600) < 0) {
738 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 739 return -errno;
e58a1277
LP
740 }
741
742 r = chmod_and_chown(from, 0600, 0, 0);
743 if (r < 0) {
744 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 745 return r;
e58a1277
LP
746 }
747
748 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 750 return -errno;
e58a1277
LP
751 }
752
753 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
754 if (fd < 0) {
755 log_error("Failed to open fifo: %m");
25ea79fe 756 return -errno;
e58a1277
LP
757 }
758
e58a1277
LP
759 cmsg = CMSG_FIRSTHDR(&mh);
760 cmsg->cmsg_level = SOL_SOCKET;
761 cmsg->cmsg_type = SCM_RIGHTS;
762 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
763 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
764
765 mh.msg_controllen = cmsg->cmsg_len;
766
767 /* Store away the fd in the socket, so that it stays open as
768 * long as we run the child */
769 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
770 close_nointr_nofail(fd);
771
772 if (k < 0) {
773 log_error("Failed to send FIFO fd: %m");
25ea79fe 774 return -errno;
a258bf26
LP
775 }
776
f1e5dfe2
LP
777 /* And now make the FIFO unavailable as /dev/kmsg... */
778 unlink(from);
25ea79fe 779 return 0;
88213476
LP
780}
781
3a74cea5 782static int setup_hostname(void) {
3a74cea5 783
7027ff61
LP
784 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
785 return -errno;
3a74cea5 786
7027ff61 787 return 0;
3a74cea5
LP
788}
789
57fb9fb5
LP
790static int setup_journal(const char *directory) {
791 sd_id128_t machine_id;
7fd1b19b 792 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 793 char *id;
57fb9fb5
LP
794 int r;
795
796 if (arg_link_journal == LINK_NO)
797 return 0;
798
799 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
800 if (!p)
801 return log_oom();
57fb9fb5
LP
802
803 r = read_one_line_file(p, &b);
27407a01
ZJS
804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
805 return 0;
806 else if (r < 0) {
807 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
808 return r;
809 }
810
27407a01
ZJS
811 id = strstrip(b);
812 if (isempty(id) && arg_link_journal == LINK_AUTO)
813 return 0;
57fb9fb5 814
27407a01
ZJS
815 /* Verify validity */
816 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 817 if (r < 0) {
27407a01
ZJS
818 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
819 return r;
57fb9fb5
LP
820 }
821
822 free(p);
27407a01
ZJS
823 p = strappend("/var/log/journal/", id);
824 q = strjoin(directory, "/var/log/journal/", id, NULL);
825 if (!p || !q)
826 return log_oom();
827
828 if (path_is_mount_point(p, false) > 0) {
829 if (arg_link_journal != LINK_AUTO) {
830 log_error("%s: already a mount point, refusing to use for journal", p);
831 return -EEXIST;
832 }
833
834 return 0;
57fb9fb5
LP
835 }
836
27407a01 837 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 838 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
839 log_error("%s: already a mount point, refusing to use for journal", q);
840 return -EEXIST;
57fb9fb5
LP
841 }
842
27407a01 843 return 0;
57fb9fb5
LP
844 }
845
846 r = readlink_and_make_absolute(p, &d);
847 if (r >= 0) {
848 if ((arg_link_journal == LINK_GUEST ||
849 arg_link_journal == LINK_AUTO) &&
850 path_equal(d, q)) {
851
27407a01
ZJS
852 r = mkdir_p(q, 0755);
853 if (r < 0)
854 log_warning("failed to create directory %s: %m", q);
855 return 0;
57fb9fb5
LP
856 }
857
858 if (unlink(p) < 0) {
859 log_error("Failed to remove symlink %s: %m", p);
27407a01 860 return -errno;
57fb9fb5
LP
861 }
862 } else if (r == -EINVAL) {
863
864 if (arg_link_journal == LINK_GUEST &&
865 rmdir(p) < 0) {
866
27407a01
ZJS
867 if (errno == ENOTDIR) {
868 log_error("%s already exists and is neither a symlink nor a directory", p);
869 return r;
870 } else {
57fb9fb5 871 log_error("Failed to remove %s: %m", p);
27407a01 872 return -errno;
57fb9fb5 873 }
57fb9fb5
LP
874 }
875 } else if (r != -ENOENT) {
876 log_error("readlink(%s) failed: %m", p);
27407a01 877 return r;
57fb9fb5
LP
878 }
879
880 if (arg_link_journal == LINK_GUEST) {
881
882 if (symlink(q, p) < 0) {
883 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 884 return -errno;
57fb9fb5
LP
885 }
886
27407a01
ZJS
887 r = mkdir_p(q, 0755);
888 if (r < 0)
889 log_warning("failed to create directory %s: %m", q);
890 return 0;
57fb9fb5
LP
891 }
892
893 if (arg_link_journal == LINK_HOST) {
894 r = mkdir_p(p, 0755);
895 if (r < 0) {
896 log_error("Failed to create %s: %m", p);
27407a01 897 return r;
57fb9fb5
LP
898 }
899
27407a01
ZJS
900 } else if (access(p, F_OK) < 0)
901 return 0;
57fb9fb5
LP
902
903 if (dir_is_empty(q) == 0) {
904 log_error("%s not empty.", q);
27407a01 905 return -ENOTEMPTY;
57fb9fb5
LP
906 }
907
908 r = mkdir_p(q, 0755);
909 if (r < 0) {
910 log_error("Failed to create %s: %m", q);
27407a01 911 return r;
57fb9fb5
LP
912 }
913
914 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
915 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 916 return -errno;
57fb9fb5
LP
917 }
918
27407a01 919 return 0;
57fb9fb5
LP
920}
921
88213476 922static int drop_capabilities(void) {
5076f0cc 923 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
924}
925
9444b1f2
LP
926static int register_machine(void) {
927 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
928 _cleanup_bus_unref_ sd_bus *bus = NULL;
929 int r;
930
931 r = sd_bus_open_system(&bus);
932 if (r < 0) {
933 log_error("Failed to open system bus: %s", strerror(-r));
934 return r;
935 }
936
937 r = sd_bus_call_method(
938 bus,
1ee306e1
LP
939 "org.freedesktop.machine1",
940 "/org/freedesktop/machine1",
941 "org.freedesktop.machine1.Manager",
9444b1f2
LP
942 "CreateMachine",
943 &error,
944 NULL,
6a4e0b13 945 "sayssusa(sv)",
9444b1f2 946 arg_machine,
40ca29a1 947 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
9444b1f2
LP
948 "nspawn",
949 "container",
950 (uint32_t) 0,
6a4e0b13 951 strempty(arg_directory),
88212f7b 952 !isempty(arg_slice), "Slice", "s", arg_slice);
9444b1f2 953 if (r < 0) {
1f0cd86b
LP
954 log_error("Failed to register machine: %s", bus_error_message(&error, r));
955 return r;
956 }
957
958 return 0;
959}
960
961static int terminate_machine(pid_t pid) {
962 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
963 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
964 _cleanup_bus_unref_ sd_bus *bus = NULL;
965 const char *path;
966 int r;
967
76b54375 968 r = sd_bus_default_system(&bus);
1f0cd86b
LP
969 if (r < 0) {
970 log_error("Failed to open system bus: %s", strerror(-r));
971 return r;
972 }
973
974 r = sd_bus_call_method(
975 bus,
976 "org.freedesktop.machine1",
977 "/org/freedesktop/machine1",
978 "org.freedesktop.machine1.Manager",
979 "GetMachineByPID",
980 &error,
981 &reply,
982 "u",
983 (uint32_t) pid);
984 if (r < 0) {
985 /* Note that the machine might already have been
986 * cleaned up automatically, hence don't consider it a
987 * failure if we cannot get the machine object. */
988 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
989 return 0;
990 }
991
992 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
993 if (r < 0)
994 return bus_log_parse_error(r);
9444b1f2 995
1f0cd86b
LP
996 r = sd_bus_call_method(
997 bus,
998 "org.freedesktop.machine1",
999 path,
1000 "org.freedesktop.machine1.Machine",
1001 "Terminate",
1002 &error,
1003 NULL,
1004 NULL);
1005 if (r < 0) {
1006 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1007 return 0;
1008 }
1009
9444b1f2
LP
1010 return 0;
1011}
1012
77b6e194
LP
1013static bool audit_enabled(void) {
1014 int fd;
1015
1016 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1017 if (fd >= 0) {
1018 close_nointr_nofail(fd);
1019 return true;
1020 }
77b6e194
LP
1021 return false;
1022}
1023
88213476
LP
1024int main(int argc, char *argv[]) {
1025 pid_t pid = 0;
04d391da 1026 int r = EXIT_FAILURE, k;
7027ff61
LP
1027 _cleanup_close_ int master = -1;
1028 int n_fd_passed;
a258bf26 1029 const char *console = NULL;
a258bf26 1030 sigset_t mask;
04d39279 1031 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
51d122af 1032 _cleanup_fdset_free_ FDSet *fds = NULL;
88213476
LP
1033
1034 log_parse_environment();
1035 log_open();
1036
05947bef
LP
1037 k = parse_argv(argc, argv);
1038 if (k < 0)
88213476 1039 goto finish;
05947bef
LP
1040 else if (k == 0) {
1041 r = EXIT_SUCCESS;
1042 goto finish;
1043 }
88213476
LP
1044
1045 if (arg_directory) {
1046 char *p;
1047
1048 p = path_make_absolute_cwd(arg_directory);
1049 free(arg_directory);
1050 arg_directory = p;
1051 } else
1052 arg_directory = get_current_dir_name();
1053
1054 if (!arg_directory) {
a383724e 1055 log_error("Failed to determine path, please use -D.");
88213476
LP
1056 goto finish;
1057 }
1058
1059 path_kill_slashes(arg_directory);
1060
7027ff61
LP
1061 if (!arg_machine) {
1062 arg_machine = strdup(path_get_file_name(arg_directory));
1063 if (!arg_machine) {
1064 log_oom();
1065 goto finish;
1066 }
1067
e724b063 1068 hostname_cleanup(arg_machine, false);
7027ff61
LP
1069 if (isempty(arg_machine)) {
1070 log_error("Failed to determine machine name automatically, please use -M.");
1071 goto finish;
1072 }
1073 }
1074
88213476
LP
1075 if (geteuid() != 0) {
1076 log_error("Need to be root.");
1077 goto finish;
1078 }
1079
04d391da
LP
1080 if (sd_booted() <= 0) {
1081 log_error("Not running on a systemd system.");
1082 goto finish;
1083 }
1084
c2384970 1085 if (arg_boot && audit_enabled()) {
77b6e194
LP
1086 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1087 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1088 "line before using systemd-nspawn. Sleeping for 5s...\n");
1089 sleep(5);
1090 }
1091
88213476 1092 if (path_equal(arg_directory, "/")) {
6df6b939 1093 log_error("Spawning container on root directory not supported.");
88213476
LP
1094 goto finish;
1095 }
1096
66060897 1097 if (path_is_os_tree(arg_directory) <= 0) {
f8964235 1098 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
88213476
LP
1099 goto finish;
1100 }
1101
842f3b0f
LP
1102 log_close();
1103 n_fd_passed = sd_listen_fds(false);
1104 if (n_fd_passed > 0) {
1105 k = fdset_new_listen_fds(&fds, false);
1106 if (k < 0) {
1107 log_error("Failed to collect file descriptors: %s", strerror(-k));
1108 goto finish;
1109 }
1110 }
1111 fdset_close_others(fds);
1112 log_open();
1113
db7feb7e
LP
1114 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1115 if (master < 0) {
a258bf26
LP
1116 log_error("Failed to acquire pseudo tty: %m");
1117 goto finish;
1118 }
1119
db7feb7e
LP
1120 console = ptsname(master);
1121 if (!console) {
a258bf26
LP
1122 log_error("Failed to determine tty name: %m");
1123 goto finish;
1124 }
1125
04d39279 1126 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
a258bf26
LP
1127
1128 if (unlockpt(master) < 0) {
1129 log_error("Failed to unlock tty: %m");
1130 goto finish;
1131 }
1132
e58a1277 1133 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
7027ff61 1134 log_error("Failed to create kmsg socket pair.");
e58a1277
LP
1135 goto finish;
1136 }
1137
05947bef
LP
1138 sd_notify(0, "READY=1");
1139
a258bf26
LP
1140 assert_se(sigemptyset(&mask) == 0);
1141 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1142 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1143
d87be9b0
LP
1144 for (;;) {
1145 siginfo_t status;
a383724e 1146
d87be9b0
LP
1147 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1148 if (pid < 0) {
1149 if (errno == EINVAL)
1150 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1151 else
1152 log_error("clone() failed: %m");
a258bf26 1153
d87be9b0
LP
1154 goto finish;
1155 }
a258bf26 1156
d87be9b0
LP
1157 if (pid == 0) {
1158 /* child */
d87be9b0
LP
1159 const char *home = NULL;
1160 uid_t uid = (uid_t) -1;
1161 gid_t gid = (gid_t) -1;
5674767e 1162 unsigned n_env = 2;
d87be9b0
LP
1163 const char *envp[] = {
1164 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1165 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1166 NULL, /* TERM */
1167 NULL, /* HOME */
1168 NULL, /* USER */
1169 NULL, /* LOGNAME */
1170 NULL, /* container_uuid */
842f3b0f
LP
1171 NULL, /* LISTEN_FDS */
1172 NULL, /* LISTEN_PID */
d87be9b0
LP
1173 NULL
1174 };
a258bf26 1175
5674767e
ZJS
1176 envp[n_env] = strv_find_prefix(environ, "TERM=");
1177 if (envp[n_env])
1178 n_env ++;
a258bf26 1179
d87be9b0 1180 close_nointr_nofail(master);
842f3b0f 1181 master = -1;
a258bf26 1182
d87be9b0
LP
1183 close_nointr(STDIN_FILENO);
1184 close_nointr(STDOUT_FILENO);
1185 close_nointr(STDERR_FILENO);
db7feb7e 1186
842f3b0f
LP
1187 close_nointr_nofail(kmsg_socket_pair[0]);
1188 kmsg_socket_pair[0] = -1;
a258bf26 1189
d87be9b0 1190 reset_all_signal_handlers();
88213476 1191
d87be9b0
LP
1192 assert_se(sigemptyset(&mask) == 0);
1193 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1194
842f3b0f
LP
1195 k = open_terminal(console, O_RDWR);
1196 if (k != STDIN_FILENO) {
1197 if (k >= 0) {
1198 close_nointr_nofail(k);
1199 k = -EINVAL;
1200 }
1201
1202 log_error("Failed to open console: %s", strerror(-k));
1203 goto child_fail;
1204 }
1205
1206 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1207 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1208 log_error("Failed to duplicate console: %m");
d87be9b0 1209 goto child_fail;
842f3b0f 1210 }
bc2f673e 1211
d87be9b0
LP
1212 if (setsid() < 0) {
1213 log_error("setsid() failed: %m");
bc2f673e
LP
1214 goto child_fail;
1215 }
1216
d87be9b0
LP
1217 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1218 log_error("PR_SET_PDEATHSIG failed: %m");
1219 goto child_fail;
1220 }
e58a1277 1221
9444b1f2
LP
1222 r = register_machine();
1223 if (r < 0)
1224 goto finish;
1225
d87be9b0
LP
1226 /* Mark everything as slave, so that we still
1227 * receive mounts from the real root, but don't
1228 * propagate mounts to the real root. */
1229 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1230 log_error("MS_SLAVE|MS_REC failed: %m");
1231 goto child_fail;
1232 }
04bc4a3f 1233
d87be9b0
LP
1234 /* Turn directory into bind mount */
1235 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1236 log_error("Failed to make bind mount.");
1237 goto child_fail;
1238 }
88213476 1239
d87be9b0
LP
1240 if (arg_read_only)
1241 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1242 log_error("Failed to make read-only.");
1243 goto child_fail;
1244 }
2547bb41 1245
d87be9b0
LP
1246 if (mount_all(arg_directory) < 0)
1247 goto child_fail;
57fb9fb5 1248
d87be9b0
LP
1249 if (copy_devnodes(arg_directory) < 0)
1250 goto child_fail;
a258bf26 1251
f2d88580
LP
1252 if (setup_ptmx(arg_directory) < 0)
1253 goto child_fail;
1254
d87be9b0 1255 dev_setup(arg_directory);
88213476 1256
d87be9b0
LP
1257 if (setup_dev_console(arg_directory, console) < 0)
1258 goto child_fail;
88213476 1259
d87be9b0
LP
1260 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1261 goto child_fail;
88213476 1262
d87be9b0 1263 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1264 kmsg_socket_pair[1] = -1;
a258bf26 1265
d87be9b0
LP
1266 if (setup_boot_id(arg_directory) < 0)
1267 goto child_fail;
a41fe3a2 1268
d87be9b0
LP
1269 if (setup_timezone(arg_directory) < 0)
1270 goto child_fail;
88213476 1271
d87be9b0
LP
1272 if (setup_resolv_conf(arg_directory) < 0)
1273 goto child_fail;
687d0825 1274
d87be9b0 1275 if (setup_journal(arg_directory) < 0)
687d0825 1276 goto child_fail;
687d0825 1277
17fe0523
LP
1278 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1279 goto child_fail;
1280
1281 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1282 goto child_fail;
1283
d87be9b0
LP
1284 if (chdir(arg_directory) < 0) {
1285 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1286 goto child_fail;
1287 }
1288
d87be9b0
LP
1289 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1290 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1291 goto child_fail;
1292 }
1293
d87be9b0
LP
1294 if (chroot(".") < 0) {
1295 log_error("chroot() failed: %m");
687d0825
MV
1296 goto child_fail;
1297 }
1298
d87be9b0
LP
1299 if (chdir("/") < 0) {
1300 log_error("chdir() failed: %m");
687d0825
MV
1301 goto child_fail;
1302 }
1303
d87be9b0
LP
1304 umask(0022);
1305
1306 loopback_setup();
1307
1308 if (drop_capabilities() < 0) {
1309 log_error("drop_capabilities() failed: %m");
687d0825
MV
1310 goto child_fail;
1311 }
687d0825 1312
d87be9b0
LP
1313 if (arg_user) {
1314
963ddb91
LP
1315 /* Note that this resolves user names
1316 * inside the container, and hence
1317 * accesses the NSS modules from the
1318 * container and not the host. This is
1319 * a bit weird... */
1320
d87be9b0
LP
1321 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1322 log_error("get_user_creds() failed: %m");
1323 goto child_fail;
1324 }
1325
1326 if (mkdir_parents_label(home, 0775) < 0) {
1327 log_error("mkdir_parents_label() failed: %m");
1328 goto child_fail;
1329 }
1330
1331 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1332 log_error("mkdir_safe_label() failed: %m");
1333 goto child_fail;
1334 }
1335
1336 if (initgroups((const char*)arg_user, gid) < 0) {
1337 log_error("initgroups() failed: %m");
1338 goto child_fail;
1339 }
144f0fc0 1340
d87be9b0
LP
1341 if (setresgid(gid, gid, gid) < 0) {
1342 log_error("setregid() failed: %m");
1343 goto child_fail;
1344 }
1345
1346 if (setresuid(uid, uid, uid) < 0) {
1347 log_error("setreuid() failed: %m");
1348 goto child_fail;
1349 }
3c957acf
LP
1350 } else {
1351 /* Reset everything fully to 0, just in case */
1352
1353 if (setgroups(0, NULL) < 0) {
1354 log_error("setgroups() failed: %m");
1355 goto child_fail;
1356 }
1357
1358 if (setresgid(0, 0, 0) < 0) {
1359 log_error("setregid() failed: %m");
1360 goto child_fail;
1361 }
1362
1363 if (setresuid(0, 0, 0) < 0) {
1364 log_error("setreuid() failed: %m");
1365 goto child_fail;
1366 }
d87be9b0
LP
1367 }
1368
842f3b0f
LP
1369 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1370 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1371 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1372 log_oom();
144f0fc0
LP
1373 goto child_fail;
1374 }
687d0825 1375
9444b1f2
LP
1376 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1377 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
842f3b0f
LP
1378 log_oom();
1379 goto child_fail;
1380 }
1381 }
1382
1383 if (fdset_size(fds) > 0) {
1384 k = fdset_cloexec(fds, false);
1385 if (k < 0) {
1386 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1387 goto child_fail;
1388 }
1389
1390 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 1391 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
1392 log_oom();
1393 goto child_fail;
1394 }
1395 }
1396
1397 setup_hostname();
1398
1399 if (arg_boot) {
1400 char **a;
1401 size_t l;
88213476 1402
d87be9b0 1403 /* Automatically search for the init system */
0f0dbc46 1404
d87be9b0
LP
1405 l = 1 + argc - optind;
1406 a = newa(char*, l + 1);
1407 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1408
d87be9b0
LP
1409 a[0] = (char*) "/usr/lib/systemd/systemd";
1410 execve(a[0], a, (char**) envp);
0f0dbc46 1411
d87be9b0
LP
1412 a[0] = (char*) "/lib/systemd/systemd";
1413 execve(a[0], a, (char**) envp);
0f0dbc46 1414
d87be9b0
LP
1415 a[0] = (char*) "/sbin/init";
1416 execve(a[0], a, (char**) envp);
1417 } else if (argc > optind)
1418 execvpe(argv[optind], argv + optind, (char**) envp);
1419 else {
1420 chdir(home ? home : "/root");
1421 execle("/bin/bash", "-bash", NULL, (char**) envp);
1422 }
1423
1424 log_error("execv() failed: %m");
0f0dbc46 1425
d87be9b0
LP
1426 child_fail:
1427 _exit(EXIT_FAILURE);
da5b3bad 1428 }
88213476 1429
842f3b0f
LP
1430 fdset_free(fds);
1431 fds = NULL;
1432
04d39279
LP
1433 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1434 if (k < 0) {
1435 r = EXIT_FAILURE;
1436 break;
1437 }
88213476 1438
04d39279
LP
1439 putc('\n', stdout);
1440
1441 /* Kill if it is not dead yet anyway */
1f0cd86b
LP
1442 terminate_machine(pid);
1443
1444 /* Redundant, but better safe than sorry */
04d39279 1445 kill(pid, SIGKILL);
a258bf26 1446
05947bef 1447 k = wait_for_terminate(pid, &status);
04d39279
LP
1448 pid = 0;
1449
05947bef 1450 if (k < 0) {
d87be9b0
LP
1451 r = EXIT_FAILURE;
1452 break;
1453 }
a258bf26 1454
d87be9b0 1455 if (status.si_code == CLD_EXITED) {
a5f5f8a0 1456 r = status.si_status;
d87be9b0 1457 if (status.si_status != 0) {
04d39279 1458 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
d87be9b0
LP
1459 break;
1460 }
1461
04d39279 1462 log_debug("Container %s exited successfully.", arg_machine);
d87be9b0
LP
1463 break;
1464 } else if (status.si_code == CLD_KILLED &&
1465 status.si_status == SIGINT) {
04d39279 1466 log_info("Container %s has been shut down.", arg_machine);
d87be9b0
LP
1467 r = 0;
1468 break;
1469 } else if (status.si_code == CLD_KILLED &&
1470 status.si_status == SIGHUP) {
04d39279 1471 log_info("Container %s is being rebooted.", arg_machine);
d87be9b0
LP
1472 continue;
1473 } else if (status.si_code == CLD_KILLED ||
1474 status.si_code == CLD_DUMPED) {
88213476 1475
04d39279 1476 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
d87be9b0
LP
1477 r = EXIT_FAILURE;
1478 break;
1479 } else {
04d39279 1480 log_error("Container %s failed due to unknown reason.", arg_machine);
d87be9b0
LP
1481 r = EXIT_FAILURE;
1482 break;
1483 }
1484 }
88213476
LP
1485
1486finish:
9444b1f2
LP
1487 if (pid > 0)
1488 kill(pid, SIGKILL);
88213476 1489
04d391da 1490 free(arg_directory);
7027ff61 1491 free(arg_machine);
88213476
LP
1492
1493 return r;
1494}