]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
update TODO
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
354bfd2b 43#include <sys/eventfd.h>
5d63309c 44#ifdef HAVE_SELINUX
a8828ed9
DW
45#include <selinux/selinux.h>
46#endif
88213476 47
1f0cd86b
LP
48#include "sd-daemon.h"
49#include "sd-bus.h"
50#include "sd-id128.h"
88213476
LP
51#include "log.h"
52#include "util.h"
49e942b2 53#include "mkdir.h"
6b2d0e85 54#include "macro.h"
d7832d2c 55#include "audit.h"
94d82985 56#include "missing.h"
04d391da 57#include "cgroup-util.h"
a258bf26 58#include "strv.h"
9eb977db 59#include "path-util.h"
a41fe3a2 60#include "loopback-setup.h"
4fc9982c 61#include "dev-setup.h"
842f3b0f 62#include "fdset.h"
acbeb427 63#include "build.h"
a5c32cff 64#include "fileio.h"
40ca29a1 65#include "bus-util.h"
1f0cd86b 66#include "bus-error.h"
4ba93280 67#include "ptyfwd.h"
9bd37b40 68#include "bus-kernel.h"
f4889f65 69#include "env-util.h"
7f112f50 70#include "def.h"
f2d88580 71
57fb9fb5
LP
72typedef enum LinkJournal {
73 LINK_NO,
74 LINK_AUTO,
75 LINK_HOST,
76 LINK_GUEST
77} LinkJournal;
88213476
LP
78
79static char *arg_directory = NULL;
687d0825 80static char *arg_user = NULL;
9444b1f2 81static sd_id128_t arg_uuid = {};
7027ff61 82static char *arg_machine = NULL;
82adf6af
LP
83static char *arg_selinux_context = NULL;
84static char *arg_selinux_apifs_context = NULL;
9444b1f2 85static const char *arg_slice = NULL;
ff01d048 86static bool arg_private_network = false;
bc2f673e 87static bool arg_read_only = false;
0f0dbc46 88static bool arg_boot = false;
57fb9fb5 89static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
90static uint64_t arg_retain =
91 (1ULL << CAP_CHOWN) |
92 (1ULL << CAP_DAC_OVERRIDE) |
93 (1ULL << CAP_DAC_READ_SEARCH) |
94 (1ULL << CAP_FOWNER) |
95 (1ULL << CAP_FSETID) |
96 (1ULL << CAP_IPC_OWNER) |
97 (1ULL << CAP_KILL) |
98 (1ULL << CAP_LEASE) |
99 (1ULL << CAP_LINUX_IMMUTABLE) |
100 (1ULL << CAP_NET_BIND_SERVICE) |
101 (1ULL << CAP_NET_BROADCAST) |
102 (1ULL << CAP_NET_RAW) |
103 (1ULL << CAP_SETGID) |
104 (1ULL << CAP_SETFCAP) |
105 (1ULL << CAP_SETPCAP) |
106 (1ULL << CAP_SETUID) |
107 (1ULL << CAP_SYS_ADMIN) |
108 (1ULL << CAP_SYS_CHROOT) |
109 (1ULL << CAP_SYS_NICE) |
110 (1ULL << CAP_SYS_PTRACE) |
111 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 112 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
113 (1ULL << CAP_SYS_BOOT) |
114 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
115 (1ULL << CAP_AUDIT_CONTROL) |
116 (1ULL << CAP_MKNOD);
17fe0523
LP
117static char **arg_bind = NULL;
118static char **arg_bind_ro = NULL;
f4889f65 119static char **arg_setenv = NULL;
284c0b91 120static bool arg_quiet = false;
88213476
LP
121
122static int help(void) {
123
124 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
125 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
126 " -h --help Show this help\n"
127 " --version Print version string\n"
128 " -D --directory=NAME Root directory for the container\n"
129 " -b --boot Boot up full system (i.e. invoke init)\n"
130 " -u --user=USER Run the command under specified user or uid\n"
131 " --uuid=UUID Set a specific machine UUID for the container\n"
132 " -M --machine=NAME Set the machine name for the container\n"
133 " -S --slice=SLICE Place the container in the specified slice\n"
82adf6af
LP
134 " -Z --selinux-context=SECLABEL\n"
135 " Set the SELinux security context to be used by\n"
136 " processes in the container\n"
137 " -L --selinux-apifs-context=SECLABEL\n"
138 " Set the SELinux security context to be used by\n"
139 " API/tmpfs file systems in the container\n"
a8828ed9
DW
140 " --private-network Disable network in container\n"
141 " --read-only Mount the root directory read-only\n"
142 " --capability=CAP In addition to the default, retain specified\n"
143 " capability\n"
144 " --drop-capability=CAP Drop the specified capability from the default set\n"
145 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
146 " -j Equivalent to --link-journal=host\n"
147 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
148 " the container\n"
149 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
284c0b91
LP
150 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
151 " -q --quiet Do not show status information\n",
88213476
LP
152 program_invocation_short_name);
153
154 return 0;
155}
156
157static int parse_argv(int argc, char *argv[]) {
158
a41fe3a2 159 enum {
acbeb427
ZJS
160 ARG_VERSION = 0x100,
161 ARG_PRIVATE_NETWORK,
bc2f673e 162 ARG_UUID,
5076f0cc 163 ARG_READ_ONLY,
57fb9fb5 164 ARG_CAPABILITY,
420c7379 165 ARG_DROP_CAPABILITY,
17fe0523
LP
166 ARG_LINK_JOURNAL,
167 ARG_BIND,
f4889f65
LP
168 ARG_BIND_RO,
169 ARG_SETENV,
a41fe3a2
LP
170 };
171
88213476 172 static const struct option options[] = {
82adf6af
LP
173 { "help", no_argument, NULL, 'h' },
174 { "version", no_argument, NULL, ARG_VERSION },
175 { "directory", required_argument, NULL, 'D' },
176 { "user", required_argument, NULL, 'u' },
177 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
178 { "boot", no_argument, NULL, 'b' },
179 { "uuid", required_argument, NULL, ARG_UUID },
180 { "read-only", no_argument, NULL, ARG_READ_ONLY },
181 { "capability", required_argument, NULL, ARG_CAPABILITY },
182 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
183 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
184 { "bind", required_argument, NULL, ARG_BIND },
185 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
186 { "machine", required_argument, NULL, 'M' },
187 { "slice", required_argument, NULL, 'S' },
188 { "setenv", required_argument, NULL, ARG_SETENV },
189 { "selinux-context", required_argument, NULL, 'Z' },
190 { "selinux-apifs-context", required_argument, NULL, 'L' },
191 { "quiet", no_argument, NULL, 'q' },
eb9da376 192 {}
88213476
LP
193 };
194
9444b1f2 195 int c, r;
88213476
LP
196
197 assert(argc >= 0);
198 assert(argv);
199
284c0b91 200 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
88213476
LP
201
202 switch (c) {
203
204 case 'h':
eb9da376 205 return help();
88213476 206
acbeb427
ZJS
207 case ARG_VERSION:
208 puts(PACKAGE_STRING);
209 puts(SYSTEMD_FEATURES);
210 return 0;
211
88213476
LP
212 case 'D':
213 free(arg_directory);
3a74cea5
LP
214 arg_directory = canonicalize_file_name(optarg);
215 if (!arg_directory) {
898d5c91 216 log_error("Invalid root directory: %m");
88213476
LP
217 return -ENOMEM;
218 }
219
220 break;
221
687d0825
MV
222 case 'u':
223 free(arg_user);
7027ff61
LP
224 arg_user = strdup(optarg);
225 if (!arg_user)
226 return log_oom();
687d0825
MV
227
228 break;
229
ff01d048
LP
230 case ARG_PRIVATE_NETWORK:
231 arg_private_network = true;
a41fe3a2
LP
232 break;
233
0f0dbc46
LP
234 case 'b':
235 arg_boot = true;
236 break;
237
144f0fc0 238 case ARG_UUID:
9444b1f2
LP
239 r = sd_id128_from_string(optarg, &arg_uuid);
240 if (r < 0) {
aa96c6cb 241 log_error("Invalid UUID: %s", optarg);
9444b1f2 242 return r;
aa96c6cb 243 }
9444b1f2 244 break;
aa96c6cb 245
9444b1f2
LP
246 case 'S':
247 arg_slice = strdup(optarg);
b3451bed
DH
248 if (!arg_slice)
249 return log_oom();
250
144f0fc0
LP
251 break;
252
7027ff61
LP
253 case 'M':
254 if (!hostname_is_valid(optarg)) {
255 log_error("Invalid machine name: %s", optarg);
256 return -EINVAL;
257 }
258
259 free(arg_machine);
260 arg_machine = strdup(optarg);
261 if (!arg_machine)
262 return log_oom();
263
264 break;
265
82adf6af
LP
266 case 'Z':
267 arg_selinux_context = optarg;
a8828ed9
DW
268 break;
269
82adf6af
LP
270 case 'L':
271 arg_selinux_apifs_context = optarg;
a8828ed9
DW
272 break;
273
bc2f673e
LP
274 case ARG_READ_ONLY:
275 arg_read_only = true;
276 break;
277
420c7379
LP
278 case ARG_CAPABILITY:
279 case ARG_DROP_CAPABILITY: {
5076f0cc
LP
280 char *state, *word;
281 size_t length;
282
283 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
284 cap_value_t cap;
285 char *t;
286
287 t = strndup(word, length);
0d0f0c50
SL
288 if (!t)
289 return log_oom();
5076f0cc
LP
290
291 if (cap_from_name(t, &cap) < 0) {
292 log_error("Failed to parse capability %s.", t);
293 free(t);
294 return -EINVAL;
295 }
296
297 free(t);
420c7379
LP
298
299 if (c == ARG_CAPABILITY)
300 arg_retain |= 1ULL << (uint64_t) cap;
301 else
302 arg_retain &= ~(1ULL << (uint64_t) cap);
5076f0cc
LP
303 }
304
305 break;
306 }
307
57fb9fb5
LP
308 case 'j':
309 arg_link_journal = LINK_GUEST;
310 break;
311
312 case ARG_LINK_JOURNAL:
313 if (streq(optarg, "auto"))
314 arg_link_journal = LINK_AUTO;
315 else if (streq(optarg, "no"))
316 arg_link_journal = LINK_NO;
317 else if (streq(optarg, "guest"))
318 arg_link_journal = LINK_GUEST;
319 else if (streq(optarg, "host"))
320 arg_link_journal = LINK_HOST;
321 else {
322 log_error("Failed to parse link journal mode %s", optarg);
323 return -EINVAL;
324 }
325
326 break;
327
17fe0523
LP
328 case ARG_BIND:
329 case ARG_BIND_RO: {
330 _cleanup_free_ char *a = NULL, *b = NULL;
331 char *e;
332 char ***x;
17fe0523
LP
333
334 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
335
336 e = strchr(optarg, ':');
337 if (e) {
338 a = strndup(optarg, e - optarg);
339 b = strdup(e + 1);
340 } else {
341 a = strdup(optarg);
342 b = strdup(optarg);
343 }
344
345 if (!a || !b)
346 return log_oom();
347
348 if (!path_is_absolute(a) || !path_is_absolute(b)) {
349 log_error("Invalid bind mount specification: %s", optarg);
350 return -EINVAL;
351 }
352
353 r = strv_extend(x, a);
354 if (r < 0)
b3451bed 355 return log_oom();
17fe0523
LP
356
357 r = strv_extend(x, b);
358 if (r < 0)
b3451bed 359 return log_oom();
17fe0523
LP
360
361 break;
362 }
363
f4889f65
LP
364 case ARG_SETENV: {
365 char **n;
366
367 if (!env_assignment_is_valid(optarg)) {
368 log_error("Environment variable assignment '%s' is not valid.", optarg);
369 return -EINVAL;
370 }
371
372 n = strv_env_set(arg_setenv, optarg);
373 if (!n)
374 return log_oom();
375
376 strv_free(arg_setenv);
377 arg_setenv = n;
378 break;
379 }
380
284c0b91
LP
381 case 'q':
382 arg_quiet = true;
383 break;
384
88213476
LP
385 case '?':
386 return -EINVAL;
387
388 default:
eb9da376 389 assert_not_reached("Unhandled option");
88213476
LP
390 }
391 }
392
393 return 1;
394}
395
396static int mount_all(const char *dest) {
397
398 typedef struct MountPoint {
399 const char *what;
400 const char *where;
401 const char *type;
402 const char *options;
403 unsigned long flags;
3bd66c05 404 bool fatal;
88213476
LP
405 } MountPoint;
406
407 static const MountPoint mount_table[] = {
4b7a6af4 408 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
409 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
410 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 411 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 412 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 413 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 414 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 415 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 416#ifdef HAVE_SELINUX
b4c59701
LP
417 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
418 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 419#endif
88213476
LP
420 };
421
422 unsigned k;
423 int r = 0;
424
425 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 426 _cleanup_free_ char *where = NULL;
d002827b 427#ifdef HAVE_SELINUX
a8828ed9 428 _cleanup_free_ char *options = NULL;
d002827b
LP
429#endif
430 const char *o;
88213476
LP
431 int t;
432
17fe0523
LP
433 where = strjoin(dest, "/", mount_table[k].where, NULL);
434 if (!where)
435 return log_oom();
88213476 436
e65aec12 437 t = path_is_mount_point(where, true);
68fb0892 438 if (t < 0) {
88213476 439 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
440
441 if (r == 0)
442 r = t;
443
444 continue;
445 }
446
9c1c7f71
LP
447 /* Skip this entry if it is not a remount. */
448 if (mount_table[k].what && t > 0)
014a9c77
LP
449 continue;
450
17fe0523 451 mkdir_p(where, 0755);
88213476 452
a8828ed9 453#ifdef HAVE_SELINUX
82adf6af
LP
454 if (arg_selinux_apifs_context &&
455 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
456 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
457 if (!options)
458 return log_oom();
459
460 o = options;
461 } else
a8828ed9 462#endif
d002827b 463 o = mount_table[k].options;
a8828ed9 464
a8828ed9 465
88213476
LP
466 if (mount(mount_table[k].what,
467 where,
468 mount_table[k].type,
469 mount_table[k].flags,
d002827b 470 o) < 0 &&
3bd66c05 471 mount_table[k].fatal) {
88213476
LP
472
473 log_error("mount(%s) failed: %m", where);
474
475 if (r == 0)
476 r = -errno;
477 }
88213476
LP
478 }
479
e58a1277
LP
480 return r;
481}
f8440af5 482
17fe0523
LP
483static int mount_binds(const char *dest, char **l, unsigned long flags) {
484 char **x, **y;
485
486 STRV_FOREACH_PAIR(x, y, l) {
2ed4e5e0 487 char *where;
d2421337 488 struct stat source_st, dest_st;
2ed4e5e0 489 int r;
d2421337
DR
490
491 if (stat(*x, &source_st) < 0) {
492 log_error("failed to stat %s: %m", *x);
493 return -errno;
494 }
17fe0523 495
2ed4e5e0
SL
496 where = strappenda(dest, *y);
497 r = stat(where, &dest_st);
498 if (r == 0) {
d2421337 499 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
cecf24e7 500 log_error("The file types of %s and %s do not match. Refusing bind mount",
d2421337
DR
501 *x, where);
502 return -EINVAL;
503 }
2ed4e5e0
SL
504 } else if (errno == ENOENT) {
505 r = mkdir_parents_label(where, 0755);
506 if (r < 0) {
507 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
508 return r;
d2421337 509 }
2ed4e5e0
SL
510 } else {
511 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
512 return -errno;
513 }
514 /* Create the mount point, but be conservative -- refuse to create block
515 * and char devices. */
516 if (S_ISDIR(source_st.st_mode))
517 mkdir_label(where, 0755);
518 else if (S_ISFIFO(source_st.st_mode))
519 mkfifo(where, 0644);
520 else if (S_ISSOCK(source_st.st_mode))
521 mknod(where, 0644 | S_IFSOCK, 0);
522 else if (S_ISREG(source_st.st_mode))
523 touch(where);
524 else {
525 log_error("Refusing to create mountpoint for file: %s", *x);
526 return -ENOTSUP;
d2421337 527 }
17fe0523
LP
528
529 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
530 log_error("mount(%s) failed: %m", where);
531 return -errno;
532 }
533
534 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
535 log_error("mount(%s) failed: %m", where);
536 return -errno;
537 }
538 }
539
540 return 0;
541}
542
e58a1277 543static int setup_timezone(const char *dest) {
d4036145
LP
544 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
545 char *z, *y;
546 int r;
f8440af5 547
e58a1277
LP
548 assert(dest);
549
550 /* Fix the timezone, if possible */
d4036145
LP
551 r = readlink_malloc("/etc/localtime", &p);
552 if (r < 0) {
553 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
554 return 0;
555 }
556
557 z = path_startswith(p, "../usr/share/zoneinfo/");
558 if (!z)
559 z = path_startswith(p, "/usr/share/zoneinfo/");
560 if (!z) {
561 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
562 return 0;
563 }
564
04bc4a3f
LP
565 where = strappend(dest, "/etc/localtime");
566 if (!where)
0d0f0c50 567 return log_oom();
715ac17a 568
d4036145
LP
569 r = readlink_malloc(where, &q);
570 if (r >= 0) {
571 y = path_startswith(q, "../usr/share/zoneinfo/");
572 if (!y)
573 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 574
4d1c38b8 575
d4036145
LP
576 /* Already pointing to the right place? Then do nothing .. */
577 if (y && streq(y, z))
578 return 0;
579 }
580
581 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
582 if (!check)
0d0f0c50 583 return log_oom();
4d1c38b8 584
d4036145
LP
585 if (access(check, F_OK) < 0) {
586 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
587 return 0;
588 }
68fb0892 589
d4036145
LP
590 what = strappend("../usr/share/zoneinfo/", z);
591 if (!what)
592 return log_oom();
593
594 unlink(where);
595 if (symlink(what, where) < 0) {
596 log_error("Failed to correct timezone of container: %m");
597 return 0;
598 }
e58a1277
LP
599
600 return 0;
88213476
LP
601}
602
2547bb41 603static int setup_resolv_conf(const char *dest) {
f333fbb1 604 char _cleanup_free_ *where = NULL;
2547bb41
LP
605
606 assert(dest);
607
608 if (arg_private_network)
609 return 0;
610
611 /* Fix resolv.conf, if possible */
04bc4a3f
LP
612 where = strappend(dest, "/etc/resolv.conf");
613 if (!where)
0d0f0c50 614 return log_oom();
2547bb41 615
77e63faf
LP
616 /* We don't really care for the results of this really. If it
617 * fails, it fails, but meh... */
51045322 618 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
2547bb41
LP
619
620 return 0;
621}
622
04bc4a3f 623static int setup_boot_id(const char *dest) {
7fd1b19b 624 _cleanup_free_ char *from = NULL, *to = NULL;
04bc4a3f
LP
625 sd_id128_t rnd;
626 char as_uuid[37];
627 int r;
628
629 assert(dest);
630
631 /* Generate a new randomized boot ID, so that each boot-up of
632 * the container gets a new one */
633
634 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 635 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
636 if (!from || !to)
637 return log_oom();
04bc4a3f
LP
638
639 r = sd_id128_randomize(&rnd);
640 if (r < 0) {
641 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 642 return r;
04bc4a3f
LP
643 }
644
645 snprintf(as_uuid, sizeof(as_uuid),
646 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
647 SD_ID128_FORMAT_VAL(rnd));
648 char_array_0(as_uuid);
649
574d5f2d 650 r = write_string_file(from, as_uuid);
04bc4a3f
LP
651 if (r < 0) {
652 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 653 return r;
04bc4a3f
LP
654 }
655
656 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
657 log_error("Failed to bind mount boot id: %m");
658 r = -errno;
10d18763
ZJS
659 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
660 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
661
662 unlink(from);
04bc4a3f
LP
663 return r;
664}
665
e58a1277 666static int copy_devnodes(const char *dest) {
88213476
LP
667
668 static const char devnodes[] =
669 "null\0"
670 "zero\0"
671 "full\0"
672 "random\0"
673 "urandom\0"
f2d88580 674 "tty\0";
88213476
LP
675
676 const char *d;
e58a1277 677 int r = 0;
7fd1b19b 678 _cleanup_umask_ mode_t u;
a258bf26
LP
679
680 assert(dest);
124640f1
LP
681
682 u = umask(0000);
88213476
LP
683
684 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 685 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 686 struct stat st;
88213476 687
7f112f50
LP
688 from = strappend("/dev/", d);
689 to = strjoin(dest, "/dev/", d, NULL);
690 if (!from || !to)
691 return log_oom();
88213476
LP
692
693 if (stat(from, &st) < 0) {
694
695 if (errno != ENOENT) {
696 log_error("Failed to stat %s: %m", from);
7f112f50 697 return -errno;
88213476
LP
698 }
699
a258bf26 700 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 701
ed8b7a3e 702 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 703 return -EIO;
a258bf26
LP
704
705 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
706
707 log_error("mknod(%s) failed: %m", dest);
7f112f50 708 return -errno;
88213476 709 }
88213476
LP
710 }
711
e58a1277
LP
712 return r;
713}
88213476 714
f2d88580
LP
715static int setup_ptmx(const char *dest) {
716 _cleanup_free_ char *p = NULL;
717
718 p = strappend(dest, "/dev/ptmx");
719 if (!p)
720 return log_oom();
721
722 if (symlink("pts/ptmx", p) < 0) {
723 log_error("Failed to create /dev/ptmx symlink: %m");
724 return -errno;
725 }
726
727 return 0;
728}
729
e58a1277
LP
730static int setup_dev_console(const char *dest, const char *console) {
731 struct stat st;
7fd1b19b 732 _cleanup_free_ char *to = NULL;
e58a1277 733 int r;
7fd1b19b 734 _cleanup_umask_ mode_t u;
e58a1277
LP
735
736 assert(dest);
737 assert(console);
738
739 u = umask(0000);
740
741 if (stat(console, &st) < 0) {
742 log_error("Failed to stat %s: %m", console);
25ea79fe 743 return -errno;
88213476 744
a258bf26 745 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
746 log_error("/dev/console is not a char device");
747 return -EIO;
e58a1277 748 }
88213476 749
e58a1277
LP
750 r = chmod_and_chown(console, 0600, 0, 0);
751 if (r < 0) {
752 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 753 return r;
a258bf26 754 }
88213476 755
25ea79fe
ZJS
756 if (asprintf(&to, "%s/dev/console", dest) < 0)
757 return log_oom();
88213476 758
a258bf26
LP
759 /* We need to bind mount the right tty to /dev/console since
760 * ptys can only exist on pts file systems. To have something
761 * to bind mount things on we create a device node first, that
762 * has the right major/minor (note that the major minor
763 * doesn't actually matter here, since we mount it over
764 * anyway). */
765
e58a1277
LP
766 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
767 log_error("mknod() for /dev/console failed: %m");
25ea79fe 768 return -errno;
e58a1277 769 }
a258bf26
LP
770
771 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 772 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 773 return -errno;
a258bf26
LP
774 }
775
25ea79fe 776 return 0;
e58a1277
LP
777}
778
779static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 780 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 781 int r, fd, k;
7fd1b19b 782 _cleanup_umask_ mode_t u;
e58a1277
LP
783 union {
784 struct cmsghdr cmsghdr;
785 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
786 } control = {};
787 struct msghdr mh = {
788 .msg_control = &control,
789 .msg_controllen = sizeof(control),
790 };
e58a1277
LP
791 struct cmsghdr *cmsg;
792
793 assert(dest);
794 assert(kmsg_socket >= 0);
a258bf26 795
e58a1277 796 u = umask(0000);
a258bf26 797
f1e5dfe2
LP
798 /* We create the kmsg FIFO as /dev/kmsg, but immediately
799 * delete it after bind mounting it to /proc/kmsg. While FIFOs
800 * on the reading side behave very similar to /proc/kmsg,
801 * their writing side behaves differently from /dev/kmsg in
802 * that writing blocks when nothing is reading. In order to
803 * avoid any problems with containers deadlocking due to this
804 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
805 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
806 asprintf(&to, "%s/proc/kmsg", dest) < 0)
807 return log_oom();
e58a1277
LP
808
809 if (mkfifo(from, 0600) < 0) {
810 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 811 return -errno;
e58a1277
LP
812 }
813
814 r = chmod_and_chown(from, 0600, 0, 0);
815 if (r < 0) {
816 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 817 return r;
e58a1277
LP
818 }
819
820 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
821 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 822 return -errno;
e58a1277
LP
823 }
824
825 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
826 if (fd < 0) {
827 log_error("Failed to open fifo: %m");
25ea79fe 828 return -errno;
e58a1277
LP
829 }
830
e58a1277
LP
831 cmsg = CMSG_FIRSTHDR(&mh);
832 cmsg->cmsg_level = SOL_SOCKET;
833 cmsg->cmsg_type = SCM_RIGHTS;
834 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
835 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
836
837 mh.msg_controllen = cmsg->cmsg_len;
838
839 /* Store away the fd in the socket, so that it stays open as
840 * long as we run the child */
841 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
842 close_nointr_nofail(fd);
843
844 if (k < 0) {
845 log_error("Failed to send FIFO fd: %m");
25ea79fe 846 return -errno;
a258bf26
LP
847 }
848
f1e5dfe2
LP
849 /* And now make the FIFO unavailable as /dev/kmsg... */
850 unlink(from);
25ea79fe 851 return 0;
88213476
LP
852}
853
3a74cea5 854static int setup_hostname(void) {
3a74cea5 855
7027ff61
LP
856 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
857 return -errno;
3a74cea5 858
7027ff61 859 return 0;
3a74cea5
LP
860}
861
57fb9fb5 862static int setup_journal(const char *directory) {
4d680aee 863 sd_id128_t machine_id, this_id;
7fd1b19b 864 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 865 char *id;
57fb9fb5
LP
866 int r;
867
57fb9fb5 868 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
869 if (!p)
870 return log_oom();
57fb9fb5
LP
871
872 r = read_one_line_file(p, &b);
27407a01
ZJS
873 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
874 return 0;
875 else if (r < 0) {
876 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
877 return r;
878 }
879
27407a01
ZJS
880 id = strstrip(b);
881 if (isempty(id) && arg_link_journal == LINK_AUTO)
882 return 0;
57fb9fb5 883
27407a01
ZJS
884 /* Verify validity */
885 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 886 if (r < 0) {
27407a01
ZJS
887 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
888 return r;
57fb9fb5
LP
889 }
890
4d680aee
ZJS
891 r = sd_id128_get_machine(&this_id);
892 if (r < 0) {
893 log_error("Failed to retrieve machine ID: %s", strerror(-r));
894 return r;
895 }
896
897 if (sd_id128_equal(machine_id, this_id)) {
898 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
899 "Host and machine ids are equal (%s): refusing to link journals", id);
900 if (arg_link_journal == LINK_AUTO)
901 return 0;
902 return
903 -EEXIST;
904 }
905
906 if (arg_link_journal == LINK_NO)
907 return 0;
908
57fb9fb5 909 free(p);
27407a01
ZJS
910 p = strappend("/var/log/journal/", id);
911 q = strjoin(directory, "/var/log/journal/", id, NULL);
912 if (!p || !q)
913 return log_oom();
914
915 if (path_is_mount_point(p, false) > 0) {
916 if (arg_link_journal != LINK_AUTO) {
917 log_error("%s: already a mount point, refusing to use for journal", p);
918 return -EEXIST;
919 }
920
921 return 0;
57fb9fb5
LP
922 }
923
27407a01 924 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 925 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
926 log_error("%s: already a mount point, refusing to use for journal", q);
927 return -EEXIST;
57fb9fb5
LP
928 }
929
27407a01 930 return 0;
57fb9fb5
LP
931 }
932
933 r = readlink_and_make_absolute(p, &d);
934 if (r >= 0) {
935 if ((arg_link_journal == LINK_GUEST ||
936 arg_link_journal == LINK_AUTO) &&
937 path_equal(d, q)) {
938
27407a01
ZJS
939 r = mkdir_p(q, 0755);
940 if (r < 0)
941 log_warning("failed to create directory %s: %m", q);
942 return 0;
57fb9fb5
LP
943 }
944
945 if (unlink(p) < 0) {
946 log_error("Failed to remove symlink %s: %m", p);
27407a01 947 return -errno;
57fb9fb5
LP
948 }
949 } else if (r == -EINVAL) {
950
951 if (arg_link_journal == LINK_GUEST &&
952 rmdir(p) < 0) {
953
27407a01
ZJS
954 if (errno == ENOTDIR) {
955 log_error("%s already exists and is neither a symlink nor a directory", p);
956 return r;
957 } else {
57fb9fb5 958 log_error("Failed to remove %s: %m", p);
27407a01 959 return -errno;
57fb9fb5 960 }
57fb9fb5
LP
961 }
962 } else if (r != -ENOENT) {
963 log_error("readlink(%s) failed: %m", p);
27407a01 964 return r;
57fb9fb5
LP
965 }
966
967 if (arg_link_journal == LINK_GUEST) {
968
969 if (symlink(q, p) < 0) {
970 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 971 return -errno;
57fb9fb5
LP
972 }
973
27407a01
ZJS
974 r = mkdir_p(q, 0755);
975 if (r < 0)
976 log_warning("failed to create directory %s: %m", q);
977 return 0;
57fb9fb5
LP
978 }
979
980 if (arg_link_journal == LINK_HOST) {
981 r = mkdir_p(p, 0755);
982 if (r < 0) {
983 log_error("Failed to create %s: %m", p);
27407a01 984 return r;
57fb9fb5
LP
985 }
986
27407a01
ZJS
987 } else if (access(p, F_OK) < 0)
988 return 0;
57fb9fb5
LP
989
990 if (dir_is_empty(q) == 0) {
991 log_error("%s not empty.", q);
27407a01 992 return -ENOTEMPTY;
57fb9fb5
LP
993 }
994
995 r = mkdir_p(q, 0755);
996 if (r < 0) {
997 log_error("Failed to create %s: %m", q);
27407a01 998 return r;
57fb9fb5
LP
999 }
1000
1001 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1002 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1003 return -errno;
57fb9fb5
LP
1004 }
1005
27407a01 1006 return 0;
57fb9fb5
LP
1007}
1008
9bd37b40
LP
1009static int setup_kdbus(const char *dest, const char *path) {
1010 const char *p;
1011
1012 if (!path)
1013 return 0;
1014
1015 p = strappenda(dest, "/dev/kdbus");
1016 if (mkdir(p, 0755) < 0) {
1017 log_error("Failed to create kdbus path: %m");
1018 return -errno;
1019 }
1020
1021 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1022 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1023 return -errno;
1024 }
1025
1026 return 0;
1027}
1028
88213476 1029static int drop_capabilities(void) {
5076f0cc 1030 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1031}
1032
354bfd2b 1033static int register_machine(pid_t pid) {
9444b1f2
LP
1034 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1035 _cleanup_bus_unref_ sd_bus *bus = NULL;
1036 int r;
1037
1c03020c 1038 r = sd_bus_default_system(&bus);
9444b1f2
LP
1039 if (r < 0) {
1040 log_error("Failed to open system bus: %s", strerror(-r));
1041 return r;
1042 }
1043
1044 r = sd_bus_call_method(
1045 bus,
1ee306e1
LP
1046 "org.freedesktop.machine1",
1047 "/org/freedesktop/machine1",
1048 "org.freedesktop.machine1.Manager",
9444b1f2
LP
1049 "CreateMachine",
1050 &error,
1051 NULL,
6a4e0b13 1052 "sayssusa(sv)",
9444b1f2 1053 arg_machine,
40ca29a1 1054 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
9444b1f2
LP
1055 "nspawn",
1056 "container",
354bfd2b 1057 (uint32_t) pid,
6a4e0b13 1058 strempty(arg_directory),
88212f7b 1059 !isempty(arg_slice), "Slice", "s", arg_slice);
9444b1f2 1060 if (r < 0) {
1f0cd86b
LP
1061 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1062 return r;
1063 }
1064
1065 return 0;
1066}
1067
1068static int terminate_machine(pid_t pid) {
1069 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1070 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1071 _cleanup_bus_unref_ sd_bus *bus = NULL;
1072 const char *path;
1073 int r;
1074
76b54375 1075 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1076 if (r < 0) {
1077 log_error("Failed to open system bus: %s", strerror(-r));
1078 return r;
1079 }
1080
1081 r = sd_bus_call_method(
1082 bus,
1083 "org.freedesktop.machine1",
1084 "/org/freedesktop/machine1",
1085 "org.freedesktop.machine1.Manager",
1086 "GetMachineByPID",
1087 &error,
1088 &reply,
1089 "u",
1090 (uint32_t) pid);
1091 if (r < 0) {
1092 /* Note that the machine might already have been
1093 * cleaned up automatically, hence don't consider it a
1094 * failure if we cannot get the machine object. */
1095 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1096 return 0;
1097 }
1098
1099 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1100 if (r < 0)
1101 return bus_log_parse_error(r);
9444b1f2 1102
1f0cd86b
LP
1103 r = sd_bus_call_method(
1104 bus,
1105 "org.freedesktop.machine1",
1106 path,
1107 "org.freedesktop.machine1.Machine",
1108 "Terminate",
1109 &error,
1110 NULL,
1111 NULL);
1112 if (r < 0) {
1113 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1114 return 0;
1115 }
1116
9444b1f2
LP
1117 return 0;
1118}
1119
77b6e194
LP
1120static bool audit_enabled(void) {
1121 int fd;
1122
1123 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1124 if (fd >= 0) {
1125 close_nointr_nofail(fd);
1126 return true;
1127 }
77b6e194
LP
1128 return false;
1129}
1130
88213476
LP
1131int main(int argc, char *argv[]) {
1132 pid_t pid = 0;
04d391da 1133 int r = EXIT_FAILURE, k;
354bfd2b 1134 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
7027ff61 1135 int n_fd_passed;
a258bf26 1136 const char *console = NULL;
a258bf26 1137 sigset_t mask;
04d39279 1138 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
51d122af 1139 _cleanup_fdset_free_ FDSet *fds = NULL;
486e99a3 1140 _cleanup_free_ char *kdbus_domain = NULL;
9e554864 1141 const char *ns;
88213476
LP
1142
1143 log_parse_environment();
1144 log_open();
1145
05947bef
LP
1146 k = parse_argv(argc, argv);
1147 if (k < 0)
88213476 1148 goto finish;
05947bef
LP
1149 else if (k == 0) {
1150 r = EXIT_SUCCESS;
1151 goto finish;
1152 }
88213476
LP
1153
1154 if (arg_directory) {
1155 char *p;
1156
1157 p = path_make_absolute_cwd(arg_directory);
1158 free(arg_directory);
1159 arg_directory = p;
1160 } else
1161 arg_directory = get_current_dir_name();
1162
1163 if (!arg_directory) {
a383724e 1164 log_error("Failed to determine path, please use -D.");
88213476
LP
1165 goto finish;
1166 }
1167
1168 path_kill_slashes(arg_directory);
1169
7027ff61 1170 if (!arg_machine) {
2b6bf07d 1171 arg_machine = strdup(basename(arg_directory));
7027ff61
LP
1172 if (!arg_machine) {
1173 log_oom();
1174 goto finish;
1175 }
1176
e724b063 1177 hostname_cleanup(arg_machine, false);
7027ff61
LP
1178 if (isempty(arg_machine)) {
1179 log_error("Failed to determine machine name automatically, please use -M.");
1180 goto finish;
1181 }
1182 }
1183
88213476
LP
1184 if (geteuid() != 0) {
1185 log_error("Need to be root.");
1186 goto finish;
1187 }
1188
04d391da
LP
1189 if (sd_booted() <= 0) {
1190 log_error("Not running on a systemd system.");
1191 goto finish;
1192 }
1193
c2384970 1194 if (arg_boot && audit_enabled()) {
77b6e194
LP
1195 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1196 "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1197 "line before using systemd-nspawn. Sleeping for 5s...\n");
1198 sleep(5);
1199 }
1200
88213476 1201 if (path_equal(arg_directory, "/")) {
6df6b939 1202 log_error("Spawning container on root directory not supported.");
88213476
LP
1203 goto finish;
1204 }
1205
fcf90586 1206 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
f8964235 1207 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
88213476
LP
1208 goto finish;
1209 }
1210
842f3b0f
LP
1211 log_close();
1212 n_fd_passed = sd_listen_fds(false);
1213 if (n_fd_passed > 0) {
1214 k = fdset_new_listen_fds(&fds, false);
1215 if (k < 0) {
1216 log_error("Failed to collect file descriptors: %s", strerror(-k));
1217 goto finish;
1218 }
1219 }
1220 fdset_close_others(fds);
1221 log_open();
1222
db7feb7e
LP
1223 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1224 if (master < 0) {
a258bf26
LP
1225 log_error("Failed to acquire pseudo tty: %m");
1226 goto finish;
1227 }
1228
db7feb7e
LP
1229 console = ptsname(master);
1230 if (!console) {
a258bf26
LP
1231 log_error("Failed to determine tty name: %m");
1232 goto finish;
1233 }
1234
284c0b91
LP
1235 if (!arg_quiet)
1236 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
a258bf26
LP
1237
1238 if (unlockpt(master) < 0) {
1239 log_error("Failed to unlock tty: %m");
1240 goto finish;
1241 }
1242
9e554864 1243 ns = strappenda("machine-", arg_machine);
486e99a3 1244 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
9bd37b40 1245 if (r < 0)
486e99a3 1246 log_debug("Failed to create kdbus domain: %s", strerror(-r));
9bd37b40 1247 else
486e99a3 1248 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
9bd37b40 1249
e58a1277 1250 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
1251 log_error("Failed to create kmsg socket pair: %m");
1252 goto finish;
1253 }
1254
05947bef
LP
1255 sd_notify(0, "READY=1");
1256
a258bf26
LP
1257 assert_se(sigemptyset(&mask) == 0);
1258 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1259 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1260
d87be9b0
LP
1261 for (;;) {
1262 siginfo_t status;
a383724e 1263
40ddbdf8
LP
1264 sync_fd = eventfd(0, EFD_CLOEXEC);
1265 if (sync_fd < 0) {
1266 log_error("Failed to create event fd: %m");
1267 goto finish;
1268 }
1269
d87be9b0
LP
1270 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1271 if (pid < 0) {
1272 if (errno == EINVAL)
1273 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1274 else
1275 log_error("clone() failed: %m");
a258bf26 1276
d87be9b0
LP
1277 goto finish;
1278 }
a258bf26 1279
d87be9b0
LP
1280 if (pid == 0) {
1281 /* child */
d87be9b0
LP
1282 const char *home = NULL;
1283 uid_t uid = (uid_t) -1;
1284 gid_t gid = (gid_t) -1;
5674767e 1285 unsigned n_env = 2;
d87be9b0 1286 const char *envp[] = {
e10a55fd 1287 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
1288 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1289 NULL, /* TERM */
1290 NULL, /* HOME */
1291 NULL, /* USER */
1292 NULL, /* LOGNAME */
1293 NULL, /* container_uuid */
842f3b0f
LP
1294 NULL, /* LISTEN_FDS */
1295 NULL, /* LISTEN_PID */
d87be9b0
LP
1296 NULL
1297 };
f4889f65 1298 char **env_use;
354bfd2b 1299 eventfd_t x;
a258bf26 1300
5674767e
ZJS
1301 envp[n_env] = strv_find_prefix(environ, "TERM=");
1302 if (envp[n_env])
1303 n_env ++;
a258bf26 1304
d87be9b0 1305 close_nointr_nofail(master);
842f3b0f 1306 master = -1;
a258bf26 1307
d87be9b0
LP
1308 close_nointr(STDIN_FILENO);
1309 close_nointr(STDOUT_FILENO);
1310 close_nointr(STDERR_FILENO);
db7feb7e 1311
842f3b0f
LP
1312 close_nointr_nofail(kmsg_socket_pair[0]);
1313 kmsg_socket_pair[0] = -1;
a258bf26 1314
d87be9b0 1315 reset_all_signal_handlers();
88213476 1316
d87be9b0
LP
1317 assert_se(sigemptyset(&mask) == 0);
1318 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1319
842f3b0f
LP
1320 k = open_terminal(console, O_RDWR);
1321 if (k != STDIN_FILENO) {
1322 if (k >= 0) {
1323 close_nointr_nofail(k);
1324 k = -EINVAL;
1325 }
1326
1327 log_error("Failed to open console: %s", strerror(-k));
1328 goto child_fail;
1329 }
1330
1331 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1332 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1333 log_error("Failed to duplicate console: %m");
d87be9b0 1334 goto child_fail;
842f3b0f 1335 }
bc2f673e 1336
d87be9b0
LP
1337 if (setsid() < 0) {
1338 log_error("setsid() failed: %m");
bc2f673e
LP
1339 goto child_fail;
1340 }
1341
d87be9b0
LP
1342 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1343 log_error("PR_SET_PDEATHSIG failed: %m");
1344 goto child_fail;
1345 }
e58a1277 1346
d87be9b0
LP
1347 /* Mark everything as slave, so that we still
1348 * receive mounts from the real root, but don't
1349 * propagate mounts to the real root. */
1350 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1351 log_error("MS_SLAVE|MS_REC failed: %m");
1352 goto child_fail;
1353 }
04bc4a3f 1354
d87be9b0
LP
1355 /* Turn directory into bind mount */
1356 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1357 log_error("Failed to make bind mount.");
1358 goto child_fail;
1359 }
88213476 1360
d87be9b0
LP
1361 if (arg_read_only)
1362 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1363 log_error("Failed to make read-only.");
1364 goto child_fail;
1365 }
2547bb41 1366
d87be9b0
LP
1367 if (mount_all(arg_directory) < 0)
1368 goto child_fail;
57fb9fb5 1369
d87be9b0
LP
1370 if (copy_devnodes(arg_directory) < 0)
1371 goto child_fail;
a258bf26 1372
f2d88580
LP
1373 if (setup_ptmx(arg_directory) < 0)
1374 goto child_fail;
1375
d87be9b0 1376 dev_setup(arg_directory);
88213476 1377
d87be9b0
LP
1378 if (setup_dev_console(arg_directory, console) < 0)
1379 goto child_fail;
88213476 1380
d87be9b0
LP
1381 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1382 goto child_fail;
88213476 1383
d87be9b0 1384 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1385 kmsg_socket_pair[1] = -1;
a258bf26 1386
d87be9b0
LP
1387 if (setup_boot_id(arg_directory) < 0)
1388 goto child_fail;
a41fe3a2 1389
d87be9b0
LP
1390 if (setup_timezone(arg_directory) < 0)
1391 goto child_fail;
88213476 1392
d87be9b0
LP
1393 if (setup_resolv_conf(arg_directory) < 0)
1394 goto child_fail;
687d0825 1395
d87be9b0 1396 if (setup_journal(arg_directory) < 0)
687d0825 1397 goto child_fail;
687d0825 1398
17fe0523
LP
1399 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1400 goto child_fail;
1401
1402 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1403 goto child_fail;
1404
486e99a3 1405 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
9bd37b40
LP
1406 goto child_fail;
1407
d87be9b0
LP
1408 if (chdir(arg_directory) < 0) {
1409 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1410 goto child_fail;
1411 }
1412
d87be9b0
LP
1413 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1414 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1415 goto child_fail;
1416 }
1417
d87be9b0
LP
1418 if (chroot(".") < 0) {
1419 log_error("chroot() failed: %m");
687d0825
MV
1420 goto child_fail;
1421 }
1422
d87be9b0
LP
1423 if (chdir("/") < 0) {
1424 log_error("chdir() failed: %m");
687d0825
MV
1425 goto child_fail;
1426 }
1427
d87be9b0
LP
1428 umask(0022);
1429
1430 loopback_setup();
1431
1432 if (drop_capabilities() < 0) {
1433 log_error("drop_capabilities() failed: %m");
687d0825
MV
1434 goto child_fail;
1435 }
687d0825 1436
d87be9b0
LP
1437 if (arg_user) {
1438
963ddb91
LP
1439 /* Note that this resolves user names
1440 * inside the container, and hence
1441 * accesses the NSS modules from the
1442 * container and not the host. This is
1443 * a bit weird... */
1444
d87be9b0
LP
1445 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1446 log_error("get_user_creds() failed: %m");
1447 goto child_fail;
1448 }
1449
1450 if (mkdir_parents_label(home, 0775) < 0) {
1451 log_error("mkdir_parents_label() failed: %m");
1452 goto child_fail;
1453 }
1454
1455 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1456 log_error("mkdir_safe_label() failed: %m");
1457 goto child_fail;
1458 }
1459
1460 if (initgroups((const char*)arg_user, gid) < 0) {
1461 log_error("initgroups() failed: %m");
1462 goto child_fail;
1463 }
144f0fc0 1464
d87be9b0
LP
1465 if (setresgid(gid, gid, gid) < 0) {
1466 log_error("setregid() failed: %m");
1467 goto child_fail;
1468 }
1469
1470 if (setresuid(uid, uid, uid) < 0) {
1471 log_error("setreuid() failed: %m");
1472 goto child_fail;
1473 }
3c957acf
LP
1474 } else {
1475 /* Reset everything fully to 0, just in case */
1476
1477 if (setgroups(0, NULL) < 0) {
1478 log_error("setgroups() failed: %m");
1479 goto child_fail;
1480 }
1481
1482 if (setresgid(0, 0, 0) < 0) {
1483 log_error("setregid() failed: %m");
1484 goto child_fail;
1485 }
1486
1487 if (setresuid(0, 0, 0) < 0) {
1488 log_error("setreuid() failed: %m");
1489 goto child_fail;
1490 }
d87be9b0
LP
1491 }
1492
842f3b0f
LP
1493 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1494 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1495 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1496 log_oom();
144f0fc0
LP
1497 goto child_fail;
1498 }
687d0825 1499
9444b1f2
LP
1500 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1501 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
842f3b0f
LP
1502 log_oom();
1503 goto child_fail;
1504 }
1505 }
1506
1507 if (fdset_size(fds) > 0) {
1508 k = fdset_cloexec(fds, false);
1509 if (k < 0) {
1510 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1511 goto child_fail;
1512 }
1513
1514 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 1515 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
1516 log_oom();
1517 goto child_fail;
1518 }
1519 }
1520
1521 setup_hostname();
1522
354bfd2b
LP
1523 eventfd_read(sync_fd, &x);
1524 close_nointr_nofail(sync_fd);
1525 sync_fd = -1;
1526
f4889f65
LP
1527 if (!strv_isempty(arg_setenv)) {
1528 char **n;
1529
1530 n = strv_env_merge(2, envp, arg_setenv);
1531 if (!n) {
1532 log_oom();
1533 goto child_fail;
1534 }
1535
1536 env_use = n;
1537 } else
1538 env_use = (char**) envp;
1539
5d63309c 1540#ifdef HAVE_SELINUX
82adf6af
LP
1541 if (arg_selinux_context)
1542 if (setexeccon(arg_selinux_context) < 0)
1543 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
a8828ed9 1544#endif
d87be9b0
LP
1545 if (arg_boot) {
1546 char **a;
1547 size_t l;
88213476 1548
d87be9b0 1549 /* Automatically search for the init system */
0f0dbc46 1550
d87be9b0
LP
1551 l = 1 + argc - optind;
1552 a = newa(char*, l + 1);
1553 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1554
d87be9b0 1555 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 1556 execve(a[0], a, env_use);
0f0dbc46 1557
d87be9b0 1558 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 1559 execve(a[0], a, env_use);
0f0dbc46 1560
d87be9b0 1561 a[0] = (char*) "/sbin/init";
f4889f65 1562 execve(a[0], a, env_use);
d87be9b0 1563 } else if (argc > optind)
f4889f65 1564 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
1565 else {
1566 chdir(home ? home : "/root");
f4889f65 1567 execle("/bin/bash", "-bash", NULL, env_use);
d87be9b0
LP
1568 }
1569
1570 log_error("execv() failed: %m");
0f0dbc46 1571
d87be9b0
LP
1572 child_fail:
1573 _exit(EXIT_FAILURE);
da5b3bad 1574 }
88213476 1575
842f3b0f
LP
1576 fdset_free(fds);
1577 fds = NULL;
1578
354bfd2b
LP
1579 r = register_machine(pid);
1580 if (r < 0)
1581 goto finish;
1582
1583 eventfd_write(sync_fd, 1);
1584 close_nointr_nofail(sync_fd);
1585 sync_fd = -1;
1586
04d39279
LP
1587 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1588 if (k < 0) {
1589 r = EXIT_FAILURE;
1590 break;
1591 }
88213476 1592
284c0b91
LP
1593 if (!arg_quiet)
1594 putc('\n', stdout);
04d39279
LP
1595
1596 /* Kill if it is not dead yet anyway */
1f0cd86b
LP
1597 terminate_machine(pid);
1598
1599 /* Redundant, but better safe than sorry */
04d39279 1600 kill(pid, SIGKILL);
a258bf26 1601
05947bef 1602 k = wait_for_terminate(pid, &status);
04d39279
LP
1603 pid = 0;
1604
05947bef 1605 if (k < 0) {
d87be9b0
LP
1606 r = EXIT_FAILURE;
1607 break;
1608 }
a258bf26 1609
d87be9b0 1610 if (status.si_code == CLD_EXITED) {
a5f5f8a0 1611 r = status.si_status;
d87be9b0 1612 if (status.si_status != 0) {
04d39279 1613 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
d87be9b0
LP
1614 break;
1615 }
1616
284c0b91
LP
1617 if (!arg_quiet)
1618 log_debug("Container %s exited successfully.", arg_machine);
d87be9b0
LP
1619 break;
1620 } else if (status.si_code == CLD_KILLED &&
1621 status.si_status == SIGINT) {
284c0b91
LP
1622
1623 if (!arg_quiet)
1624 log_info("Container %s has been shut down.", arg_machine);
d87be9b0
LP
1625 r = 0;
1626 break;
1627 } else if (status.si_code == CLD_KILLED &&
1628 status.si_status == SIGHUP) {
284c0b91
LP
1629
1630 if (!arg_quiet)
1631 log_info("Container %s is being rebooted.", arg_machine);
d87be9b0
LP
1632 continue;
1633 } else if (status.si_code == CLD_KILLED ||
1634 status.si_code == CLD_DUMPED) {
88213476 1635
04d39279 1636 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
d87be9b0
LP
1637 r = EXIT_FAILURE;
1638 break;
1639 } else {
04d39279 1640 log_error("Container %s failed due to unknown reason.", arg_machine);
d87be9b0
LP
1641 r = EXIT_FAILURE;
1642 break;
1643 }
1644 }
88213476
LP
1645
1646finish:
9444b1f2
LP
1647 if (pid > 0)
1648 kill(pid, SIGKILL);
88213476 1649
04d391da 1650 free(arg_directory);
7027ff61 1651 free(arg_machine);
f4889f65 1652 free(arg_setenv);
88213476
LP
1653
1654 return r;
1655}