]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: explain that we look for /etc/os-release in the container directory
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
a383724e 25#include <attr/xattr.h>
88213476
LP
26#include <sys/types.h>
27#include <sys/syscall.h>
28#include <sys/mount.h>
29#include <sys/wait.h>
30#include <stdlib.h>
31#include <string.h>
32#include <stdio.h>
33#include <errno.h>
34#include <sys/prctl.h>
35#include <sys/capability.h>
36#include <getopt.h>
1fd96121 37#include <sys/poll.h>
a258bf26
LP
38#include <sys/epoll.h>
39#include <termios.h>
40#include <sys/signalfd.h>
687d0825 41#include <grp.h>
5ed27dbd 42#include <linux/fs.h>
9537eab0
LP
43#include <sys/un.h>
44#include <sys/socket.h>
88213476 45
81527be1
LP
46#include <systemd/sd-daemon.h>
47
88213476
LP
48#include "log.h"
49#include "util.h"
49e942b2 50#include "mkdir.h"
6b2d0e85 51#include "macro.h"
d7832d2c 52#include "audit.h"
94d82985 53#include "missing.h"
04d391da 54#include "cgroup-util.h"
a258bf26 55#include "strv.h"
9eb977db 56#include "path-util.h"
a41fe3a2 57#include "loopback-setup.h"
57fb9fb5 58#include "sd-id128.h"
4fc9982c 59#include "dev-setup.h"
842f3b0f 60#include "fdset.h"
acbeb427 61#include "build.h"
a5c32cff 62#include "fileio.h"
57fb9fb5 63
f2d88580
LP
64#ifndef TTY_GID
65#define TTY_GID 5
66#endif
67
57fb9fb5
LP
68typedef enum LinkJournal {
69 LINK_NO,
70 LINK_AUTO,
71 LINK_HOST,
72 LINK_GUEST
73} LinkJournal;
88213476
LP
74
75static char *arg_directory = NULL;
687d0825 76static char *arg_user = NULL;
40c32a4a 77static char **arg_controllers = NULL;
144f0fc0 78static char *arg_uuid = NULL;
7027ff61 79static char *arg_machine = NULL;
ff01d048 80static bool arg_private_network = false;
bc2f673e 81static bool arg_read_only = false;
0f0dbc46 82static bool arg_boot = false;
57fb9fb5 83static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
84static uint64_t arg_retain =
85 (1ULL << CAP_CHOWN) |
86 (1ULL << CAP_DAC_OVERRIDE) |
87 (1ULL << CAP_DAC_READ_SEARCH) |
88 (1ULL << CAP_FOWNER) |
89 (1ULL << CAP_FSETID) |
90 (1ULL << CAP_IPC_OWNER) |
91 (1ULL << CAP_KILL) |
92 (1ULL << CAP_LEASE) |
93 (1ULL << CAP_LINUX_IMMUTABLE) |
94 (1ULL << CAP_NET_BIND_SERVICE) |
95 (1ULL << CAP_NET_BROADCAST) |
96 (1ULL << CAP_NET_RAW) |
97 (1ULL << CAP_SETGID) |
98 (1ULL << CAP_SETFCAP) |
99 (1ULL << CAP_SETPCAP) |
100 (1ULL << CAP_SETUID) |
101 (1ULL << CAP_SYS_ADMIN) |
102 (1ULL << CAP_SYS_CHROOT) |
103 (1ULL << CAP_SYS_NICE) |
104 (1ULL << CAP_SYS_PTRACE) |
105 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 106 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
107 (1ULL << CAP_SYS_BOOT) |
108 (1ULL << CAP_AUDIT_WRITE) |
109 (1ULL << CAP_AUDIT_CONTROL);
17fe0523
LP
110static char **arg_bind = NULL;
111static char **arg_bind_ro = NULL;
88213476
LP
112
113static int help(void) {
114
115 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
17fe0523 117 " -h --help Show this help\n"
7027ff61 118 " --version Print version string\n"
17fe0523
LP
119 " -D --directory=NAME Root directory for the container\n"
120 " -b --boot Boot up full system (i.e. invoke init)\n"
121 " -u --user=USER Run the command under specified user or uid\n"
122 " -C --controllers=LIST Put the container in specified comma-separated\n"
123 " cgroup hierarchies\n"
124 " --uuid=UUID Set a specific machine UUID for the container\n"
7027ff61 125 " -M --machine=NAME Set the machine name for the container\n"
17fe0523
LP
126 " --private-network Disable network in container\n"
127 " --read-only Mount the root directory read-only\n"
128 " --capability=CAP In addition to the default, retain specified\n"
129 " capability\n"
130 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
131 " -j Equivalent to --link-journal=host\n"
132 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
133 " the container\n"
134 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
88213476
LP
135 program_invocation_short_name);
136
137 return 0;
138}
139
140static int parse_argv(int argc, char *argv[]) {
141
a41fe3a2 142 enum {
acbeb427
ZJS
143 ARG_VERSION = 0x100,
144 ARG_PRIVATE_NETWORK,
bc2f673e 145 ARG_UUID,
5076f0cc 146 ARG_READ_ONLY,
57fb9fb5 147 ARG_CAPABILITY,
17fe0523
LP
148 ARG_LINK_JOURNAL,
149 ARG_BIND,
150 ARG_BIND_RO
a41fe3a2
LP
151 };
152
88213476 153 static const struct option options[] = {
ff01d048 154 { "help", no_argument, NULL, 'h' },
acbeb427 155 { "version", no_argument, NULL, ARG_VERSION },
ff01d048
LP
156 { "directory", required_argument, NULL, 'D' },
157 { "user", required_argument, NULL, 'u' },
40c32a4a 158 { "controllers", required_argument, NULL, 'C' },
ff01d048 159 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 160 { "boot", no_argument, NULL, 'b' },
144f0fc0 161 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 162 { "read-only", no_argument, NULL, ARG_READ_ONLY },
5076f0cc 163 { "capability", required_argument, NULL, ARG_CAPABILITY },
57fb9fb5 164 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
17fe0523
LP
165 { "bind", required_argument, NULL, ARG_BIND },
166 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
7027ff61 167 { "machine", required_argument, NULL, 'M' },
ff01d048 168 { NULL, 0, NULL, 0 }
88213476
LP
169 };
170
171 int c;
172
173 assert(argc >= 0);
174 assert(argv);
175
d7e011e5 176 while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
88213476
LP
177
178 switch (c) {
179
180 case 'h':
181 help();
182 return 0;
183
acbeb427
ZJS
184 case ARG_VERSION:
185 puts(PACKAGE_STRING);
186 puts(SYSTEMD_FEATURES);
187 return 0;
188
88213476
LP
189 case 'D':
190 free(arg_directory);
3a74cea5
LP
191 arg_directory = canonicalize_file_name(optarg);
192 if (!arg_directory) {
193 log_error("Failed to canonicalize root directory.");
88213476
LP
194 return -ENOMEM;
195 }
196
197 break;
198
687d0825
MV
199 case 'u':
200 free(arg_user);
7027ff61
LP
201 arg_user = strdup(optarg);
202 if (!arg_user)
203 return log_oom();
687d0825
MV
204
205 break;
206
40c32a4a
LGL
207 case 'C':
208 strv_free(arg_controllers);
209 arg_controllers = strv_split(optarg, ",");
7027ff61
LP
210 if (!arg_controllers)
211 return log_oom();
40c32a4a 212
7027ff61 213 cg_shorten_controllers(arg_controllers);
40c32a4a
LGL
214 break;
215
ff01d048
LP
216 case ARG_PRIVATE_NETWORK:
217 arg_private_network = true;
a41fe3a2
LP
218 break;
219
0f0dbc46
LP
220 case 'b':
221 arg_boot = true;
222 break;
223
144f0fc0 224 case ARG_UUID:
aa96c6cb
LP
225 if (!id128_is_valid(optarg)) {
226 log_error("Invalid UUID: %s", optarg);
227 return -EINVAL;
228 }
229
144f0fc0
LP
230 arg_uuid = optarg;
231 break;
232
7027ff61
LP
233 case 'M':
234 if (!hostname_is_valid(optarg)) {
235 log_error("Invalid machine name: %s", optarg);
236 return -EINVAL;
237 }
238
239 free(arg_machine);
240 arg_machine = strdup(optarg);
241 if (!arg_machine)
242 return log_oom();
243
244 break;
245
bc2f673e
LP
246 case ARG_READ_ONLY:
247 arg_read_only = true;
248 break;
249
5076f0cc
LP
250 case ARG_CAPABILITY: {
251 char *state, *word;
252 size_t length;
253
254 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
255 cap_value_t cap;
256 char *t;
257
258 t = strndup(word, length);
0d0f0c50
SL
259 if (!t)
260 return log_oom();
5076f0cc
LP
261
262 if (cap_from_name(t, &cap) < 0) {
263 log_error("Failed to parse capability %s.", t);
264 free(t);
265 return -EINVAL;
266 }
267
268 free(t);
269 arg_retain |= 1ULL << (uint64_t) cap;
270 }
271
272 break;
273 }
274
57fb9fb5
LP
275 case 'j':
276 arg_link_journal = LINK_GUEST;
277 break;
278
279 case ARG_LINK_JOURNAL:
280 if (streq(optarg, "auto"))
281 arg_link_journal = LINK_AUTO;
282 else if (streq(optarg, "no"))
283 arg_link_journal = LINK_NO;
284 else if (streq(optarg, "guest"))
285 arg_link_journal = LINK_GUEST;
286 else if (streq(optarg, "host"))
287 arg_link_journal = LINK_HOST;
288 else {
289 log_error("Failed to parse link journal mode %s", optarg);
290 return -EINVAL;
291 }
292
293 break;
294
17fe0523
LP
295 case ARG_BIND:
296 case ARG_BIND_RO: {
297 _cleanup_free_ char *a = NULL, *b = NULL;
298 char *e;
299 char ***x;
300 int r;
301
302 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
303
304 e = strchr(optarg, ':');
305 if (e) {
306 a = strndup(optarg, e - optarg);
307 b = strdup(e + 1);
308 } else {
309 a = strdup(optarg);
310 b = strdup(optarg);
311 }
312
313 if (!a || !b)
314 return log_oom();
315
316 if (!path_is_absolute(a) || !path_is_absolute(b)) {
317 log_error("Invalid bind mount specification: %s", optarg);
318 return -EINVAL;
319 }
320
321 r = strv_extend(x, a);
322 if (r < 0)
323 return r;
324
325 r = strv_extend(x, b);
326 if (r < 0)
327 return r;
328
329 break;
330 }
331
88213476
LP
332 case '?':
333 return -EINVAL;
334
335 default:
336 log_error("Unknown option code %c", c);
337 return -EINVAL;
338 }
339 }
340
341 return 1;
342}
343
344static int mount_all(const char *dest) {
345
346 typedef struct MountPoint {
347 const char *what;
348 const char *where;
349 const char *type;
350 const char *options;
351 unsigned long flags;
3bd66c05 352 bool fatal;
88213476
LP
353 } MountPoint;
354
355 static const MountPoint mount_table[] = {
4b7a6af4 356 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
357 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
358 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 359 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 360 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 361 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 362 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 363 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 364#ifdef HAVE_SELINUX
b4c59701
LP
365 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
366 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 367#endif
88213476
LP
368 };
369
370 unsigned k;
371 int r = 0;
372
373 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 374 _cleanup_free_ char *where = NULL;
88213476
LP
375 int t;
376
17fe0523
LP
377 where = strjoin(dest, "/", mount_table[k].where, NULL);
378 if (!where)
379 return log_oom();
88213476 380
e65aec12 381 t = path_is_mount_point(where, true);
68fb0892 382 if (t < 0) {
88213476 383 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
384
385 if (r == 0)
386 r = t;
387
388 continue;
389 }
390
9c1c7f71
LP
391 /* Skip this entry if it is not a remount. */
392 if (mount_table[k].what && t > 0)
014a9c77
LP
393 continue;
394
17fe0523 395 mkdir_p(where, 0755);
88213476
LP
396
397 if (mount(mount_table[k].what,
398 where,
399 mount_table[k].type,
400 mount_table[k].flags,
3bd66c05
LP
401 mount_table[k].options) < 0 &&
402 mount_table[k].fatal) {
88213476
LP
403
404 log_error("mount(%s) failed: %m", where);
405
406 if (r == 0)
407 r = -errno;
408 }
88213476
LP
409 }
410
e58a1277
LP
411 return r;
412}
f8440af5 413
17fe0523
LP
414static int mount_binds(const char *dest, char **l, unsigned long flags) {
415 char **x, **y;
416
417 STRV_FOREACH_PAIR(x, y, l) {
418 _cleanup_free_ char *where = NULL;
419
420 where = strjoin(dest, "/", *y, NULL);
421 if (!where)
422 return log_oom();
423
424 mkdir_p_label(where, 0755);
425
426 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
427 log_error("mount(%s) failed: %m", where);
428 return -errno;
429 }
430
431 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
432 log_error("mount(%s) failed: %m", where);
433 return -errno;
434 }
435 }
436
437 return 0;
438}
439
e58a1277 440static int setup_timezone(const char *dest) {
d4036145
LP
441 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
442 char *z, *y;
443 int r;
f8440af5 444
e58a1277
LP
445 assert(dest);
446
447 /* Fix the timezone, if possible */
d4036145
LP
448 r = readlink_malloc("/etc/localtime", &p);
449 if (r < 0) {
450 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
451 return 0;
452 }
453
454 z = path_startswith(p, "../usr/share/zoneinfo/");
455 if (!z)
456 z = path_startswith(p, "/usr/share/zoneinfo/");
457 if (!z) {
458 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
459 return 0;
460 }
461
04bc4a3f
LP
462 where = strappend(dest, "/etc/localtime");
463 if (!where)
0d0f0c50 464 return log_oom();
715ac17a 465
d4036145
LP
466 r = readlink_malloc(where, &q);
467 if (r >= 0) {
468 y = path_startswith(q, "../usr/share/zoneinfo/");
469 if (!y)
470 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 471
4d1c38b8 472
d4036145
LP
473 /* Already pointing to the right place? Then do nothing .. */
474 if (y && streq(y, z))
475 return 0;
476 }
477
478 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
479 if (!check)
0d0f0c50 480 return log_oom();
4d1c38b8 481
d4036145
LP
482 if (access(check, F_OK) < 0) {
483 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
484 return 0;
485 }
68fb0892 486
d4036145
LP
487 what = strappend("../usr/share/zoneinfo/", z);
488 if (!what)
489 return log_oom();
490
491 unlink(where);
492 if (symlink(what, where) < 0) {
493 log_error("Failed to correct timezone of container: %m");
494 return 0;
495 }
e58a1277
LP
496
497 return 0;
88213476
LP
498}
499
2547bb41 500static int setup_resolv_conf(const char *dest) {
f333fbb1
ZJS
501 char _cleanup_free_ *where = NULL;
502 _cleanup_close_ int fd = -1;
2547bb41
LP
503
504 assert(dest);
505
506 if (arg_private_network)
507 return 0;
508
509 /* Fix resolv.conf, if possible */
04bc4a3f
LP
510 where = strappend(dest, "/etc/resolv.conf");
511 if (!where)
0d0f0c50 512 return log_oom();
2547bb41 513
f333fbb1
ZJS
514 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
515
77e63faf
LP
516 /* We don't really care for the results of this really. If it
517 * fails, it fails, but meh... */
f333fbb1
ZJS
518 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
519 log_warning("Failed to bind mount /etc/resolv.conf: %m");
520 else
521 if (mount("/etc/resolv.conf", where, "bind",
522 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
523 log_error("Failed to remount /etc/resolv.conf readonly: %m");
524 return -errno;
525 }
2547bb41
LP
526
527 return 0;
528}
529
04bc4a3f 530static int setup_boot_id(const char *dest) {
7fd1b19b 531 _cleanup_free_ char *from = NULL, *to = NULL;
04bc4a3f
LP
532 sd_id128_t rnd;
533 char as_uuid[37];
534 int r;
535
536 assert(dest);
537
538 /* Generate a new randomized boot ID, so that each boot-up of
539 * the container gets a new one */
540
541 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 542 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
543 if (!from || !to)
544 return log_oom();
04bc4a3f
LP
545
546 r = sd_id128_randomize(&rnd);
547 if (r < 0) {
548 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 549 return r;
04bc4a3f
LP
550 }
551
552 snprintf(as_uuid, sizeof(as_uuid),
553 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
554 SD_ID128_FORMAT_VAL(rnd));
555 char_array_0(as_uuid);
556
574d5f2d 557 r = write_string_file(from, as_uuid);
04bc4a3f
LP
558 if (r < 0) {
559 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 560 return r;
04bc4a3f
LP
561 }
562
563 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
564 log_error("Failed to bind mount boot id: %m");
565 r = -errno;
10d18763
ZJS
566 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
567 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
568
569 unlink(from);
04bc4a3f
LP
570 return r;
571}
572
e58a1277 573static int copy_devnodes(const char *dest) {
88213476
LP
574
575 static const char devnodes[] =
576 "null\0"
577 "zero\0"
578 "full\0"
579 "random\0"
580 "urandom\0"
f2d88580 581 "tty\0";
88213476
LP
582
583 const char *d;
e58a1277 584 int r = 0;
7fd1b19b 585 _cleanup_umask_ mode_t u;
a258bf26
LP
586
587 assert(dest);
124640f1
LP
588
589 u = umask(0000);
88213476
LP
590
591 NULSTR_FOREACH(d, devnodes) {
e58a1277 592 struct stat st;
7fd1b19b 593 _cleanup_free_ char *from = NULL, *to = NULL;
88213476
LP
594
595 asprintf(&from, "/dev/%s", d);
596 asprintf(&to, "%s/dev/%s", dest, d);
597
598 if (!from || !to) {
ed8b7a3e 599 log_oom();
a258bf26 600
88213476
LP
601 if (r == 0)
602 r = -ENOMEM;
603
604 break;
605 }
606
607 if (stat(from, &st) < 0) {
608
609 if (errno != ENOENT) {
610 log_error("Failed to stat %s: %m", from);
88213476
LP
611 if (r == 0)
612 r = -errno;
613 }
614
a258bf26 615 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 616
ed8b7a3e 617 log_error("%s is not a char or block device, cannot copy", from);
a258bf26
LP
618 if (r == 0)
619 r = -EIO;
620
621 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
622
623 log_error("mknod(%s) failed: %m", dest);
624 if (r == 0)
625 r = -errno;
88213476 626 }
88213476
LP
627 }
628
e58a1277
LP
629 return r;
630}
88213476 631
f2d88580
LP
632static int setup_ptmx(const char *dest) {
633 _cleanup_free_ char *p = NULL;
634
635 p = strappend(dest, "/dev/ptmx");
636 if (!p)
637 return log_oom();
638
639 if (symlink("pts/ptmx", p) < 0) {
640 log_error("Failed to create /dev/ptmx symlink: %m");
641 return -errno;
642 }
643
644 return 0;
645}
646
e58a1277
LP
647static int setup_dev_console(const char *dest, const char *console) {
648 struct stat st;
7fd1b19b 649 _cleanup_free_ char *to = NULL;
e58a1277 650 int r;
7fd1b19b 651 _cleanup_umask_ mode_t u;
e58a1277
LP
652
653 assert(dest);
654 assert(console);
655
656 u = umask(0000);
657
658 if (stat(console, &st) < 0) {
659 log_error("Failed to stat %s: %m", console);
25ea79fe 660 return -errno;
88213476 661
a258bf26 662 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
663 log_error("/dev/console is not a char device");
664 return -EIO;
e58a1277 665 }
88213476 666
e58a1277
LP
667 r = chmod_and_chown(console, 0600, 0, 0);
668 if (r < 0) {
669 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 670 return r;
a258bf26 671 }
88213476 672
25ea79fe
ZJS
673 if (asprintf(&to, "%s/dev/console", dest) < 0)
674 return log_oom();
88213476 675
a258bf26
LP
676 /* We need to bind mount the right tty to /dev/console since
677 * ptys can only exist on pts file systems. To have something
678 * to bind mount things on we create a device node first, that
679 * has the right major/minor (note that the major minor
680 * doesn't actually matter here, since we mount it over
681 * anyway). */
682
e58a1277
LP
683 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
684 log_error("mknod() for /dev/console failed: %m");
25ea79fe 685 return -errno;
e58a1277 686 }
a258bf26
LP
687
688 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 689 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 690 return -errno;
a258bf26
LP
691 }
692
25ea79fe 693 return 0;
e58a1277
LP
694}
695
696static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 697 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 698 int r, fd, k;
7fd1b19b 699 _cleanup_umask_ mode_t u;
e58a1277
LP
700 union {
701 struct cmsghdr cmsghdr;
702 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
703 } control = {};
704 struct msghdr mh = {
705 .msg_control = &control,
706 .msg_controllen = sizeof(control),
707 };
e58a1277
LP
708 struct cmsghdr *cmsg;
709
710 assert(dest);
711 assert(kmsg_socket >= 0);
a258bf26 712
e58a1277 713 u = umask(0000);
a258bf26 714
f1e5dfe2
LP
715 /* We create the kmsg FIFO as /dev/kmsg, but immediately
716 * delete it after bind mounting it to /proc/kmsg. While FIFOs
717 * on the reading side behave very similar to /proc/kmsg,
718 * their writing side behaves differently from /dev/kmsg in
719 * that writing blocks when nothing is reading. In order to
720 * avoid any problems with containers deadlocking due to this
721 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
722 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
723 asprintf(&to, "%s/proc/kmsg", dest) < 0)
724 return log_oom();
e58a1277
LP
725
726 if (mkfifo(from, 0600) < 0) {
727 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 728 return -errno;
e58a1277
LP
729 }
730
731 r = chmod_and_chown(from, 0600, 0, 0);
732 if (r < 0) {
733 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 734 return r;
e58a1277
LP
735 }
736
737 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
738 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 739 return -errno;
e58a1277
LP
740 }
741
742 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
743 if (fd < 0) {
744 log_error("Failed to open fifo: %m");
25ea79fe 745 return -errno;
e58a1277
LP
746 }
747
e58a1277
LP
748 cmsg = CMSG_FIRSTHDR(&mh);
749 cmsg->cmsg_level = SOL_SOCKET;
750 cmsg->cmsg_type = SCM_RIGHTS;
751 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
752 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
753
754 mh.msg_controllen = cmsg->cmsg_len;
755
756 /* Store away the fd in the socket, so that it stays open as
757 * long as we run the child */
758 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
759 close_nointr_nofail(fd);
760
761 if (k < 0) {
762 log_error("Failed to send FIFO fd: %m");
25ea79fe 763 return -errno;
a258bf26
LP
764 }
765
f1e5dfe2
LP
766 /* And now make the FIFO unavailable as /dev/kmsg... */
767 unlink(from);
25ea79fe 768 return 0;
88213476
LP
769}
770
3a74cea5 771static int setup_hostname(void) {
3a74cea5 772
7027ff61
LP
773 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
774 return -errno;
3a74cea5 775
7027ff61 776 return 0;
3a74cea5
LP
777}
778
57fb9fb5
LP
779static int setup_journal(const char *directory) {
780 sd_id128_t machine_id;
7fd1b19b 781 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 782 char *id;
57fb9fb5
LP
783 int r;
784
785 if (arg_link_journal == LINK_NO)
786 return 0;
787
788 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
789 if (!p)
790 return log_oom();
57fb9fb5
LP
791
792 r = read_one_line_file(p, &b);
27407a01
ZJS
793 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
794 return 0;
795 else if (r < 0) {
796 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
797 return r;
798 }
799
27407a01
ZJS
800 id = strstrip(b);
801 if (isempty(id) && arg_link_journal == LINK_AUTO)
802 return 0;
57fb9fb5 803
27407a01
ZJS
804 /* Verify validity */
805 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 806 if (r < 0) {
27407a01
ZJS
807 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
808 return r;
57fb9fb5
LP
809 }
810
811 free(p);
27407a01
ZJS
812 p = strappend("/var/log/journal/", id);
813 q = strjoin(directory, "/var/log/journal/", id, NULL);
814 if (!p || !q)
815 return log_oom();
816
817 if (path_is_mount_point(p, false) > 0) {
818 if (arg_link_journal != LINK_AUTO) {
819 log_error("%s: already a mount point, refusing to use for journal", p);
820 return -EEXIST;
821 }
822
823 return 0;
57fb9fb5
LP
824 }
825
27407a01 826 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 827 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
828 log_error("%s: already a mount point, refusing to use for journal", q);
829 return -EEXIST;
57fb9fb5
LP
830 }
831
27407a01 832 return 0;
57fb9fb5
LP
833 }
834
835 r = readlink_and_make_absolute(p, &d);
836 if (r >= 0) {
837 if ((arg_link_journal == LINK_GUEST ||
838 arg_link_journal == LINK_AUTO) &&
839 path_equal(d, q)) {
840
27407a01
ZJS
841 r = mkdir_p(q, 0755);
842 if (r < 0)
843 log_warning("failed to create directory %s: %m", q);
844 return 0;
57fb9fb5
LP
845 }
846
847 if (unlink(p) < 0) {
848 log_error("Failed to remove symlink %s: %m", p);
27407a01 849 return -errno;
57fb9fb5
LP
850 }
851 } else if (r == -EINVAL) {
852
853 if (arg_link_journal == LINK_GUEST &&
854 rmdir(p) < 0) {
855
27407a01
ZJS
856 if (errno == ENOTDIR) {
857 log_error("%s already exists and is neither a symlink nor a directory", p);
858 return r;
859 } else {
57fb9fb5 860 log_error("Failed to remove %s: %m", p);
27407a01 861 return -errno;
57fb9fb5 862 }
57fb9fb5
LP
863 }
864 } else if (r != -ENOENT) {
865 log_error("readlink(%s) failed: %m", p);
27407a01 866 return r;
57fb9fb5
LP
867 }
868
869 if (arg_link_journal == LINK_GUEST) {
870
871 if (symlink(q, p) < 0) {
872 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 873 return -errno;
57fb9fb5
LP
874 }
875
27407a01
ZJS
876 r = mkdir_p(q, 0755);
877 if (r < 0)
878 log_warning("failed to create directory %s: %m", q);
879 return 0;
57fb9fb5
LP
880 }
881
882 if (arg_link_journal == LINK_HOST) {
883 r = mkdir_p(p, 0755);
884 if (r < 0) {
885 log_error("Failed to create %s: %m", p);
27407a01 886 return r;
57fb9fb5
LP
887 }
888
27407a01
ZJS
889 } else if (access(p, F_OK) < 0)
890 return 0;
57fb9fb5
LP
891
892 if (dir_is_empty(q) == 0) {
893 log_error("%s not empty.", q);
27407a01 894 return -ENOTEMPTY;
57fb9fb5
LP
895 }
896
897 r = mkdir_p(q, 0755);
898 if (r < 0) {
899 log_error("Failed to create %s: %m", q);
27407a01 900 return r;
57fb9fb5
LP
901 }
902
903 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
904 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 905 return -errno;
57fb9fb5
LP
906 }
907
27407a01 908 return 0;
57fb9fb5
LP
909}
910
7027ff61
LP
911static int setup_cgroup(const char *path) {
912 char **c;
913 int r;
914
915 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
916 if (r < 0) {
917 log_error("Failed to create cgroup: %s", strerror(-r));
918 return r;
919 }
920
921 STRV_FOREACH(c, arg_controllers) {
922 r = cg_create_and_attach(*c, path, 1);
923 if (r < 0)
924 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
925 }
926
927 return 0;
928}
929
a383724e 930static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
dc2c7560
LP
931 _cleanup_free_ char *path = NULL;
932 char buf[DECIMAL_STR_MAX(pid_t)];
a383724e
ZJS
933 int r = 0, k;
934
935 assert(cgroup);
936 assert(pid >= 0);
937 assert(arg_directory);
938
939#ifdef HAVE_XATTR
940 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
941
dc2c7560
LP
942 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
943 if (r < 0) {
944 log_error("Failed to get path: %s", strerror(-r));
945 return r;
a383724e
ZJS
946 }
947
948 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
949 if (r < 0)
950 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
951
952 if (uuid) {
953 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
954 if (k < 0) {
955 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
956 if (r == 0)
957 r = k;
958 }
959 }
960
961 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
962 if (k < 0) {
dc2c7560 963 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
a383724e
ZJS
964 if (r == 0)
965 r = k;
966 }
967#endif
968 return r;
969}
970
88213476 971static int drop_capabilities(void) {
5076f0cc 972 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
973}
974
57cb4adf 975static int process_pty(int master, pid_t pid, sigset_t *mask) {
0c749d50 976
b72491a2 977 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
a258bf26
LP
978 size_t in_buffer_full = 0, out_buffer_full = 0;
979 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
980 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
a258bf26 981 int ep = -1, signal_fd = -1, r;
57cb4adf
LP
982 bool tried_orderly_shutdown = false;
983
984 assert(master >= 0);
985 assert(pid > 0);
986 assert(mask);
a258bf26
LP
987
988 fd_nonblock(STDIN_FILENO, 1);
989 fd_nonblock(STDOUT_FILENO, 1);
990 fd_nonblock(master, 1);
991
db7feb7e
LP
992 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
993 if (signal_fd < 0) {
a258bf26
LP
994 log_error("signalfd(): %m");
995 r = -errno;
996 goto finish;
997 }
998
db7feb7e
LP
999 ep = epoll_create1(EPOLL_CLOEXEC);
1000 if (ep < 0) {
a258bf26
LP
1001 log_error("Failed to create epoll: %m");
1002 r = -errno;
1003 goto finish;
1004 }
1005
51d88d1b
LP
1006 /* We read from STDIN only if this is actually a TTY,
1007 * otherwise we assume non-interactivity. */
1008 if (isatty(STDIN_FILENO)) {
1009 zero(stdin_ev);
1010 stdin_ev.events = EPOLLIN|EPOLLET;
1011 stdin_ev.data.fd = STDIN_FILENO;
1012
1013 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1014 log_error("Failed to register STDIN in epoll: %m");
1015 r = -errno;
1016 goto finish;
1017 }
1018 }
a258bf26
LP
1019
1020 zero(stdout_ev);
1021 stdout_ev.events = EPOLLOUT|EPOLLET;
1022 stdout_ev.data.fd = STDOUT_FILENO;
1023
1024 zero(master_ev);
1025 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1026 master_ev.data.fd = master;
1027
1028 zero(signal_ev);
1029 signal_ev.events = EPOLLIN;
1030 signal_ev.data.fd = signal_fd;
1031
f2956e80
MS
1032 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1033 if (errno != EPERM) {
1034 log_error("Failed to register stdout in epoll: %m");
1035 r = -errno;
1036 goto finish;
1037 }
1038 /* stdout without epoll support. Likely redirected to regular file. */
1039 stdout_writable = true;
1040 }
1041
1042 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
a258bf26 1043 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
51d88d1b 1044 log_error("Failed to register fds in epoll: %m");
a258bf26
LP
1045 r = -errno;
1046 goto finish;
1047 }
1048
fd14078a 1049 for (;;) {
a258bf26
LP
1050 struct epoll_event ev[16];
1051 ssize_t k;
1052 int i, nfds;
1053
db7feb7e
LP
1054 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1055 if (nfds < 0) {
a258bf26
LP
1056
1057 if (errno == EINTR || errno == EAGAIN)
1058 continue;
1059
1060 log_error("epoll_wait(): %m");
1061 r = -errno;
1062 goto finish;
1063 }
1064
1065 assert(nfds >= 1);
1066
1067 for (i = 0; i < nfds; i++) {
1068 if (ev[i].data.fd == STDIN_FILENO) {
1069
fd14078a 1070 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
1071 stdin_readable = true;
1072
1073 } else if (ev[i].data.fd == STDOUT_FILENO) {
1074
fd14078a 1075 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
1076 stdout_writable = true;
1077
1078 } else if (ev[i].data.fd == master) {
1079
fd14078a 1080 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
1081 master_readable = true;
1082
fd14078a 1083 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
1084 master_writable = true;
1085
1086 } else if (ev[i].data.fd == signal_fd) {
1087 struct signalfd_siginfo sfsi;
1088 ssize_t n;
1089
db7feb7e
LP
1090 n = read(signal_fd, &sfsi, sizeof(sfsi));
1091 if (n != sizeof(sfsi)) {
a258bf26
LP
1092
1093 if (n >= 0) {
0c749d50 1094 log_error("Failed to read from signalfd: invalid block size");
a258bf26
LP
1095 r = -EIO;
1096 goto finish;
1097 }
1098
1099 if (errno != EINTR && errno != EAGAIN) {
0c749d50 1100 log_error("Failed to read from signalfd: %m");
a258bf26
LP
1101 r = -errno;
1102 goto finish;
1103 }
1104 } else {
1105
1106 if (sfsi.ssi_signo == SIGWINCH) {
1107 struct winsize ws;
1108
1109 /* The window size changed, let's forward that. */
a258bf26
LP
1110 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1111 ioctl(master, TIOCSWINSZ, &ws);
57cb4adf
LP
1112 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1113
1114 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1115
1116 /* This only works for systemd... */
1117 tried_orderly_shutdown = true;
1118 kill(pid, SIGRTMIN+3);
1119
a258bf26 1120 } else {
0c749d50 1121 r = 0;
a258bf26
LP
1122 goto finish;
1123 }
1124 }
1125 }
1126 }
1127
1128 while ((stdin_readable && in_buffer_full <= 0) ||
1129 (master_writable && in_buffer_full > 0) ||
1130 (master_readable && out_buffer_full <= 0) ||
1131 (stdout_writable && out_buffer_full > 0)) {
1132
b72491a2 1133 if (stdin_readable && in_buffer_full < LINE_MAX) {
a258bf26 1134
db7feb7e
LP
1135 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1136 if (k < 0) {
a258bf26 1137
fd14078a 1138 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1139 stdin_readable = false;
a258bf26
LP
1140 else {
1141 log_error("read(): %m");
0c749d50 1142 r = -errno;
a258bf26
LP
1143 goto finish;
1144 }
1145 } else
1146 in_buffer_full += (size_t) k;
a258bf26
LP
1147 }
1148
1149 if (master_writable && in_buffer_full > 0) {
1150
db7feb7e
LP
1151 k = write(master, in_buffer, in_buffer_full);
1152 if (k < 0) {
a258bf26 1153
fd14078a 1154 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1155 master_writable = false;
fd14078a 1156 else {
a258bf26 1157 log_error("write(): %m");
0c749d50 1158 r = -errno;
a258bf26
LP
1159 goto finish;
1160 }
1161
1162 } else {
1163 assert(in_buffer_full >= (size_t) k);
1164 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1165 in_buffer_full -= k;
1166 }
1167 }
1168
b72491a2 1169 if (master_readable && out_buffer_full < LINE_MAX) {
a258bf26 1170
db7feb7e
LP
1171 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1172 if (k < 0) {
a258bf26 1173
fd14078a 1174 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1175 master_readable = false;
a258bf26
LP
1176 else {
1177 log_error("read(): %m");
0c749d50 1178 r = -errno;
a258bf26
LP
1179 goto finish;
1180 }
1181 } else
1182 out_buffer_full += (size_t) k;
a258bf26
LP
1183 }
1184
1185 if (stdout_writable && out_buffer_full > 0) {
1186
db7feb7e
LP
1187 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1188 if (k < 0) {
a258bf26 1189
fd14078a 1190 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1191 stdout_writable = false;
fd14078a 1192 else {
a258bf26 1193 log_error("write(): %m");
0c749d50 1194 r = -errno;
a258bf26
LP
1195 goto finish;
1196 }
1197
1198 } else {
1199 assert(out_buffer_full >= (size_t) k);
1200 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1201 out_buffer_full -= k;
1202 }
1203 }
1204 }
fd14078a 1205 }
a258bf26
LP
1206
1207finish:
1208 if (ep >= 0)
1209 close_nointr_nofail(ep);
1210
1211 if (signal_fd >= 0)
1212 close_nointr_nofail(signal_fd);
1213
1214 return r;
1215}
88213476
LP
1216
1217int main(int argc, char *argv[]) {
1218 pid_t pid = 0;
04d391da 1219 int r = EXIT_FAILURE, k;
38158b92 1220 _cleanup_free_ char *newcg = NULL;
7027ff61
LP
1221 _cleanup_close_ int master = -1;
1222 int n_fd_passed;
a258bf26
LP
1223 const char *console = NULL;
1224 struct termios saved_attr, raw_attr;
1225 sigset_t mask;
1226 bool saved_attr_valid = false;
1227 struct winsize ws;
e58a1277 1228 int kmsg_socket_pair[2] = { -1, -1 };
842f3b0f 1229 FDSet *fds = NULL;
88213476
LP
1230
1231 log_parse_environment();
1232 log_open();
1233
05947bef
LP
1234 k = parse_argv(argc, argv);
1235 if (k < 0)
88213476 1236 goto finish;
05947bef
LP
1237 else if (k == 0) {
1238 r = EXIT_SUCCESS;
1239 goto finish;
1240 }
88213476
LP
1241
1242 if (arg_directory) {
1243 char *p;
1244
1245 p = path_make_absolute_cwd(arg_directory);
1246 free(arg_directory);
1247 arg_directory = p;
1248 } else
1249 arg_directory = get_current_dir_name();
1250
1251 if (!arg_directory) {
a383724e 1252 log_error("Failed to determine path, please use -D.");
88213476
LP
1253 goto finish;
1254 }
1255
1256 path_kill_slashes(arg_directory);
1257
7027ff61
LP
1258 if (!arg_machine) {
1259 arg_machine = strdup(path_get_file_name(arg_directory));
1260 if (!arg_machine) {
1261 log_oom();
1262 goto finish;
1263 }
1264
1265 hostname_cleanup(arg_machine);
1266 if (isempty(arg_machine)) {
1267 log_error("Failed to determine machine name automatically, please use -M.");
1268 goto finish;
1269 }
1270 }
1271
88213476
LP
1272 if (geteuid() != 0) {
1273 log_error("Need to be root.");
1274 goto finish;
1275 }
1276
04d391da
LP
1277 if (sd_booted() <= 0) {
1278 log_error("Not running on a systemd system.");
1279 goto finish;
1280 }
1281
88213476 1282 if (path_equal(arg_directory, "/")) {
6df6b939 1283 log_error("Spawning container on root directory not supported.");
88213476
LP
1284 goto finish;
1285 }
1286
66060897 1287 if (path_is_os_tree(arg_directory) <= 0) {
f8964235 1288 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
88213476
LP
1289 goto finish;
1290 }
1291
842f3b0f
LP
1292 log_close();
1293 n_fd_passed = sd_listen_fds(false);
1294 if (n_fd_passed > 0) {
1295 k = fdset_new_listen_fds(&fds, false);
1296 if (k < 0) {
1297 log_error("Failed to collect file descriptors: %s", strerror(-k));
1298 goto finish;
1299 }
1300 }
1301 fdset_close_others(fds);
1302 log_open();
1303
38158b92 1304 k = cg_get_machine_path(arg_machine, &newcg);
db7feb7e 1305 if (k < 0) {
7027ff61 1306 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
04d391da
LP
1307 goto finish;
1308 }
1309
05947bef
LP
1310 k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1311 if (k <= 0 && k != -ENOENT) {
7027ff61 1312 log_error("Container already running.");
04d391da 1313
7027ff61
LP
1314 free(newcg);
1315 newcg = NULL;
1316
1317 goto finish;
40c32a4a
LGL
1318 }
1319
db7feb7e
LP
1320 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1321 if (master < 0) {
a258bf26
LP
1322 log_error("Failed to acquire pseudo tty: %m");
1323 goto finish;
1324 }
1325
db7feb7e
LP
1326 console = ptsname(master);
1327 if (!console) {
a258bf26
LP
1328 log_error("Failed to determine tty name: %m");
1329 goto finish;
1330 }
1331
1332 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1333
1334 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1335 ioctl(master, TIOCSWINSZ, &ws);
1336
1337 if (unlockpt(master) < 0) {
1338 log_error("Failed to unlock tty: %m");
1339 goto finish;
1340 }
1341
51d88d1b
LP
1342 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1343 saved_attr_valid = true;
a258bf26 1344
51d88d1b
LP
1345 raw_attr = saved_attr;
1346 cfmakeraw(&raw_attr);
1347 raw_attr.c_lflag &= ~ECHO;
1348 }
a258bf26 1349
e58a1277 1350 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
7027ff61 1351 log_error("Failed to create kmsg socket pair.");
e58a1277
LP
1352 goto finish;
1353 }
1354
05947bef
LP
1355 sd_notify(0, "READY=1");
1356
a258bf26
LP
1357 assert_se(sigemptyset(&mask) == 0);
1358 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1359 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1360
d87be9b0
LP
1361 for (;;) {
1362 siginfo_t status;
a383724e 1363 int pipefd[2], pipefd2[2];
52af2106 1364
f2d88580 1365 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1fd96121
ZJS
1366 log_error("pipe2(): %m");
1367 goto finish;
d87be9b0 1368 }
88213476 1369
a383724e
ZJS
1370 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1371 log_error("pipe2(): %m");
1372 close_pipe(pipefd);
1373 goto finish;
1374 }
1375
d87be9b0
LP
1376 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1377 if (pid < 0) {
1378 if (errno == EINVAL)
1379 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1380 else
1381 log_error("clone() failed: %m");
a258bf26 1382
d87be9b0
LP
1383 goto finish;
1384 }
a258bf26 1385
d87be9b0
LP
1386 if (pid == 0) {
1387 /* child */
d87be9b0
LP
1388 const char *home = NULL;
1389 uid_t uid = (uid_t) -1;
1390 gid_t gid = (gid_t) -1;
5674767e 1391 unsigned n_env = 2;
d87be9b0
LP
1392 const char *envp[] = {
1393 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1394 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1395 NULL, /* TERM */
1396 NULL, /* HOME */
1397 NULL, /* USER */
1398 NULL, /* LOGNAME */
1399 NULL, /* container_uuid */
842f3b0f
LP
1400 NULL, /* LISTEN_FDS */
1401 NULL, /* LISTEN_PID */
d87be9b0
LP
1402 NULL
1403 };
a258bf26 1404
5674767e
ZJS
1405 envp[n_env] = strv_find_prefix(environ, "TERM=");
1406 if (envp[n_env])
1407 n_env ++;
a258bf26 1408
a383724e 1409 /* Wait for the parent process to log our PID */
5659774c 1410 close_nointr_nofail(pipefd[1]);
1fd96121 1411 fd_wait_for_event(pipefd[0], POLLHUP, -1);
5659774c 1412 close_nointr_nofail(pipefd[0]);
1fd96121 1413
d87be9b0 1414 close_nointr_nofail(master);
842f3b0f 1415 master = -1;
a258bf26 1416
1fd96121
ZJS
1417 if (saved_attr_valid) {
1418 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1419 log_error("Failed to set terminal attributes: %m");
1420 goto child_fail;
1421 }
1422 }
1423
d87be9b0
LP
1424 close_nointr(STDIN_FILENO);
1425 close_nointr(STDOUT_FILENO);
1426 close_nointr(STDERR_FILENO);
db7feb7e 1427
842f3b0f
LP
1428 close_nointr_nofail(kmsg_socket_pair[0]);
1429 kmsg_socket_pair[0] = -1;
a258bf26 1430
d87be9b0 1431 reset_all_signal_handlers();
88213476 1432
d87be9b0
LP
1433 assert_se(sigemptyset(&mask) == 0);
1434 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1435
842f3b0f
LP
1436 k = open_terminal(console, O_RDWR);
1437 if (k != STDIN_FILENO) {
1438 if (k >= 0) {
1439 close_nointr_nofail(k);
1440 k = -EINVAL;
1441 }
1442
1443 log_error("Failed to open console: %s", strerror(-k));
1444 goto child_fail;
1445 }
1446
1447 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1448 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1449 log_error("Failed to duplicate console: %m");
d87be9b0 1450 goto child_fail;
842f3b0f 1451 }
bc2f673e 1452
d87be9b0
LP
1453 if (setsid() < 0) {
1454 log_error("setsid() failed: %m");
bc2f673e
LP
1455 goto child_fail;
1456 }
1457
d87be9b0
LP
1458 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1459 log_error("PR_SET_PDEATHSIG failed: %m");
1460 goto child_fail;
1461 }
e58a1277 1462
7027ff61
LP
1463 if (setup_cgroup(newcg) < 0)
1464 goto child_fail;
1465
dc2c7560 1466 close_pipe(pipefd2);
a383724e 1467
d87be9b0
LP
1468 /* Mark everything as slave, so that we still
1469 * receive mounts from the real root, but don't
1470 * propagate mounts to the real root. */
1471 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1472 log_error("MS_SLAVE|MS_REC failed: %m");
1473 goto child_fail;
1474 }
04bc4a3f 1475
d87be9b0
LP
1476 /* Turn directory into bind mount */
1477 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1478 log_error("Failed to make bind mount.");
1479 goto child_fail;
1480 }
88213476 1481
d87be9b0
LP
1482 if (arg_read_only)
1483 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1484 log_error("Failed to make read-only.");
1485 goto child_fail;
1486 }
2547bb41 1487
d87be9b0
LP
1488 if (mount_all(arg_directory) < 0)
1489 goto child_fail;
57fb9fb5 1490
d87be9b0
LP
1491 if (copy_devnodes(arg_directory) < 0)
1492 goto child_fail;
a258bf26 1493
f2d88580
LP
1494 if (setup_ptmx(arg_directory) < 0)
1495 goto child_fail;
1496
d87be9b0 1497 dev_setup(arg_directory);
88213476 1498
d87be9b0
LP
1499 if (setup_dev_console(arg_directory, console) < 0)
1500 goto child_fail;
88213476 1501
d87be9b0
LP
1502 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1503 goto child_fail;
88213476 1504
d87be9b0 1505 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1506 kmsg_socket_pair[1] = -1;
a258bf26 1507
d87be9b0
LP
1508 if (setup_boot_id(arg_directory) < 0)
1509 goto child_fail;
a41fe3a2 1510
d87be9b0
LP
1511 if (setup_timezone(arg_directory) < 0)
1512 goto child_fail;
88213476 1513
d87be9b0
LP
1514 if (setup_resolv_conf(arg_directory) < 0)
1515 goto child_fail;
687d0825 1516
d87be9b0 1517 if (setup_journal(arg_directory) < 0)
687d0825 1518 goto child_fail;
687d0825 1519
17fe0523
LP
1520 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1521 goto child_fail;
1522
1523 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1524 goto child_fail;
1525
d87be9b0
LP
1526 if (chdir(arg_directory) < 0) {
1527 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1528 goto child_fail;
1529 }
1530
d87be9b0
LP
1531 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1532 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1533 goto child_fail;
1534 }
1535
d87be9b0
LP
1536 if (chroot(".") < 0) {
1537 log_error("chroot() failed: %m");
687d0825
MV
1538 goto child_fail;
1539 }
1540
d87be9b0
LP
1541 if (chdir("/") < 0) {
1542 log_error("chdir() failed: %m");
687d0825
MV
1543 goto child_fail;
1544 }
1545
d87be9b0
LP
1546 umask(0022);
1547
1548 loopback_setup();
1549
1550 if (drop_capabilities() < 0) {
1551 log_error("drop_capabilities() failed: %m");
687d0825
MV
1552 goto child_fail;
1553 }
687d0825 1554
d87be9b0
LP
1555 if (arg_user) {
1556
963ddb91
LP
1557 /* Note that this resolves user names
1558 * inside the container, and hence
1559 * accesses the NSS modules from the
1560 * container and not the host. This is
1561 * a bit weird... */
1562
d87be9b0
LP
1563 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1564 log_error("get_user_creds() failed: %m");
1565 goto child_fail;
1566 }
1567
1568 if (mkdir_parents_label(home, 0775) < 0) {
1569 log_error("mkdir_parents_label() failed: %m");
1570 goto child_fail;
1571 }
1572
1573 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1574 log_error("mkdir_safe_label() failed: %m");
1575 goto child_fail;
1576 }
1577
1578 if (initgroups((const char*)arg_user, gid) < 0) {
1579 log_error("initgroups() failed: %m");
1580 goto child_fail;
1581 }
144f0fc0 1582
d87be9b0
LP
1583 if (setresgid(gid, gid, gid) < 0) {
1584 log_error("setregid() failed: %m");
1585 goto child_fail;
1586 }
1587
1588 if (setresuid(uid, uid, uid) < 0) {
1589 log_error("setreuid() failed: %m");
1590 goto child_fail;
1591 }
3c957acf
LP
1592 } else {
1593 /* Reset everything fully to 0, just in case */
1594
1595 if (setgroups(0, NULL) < 0) {
1596 log_error("setgroups() failed: %m");
1597 goto child_fail;
1598 }
1599
1600 if (setresgid(0, 0, 0) < 0) {
1601 log_error("setregid() failed: %m");
1602 goto child_fail;
1603 }
1604
1605 if (setresuid(0, 0, 0) < 0) {
1606 log_error("setreuid() failed: %m");
1607 goto child_fail;
1608 }
d87be9b0
LP
1609 }
1610
842f3b0f
LP
1611 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1612 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1613 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1614 log_oom();
144f0fc0
LP
1615 goto child_fail;
1616 }
687d0825 1617
d87be9b0 1618 if (arg_uuid) {
842f3b0f
LP
1619 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1620 log_oom();
1621 goto child_fail;
1622 }
1623 }
1624
1625 if (fdset_size(fds) > 0) {
1626 k = fdset_cloexec(fds, false);
1627 if (k < 0) {
1628 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1629 goto child_fail;
1630 }
1631
1632 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
7027ff61 1633 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
d87be9b0
LP
1634 log_oom();
1635 goto child_fail;
1636 }
1637 }
1638
1639 setup_hostname();
1640
1641 if (arg_boot) {
1642 char **a;
1643 size_t l;
88213476 1644
d87be9b0 1645 /* Automatically search for the init system */
0f0dbc46 1646
d87be9b0
LP
1647 l = 1 + argc - optind;
1648 a = newa(char*, l + 1);
1649 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1650
d87be9b0
LP
1651 a[0] = (char*) "/usr/lib/systemd/systemd";
1652 execve(a[0], a, (char**) envp);
0f0dbc46 1653
d87be9b0
LP
1654 a[0] = (char*) "/lib/systemd/systemd";
1655 execve(a[0], a, (char**) envp);
0f0dbc46 1656
d87be9b0
LP
1657 a[0] = (char*) "/sbin/init";
1658 execve(a[0], a, (char**) envp);
1659 } else if (argc > optind)
1660 execvpe(argv[optind], argv + optind, (char**) envp);
1661 else {
1662 chdir(home ? home : "/root");
1663 execle("/bin/bash", "-bash", NULL, (char**) envp);
1664 }
1665
1666 log_error("execv() failed: %m");
0f0dbc46 1667
d87be9b0
LP
1668 child_fail:
1669 _exit(EXIT_FAILURE);
da5b3bad 1670 }
88213476 1671
9d60cb63 1672 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
5659774c
ZJS
1673 close_nointr_nofail(pipefd[0]);
1674 close_nointr_nofail(pipefd[1]);
1fd96121 1675
a383724e
ZJS
1676 /* Wait for the child process to establish cgroup hierarchy */
1677 close_nointr_nofail(pipefd2[1]);
1678 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1679 close_nointr_nofail(pipefd2[0]);
1680
1681 save_attributes(newcg, pid, arg_uuid, arg_directory);
1682
842f3b0f
LP
1683 fdset_free(fds);
1684 fds = NULL;
1685
57cb4adf 1686 if (process_pty(master, pid, &mask) < 0)
d87be9b0 1687 goto finish;
88213476 1688
d87be9b0
LP
1689 if (saved_attr_valid)
1690 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
a258bf26 1691
05947bef
LP
1692 k = wait_for_terminate(pid, &status);
1693 if (k < 0) {
d87be9b0
LP
1694 r = EXIT_FAILURE;
1695 break;
1696 }
a258bf26 1697
d87be9b0 1698 if (status.si_code == CLD_EXITED) {
a5f5f8a0 1699 r = status.si_status;
d87be9b0
LP
1700 if (status.si_status != 0) {
1701 log_error("Container failed with error code %i.", status.si_status);
d87be9b0
LP
1702 break;
1703 }
1704
1705 log_debug("Container exited successfully.");
1706 break;
1707 } else if (status.si_code == CLD_KILLED &&
1708 status.si_status == SIGINT) {
1709 log_info("Container has been shut down.");
1710 r = 0;
1711 break;
1712 } else if (status.si_code == CLD_KILLED &&
1713 status.si_status == SIGHUP) {
1714 log_info("Container is being rebooted.");
1715 continue;
1716 } else if (status.si_code == CLD_KILLED ||
1717 status.si_code == CLD_DUMPED) {
88213476 1718
d87be9b0
LP
1719 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1720 r = EXIT_FAILURE;
1721 break;
1722 } else {
1723 log_error("Container failed due to unknown reason.");
1724 r = EXIT_FAILURE;
1725 break;
1726 }
1727 }
88213476
LP
1728
1729finish:
a258bf26
LP
1730 if (saved_attr_valid)
1731 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1732
e58a1277
LP
1733 close_pipe(kmsg_socket_pair);
1734
04d391da
LP
1735 if (newcg)
1736 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
88213476 1737
04d391da 1738 free(arg_directory);
7027ff61 1739 free(arg_machine);
40c32a4a 1740 strv_free(arg_controllers);
88213476 1741
842f3b0f
LP
1742 fdset_free(fds);
1743
88213476
LP
1744 return r;
1745}