]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
bootchart items
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <sys/epoll.h>
37#include <termios.h>
38#include <sys/signalfd.h>
687d0825 39#include <grp.h>
5ed27dbd 40#include <linux/fs.h>
9537eab0
LP
41#include <sys/un.h>
42#include <sys/socket.h>
88213476 43
81527be1
LP
44#include <systemd/sd-daemon.h>
45
88213476
LP
46#include "log.h"
47#include "util.h"
49e942b2 48#include "mkdir.h"
6b2d0e85 49#include "macro.h"
d7832d2c 50#include "audit.h"
94d82985 51#include "missing.h"
04d391da 52#include "cgroup-util.h"
a258bf26 53#include "strv.h"
9eb977db 54#include "path-util.h"
a41fe3a2 55#include "loopback-setup.h"
57fb9fb5 56#include "sd-id128.h"
4fc9982c 57#include "dev-setup.h"
842f3b0f 58#include "fdset.h"
acbeb427 59#include "build.h"
57fb9fb5
LP
60
61typedef enum LinkJournal {
62 LINK_NO,
63 LINK_AUTO,
64 LINK_HOST,
65 LINK_GUEST
66} LinkJournal;
88213476
LP
67
68static char *arg_directory = NULL;
687d0825 69static char *arg_user = NULL;
40c32a4a 70static char **arg_controllers = NULL;
144f0fc0 71static char *arg_uuid = NULL;
ff01d048 72static bool arg_private_network = false;
bc2f673e 73static bool arg_read_only = false;
0f0dbc46 74static bool arg_boot = false;
57fb9fb5 75static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
76static uint64_t arg_retain =
77 (1ULL << CAP_CHOWN) |
78 (1ULL << CAP_DAC_OVERRIDE) |
79 (1ULL << CAP_DAC_READ_SEARCH) |
80 (1ULL << CAP_FOWNER) |
81 (1ULL << CAP_FSETID) |
82 (1ULL << CAP_IPC_OWNER) |
83 (1ULL << CAP_KILL) |
84 (1ULL << CAP_LEASE) |
85 (1ULL << CAP_LINUX_IMMUTABLE) |
86 (1ULL << CAP_NET_BIND_SERVICE) |
87 (1ULL << CAP_NET_BROADCAST) |
88 (1ULL << CAP_NET_RAW) |
89 (1ULL << CAP_SETGID) |
90 (1ULL << CAP_SETFCAP) |
91 (1ULL << CAP_SETPCAP) |
92 (1ULL << CAP_SETUID) |
93 (1ULL << CAP_SYS_ADMIN) |
94 (1ULL << CAP_SYS_CHROOT) |
95 (1ULL << CAP_SYS_NICE) |
96 (1ULL << CAP_SYS_PTRACE) |
97 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 98 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
99 (1ULL << CAP_SYS_BOOT) |
100 (1ULL << CAP_AUDIT_WRITE) |
101 (1ULL << CAP_AUDIT_CONTROL);
88213476
LP
102
103static int help(void) {
104
105 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
106 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
57fb9fb5 107 " -h --help Show this help\n"
acbeb427 108 " --version Print version string\n"
57fb9fb5
LP
109 " -D --directory=NAME Root directory for the container\n"
110 " -b --boot Boot up full system (i.e. invoke init)\n"
111 " -u --user=USER Run the command under specified user or uid\n"
112 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
113 " --uuid=UUID Set a specific machine UUID for the container\n"
114 " --private-network Disable network in container\n"
115 " --read-only Mount the root directory read-only\n"
116 " --capability=CAP In addition to the default, retain specified capability\n"
117 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
118 " -j Equivalent to --link-journal=host\n",
88213476
LP
119 program_invocation_short_name);
120
121 return 0;
122}
123
124static int parse_argv(int argc, char *argv[]) {
125
a41fe3a2 126 enum {
acbeb427
ZJS
127 ARG_VERSION = 0x100,
128 ARG_PRIVATE_NETWORK,
bc2f673e 129 ARG_UUID,
5076f0cc 130 ARG_READ_ONLY,
57fb9fb5
LP
131 ARG_CAPABILITY,
132 ARG_LINK_JOURNAL
a41fe3a2
LP
133 };
134
88213476 135 static const struct option options[] = {
ff01d048 136 { "help", no_argument, NULL, 'h' },
acbeb427 137 { "version", no_argument, NULL, ARG_VERSION },
ff01d048
LP
138 { "directory", required_argument, NULL, 'D' },
139 { "user", required_argument, NULL, 'u' },
40c32a4a 140 { "controllers", required_argument, NULL, 'C' },
ff01d048 141 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 142 { "boot", no_argument, NULL, 'b' },
144f0fc0 143 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 144 { "read-only", no_argument, NULL, ARG_READ_ONLY },
5076f0cc 145 { "capability", required_argument, NULL, ARG_CAPABILITY },
57fb9fb5 146 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
ff01d048 147 { NULL, 0, NULL, 0 }
88213476
LP
148 };
149
150 int c;
151
152 assert(argc >= 0);
153 assert(argv);
154
57fb9fb5 155 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
88213476
LP
156
157 switch (c) {
158
159 case 'h':
160 help();
161 return 0;
162
acbeb427
ZJS
163 case ARG_VERSION:
164 puts(PACKAGE_STRING);
165 puts(SYSTEMD_FEATURES);
166 return 0;
167
88213476
LP
168 case 'D':
169 free(arg_directory);
3a74cea5
LP
170 arg_directory = canonicalize_file_name(optarg);
171 if (!arg_directory) {
172 log_error("Failed to canonicalize root directory.");
88213476
LP
173 return -ENOMEM;
174 }
175
176 break;
177
687d0825
MV
178 case 'u':
179 free(arg_user);
180 if (!(arg_user = strdup(optarg))) {
181 log_error("Failed to duplicate user name.");
182 return -ENOMEM;
183 }
184
185 break;
186
40c32a4a
LGL
187 case 'C':
188 strv_free(arg_controllers);
189 arg_controllers = strv_split(optarg, ",");
190 if (!arg_controllers) {
191 log_error("Failed to split controllers list.");
192 return -ENOMEM;
193 }
194 strv_uniq(arg_controllers);
195
196 break;
197
ff01d048
LP
198 case ARG_PRIVATE_NETWORK:
199 arg_private_network = true;
a41fe3a2
LP
200 break;
201
0f0dbc46
LP
202 case 'b':
203 arg_boot = true;
204 break;
205
144f0fc0
LP
206 case ARG_UUID:
207 arg_uuid = optarg;
208 break;
209
bc2f673e
LP
210 case ARG_READ_ONLY:
211 arg_read_only = true;
212 break;
213
5076f0cc
LP
214 case ARG_CAPABILITY: {
215 char *state, *word;
216 size_t length;
217
218 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
219 cap_value_t cap;
220 char *t;
221
222 t = strndup(word, length);
0d0f0c50
SL
223 if (!t)
224 return log_oom();
5076f0cc
LP
225
226 if (cap_from_name(t, &cap) < 0) {
227 log_error("Failed to parse capability %s.", t);
228 free(t);
229 return -EINVAL;
230 }
231
232 free(t);
233 arg_retain |= 1ULL << (uint64_t) cap;
234 }
235
236 break;
237 }
238
57fb9fb5
LP
239 case 'j':
240 arg_link_journal = LINK_GUEST;
241 break;
242
243 case ARG_LINK_JOURNAL:
244 if (streq(optarg, "auto"))
245 arg_link_journal = LINK_AUTO;
246 else if (streq(optarg, "no"))
247 arg_link_journal = LINK_NO;
248 else if (streq(optarg, "guest"))
249 arg_link_journal = LINK_GUEST;
250 else if (streq(optarg, "host"))
251 arg_link_journal = LINK_HOST;
252 else {
253 log_error("Failed to parse link journal mode %s", optarg);
254 return -EINVAL;
255 }
256
257 break;
258
88213476
LP
259 case '?':
260 return -EINVAL;
261
262 default:
263 log_error("Unknown option code %c", c);
264 return -EINVAL;
265 }
266 }
267
268 return 1;
269}
270
271static int mount_all(const char *dest) {
272
273 typedef struct MountPoint {
274 const char *what;
275 const char *where;
276 const char *type;
277 const char *options;
278 unsigned long flags;
3bd66c05 279 bool fatal;
88213476
LP
280 } MountPoint;
281
282 static const MountPoint mount_table[] = {
4b7a6af4 283 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
284 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
285 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 286 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 287 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
b4c59701 288 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
ede89845 289 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 290 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 291#ifdef HAVE_SELINUX
b4c59701
LP
292 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
293 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 294#endif
88213476
LP
295 };
296
297 unsigned k;
298 int r = 0;
299
300 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
89154bd4 301 char _cleanup_free_ *where = NULL;
88213476
LP
302 int t;
303
304 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
0d0f0c50 305 log_oom();
88213476
LP
306
307 if (r == 0)
308 r = -ENOMEM;
309
310 break;
311 }
312
e65aec12 313 t = path_is_mount_point(where, true);
68fb0892 314 if (t < 0) {
88213476 315 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
316
317 if (r == 0)
318 r = t;
319
320 continue;
321 }
322
9c1c7f71
LP
323 /* Skip this entry if it is not a remount. */
324 if (mount_table[k].what && t > 0)
014a9c77
LP
325 continue;
326
d2e54fae 327 mkdir_p_label(where, 0755);
88213476
LP
328
329 if (mount(mount_table[k].what,
330 where,
331 mount_table[k].type,
332 mount_table[k].flags,
3bd66c05
LP
333 mount_table[k].options) < 0 &&
334 mount_table[k].fatal) {
88213476
LP
335
336 log_error("mount(%s) failed: %m", where);
337
338 if (r == 0)
339 r = -errno;
340 }
88213476
LP
341 }
342
e58a1277
LP
343 return r;
344}
f8440af5 345
e58a1277 346static int setup_timezone(const char *dest) {
d4036145
LP
347 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
348 char *z, *y;
349 int r;
f8440af5 350
e58a1277
LP
351 assert(dest);
352
353 /* Fix the timezone, if possible */
d4036145
LP
354 r = readlink_malloc("/etc/localtime", &p);
355 if (r < 0) {
356 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
357 return 0;
358 }
359
360 z = path_startswith(p, "../usr/share/zoneinfo/");
361 if (!z)
362 z = path_startswith(p, "/usr/share/zoneinfo/");
363 if (!z) {
364 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
365 return 0;
366 }
367
04bc4a3f
LP
368 where = strappend(dest, "/etc/localtime");
369 if (!where)
0d0f0c50 370 return log_oom();
715ac17a 371
d4036145
LP
372 r = readlink_malloc(where, &q);
373 if (r >= 0) {
374 y = path_startswith(q, "../usr/share/zoneinfo/");
375 if (!y)
376 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 377
4d1c38b8 378
d4036145
LP
379 /* Already pointing to the right place? Then do nothing .. */
380 if (y && streq(y, z))
381 return 0;
382 }
383
384 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
385 if (!check)
0d0f0c50 386 return log_oom();
4d1c38b8 387
d4036145
LP
388 if (access(check, F_OK) < 0) {
389 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
390 return 0;
391 }
68fb0892 392
d4036145
LP
393 what = strappend("../usr/share/zoneinfo/", z);
394 if (!what)
395 return log_oom();
396
397 unlink(where);
398 if (symlink(what, where) < 0) {
399 log_error("Failed to correct timezone of container: %m");
400 return 0;
401 }
e58a1277
LP
402
403 return 0;
88213476
LP
404}
405
2547bb41
LP
406static int setup_resolv_conf(const char *dest) {
407 char *where;
408
409 assert(dest);
410
411 if (arg_private_network)
412 return 0;
413
414 /* Fix resolv.conf, if possible */
04bc4a3f
LP
415 where = strappend(dest, "/etc/resolv.conf");
416 if (!where)
0d0f0c50 417 return log_oom();
2547bb41 418
77e63faf
LP
419 /* We don't really care for the results of this really. If it
420 * fails, it fails, but meh... */
2547bb41
LP
421 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
422 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
423
424 free(where);
425
426 return 0;
427}
428
04bc4a3f 429static int setup_boot_id(const char *dest) {
ed8b7a3e 430 char _cleanup_free_ *from = NULL, *to = NULL;
04bc4a3f
LP
431 sd_id128_t rnd;
432 char as_uuid[37];
433 int r;
434
435 assert(dest);
436
437 /* Generate a new randomized boot ID, so that each boot-up of
438 * the container gets a new one */
439
440 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 441 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
442 if (!from || !to)
443 return log_oom();
04bc4a3f
LP
444
445 r = sd_id128_randomize(&rnd);
446 if (r < 0) {
447 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 448 return r;
04bc4a3f
LP
449 }
450
451 snprintf(as_uuid, sizeof(as_uuid),
452 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
453 SD_ID128_FORMAT_VAL(rnd));
454 char_array_0(as_uuid);
455
456 r = write_one_line_file(from, as_uuid);
457 if (r < 0) {
458 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 459 return r;
04bc4a3f
LP
460 }
461
462 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
463 log_error("Failed to bind mount boot id: %m");
464 r = -errno;
465 } else
466 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
467
468 unlink(from);
04bc4a3f
LP
469 return r;
470}
471
e58a1277 472static int copy_devnodes(const char *dest) {
88213476
LP
473
474 static const char devnodes[] =
475 "null\0"
476 "zero\0"
477 "full\0"
478 "random\0"
479 "urandom\0"
480 "tty\0"
3eabccc4 481 "ptmx\0";
88213476
LP
482
483 const char *d;
e58a1277 484 int r = 0;
25ea79fe 485 mode_t _cleanup_umask_ u;
a258bf26
LP
486
487 assert(dest);
124640f1
LP
488
489 u = umask(0000);
88213476
LP
490
491 NULSTR_FOREACH(d, devnodes) {
e58a1277 492 struct stat st;
ed8b7a3e 493 char _cleanup_free_ *from = NULL, *to = NULL;
88213476
LP
494
495 asprintf(&from, "/dev/%s", d);
496 asprintf(&to, "%s/dev/%s", dest, d);
497
498 if (!from || !to) {
ed8b7a3e 499 log_oom();
a258bf26 500
88213476
LP
501 if (r == 0)
502 r = -ENOMEM;
503
504 break;
505 }
506
507 if (stat(from, &st) < 0) {
508
509 if (errno != ENOENT) {
510 log_error("Failed to stat %s: %m", from);
88213476
LP
511 if (r == 0)
512 r = -errno;
513 }
514
a258bf26 515 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 516
ed8b7a3e 517 log_error("%s is not a char or block device, cannot copy", from);
a258bf26
LP
518 if (r == 0)
519 r = -EIO;
520
521 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
522
523 log_error("mknod(%s) failed: %m", dest);
524 if (r == 0)
525 r = -errno;
88213476 526 }
88213476
LP
527 }
528
e58a1277
LP
529 return r;
530}
88213476 531
e58a1277
LP
532static int setup_dev_console(const char *dest, const char *console) {
533 struct stat st;
ed8b7a3e 534 char _cleanup_free_ *to = NULL;
e58a1277 535 int r;
25ea79fe 536 mode_t _cleanup_umask_ u;
e58a1277
LP
537
538 assert(dest);
539 assert(console);
540
541 u = umask(0000);
542
543 if (stat(console, &st) < 0) {
544 log_error("Failed to stat %s: %m", console);
25ea79fe 545 return -errno;
88213476 546
a258bf26 547 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
548 log_error("/dev/console is not a char device");
549 return -EIO;
e58a1277 550 }
88213476 551
e58a1277
LP
552 r = chmod_and_chown(console, 0600, 0, 0);
553 if (r < 0) {
554 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 555 return r;
a258bf26 556 }
88213476 557
25ea79fe
ZJS
558 if (asprintf(&to, "%s/dev/console", dest) < 0)
559 return log_oom();
88213476 560
a258bf26
LP
561 /* We need to bind mount the right tty to /dev/console since
562 * ptys can only exist on pts file systems. To have something
563 * to bind mount things on we create a device node first, that
564 * has the right major/minor (note that the major minor
565 * doesn't actually matter here, since we mount it over
566 * anyway). */
567
e58a1277
LP
568 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
569 log_error("mknod() for /dev/console failed: %m");
25ea79fe 570 return -errno;
e58a1277 571 }
a258bf26
LP
572
573 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 574 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 575 return -errno;
a258bf26
LP
576 }
577
25ea79fe 578 return 0;
e58a1277
LP
579}
580
581static int setup_kmsg(const char *dest, int kmsg_socket) {
ed8b7a3e 582 char _cleanup_free_ *from = NULL, *to = NULL;
e58a1277 583 int r, fd, k;
25ea79fe 584 mode_t _cleanup_umask_ u;
e58a1277
LP
585 union {
586 struct cmsghdr cmsghdr;
587 uint8_t buf[CMSG_SPACE(sizeof(int))];
588 } control;
589 struct msghdr mh;
590 struct cmsghdr *cmsg;
591
592 assert(dest);
593 assert(kmsg_socket >= 0);
a258bf26 594
e58a1277 595 u = umask(0000);
a258bf26 596
f1e5dfe2
LP
597 /* We create the kmsg FIFO as /dev/kmsg, but immediately
598 * delete it after bind mounting it to /proc/kmsg. While FIFOs
599 * on the reading side behave very similar to /proc/kmsg,
600 * their writing side behaves differently from /dev/kmsg in
601 * that writing blocks when nothing is reading. In order to
602 * avoid any problems with containers deadlocking due to this
603 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
604 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
605 asprintf(&to, "%s/proc/kmsg", dest) < 0)
606 return log_oom();
e58a1277
LP
607
608 if (mkfifo(from, 0600) < 0) {
609 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 610 return -errno;
e58a1277
LP
611 }
612
613 r = chmod_and_chown(from, 0600, 0, 0);
614 if (r < 0) {
615 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 616 return r;
e58a1277
LP
617 }
618
619 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
620 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 621 return -errno;
e58a1277
LP
622 }
623
624 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
625 if (fd < 0) {
626 log_error("Failed to open fifo: %m");
25ea79fe 627 return -errno;
e58a1277
LP
628 }
629
630 zero(mh);
631 zero(control);
632
633 mh.msg_control = &control;
634 mh.msg_controllen = sizeof(control);
635
636 cmsg = CMSG_FIRSTHDR(&mh);
637 cmsg->cmsg_level = SOL_SOCKET;
638 cmsg->cmsg_type = SCM_RIGHTS;
639 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
640 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
641
642 mh.msg_controllen = cmsg->cmsg_len;
643
644 /* Store away the fd in the socket, so that it stays open as
645 * long as we run the child */
646 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
647 close_nointr_nofail(fd);
648
649 if (k < 0) {
650 log_error("Failed to send FIFO fd: %m");
25ea79fe 651 return -errno;
a258bf26
LP
652 }
653
f1e5dfe2
LP
654 /* And now make the FIFO unavailable as /dev/kmsg... */
655 unlink(from);
25ea79fe 656 return 0;
88213476
LP
657}
658
3a74cea5
LP
659static int setup_hostname(void) {
660 char *hn;
661 int r = 0;
662
9eb977db 663 hn = path_get_file_name(arg_directory);
3a74cea5
LP
664 if (hn) {
665 hn = strdup(hn);
666 if (!hn)
667 return -ENOMEM;
668
669 hostname_cleanup(hn);
670
671 if (!isempty(hn))
672 if (sethostname(hn, strlen(hn)) < 0)
673 r = -errno;
674
675 free(hn);
676 }
677
678 return r;
679}
680
57fb9fb5
LP
681static int setup_journal(const char *directory) {
682 sd_id128_t machine_id;
27407a01
ZJS
683 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
684 char *id;
57fb9fb5
LP
685 int r;
686
687 if (arg_link_journal == LINK_NO)
688 return 0;
689
690 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
691 if (!p)
692 return log_oom();
57fb9fb5
LP
693
694 r = read_one_line_file(p, &b);
27407a01
ZJS
695 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
696 return 0;
697 else if (r < 0) {
698 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
699 return r;
700 }
701
27407a01
ZJS
702 id = strstrip(b);
703 if (isempty(id) && arg_link_journal == LINK_AUTO)
704 return 0;
57fb9fb5 705
27407a01
ZJS
706 /* Verify validity */
707 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 708 if (r < 0) {
27407a01
ZJS
709 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
710 return r;
57fb9fb5
LP
711 }
712
713 free(p);
27407a01
ZJS
714 p = strappend("/var/log/journal/", id);
715 q = strjoin(directory, "/var/log/journal/", id, NULL);
716 if (!p || !q)
717 return log_oom();
718
719 if (path_is_mount_point(p, false) > 0) {
720 if (arg_link_journal != LINK_AUTO) {
721 log_error("%s: already a mount point, refusing to use for journal", p);
722 return -EEXIST;
723 }
724
725 return 0;
57fb9fb5
LP
726 }
727
27407a01 728 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 729 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
730 log_error("%s: already a mount point, refusing to use for journal", q);
731 return -EEXIST;
57fb9fb5
LP
732 }
733
27407a01 734 return 0;
57fb9fb5
LP
735 }
736
737 r = readlink_and_make_absolute(p, &d);
738 if (r >= 0) {
739 if ((arg_link_journal == LINK_GUEST ||
740 arg_link_journal == LINK_AUTO) &&
741 path_equal(d, q)) {
742
27407a01
ZJS
743 r = mkdir_p(q, 0755);
744 if (r < 0)
745 log_warning("failed to create directory %s: %m", q);
746 return 0;
57fb9fb5
LP
747 }
748
749 if (unlink(p) < 0) {
750 log_error("Failed to remove symlink %s: %m", p);
27407a01 751 return -errno;
57fb9fb5
LP
752 }
753 } else if (r == -EINVAL) {
754
755 if (arg_link_journal == LINK_GUEST &&
756 rmdir(p) < 0) {
757
27407a01
ZJS
758 if (errno == ENOTDIR) {
759 log_error("%s already exists and is neither a symlink nor a directory", p);
760 return r;
761 } else {
57fb9fb5 762 log_error("Failed to remove %s: %m", p);
27407a01 763 return -errno;
57fb9fb5 764 }
57fb9fb5
LP
765 }
766 } else if (r != -ENOENT) {
767 log_error("readlink(%s) failed: %m", p);
27407a01 768 return r;
57fb9fb5
LP
769 }
770
771 if (arg_link_journal == LINK_GUEST) {
772
773 if (symlink(q, p) < 0) {
774 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 775 return -errno;
57fb9fb5
LP
776 }
777
27407a01
ZJS
778 r = mkdir_p(q, 0755);
779 if (r < 0)
780 log_warning("failed to create directory %s: %m", q);
781 return 0;
57fb9fb5
LP
782 }
783
784 if (arg_link_journal == LINK_HOST) {
785 r = mkdir_p(p, 0755);
786 if (r < 0) {
787 log_error("Failed to create %s: %m", p);
27407a01 788 return r;
57fb9fb5
LP
789 }
790
27407a01
ZJS
791 } else if (access(p, F_OK) < 0)
792 return 0;
57fb9fb5
LP
793
794 if (dir_is_empty(q) == 0) {
795 log_error("%s not empty.", q);
27407a01 796 return -ENOTEMPTY;
57fb9fb5
LP
797 }
798
799 r = mkdir_p(q, 0755);
800 if (r < 0) {
801 log_error("Failed to create %s: %m", q);
27407a01 802 return r;
57fb9fb5
LP
803 }
804
805 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
806 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 807 return -errno;
57fb9fb5
LP
808 }
809
27407a01 810 return 0;
57fb9fb5
LP
811}
812
88213476 813static int drop_capabilities(void) {
5076f0cc 814 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
815}
816
817static int is_os_tree(const char *path) {
818 int r;
819 char *p;
820 /* We use /bin/sh as flag file if something is an OS */
821
822 if (asprintf(&p, "%s/bin/sh", path) < 0)
823 return -ENOMEM;
824
825 r = access(p, F_OK);
826 free(p);
827
828 return r < 0 ? 0 : 1;
829}
830
57cb4adf 831static int process_pty(int master, pid_t pid, sigset_t *mask) {
0c749d50 832
b72491a2 833 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
a258bf26
LP
834 size_t in_buffer_full = 0, out_buffer_full = 0;
835 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
836 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
a258bf26 837 int ep = -1, signal_fd = -1, r;
57cb4adf
LP
838 bool tried_orderly_shutdown = false;
839
840 assert(master >= 0);
841 assert(pid > 0);
842 assert(mask);
a258bf26
LP
843
844 fd_nonblock(STDIN_FILENO, 1);
845 fd_nonblock(STDOUT_FILENO, 1);
846 fd_nonblock(master, 1);
847
db7feb7e
LP
848 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
849 if (signal_fd < 0) {
a258bf26
LP
850 log_error("signalfd(): %m");
851 r = -errno;
852 goto finish;
853 }
854
db7feb7e
LP
855 ep = epoll_create1(EPOLL_CLOEXEC);
856 if (ep < 0) {
a258bf26
LP
857 log_error("Failed to create epoll: %m");
858 r = -errno;
859 goto finish;
860 }
861
51d88d1b
LP
862 /* We read from STDIN only if this is actually a TTY,
863 * otherwise we assume non-interactivity. */
864 if (isatty(STDIN_FILENO)) {
865 zero(stdin_ev);
866 stdin_ev.events = EPOLLIN|EPOLLET;
867 stdin_ev.data.fd = STDIN_FILENO;
868
869 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
870 log_error("Failed to register STDIN in epoll: %m");
871 r = -errno;
872 goto finish;
873 }
874 }
a258bf26
LP
875
876 zero(stdout_ev);
877 stdout_ev.events = EPOLLOUT|EPOLLET;
878 stdout_ev.data.fd = STDOUT_FILENO;
879
880 zero(master_ev);
881 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
882 master_ev.data.fd = master;
883
884 zero(signal_ev);
885 signal_ev.events = EPOLLIN;
886 signal_ev.data.fd = signal_fd;
887
f2956e80
MS
888 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
889 if (errno != EPERM) {
890 log_error("Failed to register stdout in epoll: %m");
891 r = -errno;
892 goto finish;
893 }
894 /* stdout without epoll support. Likely redirected to regular file. */
895 stdout_writable = true;
896 }
897
898 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
a258bf26 899 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
51d88d1b 900 log_error("Failed to register fds in epoll: %m");
a258bf26
LP
901 r = -errno;
902 goto finish;
903 }
904
fd14078a 905 for (;;) {
a258bf26
LP
906 struct epoll_event ev[16];
907 ssize_t k;
908 int i, nfds;
909
db7feb7e
LP
910 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
911 if (nfds < 0) {
a258bf26
LP
912
913 if (errno == EINTR || errno == EAGAIN)
914 continue;
915
916 log_error("epoll_wait(): %m");
917 r = -errno;
918 goto finish;
919 }
920
921 assert(nfds >= 1);
922
923 for (i = 0; i < nfds; i++) {
924 if (ev[i].data.fd == STDIN_FILENO) {
925
fd14078a 926 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
927 stdin_readable = true;
928
929 } else if (ev[i].data.fd == STDOUT_FILENO) {
930
fd14078a 931 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
932 stdout_writable = true;
933
934 } else if (ev[i].data.fd == master) {
935
fd14078a 936 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
937 master_readable = true;
938
fd14078a 939 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
940 master_writable = true;
941
942 } else if (ev[i].data.fd == signal_fd) {
943 struct signalfd_siginfo sfsi;
944 ssize_t n;
945
db7feb7e
LP
946 n = read(signal_fd, &sfsi, sizeof(sfsi));
947 if (n != sizeof(sfsi)) {
a258bf26
LP
948
949 if (n >= 0) {
0c749d50 950 log_error("Failed to read from signalfd: invalid block size");
a258bf26
LP
951 r = -EIO;
952 goto finish;
953 }
954
955 if (errno != EINTR && errno != EAGAIN) {
0c749d50 956 log_error("Failed to read from signalfd: %m");
a258bf26
LP
957 r = -errno;
958 goto finish;
959 }
960 } else {
961
962 if (sfsi.ssi_signo == SIGWINCH) {
963 struct winsize ws;
964
965 /* The window size changed, let's forward that. */
a258bf26
LP
966 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
967 ioctl(master, TIOCSWINSZ, &ws);
57cb4adf
LP
968 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
969
970 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
971
972 /* This only works for systemd... */
973 tried_orderly_shutdown = true;
974 kill(pid, SIGRTMIN+3);
975
a258bf26 976 } else {
0c749d50 977 r = 0;
a258bf26
LP
978 goto finish;
979 }
980 }
981 }
982 }
983
984 while ((stdin_readable && in_buffer_full <= 0) ||
985 (master_writable && in_buffer_full > 0) ||
986 (master_readable && out_buffer_full <= 0) ||
987 (stdout_writable && out_buffer_full > 0)) {
988
b72491a2 989 if (stdin_readable && in_buffer_full < LINE_MAX) {
a258bf26 990
db7feb7e
LP
991 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
992 if (k < 0) {
a258bf26 993
fd14078a 994 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 995 stdin_readable = false;
a258bf26
LP
996 else {
997 log_error("read(): %m");
0c749d50 998 r = -errno;
a258bf26
LP
999 goto finish;
1000 }
1001 } else
1002 in_buffer_full += (size_t) k;
a258bf26
LP
1003 }
1004
1005 if (master_writable && in_buffer_full > 0) {
1006
db7feb7e
LP
1007 k = write(master, in_buffer, in_buffer_full);
1008 if (k < 0) {
a258bf26 1009
fd14078a 1010 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1011 master_writable = false;
fd14078a 1012 else {
a258bf26 1013 log_error("write(): %m");
0c749d50 1014 r = -errno;
a258bf26
LP
1015 goto finish;
1016 }
1017
1018 } else {
1019 assert(in_buffer_full >= (size_t) k);
1020 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1021 in_buffer_full -= k;
1022 }
1023 }
1024
b72491a2 1025 if (master_readable && out_buffer_full < LINE_MAX) {
a258bf26 1026
db7feb7e
LP
1027 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1028 if (k < 0) {
a258bf26 1029
fd14078a 1030 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1031 master_readable = false;
a258bf26
LP
1032 else {
1033 log_error("read(): %m");
0c749d50 1034 r = -errno;
a258bf26
LP
1035 goto finish;
1036 }
1037 } else
1038 out_buffer_full += (size_t) k;
a258bf26
LP
1039 }
1040
1041 if (stdout_writable && out_buffer_full > 0) {
1042
db7feb7e
LP
1043 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1044 if (k < 0) {
a258bf26 1045
fd14078a 1046 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1047 stdout_writable = false;
fd14078a 1048 else {
a258bf26 1049 log_error("write(): %m");
0c749d50 1050 r = -errno;
a258bf26
LP
1051 goto finish;
1052 }
1053
1054 } else {
1055 assert(out_buffer_full >= (size_t) k);
1056 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1057 out_buffer_full -= k;
1058 }
1059 }
1060 }
fd14078a 1061 }
a258bf26
LP
1062
1063finish:
1064 if (ep >= 0)
1065 close_nointr_nofail(ep);
1066
1067 if (signal_fd >= 0)
1068 close_nointr_nofail(signal_fd);
1069
1070 return r;
1071}
88213476
LP
1072
1073int main(int argc, char *argv[]) {
1074 pid_t pid = 0;
04d391da
LP
1075 int r = EXIT_FAILURE, k;
1076 char *oldcg = NULL, *newcg = NULL;
40c32a4a 1077 char **controller = NULL;
842f3b0f 1078 int master = -1, n_fd_passed;
a258bf26
LP
1079 const char *console = NULL;
1080 struct termios saved_attr, raw_attr;
1081 sigset_t mask;
1082 bool saved_attr_valid = false;
1083 struct winsize ws;
e58a1277 1084 int kmsg_socket_pair[2] = { -1, -1 };
842f3b0f 1085 FDSet *fds = NULL;
88213476
LP
1086
1087 log_parse_environment();
1088 log_open();
1089
db7feb7e
LP
1090 r = parse_argv(argc, argv);
1091 if (r <= 0)
88213476
LP
1092 goto finish;
1093
1094 if (arg_directory) {
1095 char *p;
1096
1097 p = path_make_absolute_cwd(arg_directory);
1098 free(arg_directory);
1099 arg_directory = p;
1100 } else
1101 arg_directory = get_current_dir_name();
1102
1103 if (!arg_directory) {
1104 log_error("Failed to determine path");
1105 goto finish;
1106 }
1107
1108 path_kill_slashes(arg_directory);
1109
1110 if (geteuid() != 0) {
1111 log_error("Need to be root.");
1112 goto finish;
1113 }
1114
04d391da
LP
1115 if (sd_booted() <= 0) {
1116 log_error("Not running on a systemd system.");
1117 goto finish;
1118 }
1119
88213476 1120 if (path_equal(arg_directory, "/")) {
6df6b939 1121 log_error("Spawning container on root directory not supported.");
88213476
LP
1122 goto finish;
1123 }
1124
1125 if (is_os_tree(arg_directory) <= 0) {
1126 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1127 goto finish;
1128 }
1129
842f3b0f
LP
1130 log_close();
1131 n_fd_passed = sd_listen_fds(false);
1132 if (n_fd_passed > 0) {
1133 k = fdset_new_listen_fds(&fds, false);
1134 if (k < 0) {
1135 log_error("Failed to collect file descriptors: %s", strerror(-k));
1136 goto finish;
1137 }
1138 }
1139 fdset_close_others(fds);
1140 log_open();
1141
db7feb7e
LP
1142 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1143 if (k < 0) {
04d391da
LP
1144 log_error("Failed to determine current cgroup: %s", strerror(-k));
1145 goto finish;
1146 }
1147
1148 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1149 log_error("Failed to allocate cgroup path.");
1150 goto finish;
1151 }
1152
40c32a4a
LGL
1153 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1154 if (k < 0) {
04d391da
LP
1155 log_error("Failed to create cgroup: %s", strerror(-k));
1156 goto finish;
1157 }
1158
db7feb7e 1159 STRV_FOREACH(controller, arg_controllers) {
40c32a4a
LGL
1160 k = cg_create_and_attach(*controller, newcg, 0);
1161 if (k < 0)
1162 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1163 }
1164
db7feb7e
LP
1165 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1166 if (master < 0) {
a258bf26
LP
1167 log_error("Failed to acquire pseudo tty: %m");
1168 goto finish;
1169 }
1170
db7feb7e
LP
1171 console = ptsname(master);
1172 if (!console) {
a258bf26
LP
1173 log_error("Failed to determine tty name: %m");
1174 goto finish;
1175 }
1176
1177 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1178
1179 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1180 ioctl(master, TIOCSWINSZ, &ws);
1181
1182 if (unlockpt(master) < 0) {
1183 log_error("Failed to unlock tty: %m");
1184 goto finish;
1185 }
1186
51d88d1b
LP
1187 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1188 saved_attr_valid = true;
a258bf26 1189
51d88d1b
LP
1190 raw_attr = saved_attr;
1191 cfmakeraw(&raw_attr);
1192 raw_attr.c_lflag &= ~ECHO;
1193 }
a258bf26 1194
e58a1277
LP
1195 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1196 log_error("Failed to create kmsg socket pair");
1197 goto finish;
1198 }
1199
a258bf26
LP
1200 assert_se(sigemptyset(&mask) == 0);
1201 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1202 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1203
d87be9b0
LP
1204 for (;;) {
1205 siginfo_t status;
52af2106 1206
51d88d1b
LP
1207 if (saved_attr_valid) {
1208 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1209 log_error("Failed to set terminal attributes: %m");
1210 goto finish;
1211 }
d87be9b0 1212 }
88213476 1213
d87be9b0
LP
1214 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1215 if (pid < 0) {
1216 if (errno == EINVAL)
1217 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1218 else
1219 log_error("clone() failed: %m");
a258bf26 1220
d87be9b0
LP
1221 goto finish;
1222 }
a258bf26 1223
d87be9b0
LP
1224 if (pid == 0) {
1225 /* child */
a258bf26 1226
d87be9b0
LP
1227 const char *home = NULL;
1228 uid_t uid = (uid_t) -1;
1229 gid_t gid = (gid_t) -1;
842f3b0f 1230 unsigned n_env = 0;
d87be9b0
LP
1231 const char *envp[] = {
1232 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1233 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1234 NULL, /* TERM */
1235 NULL, /* HOME */
1236 NULL, /* USER */
1237 NULL, /* LOGNAME */
1238 NULL, /* container_uuid */
842f3b0f
LP
1239 NULL, /* LISTEN_FDS */
1240 NULL, /* LISTEN_PID */
d87be9b0
LP
1241 NULL
1242 };
a258bf26 1243
d87be9b0 1244 envp[2] = strv_find_prefix(environ, "TERM=");
842f3b0f 1245 n_env = 3;
a258bf26 1246
d87be9b0 1247 close_nointr_nofail(master);
842f3b0f 1248 master = -1;
a258bf26 1249
d87be9b0
LP
1250 close_nointr(STDIN_FILENO);
1251 close_nointr(STDOUT_FILENO);
1252 close_nointr(STDERR_FILENO);
db7feb7e 1253
842f3b0f
LP
1254 close_nointr_nofail(kmsg_socket_pair[0]);
1255 kmsg_socket_pair[0] = -1;
a258bf26 1256
d87be9b0 1257 reset_all_signal_handlers();
88213476 1258
d87be9b0
LP
1259 assert_se(sigemptyset(&mask) == 0);
1260 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1261
842f3b0f
LP
1262 k = open_terminal(console, O_RDWR);
1263 if (k != STDIN_FILENO) {
1264 if (k >= 0) {
1265 close_nointr_nofail(k);
1266 k = -EINVAL;
1267 }
1268
1269 log_error("Failed to open console: %s", strerror(-k));
1270 goto child_fail;
1271 }
1272
1273 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1274 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1275 log_error("Failed to duplicate console: %m");
d87be9b0 1276 goto child_fail;
842f3b0f 1277 }
bc2f673e 1278
d87be9b0
LP
1279 if (setsid() < 0) {
1280 log_error("setsid() failed: %m");
bc2f673e
LP
1281 goto child_fail;
1282 }
1283
d87be9b0
LP
1284 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1285 log_error("PR_SET_PDEATHSIG failed: %m");
1286 goto child_fail;
1287 }
e58a1277 1288
d87be9b0
LP
1289 /* Mark everything as slave, so that we still
1290 * receive mounts from the real root, but don't
1291 * propagate mounts to the real root. */
1292 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1293 log_error("MS_SLAVE|MS_REC failed: %m");
1294 goto child_fail;
1295 }
04bc4a3f 1296
d87be9b0
LP
1297 /* Turn directory into bind mount */
1298 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1299 log_error("Failed to make bind mount.");
1300 goto child_fail;
1301 }
88213476 1302
d87be9b0
LP
1303 if (arg_read_only)
1304 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1305 log_error("Failed to make read-only.");
1306 goto child_fail;
1307 }
2547bb41 1308
d87be9b0
LP
1309 if (mount_all(arg_directory) < 0)
1310 goto child_fail;
57fb9fb5 1311
d87be9b0
LP
1312 if (copy_devnodes(arg_directory) < 0)
1313 goto child_fail;
a258bf26 1314
d87be9b0 1315 dev_setup(arg_directory);
88213476 1316
d87be9b0
LP
1317 if (setup_dev_console(arg_directory, console) < 0)
1318 goto child_fail;
88213476 1319
d87be9b0
LP
1320 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1321 goto child_fail;
88213476 1322
d87be9b0 1323 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1324 kmsg_socket_pair[1] = -1;
a258bf26 1325
d87be9b0
LP
1326 if (setup_boot_id(arg_directory) < 0)
1327 goto child_fail;
a41fe3a2 1328
d87be9b0
LP
1329 if (setup_timezone(arg_directory) < 0)
1330 goto child_fail;
88213476 1331
d87be9b0
LP
1332 if (setup_resolv_conf(arg_directory) < 0)
1333 goto child_fail;
687d0825 1334
d87be9b0 1335 if (setup_journal(arg_directory) < 0)
687d0825 1336 goto child_fail;
687d0825 1337
d87be9b0
LP
1338 if (chdir(arg_directory) < 0) {
1339 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1340 goto child_fail;
1341 }
1342
d87be9b0
LP
1343 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1344 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1345 goto child_fail;
1346 }
1347
d87be9b0
LP
1348 if (chroot(".") < 0) {
1349 log_error("chroot() failed: %m");
687d0825
MV
1350 goto child_fail;
1351 }
1352
d87be9b0
LP
1353 if (chdir("/") < 0) {
1354 log_error("chdir() failed: %m");
687d0825
MV
1355 goto child_fail;
1356 }
1357
d87be9b0
LP
1358 umask(0022);
1359
1360 loopback_setup();
1361
1362 if (drop_capabilities() < 0) {
1363 log_error("drop_capabilities() failed: %m");
687d0825
MV
1364 goto child_fail;
1365 }
687d0825 1366
d87be9b0
LP
1367 if (arg_user) {
1368
963ddb91
LP
1369 /* Note that this resolves user names
1370 * inside the container, and hence
1371 * accesses the NSS modules from the
1372 * container and not the host. This is
1373 * a bit weird... */
1374
d87be9b0
LP
1375 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1376 log_error("get_user_creds() failed: %m");
1377 goto child_fail;
1378 }
1379
1380 if (mkdir_parents_label(home, 0775) < 0) {
1381 log_error("mkdir_parents_label() failed: %m");
1382 goto child_fail;
1383 }
1384
1385 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1386 log_error("mkdir_safe_label() failed: %m");
1387 goto child_fail;
1388 }
1389
1390 if (initgroups((const char*)arg_user, gid) < 0) {
1391 log_error("initgroups() failed: %m");
1392 goto child_fail;
1393 }
144f0fc0 1394
d87be9b0
LP
1395 if (setresgid(gid, gid, gid) < 0) {
1396 log_error("setregid() failed: %m");
1397 goto child_fail;
1398 }
1399
1400 if (setresuid(uid, uid, uid) < 0) {
1401 log_error("setreuid() failed: %m");
1402 goto child_fail;
1403 }
3c957acf
LP
1404 } else {
1405 /* Reset everything fully to 0, just in case */
1406
1407 if (setgroups(0, NULL) < 0) {
1408 log_error("setgroups() failed: %m");
1409 goto child_fail;
1410 }
1411
1412 if (setresgid(0, 0, 0) < 0) {
1413 log_error("setregid() failed: %m");
1414 goto child_fail;
1415 }
1416
1417 if (setresuid(0, 0, 0) < 0) {
1418 log_error("setreuid() failed: %m");
1419 goto child_fail;
1420 }
d87be9b0
LP
1421 }
1422
842f3b0f
LP
1423 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1424 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1425 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1426 log_oom();
144f0fc0
LP
1427 goto child_fail;
1428 }
687d0825 1429
d87be9b0 1430 if (arg_uuid) {
842f3b0f
LP
1431 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1432 log_oom();
1433 goto child_fail;
1434 }
1435 }
1436
1437 if (fdset_size(fds) > 0) {
1438 k = fdset_cloexec(fds, false);
1439 if (k < 0) {
1440 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1441 goto child_fail;
1442 }
1443
1444 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1445 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
d87be9b0
LP
1446 log_oom();
1447 goto child_fail;
1448 }
1449 }
1450
1451 setup_hostname();
1452
1453 if (arg_boot) {
1454 char **a;
1455 size_t l;
88213476 1456
d87be9b0 1457 /* Automatically search for the init system */
0f0dbc46 1458
d87be9b0
LP
1459 l = 1 + argc - optind;
1460 a = newa(char*, l + 1);
1461 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1462
d87be9b0
LP
1463 a[0] = (char*) "/usr/lib/systemd/systemd";
1464 execve(a[0], a, (char**) envp);
0f0dbc46 1465
d87be9b0
LP
1466 a[0] = (char*) "/lib/systemd/systemd";
1467 execve(a[0], a, (char**) envp);
0f0dbc46 1468
d87be9b0
LP
1469 a[0] = (char*) "/sbin/init";
1470 execve(a[0], a, (char**) envp);
1471 } else if (argc > optind)
1472 execvpe(argv[optind], argv + optind, (char**) envp);
1473 else {
1474 chdir(home ? home : "/root");
1475 execle("/bin/bash", "-bash", NULL, (char**) envp);
1476 }
1477
1478 log_error("execv() failed: %m");
0f0dbc46 1479
d87be9b0
LP
1480 child_fail:
1481 _exit(EXIT_FAILURE);
da5b3bad 1482 }
88213476 1483
842f3b0f
LP
1484 fdset_free(fds);
1485 fds = NULL;
1486
57cb4adf 1487 if (process_pty(master, pid, &mask) < 0)
d87be9b0 1488 goto finish;
88213476 1489
d87be9b0
LP
1490 if (saved_attr_valid)
1491 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
a258bf26 1492
d87be9b0
LP
1493 r = wait_for_terminate(pid, &status);
1494 if (r < 0) {
1495 r = EXIT_FAILURE;
1496 break;
1497 }
a258bf26 1498
d87be9b0
LP
1499 if (status.si_code == CLD_EXITED) {
1500 if (status.si_status != 0) {
1501 log_error("Container failed with error code %i.", status.si_status);
1502 r = status.si_status;
1503 break;
1504 }
1505
1506 log_debug("Container exited successfully.");
1507 break;
1508 } else if (status.si_code == CLD_KILLED &&
1509 status.si_status == SIGINT) {
1510 log_info("Container has been shut down.");
1511 r = 0;
1512 break;
1513 } else if (status.si_code == CLD_KILLED &&
1514 status.si_status == SIGHUP) {
1515 log_info("Container is being rebooted.");
1516 continue;
1517 } else if (status.si_code == CLD_KILLED ||
1518 status.si_code == CLD_DUMPED) {
88213476 1519
d87be9b0
LP
1520 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1521 r = EXIT_FAILURE;
1522 break;
1523 } else {
1524 log_error("Container failed due to unknown reason.");
1525 r = EXIT_FAILURE;
1526 break;
1527 }
1528 }
88213476
LP
1529
1530finish:
a258bf26
LP
1531 if (saved_attr_valid)
1532 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1533
1534 if (master >= 0)
1535 close_nointr_nofail(master);
1536
e58a1277
LP
1537 close_pipe(kmsg_socket_pair);
1538
04d391da
LP
1539 if (oldcg)
1540 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1541
1542 if (newcg)
1543 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
88213476 1544
04d391da 1545 free(arg_directory);
40c32a4a 1546 strv_free(arg_controllers);
04d391da
LP
1547 free(oldcg);
1548 free(newcg);
88213476 1549
842f3b0f
LP
1550 fdset_free(fds);
1551
88213476
LP
1552 return r;
1553}