]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: don't make assumptions about the size of pid_t
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
1fd96121 36#include <sys/poll.h>
a258bf26
LP
37#include <sys/epoll.h>
38#include <termios.h>
39#include <sys/signalfd.h>
687d0825 40#include <grp.h>
5ed27dbd 41#include <linux/fs.h>
9537eab0
LP
42#include <sys/un.h>
43#include <sys/socket.h>
88213476 44
81527be1
LP
45#include <systemd/sd-daemon.h>
46
88213476
LP
47#include "log.h"
48#include "util.h"
49e942b2 49#include "mkdir.h"
6b2d0e85 50#include "macro.h"
d7832d2c 51#include "audit.h"
94d82985 52#include "missing.h"
04d391da 53#include "cgroup-util.h"
a258bf26 54#include "strv.h"
9eb977db 55#include "path-util.h"
a41fe3a2 56#include "loopback-setup.h"
57fb9fb5 57#include "sd-id128.h"
4fc9982c 58#include "dev-setup.h"
842f3b0f 59#include "fdset.h"
acbeb427 60#include "build.h"
a5c32cff 61#include "fileio.h"
57fb9fb5 62
f2d88580
LP
63#ifndef TTY_GID
64#define TTY_GID 5
65#endif
66
57fb9fb5
LP
67typedef enum LinkJournal {
68 LINK_NO,
69 LINK_AUTO,
70 LINK_HOST,
71 LINK_GUEST
72} LinkJournal;
88213476
LP
73
74static char *arg_directory = NULL;
687d0825 75static char *arg_user = NULL;
40c32a4a 76static char **arg_controllers = NULL;
144f0fc0 77static char *arg_uuid = NULL;
ff01d048 78static bool arg_private_network = false;
bc2f673e 79static bool arg_read_only = false;
0f0dbc46 80static bool arg_boot = false;
57fb9fb5 81static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
82static uint64_t arg_retain =
83 (1ULL << CAP_CHOWN) |
84 (1ULL << CAP_DAC_OVERRIDE) |
85 (1ULL << CAP_DAC_READ_SEARCH) |
86 (1ULL << CAP_FOWNER) |
87 (1ULL << CAP_FSETID) |
88 (1ULL << CAP_IPC_OWNER) |
89 (1ULL << CAP_KILL) |
90 (1ULL << CAP_LEASE) |
91 (1ULL << CAP_LINUX_IMMUTABLE) |
92 (1ULL << CAP_NET_BIND_SERVICE) |
93 (1ULL << CAP_NET_BROADCAST) |
94 (1ULL << CAP_NET_RAW) |
95 (1ULL << CAP_SETGID) |
96 (1ULL << CAP_SETFCAP) |
97 (1ULL << CAP_SETPCAP) |
98 (1ULL << CAP_SETUID) |
99 (1ULL << CAP_SYS_ADMIN) |
100 (1ULL << CAP_SYS_CHROOT) |
101 (1ULL << CAP_SYS_NICE) |
102 (1ULL << CAP_SYS_PTRACE) |
103 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 104 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
105 (1ULL << CAP_SYS_BOOT) |
106 (1ULL << CAP_AUDIT_WRITE) |
107 (1ULL << CAP_AUDIT_CONTROL);
17fe0523
LP
108static char **arg_bind = NULL;
109static char **arg_bind_ro = NULL;
88213476
LP
110
111static int help(void) {
112
113 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
114 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
17fe0523
LP
115 " -h --help Show this help\n"
116 " --version Print version string\n"
117 " -D --directory=NAME Root directory for the container\n"
118 " -b --boot Boot up full system (i.e. invoke init)\n"
119 " -u --user=USER Run the command under specified user or uid\n"
120 " -C --controllers=LIST Put the container in specified comma-separated\n"
121 " cgroup hierarchies\n"
122 " --uuid=UUID Set a specific machine UUID for the container\n"
123 " --private-network Disable network in container\n"
124 " --read-only Mount the root directory read-only\n"
125 " --capability=CAP In addition to the default, retain specified\n"
126 " capability\n"
127 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
128 " -j Equivalent to --link-journal=host\n"
129 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
130 " the container\n"
131 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
88213476
LP
132 program_invocation_short_name);
133
134 return 0;
135}
136
137static int parse_argv(int argc, char *argv[]) {
138
a41fe3a2 139 enum {
acbeb427
ZJS
140 ARG_VERSION = 0x100,
141 ARG_PRIVATE_NETWORK,
bc2f673e 142 ARG_UUID,
5076f0cc 143 ARG_READ_ONLY,
57fb9fb5 144 ARG_CAPABILITY,
17fe0523
LP
145 ARG_LINK_JOURNAL,
146 ARG_BIND,
147 ARG_BIND_RO
a41fe3a2
LP
148 };
149
88213476 150 static const struct option options[] = {
ff01d048 151 { "help", no_argument, NULL, 'h' },
acbeb427 152 { "version", no_argument, NULL, ARG_VERSION },
ff01d048
LP
153 { "directory", required_argument, NULL, 'D' },
154 { "user", required_argument, NULL, 'u' },
40c32a4a 155 { "controllers", required_argument, NULL, 'C' },
ff01d048 156 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 157 { "boot", no_argument, NULL, 'b' },
144f0fc0 158 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 159 { "read-only", no_argument, NULL, ARG_READ_ONLY },
5076f0cc 160 { "capability", required_argument, NULL, ARG_CAPABILITY },
57fb9fb5 161 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
17fe0523
LP
162 { "bind", required_argument, NULL, ARG_BIND },
163 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
ff01d048 164 { NULL, 0, NULL, 0 }
88213476
LP
165 };
166
167 int c;
168
169 assert(argc >= 0);
170 assert(argv);
171
57fb9fb5 172 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
88213476
LP
173
174 switch (c) {
175
176 case 'h':
177 help();
178 return 0;
179
acbeb427
ZJS
180 case ARG_VERSION:
181 puts(PACKAGE_STRING);
182 puts(SYSTEMD_FEATURES);
183 return 0;
184
88213476
LP
185 case 'D':
186 free(arg_directory);
3a74cea5
LP
187 arg_directory = canonicalize_file_name(optarg);
188 if (!arg_directory) {
189 log_error("Failed to canonicalize root directory.");
88213476
LP
190 return -ENOMEM;
191 }
192
193 break;
194
687d0825
MV
195 case 'u':
196 free(arg_user);
197 if (!(arg_user = strdup(optarg))) {
198 log_error("Failed to duplicate user name.");
199 return -ENOMEM;
200 }
201
202 break;
203
40c32a4a
LGL
204 case 'C':
205 strv_free(arg_controllers);
206 arg_controllers = strv_split(optarg, ",");
207 if (!arg_controllers) {
208 log_error("Failed to split controllers list.");
209 return -ENOMEM;
210 }
211 strv_uniq(arg_controllers);
212
213 break;
214
ff01d048
LP
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
a41fe3a2
LP
217 break;
218
0f0dbc46
LP
219 case 'b':
220 arg_boot = true;
221 break;
222
144f0fc0
LP
223 case ARG_UUID:
224 arg_uuid = optarg;
225 break;
226
bc2f673e
LP
227 case ARG_READ_ONLY:
228 arg_read_only = true;
229 break;
230
5076f0cc
LP
231 case ARG_CAPABILITY: {
232 char *state, *word;
233 size_t length;
234
235 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
236 cap_value_t cap;
237 char *t;
238
239 t = strndup(word, length);
0d0f0c50
SL
240 if (!t)
241 return log_oom();
5076f0cc
LP
242
243 if (cap_from_name(t, &cap) < 0) {
244 log_error("Failed to parse capability %s.", t);
245 free(t);
246 return -EINVAL;
247 }
248
249 free(t);
250 arg_retain |= 1ULL << (uint64_t) cap;
251 }
252
253 break;
254 }
255
57fb9fb5
LP
256 case 'j':
257 arg_link_journal = LINK_GUEST;
258 break;
259
260 case ARG_LINK_JOURNAL:
261 if (streq(optarg, "auto"))
262 arg_link_journal = LINK_AUTO;
263 else if (streq(optarg, "no"))
264 arg_link_journal = LINK_NO;
265 else if (streq(optarg, "guest"))
266 arg_link_journal = LINK_GUEST;
267 else if (streq(optarg, "host"))
268 arg_link_journal = LINK_HOST;
269 else {
270 log_error("Failed to parse link journal mode %s", optarg);
271 return -EINVAL;
272 }
273
274 break;
275
17fe0523
LP
276 case ARG_BIND:
277 case ARG_BIND_RO: {
278 _cleanup_free_ char *a = NULL, *b = NULL;
279 char *e;
280 char ***x;
281 int r;
282
283 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
284
285 e = strchr(optarg, ':');
286 if (e) {
287 a = strndup(optarg, e - optarg);
288 b = strdup(e + 1);
289 } else {
290 a = strdup(optarg);
291 b = strdup(optarg);
292 }
293
294 if (!a || !b)
295 return log_oom();
296
297 if (!path_is_absolute(a) || !path_is_absolute(b)) {
298 log_error("Invalid bind mount specification: %s", optarg);
299 return -EINVAL;
300 }
301
302 r = strv_extend(x, a);
303 if (r < 0)
304 return r;
305
306 r = strv_extend(x, b);
307 if (r < 0)
308 return r;
309
310 break;
311 }
312
88213476
LP
313 case '?':
314 return -EINVAL;
315
316 default:
317 log_error("Unknown option code %c", c);
318 return -EINVAL;
319 }
320 }
321
322 return 1;
323}
324
325static int mount_all(const char *dest) {
326
327 typedef struct MountPoint {
328 const char *what;
329 const char *where;
330 const char *type;
331 const char *options;
332 unsigned long flags;
3bd66c05 333 bool fatal;
88213476
LP
334 } MountPoint;
335
336 static const MountPoint mount_table[] = {
4b7a6af4 337 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
338 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
339 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 340 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 341 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 342 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 343 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 344 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 345#ifdef HAVE_SELINUX
b4c59701
LP
346 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
347 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 348#endif
88213476
LP
349 };
350
351 unsigned k;
352 int r = 0;
353
354 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
89154bd4 355 char _cleanup_free_ *where = NULL;
88213476
LP
356 int t;
357
17fe0523
LP
358 where = strjoin(dest, "/", mount_table[k].where, NULL);
359 if (!where)
360 return log_oom();
88213476 361
e65aec12 362 t = path_is_mount_point(where, true);
68fb0892 363 if (t < 0) {
88213476 364 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
365
366 if (r == 0)
367 r = t;
368
369 continue;
370 }
371
9c1c7f71
LP
372 /* Skip this entry if it is not a remount. */
373 if (mount_table[k].what && t > 0)
014a9c77
LP
374 continue;
375
17fe0523 376 mkdir_p(where, 0755);
88213476
LP
377
378 if (mount(mount_table[k].what,
379 where,
380 mount_table[k].type,
381 mount_table[k].flags,
3bd66c05
LP
382 mount_table[k].options) < 0 &&
383 mount_table[k].fatal) {
88213476
LP
384
385 log_error("mount(%s) failed: %m", where);
386
387 if (r == 0)
388 r = -errno;
389 }
88213476
LP
390 }
391
e58a1277
LP
392 return r;
393}
f8440af5 394
17fe0523
LP
395static int mount_binds(const char *dest, char **l, unsigned long flags) {
396 char **x, **y;
397
398 STRV_FOREACH_PAIR(x, y, l) {
399 _cleanup_free_ char *where = NULL;
400
401 where = strjoin(dest, "/", *y, NULL);
402 if (!where)
403 return log_oom();
404
405 mkdir_p_label(where, 0755);
406
407 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
408 log_error("mount(%s) failed: %m", where);
409 return -errno;
410 }
411
412 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
413 log_error("mount(%s) failed: %m", where);
414 return -errno;
415 }
416 }
417
418 return 0;
419}
420
e58a1277 421static int setup_timezone(const char *dest) {
d4036145
LP
422 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
423 char *z, *y;
424 int r;
f8440af5 425
e58a1277
LP
426 assert(dest);
427
428 /* Fix the timezone, if possible */
d4036145
LP
429 r = readlink_malloc("/etc/localtime", &p);
430 if (r < 0) {
431 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
432 return 0;
433 }
434
435 z = path_startswith(p, "../usr/share/zoneinfo/");
436 if (!z)
437 z = path_startswith(p, "/usr/share/zoneinfo/");
438 if (!z) {
439 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
440 return 0;
441 }
442
04bc4a3f
LP
443 where = strappend(dest, "/etc/localtime");
444 if (!where)
0d0f0c50 445 return log_oom();
715ac17a 446
d4036145
LP
447 r = readlink_malloc(where, &q);
448 if (r >= 0) {
449 y = path_startswith(q, "../usr/share/zoneinfo/");
450 if (!y)
451 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 452
4d1c38b8 453
d4036145
LP
454 /* Already pointing to the right place? Then do nothing .. */
455 if (y && streq(y, z))
456 return 0;
457 }
458
459 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
460 if (!check)
0d0f0c50 461 return log_oom();
4d1c38b8 462
d4036145
LP
463 if (access(check, F_OK) < 0) {
464 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
465 return 0;
466 }
68fb0892 467
d4036145
LP
468 what = strappend("../usr/share/zoneinfo/", z);
469 if (!what)
470 return log_oom();
471
472 unlink(where);
473 if (symlink(what, where) < 0) {
474 log_error("Failed to correct timezone of container: %m");
475 return 0;
476 }
e58a1277
LP
477
478 return 0;
88213476
LP
479}
480
2547bb41
LP
481static int setup_resolv_conf(const char *dest) {
482 char *where;
483
484 assert(dest);
485
486 if (arg_private_network)
487 return 0;
488
489 /* Fix resolv.conf, if possible */
04bc4a3f
LP
490 where = strappend(dest, "/etc/resolv.conf");
491 if (!where)
0d0f0c50 492 return log_oom();
2547bb41 493
77e63faf
LP
494 /* We don't really care for the results of this really. If it
495 * fails, it fails, but meh... */
2547bb41
LP
496 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
497 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
498
499 free(where);
500
501 return 0;
502}
503
04bc4a3f 504static int setup_boot_id(const char *dest) {
ed8b7a3e 505 char _cleanup_free_ *from = NULL, *to = NULL;
04bc4a3f
LP
506 sd_id128_t rnd;
507 char as_uuid[37];
508 int r;
509
510 assert(dest);
511
512 /* Generate a new randomized boot ID, so that each boot-up of
513 * the container gets a new one */
514
515 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 516 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
517 if (!from || !to)
518 return log_oom();
04bc4a3f
LP
519
520 r = sd_id128_randomize(&rnd);
521 if (r < 0) {
522 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 523 return r;
04bc4a3f
LP
524 }
525
526 snprintf(as_uuid, sizeof(as_uuid),
527 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
528 SD_ID128_FORMAT_VAL(rnd));
529 char_array_0(as_uuid);
530
531 r = write_one_line_file(from, as_uuid);
532 if (r < 0) {
533 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 534 return r;
04bc4a3f
LP
535 }
536
537 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
538 log_error("Failed to bind mount boot id: %m");
539 r = -errno;
540 } else
541 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
542
543 unlink(from);
04bc4a3f
LP
544 return r;
545}
546
e58a1277 547static int copy_devnodes(const char *dest) {
88213476
LP
548
549 static const char devnodes[] =
550 "null\0"
551 "zero\0"
552 "full\0"
553 "random\0"
554 "urandom\0"
f2d88580 555 "tty\0";
88213476
LP
556
557 const char *d;
e58a1277 558 int r = 0;
25ea79fe 559 mode_t _cleanup_umask_ u;
a258bf26
LP
560
561 assert(dest);
124640f1
LP
562
563 u = umask(0000);
88213476
LP
564
565 NULSTR_FOREACH(d, devnodes) {
e58a1277 566 struct stat st;
ed8b7a3e 567 char _cleanup_free_ *from = NULL, *to = NULL;
88213476
LP
568
569 asprintf(&from, "/dev/%s", d);
570 asprintf(&to, "%s/dev/%s", dest, d);
571
572 if (!from || !to) {
ed8b7a3e 573 log_oom();
a258bf26 574
88213476
LP
575 if (r == 0)
576 r = -ENOMEM;
577
578 break;
579 }
580
581 if (stat(from, &st) < 0) {
582
583 if (errno != ENOENT) {
584 log_error("Failed to stat %s: %m", from);
88213476
LP
585 if (r == 0)
586 r = -errno;
587 }
588
a258bf26 589 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 590
ed8b7a3e 591 log_error("%s is not a char or block device, cannot copy", from);
a258bf26
LP
592 if (r == 0)
593 r = -EIO;
594
595 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
596
597 log_error("mknod(%s) failed: %m", dest);
598 if (r == 0)
599 r = -errno;
88213476 600 }
88213476
LP
601 }
602
e58a1277
LP
603 return r;
604}
88213476 605
f2d88580
LP
606static int setup_ptmx(const char *dest) {
607 _cleanup_free_ char *p = NULL;
608
609 p = strappend(dest, "/dev/ptmx");
610 if (!p)
611 return log_oom();
612
613 if (symlink("pts/ptmx", p) < 0) {
614 log_error("Failed to create /dev/ptmx symlink: %m");
615 return -errno;
616 }
617
618 return 0;
619}
620
e58a1277
LP
621static int setup_dev_console(const char *dest, const char *console) {
622 struct stat st;
ed8b7a3e 623 char _cleanup_free_ *to = NULL;
e58a1277 624 int r;
25ea79fe 625 mode_t _cleanup_umask_ u;
e58a1277
LP
626
627 assert(dest);
628 assert(console);
629
630 u = umask(0000);
631
632 if (stat(console, &st) < 0) {
633 log_error("Failed to stat %s: %m", console);
25ea79fe 634 return -errno;
88213476 635
a258bf26 636 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
637 log_error("/dev/console is not a char device");
638 return -EIO;
e58a1277 639 }
88213476 640
e58a1277
LP
641 r = chmod_and_chown(console, 0600, 0, 0);
642 if (r < 0) {
643 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 644 return r;
a258bf26 645 }
88213476 646
25ea79fe
ZJS
647 if (asprintf(&to, "%s/dev/console", dest) < 0)
648 return log_oom();
88213476 649
a258bf26
LP
650 /* We need to bind mount the right tty to /dev/console since
651 * ptys can only exist on pts file systems. To have something
652 * to bind mount things on we create a device node first, that
653 * has the right major/minor (note that the major minor
654 * doesn't actually matter here, since we mount it over
655 * anyway). */
656
e58a1277
LP
657 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
658 log_error("mknod() for /dev/console failed: %m");
25ea79fe 659 return -errno;
e58a1277 660 }
a258bf26
LP
661
662 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 663 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 664 return -errno;
a258bf26
LP
665 }
666
25ea79fe 667 return 0;
e58a1277
LP
668}
669
670static int setup_kmsg(const char *dest, int kmsg_socket) {
ed8b7a3e 671 char _cleanup_free_ *from = NULL, *to = NULL;
e58a1277 672 int r, fd, k;
25ea79fe 673 mode_t _cleanup_umask_ u;
e58a1277
LP
674 union {
675 struct cmsghdr cmsghdr;
676 uint8_t buf[CMSG_SPACE(sizeof(int))];
677 } control;
678 struct msghdr mh;
679 struct cmsghdr *cmsg;
680
681 assert(dest);
682 assert(kmsg_socket >= 0);
a258bf26 683
e58a1277 684 u = umask(0000);
a258bf26 685
f1e5dfe2
LP
686 /* We create the kmsg FIFO as /dev/kmsg, but immediately
687 * delete it after bind mounting it to /proc/kmsg. While FIFOs
688 * on the reading side behave very similar to /proc/kmsg,
689 * their writing side behaves differently from /dev/kmsg in
690 * that writing blocks when nothing is reading. In order to
691 * avoid any problems with containers deadlocking due to this
692 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
693 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
694 asprintf(&to, "%s/proc/kmsg", dest) < 0)
695 return log_oom();
e58a1277
LP
696
697 if (mkfifo(from, 0600) < 0) {
698 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 699 return -errno;
e58a1277
LP
700 }
701
702 r = chmod_and_chown(from, 0600, 0, 0);
703 if (r < 0) {
704 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 705 return r;
e58a1277
LP
706 }
707
708 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
709 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 710 return -errno;
e58a1277
LP
711 }
712
713 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
714 if (fd < 0) {
715 log_error("Failed to open fifo: %m");
25ea79fe 716 return -errno;
e58a1277
LP
717 }
718
719 zero(mh);
720 zero(control);
721
722 mh.msg_control = &control;
723 mh.msg_controllen = sizeof(control);
724
725 cmsg = CMSG_FIRSTHDR(&mh);
726 cmsg->cmsg_level = SOL_SOCKET;
727 cmsg->cmsg_type = SCM_RIGHTS;
728 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
729 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
730
731 mh.msg_controllen = cmsg->cmsg_len;
732
733 /* Store away the fd in the socket, so that it stays open as
734 * long as we run the child */
735 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
736 close_nointr_nofail(fd);
737
738 if (k < 0) {
739 log_error("Failed to send FIFO fd: %m");
25ea79fe 740 return -errno;
a258bf26
LP
741 }
742
f1e5dfe2
LP
743 /* And now make the FIFO unavailable as /dev/kmsg... */
744 unlink(from);
25ea79fe 745 return 0;
88213476
LP
746}
747
3a74cea5
LP
748static int setup_hostname(void) {
749 char *hn;
750 int r = 0;
751
9eb977db 752 hn = path_get_file_name(arg_directory);
3a74cea5
LP
753 if (hn) {
754 hn = strdup(hn);
755 if (!hn)
756 return -ENOMEM;
757
758 hostname_cleanup(hn);
759
760 if (!isempty(hn))
761 if (sethostname(hn, strlen(hn)) < 0)
762 r = -errno;
763
764 free(hn);
765 }
766
767 return r;
768}
769
57fb9fb5
LP
770static int setup_journal(const char *directory) {
771 sd_id128_t machine_id;
27407a01
ZJS
772 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
773 char *id;
57fb9fb5
LP
774 int r;
775
776 if (arg_link_journal == LINK_NO)
777 return 0;
778
779 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
780 if (!p)
781 return log_oom();
57fb9fb5
LP
782
783 r = read_one_line_file(p, &b);
27407a01
ZJS
784 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
785 return 0;
786 else if (r < 0) {
787 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
788 return r;
789 }
790
27407a01
ZJS
791 id = strstrip(b);
792 if (isempty(id) && arg_link_journal == LINK_AUTO)
793 return 0;
57fb9fb5 794
27407a01
ZJS
795 /* Verify validity */
796 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 797 if (r < 0) {
27407a01
ZJS
798 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
799 return r;
57fb9fb5
LP
800 }
801
802 free(p);
27407a01
ZJS
803 p = strappend("/var/log/journal/", id);
804 q = strjoin(directory, "/var/log/journal/", id, NULL);
805 if (!p || !q)
806 return log_oom();
807
808 if (path_is_mount_point(p, false) > 0) {
809 if (arg_link_journal != LINK_AUTO) {
810 log_error("%s: already a mount point, refusing to use for journal", p);
811 return -EEXIST;
812 }
813
814 return 0;
57fb9fb5
LP
815 }
816
27407a01 817 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 818 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
819 log_error("%s: already a mount point, refusing to use for journal", q);
820 return -EEXIST;
57fb9fb5
LP
821 }
822
27407a01 823 return 0;
57fb9fb5
LP
824 }
825
826 r = readlink_and_make_absolute(p, &d);
827 if (r >= 0) {
828 if ((arg_link_journal == LINK_GUEST ||
829 arg_link_journal == LINK_AUTO) &&
830 path_equal(d, q)) {
831
27407a01
ZJS
832 r = mkdir_p(q, 0755);
833 if (r < 0)
834 log_warning("failed to create directory %s: %m", q);
835 return 0;
57fb9fb5
LP
836 }
837
838 if (unlink(p) < 0) {
839 log_error("Failed to remove symlink %s: %m", p);
27407a01 840 return -errno;
57fb9fb5
LP
841 }
842 } else if (r == -EINVAL) {
843
844 if (arg_link_journal == LINK_GUEST &&
845 rmdir(p) < 0) {
846
27407a01
ZJS
847 if (errno == ENOTDIR) {
848 log_error("%s already exists and is neither a symlink nor a directory", p);
849 return r;
850 } else {
57fb9fb5 851 log_error("Failed to remove %s: %m", p);
27407a01 852 return -errno;
57fb9fb5 853 }
57fb9fb5
LP
854 }
855 } else if (r != -ENOENT) {
856 log_error("readlink(%s) failed: %m", p);
27407a01 857 return r;
57fb9fb5
LP
858 }
859
860 if (arg_link_journal == LINK_GUEST) {
861
862 if (symlink(q, p) < 0) {
863 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 864 return -errno;
57fb9fb5
LP
865 }
866
27407a01
ZJS
867 r = mkdir_p(q, 0755);
868 if (r < 0)
869 log_warning("failed to create directory %s: %m", q);
870 return 0;
57fb9fb5
LP
871 }
872
873 if (arg_link_journal == LINK_HOST) {
874 r = mkdir_p(p, 0755);
875 if (r < 0) {
876 log_error("Failed to create %s: %m", p);
27407a01 877 return r;
57fb9fb5
LP
878 }
879
27407a01
ZJS
880 } else if (access(p, F_OK) < 0)
881 return 0;
57fb9fb5
LP
882
883 if (dir_is_empty(q) == 0) {
884 log_error("%s not empty.", q);
27407a01 885 return -ENOTEMPTY;
57fb9fb5
LP
886 }
887
888 r = mkdir_p(q, 0755);
889 if (r < 0) {
890 log_error("Failed to create %s: %m", q);
27407a01 891 return r;
57fb9fb5
LP
892 }
893
894 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
895 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 896 return -errno;
57fb9fb5
LP
897 }
898
27407a01 899 return 0;
57fb9fb5
LP
900}
901
88213476 902static int drop_capabilities(void) {
5076f0cc 903 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
904}
905
906static int is_os_tree(const char *path) {
907 int r;
908 char *p;
909 /* We use /bin/sh as flag file if something is an OS */
910
911 if (asprintf(&p, "%s/bin/sh", path) < 0)
912 return -ENOMEM;
913
914 r = access(p, F_OK);
915 free(p);
916
917 return r < 0 ? 0 : 1;
918}
919
57cb4adf 920static int process_pty(int master, pid_t pid, sigset_t *mask) {
0c749d50 921
b72491a2 922 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
a258bf26
LP
923 size_t in_buffer_full = 0, out_buffer_full = 0;
924 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
925 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
a258bf26 926 int ep = -1, signal_fd = -1, r;
57cb4adf
LP
927 bool tried_orderly_shutdown = false;
928
929 assert(master >= 0);
930 assert(pid > 0);
931 assert(mask);
a258bf26
LP
932
933 fd_nonblock(STDIN_FILENO, 1);
934 fd_nonblock(STDOUT_FILENO, 1);
935 fd_nonblock(master, 1);
936
db7feb7e
LP
937 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
938 if (signal_fd < 0) {
a258bf26
LP
939 log_error("signalfd(): %m");
940 r = -errno;
941 goto finish;
942 }
943
db7feb7e
LP
944 ep = epoll_create1(EPOLL_CLOEXEC);
945 if (ep < 0) {
a258bf26
LP
946 log_error("Failed to create epoll: %m");
947 r = -errno;
948 goto finish;
949 }
950
51d88d1b
LP
951 /* We read from STDIN only if this is actually a TTY,
952 * otherwise we assume non-interactivity. */
953 if (isatty(STDIN_FILENO)) {
954 zero(stdin_ev);
955 stdin_ev.events = EPOLLIN|EPOLLET;
956 stdin_ev.data.fd = STDIN_FILENO;
957
958 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
959 log_error("Failed to register STDIN in epoll: %m");
960 r = -errno;
961 goto finish;
962 }
963 }
a258bf26
LP
964
965 zero(stdout_ev);
966 stdout_ev.events = EPOLLOUT|EPOLLET;
967 stdout_ev.data.fd = STDOUT_FILENO;
968
969 zero(master_ev);
970 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
971 master_ev.data.fd = master;
972
973 zero(signal_ev);
974 signal_ev.events = EPOLLIN;
975 signal_ev.data.fd = signal_fd;
976
f2956e80
MS
977 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
978 if (errno != EPERM) {
979 log_error("Failed to register stdout in epoll: %m");
980 r = -errno;
981 goto finish;
982 }
983 /* stdout without epoll support. Likely redirected to regular file. */
984 stdout_writable = true;
985 }
986
987 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
a258bf26 988 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
51d88d1b 989 log_error("Failed to register fds in epoll: %m");
a258bf26
LP
990 r = -errno;
991 goto finish;
992 }
993
fd14078a 994 for (;;) {
a258bf26
LP
995 struct epoll_event ev[16];
996 ssize_t k;
997 int i, nfds;
998
db7feb7e
LP
999 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1000 if (nfds < 0) {
a258bf26
LP
1001
1002 if (errno == EINTR || errno == EAGAIN)
1003 continue;
1004
1005 log_error("epoll_wait(): %m");
1006 r = -errno;
1007 goto finish;
1008 }
1009
1010 assert(nfds >= 1);
1011
1012 for (i = 0; i < nfds; i++) {
1013 if (ev[i].data.fd == STDIN_FILENO) {
1014
fd14078a 1015 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
1016 stdin_readable = true;
1017
1018 } else if (ev[i].data.fd == STDOUT_FILENO) {
1019
fd14078a 1020 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
1021 stdout_writable = true;
1022
1023 } else if (ev[i].data.fd == master) {
1024
fd14078a 1025 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
1026 master_readable = true;
1027
fd14078a 1028 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
1029 master_writable = true;
1030
1031 } else if (ev[i].data.fd == signal_fd) {
1032 struct signalfd_siginfo sfsi;
1033 ssize_t n;
1034
db7feb7e
LP
1035 n = read(signal_fd, &sfsi, sizeof(sfsi));
1036 if (n != sizeof(sfsi)) {
a258bf26
LP
1037
1038 if (n >= 0) {
0c749d50 1039 log_error("Failed to read from signalfd: invalid block size");
a258bf26
LP
1040 r = -EIO;
1041 goto finish;
1042 }
1043
1044 if (errno != EINTR && errno != EAGAIN) {
0c749d50 1045 log_error("Failed to read from signalfd: %m");
a258bf26
LP
1046 r = -errno;
1047 goto finish;
1048 }
1049 } else {
1050
1051 if (sfsi.ssi_signo == SIGWINCH) {
1052 struct winsize ws;
1053
1054 /* The window size changed, let's forward that. */
a258bf26
LP
1055 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1056 ioctl(master, TIOCSWINSZ, &ws);
57cb4adf
LP
1057 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1058
1059 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1060
1061 /* This only works for systemd... */
1062 tried_orderly_shutdown = true;
1063 kill(pid, SIGRTMIN+3);
1064
a258bf26 1065 } else {
0c749d50 1066 r = 0;
a258bf26
LP
1067 goto finish;
1068 }
1069 }
1070 }
1071 }
1072
1073 while ((stdin_readable && in_buffer_full <= 0) ||
1074 (master_writable && in_buffer_full > 0) ||
1075 (master_readable && out_buffer_full <= 0) ||
1076 (stdout_writable && out_buffer_full > 0)) {
1077
b72491a2 1078 if (stdin_readable && in_buffer_full < LINE_MAX) {
a258bf26 1079
db7feb7e
LP
1080 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1081 if (k < 0) {
a258bf26 1082
fd14078a 1083 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1084 stdin_readable = false;
a258bf26
LP
1085 else {
1086 log_error("read(): %m");
0c749d50 1087 r = -errno;
a258bf26
LP
1088 goto finish;
1089 }
1090 } else
1091 in_buffer_full += (size_t) k;
a258bf26
LP
1092 }
1093
1094 if (master_writable && in_buffer_full > 0) {
1095
db7feb7e
LP
1096 k = write(master, in_buffer, in_buffer_full);
1097 if (k < 0) {
a258bf26 1098
fd14078a 1099 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1100 master_writable = false;
fd14078a 1101 else {
a258bf26 1102 log_error("write(): %m");
0c749d50 1103 r = -errno;
a258bf26
LP
1104 goto finish;
1105 }
1106
1107 } else {
1108 assert(in_buffer_full >= (size_t) k);
1109 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1110 in_buffer_full -= k;
1111 }
1112 }
1113
b72491a2 1114 if (master_readable && out_buffer_full < LINE_MAX) {
a258bf26 1115
db7feb7e
LP
1116 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1117 if (k < 0) {
a258bf26 1118
fd14078a 1119 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1120 master_readable = false;
a258bf26
LP
1121 else {
1122 log_error("read(): %m");
0c749d50 1123 r = -errno;
a258bf26
LP
1124 goto finish;
1125 }
1126 } else
1127 out_buffer_full += (size_t) k;
a258bf26
LP
1128 }
1129
1130 if (stdout_writable && out_buffer_full > 0) {
1131
db7feb7e
LP
1132 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1133 if (k < 0) {
a258bf26 1134
fd14078a 1135 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1136 stdout_writable = false;
fd14078a 1137 else {
a258bf26 1138 log_error("write(): %m");
0c749d50 1139 r = -errno;
a258bf26
LP
1140 goto finish;
1141 }
1142
1143 } else {
1144 assert(out_buffer_full >= (size_t) k);
1145 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1146 out_buffer_full -= k;
1147 }
1148 }
1149 }
fd14078a 1150 }
a258bf26
LP
1151
1152finish:
1153 if (ep >= 0)
1154 close_nointr_nofail(ep);
1155
1156 if (signal_fd >= 0)
1157 close_nointr_nofail(signal_fd);
1158
1159 return r;
1160}
88213476
LP
1161
1162int main(int argc, char *argv[]) {
1163 pid_t pid = 0;
04d391da
LP
1164 int r = EXIT_FAILURE, k;
1165 char *oldcg = NULL, *newcg = NULL;
40c32a4a 1166 char **controller = NULL;
842f3b0f 1167 int master = -1, n_fd_passed;
a258bf26
LP
1168 const char *console = NULL;
1169 struct termios saved_attr, raw_attr;
1170 sigset_t mask;
1171 bool saved_attr_valid = false;
1172 struct winsize ws;
e58a1277 1173 int kmsg_socket_pair[2] = { -1, -1 };
842f3b0f 1174 FDSet *fds = NULL;
88213476
LP
1175
1176 log_parse_environment();
1177 log_open();
1178
db7feb7e
LP
1179 r = parse_argv(argc, argv);
1180 if (r <= 0)
88213476
LP
1181 goto finish;
1182
1183 if (arg_directory) {
1184 char *p;
1185
1186 p = path_make_absolute_cwd(arg_directory);
1187 free(arg_directory);
1188 arg_directory = p;
1189 } else
1190 arg_directory = get_current_dir_name();
1191
1192 if (!arg_directory) {
1193 log_error("Failed to determine path");
1194 goto finish;
1195 }
1196
1197 path_kill_slashes(arg_directory);
1198
1199 if (geteuid() != 0) {
1200 log_error("Need to be root.");
1201 goto finish;
1202 }
1203
04d391da
LP
1204 if (sd_booted() <= 0) {
1205 log_error("Not running on a systemd system.");
1206 goto finish;
1207 }
1208
88213476 1209 if (path_equal(arg_directory, "/")) {
6df6b939 1210 log_error("Spawning container on root directory not supported.");
88213476
LP
1211 goto finish;
1212 }
1213
1214 if (is_os_tree(arg_directory) <= 0) {
1215 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1216 goto finish;
1217 }
1218
842f3b0f
LP
1219 log_close();
1220 n_fd_passed = sd_listen_fds(false);
1221 if (n_fd_passed > 0) {
1222 k = fdset_new_listen_fds(&fds, false);
1223 if (k < 0) {
1224 log_error("Failed to collect file descriptors: %s", strerror(-k));
1225 goto finish;
1226 }
1227 }
1228 fdset_close_others(fds);
1229 log_open();
1230
db7feb7e
LP
1231 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1232 if (k < 0) {
04d391da
LP
1233 log_error("Failed to determine current cgroup: %s", strerror(-k));
1234 goto finish;
1235 }
1236
1237 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1238 log_error("Failed to allocate cgroup path.");
1239 goto finish;
1240 }
1241
40c32a4a
LGL
1242 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1243 if (k < 0) {
04d391da
LP
1244 log_error("Failed to create cgroup: %s", strerror(-k));
1245 goto finish;
1246 }
1247
db7feb7e 1248 STRV_FOREACH(controller, arg_controllers) {
40c32a4a
LGL
1249 k = cg_create_and_attach(*controller, newcg, 0);
1250 if (k < 0)
1251 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1252 }
1253
db7feb7e
LP
1254 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1255 if (master < 0) {
a258bf26
LP
1256 log_error("Failed to acquire pseudo tty: %m");
1257 goto finish;
1258 }
1259
db7feb7e
LP
1260 console = ptsname(master);
1261 if (!console) {
a258bf26
LP
1262 log_error("Failed to determine tty name: %m");
1263 goto finish;
1264 }
1265
1266 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1267
1268 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1269 ioctl(master, TIOCSWINSZ, &ws);
1270
1271 if (unlockpt(master) < 0) {
1272 log_error("Failed to unlock tty: %m");
1273 goto finish;
1274 }
1275
51d88d1b
LP
1276 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1277 saved_attr_valid = true;
a258bf26 1278
51d88d1b
LP
1279 raw_attr = saved_attr;
1280 cfmakeraw(&raw_attr);
1281 raw_attr.c_lflag &= ~ECHO;
1282 }
a258bf26 1283
e58a1277
LP
1284 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1285 log_error("Failed to create kmsg socket pair");
1286 goto finish;
1287 }
1288
a258bf26
LP
1289 assert_se(sigemptyset(&mask) == 0);
1290 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1291 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1292
d87be9b0
LP
1293 for (;;) {
1294 siginfo_t status;
1fd96121 1295 int pipefd[2];
52af2106 1296
f2d88580 1297 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1fd96121
ZJS
1298 log_error("pipe2(): %m");
1299 goto finish;
d87be9b0 1300 }
88213476 1301
d87be9b0
LP
1302 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1303 if (pid < 0) {
1304 if (errno == EINVAL)
1305 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1306 else
1307 log_error("clone() failed: %m");
a258bf26 1308
d87be9b0
LP
1309 goto finish;
1310 }
a258bf26 1311
d87be9b0
LP
1312 if (pid == 0) {
1313 /* child */
d87be9b0
LP
1314 const char *home = NULL;
1315 uid_t uid = (uid_t) -1;
1316 gid_t gid = (gid_t) -1;
5674767e 1317 unsigned n_env = 2;
d87be9b0
LP
1318 const char *envp[] = {
1319 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1320 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1321 NULL, /* TERM */
1322 NULL, /* HOME */
1323 NULL, /* USER */
1324 NULL, /* LOGNAME */
1325 NULL, /* container_uuid */
842f3b0f
LP
1326 NULL, /* LISTEN_FDS */
1327 NULL, /* LISTEN_PID */
d87be9b0
LP
1328 NULL
1329 };
a258bf26 1330
5674767e
ZJS
1331 envp[n_env] = strv_find_prefix(environ, "TERM=");
1332 if (envp[n_env])
1333 n_env ++;
a258bf26 1334
5659774c 1335 close_nointr_nofail(pipefd[1]);
1fd96121 1336 fd_wait_for_event(pipefd[0], POLLHUP, -1);
5659774c 1337 close_nointr_nofail(pipefd[0]);
1fd96121 1338
d87be9b0 1339 close_nointr_nofail(master);
842f3b0f 1340 master = -1;
a258bf26 1341
1fd96121
ZJS
1342 if (saved_attr_valid) {
1343 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1344 log_error("Failed to set terminal attributes: %m");
1345 goto child_fail;
1346 }
1347 }
1348
d87be9b0
LP
1349 close_nointr(STDIN_FILENO);
1350 close_nointr(STDOUT_FILENO);
1351 close_nointr(STDERR_FILENO);
db7feb7e 1352
842f3b0f
LP
1353 close_nointr_nofail(kmsg_socket_pair[0]);
1354 kmsg_socket_pair[0] = -1;
a258bf26 1355
d87be9b0 1356 reset_all_signal_handlers();
88213476 1357
d87be9b0
LP
1358 assert_se(sigemptyset(&mask) == 0);
1359 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1360
842f3b0f
LP
1361 k = open_terminal(console, O_RDWR);
1362 if (k != STDIN_FILENO) {
1363 if (k >= 0) {
1364 close_nointr_nofail(k);
1365 k = -EINVAL;
1366 }
1367
1368 log_error("Failed to open console: %s", strerror(-k));
1369 goto child_fail;
1370 }
1371
1372 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1373 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1374 log_error("Failed to duplicate console: %m");
d87be9b0 1375 goto child_fail;
842f3b0f 1376 }
bc2f673e 1377
d87be9b0
LP
1378 if (setsid() < 0) {
1379 log_error("setsid() failed: %m");
bc2f673e
LP
1380 goto child_fail;
1381 }
1382
d87be9b0
LP
1383 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1384 log_error("PR_SET_PDEATHSIG failed: %m");
1385 goto child_fail;
1386 }
e58a1277 1387
d87be9b0
LP
1388 /* Mark everything as slave, so that we still
1389 * receive mounts from the real root, but don't
1390 * propagate mounts to the real root. */
1391 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1392 log_error("MS_SLAVE|MS_REC failed: %m");
1393 goto child_fail;
1394 }
04bc4a3f 1395
d87be9b0
LP
1396 /* Turn directory into bind mount */
1397 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1398 log_error("Failed to make bind mount.");
1399 goto child_fail;
1400 }
88213476 1401
d87be9b0
LP
1402 if (arg_read_only)
1403 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1404 log_error("Failed to make read-only.");
1405 goto child_fail;
1406 }
2547bb41 1407
d87be9b0
LP
1408 if (mount_all(arg_directory) < 0)
1409 goto child_fail;
57fb9fb5 1410
d87be9b0
LP
1411 if (copy_devnodes(arg_directory) < 0)
1412 goto child_fail;
a258bf26 1413
f2d88580
LP
1414 if (setup_ptmx(arg_directory) < 0)
1415 goto child_fail;
1416
d87be9b0 1417 dev_setup(arg_directory);
88213476 1418
d87be9b0
LP
1419 if (setup_dev_console(arg_directory, console) < 0)
1420 goto child_fail;
88213476 1421
d87be9b0
LP
1422 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1423 goto child_fail;
88213476 1424
d87be9b0 1425 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1426 kmsg_socket_pair[1] = -1;
a258bf26 1427
d87be9b0
LP
1428 if (setup_boot_id(arg_directory) < 0)
1429 goto child_fail;
a41fe3a2 1430
d87be9b0
LP
1431 if (setup_timezone(arg_directory) < 0)
1432 goto child_fail;
88213476 1433
d87be9b0
LP
1434 if (setup_resolv_conf(arg_directory) < 0)
1435 goto child_fail;
687d0825 1436
d87be9b0 1437 if (setup_journal(arg_directory) < 0)
687d0825 1438 goto child_fail;
687d0825 1439
17fe0523
LP
1440 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1441 goto child_fail;
1442
1443 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1444 goto child_fail;
1445
d87be9b0
LP
1446 if (chdir(arg_directory) < 0) {
1447 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1448 goto child_fail;
1449 }
1450
d87be9b0
LP
1451 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1452 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1453 goto child_fail;
1454 }
1455
d87be9b0
LP
1456 if (chroot(".") < 0) {
1457 log_error("chroot() failed: %m");
687d0825
MV
1458 goto child_fail;
1459 }
1460
d87be9b0
LP
1461 if (chdir("/") < 0) {
1462 log_error("chdir() failed: %m");
687d0825
MV
1463 goto child_fail;
1464 }
1465
d87be9b0
LP
1466 umask(0022);
1467
1468 loopback_setup();
1469
1470 if (drop_capabilities() < 0) {
1471 log_error("drop_capabilities() failed: %m");
687d0825
MV
1472 goto child_fail;
1473 }
687d0825 1474
d87be9b0
LP
1475 if (arg_user) {
1476
963ddb91
LP
1477 /* Note that this resolves user names
1478 * inside the container, and hence
1479 * accesses the NSS modules from the
1480 * container and not the host. This is
1481 * a bit weird... */
1482
d87be9b0
LP
1483 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1484 log_error("get_user_creds() failed: %m");
1485 goto child_fail;
1486 }
1487
1488 if (mkdir_parents_label(home, 0775) < 0) {
1489 log_error("mkdir_parents_label() failed: %m");
1490 goto child_fail;
1491 }
1492
1493 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1494 log_error("mkdir_safe_label() failed: %m");
1495 goto child_fail;
1496 }
1497
1498 if (initgroups((const char*)arg_user, gid) < 0) {
1499 log_error("initgroups() failed: %m");
1500 goto child_fail;
1501 }
144f0fc0 1502
d87be9b0
LP
1503 if (setresgid(gid, gid, gid) < 0) {
1504 log_error("setregid() failed: %m");
1505 goto child_fail;
1506 }
1507
1508 if (setresuid(uid, uid, uid) < 0) {
1509 log_error("setreuid() failed: %m");
1510 goto child_fail;
1511 }
3c957acf
LP
1512 } else {
1513 /* Reset everything fully to 0, just in case */
1514
1515 if (setgroups(0, NULL) < 0) {
1516 log_error("setgroups() failed: %m");
1517 goto child_fail;
1518 }
1519
1520 if (setresgid(0, 0, 0) < 0) {
1521 log_error("setregid() failed: %m");
1522 goto child_fail;
1523 }
1524
1525 if (setresuid(0, 0, 0) < 0) {
1526 log_error("setreuid() failed: %m");
1527 goto child_fail;
1528 }
d87be9b0
LP
1529 }
1530
842f3b0f
LP
1531 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1532 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1533 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1534 log_oom();
144f0fc0
LP
1535 goto child_fail;
1536 }
687d0825 1537
d87be9b0 1538 if (arg_uuid) {
842f3b0f
LP
1539 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1540 log_oom();
1541 goto child_fail;
1542 }
1543 }
1544
1545 if (fdset_size(fds) > 0) {
1546 k = fdset_cloexec(fds, false);
1547 if (k < 0) {
1548 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1549 goto child_fail;
1550 }
1551
1552 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1553 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
d87be9b0
LP
1554 log_oom();
1555 goto child_fail;
1556 }
1557 }
1558
1559 setup_hostname();
1560
1561 if (arg_boot) {
1562 char **a;
1563 size_t l;
88213476 1564
d87be9b0 1565 /* Automatically search for the init system */
0f0dbc46 1566
d87be9b0
LP
1567 l = 1 + argc - optind;
1568 a = newa(char*, l + 1);
1569 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1570
d87be9b0
LP
1571 a[0] = (char*) "/usr/lib/systemd/systemd";
1572 execve(a[0], a, (char**) envp);
0f0dbc46 1573
d87be9b0
LP
1574 a[0] = (char*) "/lib/systemd/systemd";
1575 execve(a[0], a, (char**) envp);
0f0dbc46 1576
d87be9b0
LP
1577 a[0] = (char*) "/sbin/init";
1578 execve(a[0], a, (char**) envp);
1579 } else if (argc > optind)
1580 execvpe(argv[optind], argv + optind, (char**) envp);
1581 else {
1582 chdir(home ? home : "/root");
1583 execle("/bin/bash", "-bash", NULL, (char**) envp);
1584 }
1585
1586 log_error("execv() failed: %m");
0f0dbc46 1587
d87be9b0
LP
1588 child_fail:
1589 _exit(EXIT_FAILURE);
da5b3bad 1590 }
88213476 1591
9d60cb63 1592 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
5659774c
ZJS
1593 close_nointr_nofail(pipefd[0]);
1594 close_nointr_nofail(pipefd[1]);
1fd96121 1595
842f3b0f
LP
1596 fdset_free(fds);
1597 fds = NULL;
1598
57cb4adf 1599 if (process_pty(master, pid, &mask) < 0)
d87be9b0 1600 goto finish;
88213476 1601
d87be9b0
LP
1602 if (saved_attr_valid)
1603 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
a258bf26 1604
d87be9b0
LP
1605 r = wait_for_terminate(pid, &status);
1606 if (r < 0) {
1607 r = EXIT_FAILURE;
1608 break;
1609 }
a258bf26 1610
d87be9b0
LP
1611 if (status.si_code == CLD_EXITED) {
1612 if (status.si_status != 0) {
1613 log_error("Container failed with error code %i.", status.si_status);
1614 r = status.si_status;
1615 break;
1616 }
1617
1618 log_debug("Container exited successfully.");
1619 break;
1620 } else if (status.si_code == CLD_KILLED &&
1621 status.si_status == SIGINT) {
1622 log_info("Container has been shut down.");
1623 r = 0;
1624 break;
1625 } else if (status.si_code == CLD_KILLED &&
1626 status.si_status == SIGHUP) {
1627 log_info("Container is being rebooted.");
1628 continue;
1629 } else if (status.si_code == CLD_KILLED ||
1630 status.si_code == CLD_DUMPED) {
88213476 1631
d87be9b0
LP
1632 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1633 r = EXIT_FAILURE;
1634 break;
1635 } else {
1636 log_error("Container failed due to unknown reason.");
1637 r = EXIT_FAILURE;
1638 break;
1639 }
1640 }
88213476
LP
1641
1642finish:
a258bf26
LP
1643 if (saved_attr_valid)
1644 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1645
1646 if (master >= 0)
1647 close_nointr_nofail(master);
1648
e58a1277
LP
1649 close_pipe(kmsg_socket_pair);
1650
04d391da
LP
1651 if (oldcg)
1652 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1653
1654 if (newcg)
1655 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
88213476 1656
04d391da 1657 free(arg_directory);
40c32a4a 1658 strv_free(arg_controllers);
04d391da
LP
1659 free(oldcg);
1660 free(newcg);
88213476 1661
842f3b0f
LP
1662 fdset_free(fds);
1663
88213476
LP
1664 return r;
1665}