]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: introduce the new /machine/ tree in the cgroup tree and move containers there
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
1fd96121 36#include <sys/poll.h>
a258bf26
LP
37#include <sys/epoll.h>
38#include <termios.h>
39#include <sys/signalfd.h>
687d0825 40#include <grp.h>
5ed27dbd 41#include <linux/fs.h>
9537eab0
LP
42#include <sys/un.h>
43#include <sys/socket.h>
88213476 44
81527be1
LP
45#include <systemd/sd-daemon.h>
46
88213476
LP
47#include "log.h"
48#include "util.h"
49e942b2 49#include "mkdir.h"
6b2d0e85 50#include "macro.h"
d7832d2c 51#include "audit.h"
94d82985 52#include "missing.h"
04d391da 53#include "cgroup-util.h"
a258bf26 54#include "strv.h"
9eb977db 55#include "path-util.h"
a41fe3a2 56#include "loopback-setup.h"
57fb9fb5 57#include "sd-id128.h"
4fc9982c 58#include "dev-setup.h"
842f3b0f 59#include "fdset.h"
acbeb427 60#include "build.h"
a5c32cff 61#include "fileio.h"
57fb9fb5 62
f2d88580
LP
63#ifndef TTY_GID
64#define TTY_GID 5
65#endif
66
57fb9fb5
LP
67typedef enum LinkJournal {
68 LINK_NO,
69 LINK_AUTO,
70 LINK_HOST,
71 LINK_GUEST
72} LinkJournal;
88213476
LP
73
74static char *arg_directory = NULL;
687d0825 75static char *arg_user = NULL;
40c32a4a 76static char **arg_controllers = NULL;
144f0fc0 77static char *arg_uuid = NULL;
7027ff61 78static char *arg_machine = NULL;
ff01d048 79static bool arg_private_network = false;
bc2f673e 80static bool arg_read_only = false;
0f0dbc46 81static bool arg_boot = false;
57fb9fb5 82static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
83static uint64_t arg_retain =
84 (1ULL << CAP_CHOWN) |
85 (1ULL << CAP_DAC_OVERRIDE) |
86 (1ULL << CAP_DAC_READ_SEARCH) |
87 (1ULL << CAP_FOWNER) |
88 (1ULL << CAP_FSETID) |
89 (1ULL << CAP_IPC_OWNER) |
90 (1ULL << CAP_KILL) |
91 (1ULL << CAP_LEASE) |
92 (1ULL << CAP_LINUX_IMMUTABLE) |
93 (1ULL << CAP_NET_BIND_SERVICE) |
94 (1ULL << CAP_NET_BROADCAST) |
95 (1ULL << CAP_NET_RAW) |
96 (1ULL << CAP_SETGID) |
97 (1ULL << CAP_SETFCAP) |
98 (1ULL << CAP_SETPCAP) |
99 (1ULL << CAP_SETUID) |
100 (1ULL << CAP_SYS_ADMIN) |
101 (1ULL << CAP_SYS_CHROOT) |
102 (1ULL << CAP_SYS_NICE) |
103 (1ULL << CAP_SYS_PTRACE) |
104 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 105 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
106 (1ULL << CAP_SYS_BOOT) |
107 (1ULL << CAP_AUDIT_WRITE) |
108 (1ULL << CAP_AUDIT_CONTROL);
17fe0523
LP
109static char **arg_bind = NULL;
110static char **arg_bind_ro = NULL;
88213476
LP
111
112static int help(void) {
113
114 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
17fe0523 116 " -h --help Show this help\n"
7027ff61 117 " --version Print version string\n"
17fe0523
LP
118 " -D --directory=NAME Root directory for the container\n"
119 " -b --boot Boot up full system (i.e. invoke init)\n"
120 " -u --user=USER Run the command under specified user or uid\n"
121 " -C --controllers=LIST Put the container in specified comma-separated\n"
122 " cgroup hierarchies\n"
123 " --uuid=UUID Set a specific machine UUID for the container\n"
7027ff61 124 " -M --machine=NAME Set the machine name for the container\n"
17fe0523
LP
125 " --private-network Disable network in container\n"
126 " --read-only Mount the root directory read-only\n"
127 " --capability=CAP In addition to the default, retain specified\n"
128 " capability\n"
129 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
130 " -j Equivalent to --link-journal=host\n"
131 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
132 " the container\n"
133 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
88213476
LP
134 program_invocation_short_name);
135
136 return 0;
137}
138
139static int parse_argv(int argc, char *argv[]) {
140
a41fe3a2 141 enum {
acbeb427
ZJS
142 ARG_VERSION = 0x100,
143 ARG_PRIVATE_NETWORK,
bc2f673e 144 ARG_UUID,
5076f0cc 145 ARG_READ_ONLY,
57fb9fb5 146 ARG_CAPABILITY,
17fe0523
LP
147 ARG_LINK_JOURNAL,
148 ARG_BIND,
149 ARG_BIND_RO
a41fe3a2
LP
150 };
151
88213476 152 static const struct option options[] = {
ff01d048 153 { "help", no_argument, NULL, 'h' },
acbeb427 154 { "version", no_argument, NULL, ARG_VERSION },
ff01d048
LP
155 { "directory", required_argument, NULL, 'D' },
156 { "user", required_argument, NULL, 'u' },
40c32a4a 157 { "controllers", required_argument, NULL, 'C' },
ff01d048 158 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 159 { "boot", no_argument, NULL, 'b' },
144f0fc0 160 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 161 { "read-only", no_argument, NULL, ARG_READ_ONLY },
5076f0cc 162 { "capability", required_argument, NULL, ARG_CAPABILITY },
57fb9fb5 163 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
17fe0523
LP
164 { "bind", required_argument, NULL, ARG_BIND },
165 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
7027ff61 166 { "machine", required_argument, NULL, 'M' },
ff01d048 167 { NULL, 0, NULL, 0 }
88213476
LP
168 };
169
170 int c;
171
172 assert(argc >= 0);
173 assert(argv);
174
57fb9fb5 175 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
88213476
LP
176
177 switch (c) {
178
179 case 'h':
180 help();
181 return 0;
182
acbeb427
ZJS
183 case ARG_VERSION:
184 puts(PACKAGE_STRING);
185 puts(SYSTEMD_FEATURES);
186 return 0;
187
88213476
LP
188 case 'D':
189 free(arg_directory);
3a74cea5
LP
190 arg_directory = canonicalize_file_name(optarg);
191 if (!arg_directory) {
192 log_error("Failed to canonicalize root directory.");
88213476
LP
193 return -ENOMEM;
194 }
195
196 break;
197
687d0825
MV
198 case 'u':
199 free(arg_user);
7027ff61
LP
200 arg_user = strdup(optarg);
201 if (!arg_user)
202 return log_oom();
687d0825
MV
203
204 break;
205
40c32a4a
LGL
206 case 'C':
207 strv_free(arg_controllers);
208 arg_controllers = strv_split(optarg, ",");
7027ff61
LP
209 if (!arg_controllers)
210 return log_oom();
40c32a4a 211
7027ff61 212 cg_shorten_controllers(arg_controllers);
40c32a4a
LGL
213 break;
214
ff01d048
LP
215 case ARG_PRIVATE_NETWORK:
216 arg_private_network = true;
a41fe3a2
LP
217 break;
218
0f0dbc46
LP
219 case 'b':
220 arg_boot = true;
221 break;
222
144f0fc0
LP
223 case ARG_UUID:
224 arg_uuid = optarg;
225 break;
226
7027ff61
LP
227 case 'M':
228 if (!hostname_is_valid(optarg)) {
229 log_error("Invalid machine name: %s", optarg);
230 return -EINVAL;
231 }
232
233 free(arg_machine);
234 arg_machine = strdup(optarg);
235 if (!arg_machine)
236 return log_oom();
237
238 break;
239
bc2f673e
LP
240 case ARG_READ_ONLY:
241 arg_read_only = true;
242 break;
243
5076f0cc
LP
244 case ARG_CAPABILITY: {
245 char *state, *word;
246 size_t length;
247
248 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
249 cap_value_t cap;
250 char *t;
251
252 t = strndup(word, length);
0d0f0c50
SL
253 if (!t)
254 return log_oom();
5076f0cc
LP
255
256 if (cap_from_name(t, &cap) < 0) {
257 log_error("Failed to parse capability %s.", t);
258 free(t);
259 return -EINVAL;
260 }
261
262 free(t);
263 arg_retain |= 1ULL << (uint64_t) cap;
264 }
265
266 break;
267 }
268
57fb9fb5
LP
269 case 'j':
270 arg_link_journal = LINK_GUEST;
271 break;
272
273 case ARG_LINK_JOURNAL:
274 if (streq(optarg, "auto"))
275 arg_link_journal = LINK_AUTO;
276 else if (streq(optarg, "no"))
277 arg_link_journal = LINK_NO;
278 else if (streq(optarg, "guest"))
279 arg_link_journal = LINK_GUEST;
280 else if (streq(optarg, "host"))
281 arg_link_journal = LINK_HOST;
282 else {
283 log_error("Failed to parse link journal mode %s", optarg);
284 return -EINVAL;
285 }
286
287 break;
288
17fe0523
LP
289 case ARG_BIND:
290 case ARG_BIND_RO: {
291 _cleanup_free_ char *a = NULL, *b = NULL;
292 char *e;
293 char ***x;
294 int r;
295
296 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
297
298 e = strchr(optarg, ':');
299 if (e) {
300 a = strndup(optarg, e - optarg);
301 b = strdup(e + 1);
302 } else {
303 a = strdup(optarg);
304 b = strdup(optarg);
305 }
306
307 if (!a || !b)
308 return log_oom();
309
310 if (!path_is_absolute(a) || !path_is_absolute(b)) {
311 log_error("Invalid bind mount specification: %s", optarg);
312 return -EINVAL;
313 }
314
315 r = strv_extend(x, a);
316 if (r < 0)
317 return r;
318
319 r = strv_extend(x, b);
320 if (r < 0)
321 return r;
322
323 break;
324 }
325
88213476
LP
326 case '?':
327 return -EINVAL;
328
329 default:
330 log_error("Unknown option code %c", c);
331 return -EINVAL;
332 }
333 }
334
335 return 1;
336}
337
338static int mount_all(const char *dest) {
339
340 typedef struct MountPoint {
341 const char *what;
342 const char *where;
343 const char *type;
344 const char *options;
345 unsigned long flags;
3bd66c05 346 bool fatal;
88213476
LP
347 } MountPoint;
348
349 static const MountPoint mount_table[] = {
4b7a6af4 350 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
351 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
352 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 353 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 354 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 355 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 356 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 357 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 358#ifdef HAVE_SELINUX
b4c59701
LP
359 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
360 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 361#endif
88213476
LP
362 };
363
364 unsigned k;
365 int r = 0;
366
367 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
89154bd4 368 char _cleanup_free_ *where = NULL;
88213476
LP
369 int t;
370
17fe0523
LP
371 where = strjoin(dest, "/", mount_table[k].where, NULL);
372 if (!where)
373 return log_oom();
88213476 374
e65aec12 375 t = path_is_mount_point(where, true);
68fb0892 376 if (t < 0) {
88213476 377 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
378
379 if (r == 0)
380 r = t;
381
382 continue;
383 }
384
9c1c7f71
LP
385 /* Skip this entry if it is not a remount. */
386 if (mount_table[k].what && t > 0)
014a9c77
LP
387 continue;
388
17fe0523 389 mkdir_p(where, 0755);
88213476
LP
390
391 if (mount(mount_table[k].what,
392 where,
393 mount_table[k].type,
394 mount_table[k].flags,
3bd66c05
LP
395 mount_table[k].options) < 0 &&
396 mount_table[k].fatal) {
88213476
LP
397
398 log_error("mount(%s) failed: %m", where);
399
400 if (r == 0)
401 r = -errno;
402 }
88213476
LP
403 }
404
e58a1277
LP
405 return r;
406}
f8440af5 407
17fe0523
LP
408static int mount_binds(const char *dest, char **l, unsigned long flags) {
409 char **x, **y;
410
411 STRV_FOREACH_PAIR(x, y, l) {
412 _cleanup_free_ char *where = NULL;
413
414 where = strjoin(dest, "/", *y, NULL);
415 if (!where)
416 return log_oom();
417
418 mkdir_p_label(where, 0755);
419
420 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421 log_error("mount(%s) failed: %m", where);
422 return -errno;
423 }
424
425 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426 log_error("mount(%s) failed: %m", where);
427 return -errno;
428 }
429 }
430
431 return 0;
432}
433
e58a1277 434static int setup_timezone(const char *dest) {
d4036145
LP
435 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
436 char *z, *y;
437 int r;
f8440af5 438
e58a1277
LP
439 assert(dest);
440
441 /* Fix the timezone, if possible */
d4036145
LP
442 r = readlink_malloc("/etc/localtime", &p);
443 if (r < 0) {
444 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
445 return 0;
446 }
447
448 z = path_startswith(p, "../usr/share/zoneinfo/");
449 if (!z)
450 z = path_startswith(p, "/usr/share/zoneinfo/");
451 if (!z) {
452 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
453 return 0;
454 }
455
04bc4a3f
LP
456 where = strappend(dest, "/etc/localtime");
457 if (!where)
0d0f0c50 458 return log_oom();
715ac17a 459
d4036145
LP
460 r = readlink_malloc(where, &q);
461 if (r >= 0) {
462 y = path_startswith(q, "../usr/share/zoneinfo/");
463 if (!y)
464 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 465
4d1c38b8 466
d4036145
LP
467 /* Already pointing to the right place? Then do nothing .. */
468 if (y && streq(y, z))
469 return 0;
470 }
471
472 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
473 if (!check)
0d0f0c50 474 return log_oom();
4d1c38b8 475
d4036145
LP
476 if (access(check, F_OK) < 0) {
477 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
478 return 0;
479 }
68fb0892 480
d4036145
LP
481 what = strappend("../usr/share/zoneinfo/", z);
482 if (!what)
483 return log_oom();
484
485 unlink(where);
486 if (symlink(what, where) < 0) {
487 log_error("Failed to correct timezone of container: %m");
488 return 0;
489 }
e58a1277
LP
490
491 return 0;
88213476
LP
492}
493
2547bb41
LP
494static int setup_resolv_conf(const char *dest) {
495 char *where;
496
497 assert(dest);
498
499 if (arg_private_network)
500 return 0;
501
502 /* Fix resolv.conf, if possible */
04bc4a3f
LP
503 where = strappend(dest, "/etc/resolv.conf");
504 if (!where)
0d0f0c50 505 return log_oom();
2547bb41 506
77e63faf
LP
507 /* We don't really care for the results of this really. If it
508 * fails, it fails, but meh... */
2547bb41
LP
509 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
510 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
511
512 free(where);
513
514 return 0;
515}
516
04bc4a3f 517static int setup_boot_id(const char *dest) {
ed8b7a3e 518 char _cleanup_free_ *from = NULL, *to = NULL;
04bc4a3f
LP
519 sd_id128_t rnd;
520 char as_uuid[37];
521 int r;
522
523 assert(dest);
524
525 /* Generate a new randomized boot ID, so that each boot-up of
526 * the container gets a new one */
527
528 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 529 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
530 if (!from || !to)
531 return log_oom();
04bc4a3f
LP
532
533 r = sd_id128_randomize(&rnd);
534 if (r < 0) {
535 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 536 return r;
04bc4a3f
LP
537 }
538
539 snprintf(as_uuid, sizeof(as_uuid),
540 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
541 SD_ID128_FORMAT_VAL(rnd));
542 char_array_0(as_uuid);
543
574d5f2d 544 r = write_string_file(from, as_uuid);
04bc4a3f
LP
545 if (r < 0) {
546 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 547 return r;
04bc4a3f
LP
548 }
549
550 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
551 log_error("Failed to bind mount boot id: %m");
552 r = -errno;
10d18763
ZJS
553 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
554 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
555
556 unlink(from);
04bc4a3f
LP
557 return r;
558}
559
e58a1277 560static int copy_devnodes(const char *dest) {
88213476
LP
561
562 static const char devnodes[] =
563 "null\0"
564 "zero\0"
565 "full\0"
566 "random\0"
567 "urandom\0"
f2d88580 568 "tty\0";
88213476
LP
569
570 const char *d;
e58a1277 571 int r = 0;
25ea79fe 572 mode_t _cleanup_umask_ u;
a258bf26
LP
573
574 assert(dest);
124640f1
LP
575
576 u = umask(0000);
88213476
LP
577
578 NULSTR_FOREACH(d, devnodes) {
e58a1277 579 struct stat st;
ed8b7a3e 580 char _cleanup_free_ *from = NULL, *to = NULL;
88213476
LP
581
582 asprintf(&from, "/dev/%s", d);
583 asprintf(&to, "%s/dev/%s", dest, d);
584
585 if (!from || !to) {
ed8b7a3e 586 log_oom();
a258bf26 587
88213476
LP
588 if (r == 0)
589 r = -ENOMEM;
590
591 break;
592 }
593
594 if (stat(from, &st) < 0) {
595
596 if (errno != ENOENT) {
597 log_error("Failed to stat %s: %m", from);
88213476
LP
598 if (r == 0)
599 r = -errno;
600 }
601
a258bf26 602 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 603
ed8b7a3e 604 log_error("%s is not a char or block device, cannot copy", from);
a258bf26
LP
605 if (r == 0)
606 r = -EIO;
607
608 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
609
610 log_error("mknod(%s) failed: %m", dest);
611 if (r == 0)
612 r = -errno;
88213476 613 }
88213476
LP
614 }
615
e58a1277
LP
616 return r;
617}
88213476 618
f2d88580
LP
619static int setup_ptmx(const char *dest) {
620 _cleanup_free_ char *p = NULL;
621
622 p = strappend(dest, "/dev/ptmx");
623 if (!p)
624 return log_oom();
625
626 if (symlink("pts/ptmx", p) < 0) {
627 log_error("Failed to create /dev/ptmx symlink: %m");
628 return -errno;
629 }
630
631 return 0;
632}
633
e58a1277
LP
634static int setup_dev_console(const char *dest, const char *console) {
635 struct stat st;
ed8b7a3e 636 char _cleanup_free_ *to = NULL;
e58a1277 637 int r;
25ea79fe 638 mode_t _cleanup_umask_ u;
e58a1277
LP
639
640 assert(dest);
641 assert(console);
642
643 u = umask(0000);
644
645 if (stat(console, &st) < 0) {
646 log_error("Failed to stat %s: %m", console);
25ea79fe 647 return -errno;
88213476 648
a258bf26 649 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
650 log_error("/dev/console is not a char device");
651 return -EIO;
e58a1277 652 }
88213476 653
e58a1277
LP
654 r = chmod_and_chown(console, 0600, 0, 0);
655 if (r < 0) {
656 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 657 return r;
a258bf26 658 }
88213476 659
25ea79fe
ZJS
660 if (asprintf(&to, "%s/dev/console", dest) < 0)
661 return log_oom();
88213476 662
a258bf26
LP
663 /* We need to bind mount the right tty to /dev/console since
664 * ptys can only exist on pts file systems. To have something
665 * to bind mount things on we create a device node first, that
666 * has the right major/minor (note that the major minor
667 * doesn't actually matter here, since we mount it over
668 * anyway). */
669
e58a1277
LP
670 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
671 log_error("mknod() for /dev/console failed: %m");
25ea79fe 672 return -errno;
e58a1277 673 }
a258bf26
LP
674
675 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 676 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 677 return -errno;
a258bf26
LP
678 }
679
25ea79fe 680 return 0;
e58a1277
LP
681}
682
683static int setup_kmsg(const char *dest, int kmsg_socket) {
ed8b7a3e 684 char _cleanup_free_ *from = NULL, *to = NULL;
e58a1277 685 int r, fd, k;
25ea79fe 686 mode_t _cleanup_umask_ u;
e58a1277
LP
687 union {
688 struct cmsghdr cmsghdr;
689 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
690 } control = {};
691 struct msghdr mh = {
692 .msg_control = &control,
693 .msg_controllen = sizeof(control),
694 };
e58a1277
LP
695 struct cmsghdr *cmsg;
696
697 assert(dest);
698 assert(kmsg_socket >= 0);
a258bf26 699
e58a1277 700 u = umask(0000);
a258bf26 701
f1e5dfe2
LP
702 /* We create the kmsg FIFO as /dev/kmsg, but immediately
703 * delete it after bind mounting it to /proc/kmsg. While FIFOs
704 * on the reading side behave very similar to /proc/kmsg,
705 * their writing side behaves differently from /dev/kmsg in
706 * that writing blocks when nothing is reading. In order to
707 * avoid any problems with containers deadlocking due to this
708 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
709 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
710 asprintf(&to, "%s/proc/kmsg", dest) < 0)
711 return log_oom();
e58a1277
LP
712
713 if (mkfifo(from, 0600) < 0) {
714 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 715 return -errno;
e58a1277
LP
716 }
717
718 r = chmod_and_chown(from, 0600, 0, 0);
719 if (r < 0) {
720 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 721 return r;
e58a1277
LP
722 }
723
724 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
725 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 726 return -errno;
e58a1277
LP
727 }
728
729 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
730 if (fd < 0) {
731 log_error("Failed to open fifo: %m");
25ea79fe 732 return -errno;
e58a1277
LP
733 }
734
e58a1277
LP
735 cmsg = CMSG_FIRSTHDR(&mh);
736 cmsg->cmsg_level = SOL_SOCKET;
737 cmsg->cmsg_type = SCM_RIGHTS;
738 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
739 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
740
741 mh.msg_controllen = cmsg->cmsg_len;
742
743 /* Store away the fd in the socket, so that it stays open as
744 * long as we run the child */
745 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
746 close_nointr_nofail(fd);
747
748 if (k < 0) {
749 log_error("Failed to send FIFO fd: %m");
25ea79fe 750 return -errno;
a258bf26
LP
751 }
752
f1e5dfe2
LP
753 /* And now make the FIFO unavailable as /dev/kmsg... */
754 unlink(from);
25ea79fe 755 return 0;
88213476
LP
756}
757
3a74cea5 758static int setup_hostname(void) {
3a74cea5 759
7027ff61
LP
760 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
761 return -errno;
3a74cea5 762
7027ff61 763 return 0;
3a74cea5
LP
764}
765
57fb9fb5
LP
766static int setup_journal(const char *directory) {
767 sd_id128_t machine_id;
27407a01
ZJS
768 char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
769 char *id;
57fb9fb5
LP
770 int r;
771
772 if (arg_link_journal == LINK_NO)
773 return 0;
774
775 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
776 if (!p)
777 return log_oom();
57fb9fb5
LP
778
779 r = read_one_line_file(p, &b);
27407a01
ZJS
780 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
781 return 0;
782 else if (r < 0) {
783 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
784 return r;
785 }
786
27407a01
ZJS
787 id = strstrip(b);
788 if (isempty(id) && arg_link_journal == LINK_AUTO)
789 return 0;
57fb9fb5 790
27407a01
ZJS
791 /* Verify validity */
792 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 793 if (r < 0) {
27407a01
ZJS
794 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
795 return r;
57fb9fb5
LP
796 }
797
798 free(p);
27407a01
ZJS
799 p = strappend("/var/log/journal/", id);
800 q = strjoin(directory, "/var/log/journal/", id, NULL);
801 if (!p || !q)
802 return log_oom();
803
804 if (path_is_mount_point(p, false) > 0) {
805 if (arg_link_journal != LINK_AUTO) {
806 log_error("%s: already a mount point, refusing to use for journal", p);
807 return -EEXIST;
808 }
809
810 return 0;
57fb9fb5
LP
811 }
812
27407a01 813 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 814 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
815 log_error("%s: already a mount point, refusing to use for journal", q);
816 return -EEXIST;
57fb9fb5
LP
817 }
818
27407a01 819 return 0;
57fb9fb5
LP
820 }
821
822 r = readlink_and_make_absolute(p, &d);
823 if (r >= 0) {
824 if ((arg_link_journal == LINK_GUEST ||
825 arg_link_journal == LINK_AUTO) &&
826 path_equal(d, q)) {
827
27407a01
ZJS
828 r = mkdir_p(q, 0755);
829 if (r < 0)
830 log_warning("failed to create directory %s: %m", q);
831 return 0;
57fb9fb5
LP
832 }
833
834 if (unlink(p) < 0) {
835 log_error("Failed to remove symlink %s: %m", p);
27407a01 836 return -errno;
57fb9fb5
LP
837 }
838 } else if (r == -EINVAL) {
839
840 if (arg_link_journal == LINK_GUEST &&
841 rmdir(p) < 0) {
842
27407a01
ZJS
843 if (errno == ENOTDIR) {
844 log_error("%s already exists and is neither a symlink nor a directory", p);
845 return r;
846 } else {
57fb9fb5 847 log_error("Failed to remove %s: %m", p);
27407a01 848 return -errno;
57fb9fb5 849 }
57fb9fb5
LP
850 }
851 } else if (r != -ENOENT) {
852 log_error("readlink(%s) failed: %m", p);
27407a01 853 return r;
57fb9fb5
LP
854 }
855
856 if (arg_link_journal == LINK_GUEST) {
857
858 if (symlink(q, p) < 0) {
859 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 860 return -errno;
57fb9fb5
LP
861 }
862
27407a01
ZJS
863 r = mkdir_p(q, 0755);
864 if (r < 0)
865 log_warning("failed to create directory %s: %m", q);
866 return 0;
57fb9fb5
LP
867 }
868
869 if (arg_link_journal == LINK_HOST) {
870 r = mkdir_p(p, 0755);
871 if (r < 0) {
872 log_error("Failed to create %s: %m", p);
27407a01 873 return r;
57fb9fb5
LP
874 }
875
27407a01
ZJS
876 } else if (access(p, F_OK) < 0)
877 return 0;
57fb9fb5
LP
878
879 if (dir_is_empty(q) == 0) {
880 log_error("%s not empty.", q);
27407a01 881 return -ENOTEMPTY;
57fb9fb5
LP
882 }
883
884 r = mkdir_p(q, 0755);
885 if (r < 0) {
886 log_error("Failed to create %s: %m", q);
27407a01 887 return r;
57fb9fb5
LP
888 }
889
890 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
891 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 892 return -errno;
57fb9fb5
LP
893 }
894
27407a01 895 return 0;
57fb9fb5
LP
896}
897
7027ff61
LP
898static int setup_cgroup(const char *path) {
899 char **c;
900 int r;
901
902 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
903 if (r < 0) {
904 log_error("Failed to create cgroup: %s", strerror(-r));
905 return r;
906 }
907
908 STRV_FOREACH(c, arg_controllers) {
909 r = cg_create_and_attach(*c, path, 1);
910 if (r < 0)
911 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
912 }
913
914 return 0;
915}
916
88213476 917static int drop_capabilities(void) {
5076f0cc 918 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
919}
920
921static int is_os_tree(const char *path) {
922 int r;
923 char *p;
924 /* We use /bin/sh as flag file if something is an OS */
925
926 if (asprintf(&p, "%s/bin/sh", path) < 0)
927 return -ENOMEM;
928
929 r = access(p, F_OK);
930 free(p);
931
932 return r < 0 ? 0 : 1;
933}
934
57cb4adf 935static int process_pty(int master, pid_t pid, sigset_t *mask) {
0c749d50 936
b72491a2 937 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
a258bf26
LP
938 size_t in_buffer_full = 0, out_buffer_full = 0;
939 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
940 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
a258bf26 941 int ep = -1, signal_fd = -1, r;
57cb4adf
LP
942 bool tried_orderly_shutdown = false;
943
944 assert(master >= 0);
945 assert(pid > 0);
946 assert(mask);
a258bf26
LP
947
948 fd_nonblock(STDIN_FILENO, 1);
949 fd_nonblock(STDOUT_FILENO, 1);
950 fd_nonblock(master, 1);
951
db7feb7e
LP
952 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
953 if (signal_fd < 0) {
a258bf26
LP
954 log_error("signalfd(): %m");
955 r = -errno;
956 goto finish;
957 }
958
db7feb7e
LP
959 ep = epoll_create1(EPOLL_CLOEXEC);
960 if (ep < 0) {
a258bf26
LP
961 log_error("Failed to create epoll: %m");
962 r = -errno;
963 goto finish;
964 }
965
51d88d1b
LP
966 /* We read from STDIN only if this is actually a TTY,
967 * otherwise we assume non-interactivity. */
968 if (isatty(STDIN_FILENO)) {
969 zero(stdin_ev);
970 stdin_ev.events = EPOLLIN|EPOLLET;
971 stdin_ev.data.fd = STDIN_FILENO;
972
973 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
974 log_error("Failed to register STDIN in epoll: %m");
975 r = -errno;
976 goto finish;
977 }
978 }
a258bf26
LP
979
980 zero(stdout_ev);
981 stdout_ev.events = EPOLLOUT|EPOLLET;
982 stdout_ev.data.fd = STDOUT_FILENO;
983
984 zero(master_ev);
985 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
986 master_ev.data.fd = master;
987
988 zero(signal_ev);
989 signal_ev.events = EPOLLIN;
990 signal_ev.data.fd = signal_fd;
991
f2956e80
MS
992 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
993 if (errno != EPERM) {
994 log_error("Failed to register stdout in epoll: %m");
995 r = -errno;
996 goto finish;
997 }
998 /* stdout without epoll support. Likely redirected to regular file. */
999 stdout_writable = true;
1000 }
1001
1002 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
a258bf26 1003 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
51d88d1b 1004 log_error("Failed to register fds in epoll: %m");
a258bf26
LP
1005 r = -errno;
1006 goto finish;
1007 }
1008
fd14078a 1009 for (;;) {
a258bf26
LP
1010 struct epoll_event ev[16];
1011 ssize_t k;
1012 int i, nfds;
1013
db7feb7e
LP
1014 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1015 if (nfds < 0) {
a258bf26
LP
1016
1017 if (errno == EINTR || errno == EAGAIN)
1018 continue;
1019
1020 log_error("epoll_wait(): %m");
1021 r = -errno;
1022 goto finish;
1023 }
1024
1025 assert(nfds >= 1);
1026
1027 for (i = 0; i < nfds; i++) {
1028 if (ev[i].data.fd == STDIN_FILENO) {
1029
fd14078a 1030 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
1031 stdin_readable = true;
1032
1033 } else if (ev[i].data.fd == STDOUT_FILENO) {
1034
fd14078a 1035 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
1036 stdout_writable = true;
1037
1038 } else if (ev[i].data.fd == master) {
1039
fd14078a 1040 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
1041 master_readable = true;
1042
fd14078a 1043 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
1044 master_writable = true;
1045
1046 } else if (ev[i].data.fd == signal_fd) {
1047 struct signalfd_siginfo sfsi;
1048 ssize_t n;
1049
db7feb7e
LP
1050 n = read(signal_fd, &sfsi, sizeof(sfsi));
1051 if (n != sizeof(sfsi)) {
a258bf26
LP
1052
1053 if (n >= 0) {
0c749d50 1054 log_error("Failed to read from signalfd: invalid block size");
a258bf26
LP
1055 r = -EIO;
1056 goto finish;
1057 }
1058
1059 if (errno != EINTR && errno != EAGAIN) {
0c749d50 1060 log_error("Failed to read from signalfd: %m");
a258bf26
LP
1061 r = -errno;
1062 goto finish;
1063 }
1064 } else {
1065
1066 if (sfsi.ssi_signo == SIGWINCH) {
1067 struct winsize ws;
1068
1069 /* The window size changed, let's forward that. */
a258bf26
LP
1070 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1071 ioctl(master, TIOCSWINSZ, &ws);
57cb4adf
LP
1072 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1073
1074 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1075
1076 /* This only works for systemd... */
1077 tried_orderly_shutdown = true;
1078 kill(pid, SIGRTMIN+3);
1079
a258bf26 1080 } else {
0c749d50 1081 r = 0;
a258bf26
LP
1082 goto finish;
1083 }
1084 }
1085 }
1086 }
1087
1088 while ((stdin_readable && in_buffer_full <= 0) ||
1089 (master_writable && in_buffer_full > 0) ||
1090 (master_readable && out_buffer_full <= 0) ||
1091 (stdout_writable && out_buffer_full > 0)) {
1092
b72491a2 1093 if (stdin_readable && in_buffer_full < LINE_MAX) {
a258bf26 1094
db7feb7e
LP
1095 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1096 if (k < 0) {
a258bf26 1097
fd14078a 1098 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1099 stdin_readable = false;
a258bf26
LP
1100 else {
1101 log_error("read(): %m");
0c749d50 1102 r = -errno;
a258bf26
LP
1103 goto finish;
1104 }
1105 } else
1106 in_buffer_full += (size_t) k;
a258bf26
LP
1107 }
1108
1109 if (master_writable && in_buffer_full > 0) {
1110
db7feb7e
LP
1111 k = write(master, in_buffer, in_buffer_full);
1112 if (k < 0) {
a258bf26 1113
fd14078a 1114 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1115 master_writable = false;
fd14078a 1116 else {
a258bf26 1117 log_error("write(): %m");
0c749d50 1118 r = -errno;
a258bf26
LP
1119 goto finish;
1120 }
1121
1122 } else {
1123 assert(in_buffer_full >= (size_t) k);
1124 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1125 in_buffer_full -= k;
1126 }
1127 }
1128
b72491a2 1129 if (master_readable && out_buffer_full < LINE_MAX) {
a258bf26 1130
db7feb7e
LP
1131 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1132 if (k < 0) {
a258bf26 1133
fd14078a 1134 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1135 master_readable = false;
a258bf26
LP
1136 else {
1137 log_error("read(): %m");
0c749d50 1138 r = -errno;
a258bf26
LP
1139 goto finish;
1140 }
1141 } else
1142 out_buffer_full += (size_t) k;
a258bf26
LP
1143 }
1144
1145 if (stdout_writable && out_buffer_full > 0) {
1146
db7feb7e
LP
1147 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1148 if (k < 0) {
a258bf26 1149
fd14078a 1150 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 1151 stdout_writable = false;
fd14078a 1152 else {
a258bf26 1153 log_error("write(): %m");
0c749d50 1154 r = -errno;
a258bf26
LP
1155 goto finish;
1156 }
1157
1158 } else {
1159 assert(out_buffer_full >= (size_t) k);
1160 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1161 out_buffer_full -= k;
1162 }
1163 }
1164 }
fd14078a 1165 }
a258bf26
LP
1166
1167finish:
1168 if (ep >= 0)
1169 close_nointr_nofail(ep);
1170
1171 if (signal_fd >= 0)
1172 close_nointr_nofail(signal_fd);
1173
1174 return r;
1175}
88213476
LP
1176
1177int main(int argc, char *argv[]) {
1178 pid_t pid = 0;
04d391da 1179 int r = EXIT_FAILURE, k;
7027ff61
LP
1180 _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1181 _cleanup_close_ int master = -1;
1182 int n_fd_passed;
a258bf26
LP
1183 const char *console = NULL;
1184 struct termios saved_attr, raw_attr;
1185 sigset_t mask;
1186 bool saved_attr_valid = false;
1187 struct winsize ws;
e58a1277 1188 int kmsg_socket_pair[2] = { -1, -1 };
842f3b0f 1189 FDSet *fds = NULL;
88213476
LP
1190
1191 log_parse_environment();
1192 log_open();
1193
db7feb7e
LP
1194 r = parse_argv(argc, argv);
1195 if (r <= 0)
88213476
LP
1196 goto finish;
1197
1198 if (arg_directory) {
1199 char *p;
1200
1201 p = path_make_absolute_cwd(arg_directory);
1202 free(arg_directory);
1203 arg_directory = p;
1204 } else
1205 arg_directory = get_current_dir_name();
1206
1207 if (!arg_directory) {
1208 log_error("Failed to determine path");
1209 goto finish;
1210 }
1211
1212 path_kill_slashes(arg_directory);
1213
7027ff61
LP
1214 if (!arg_machine) {
1215 arg_machine = strdup(path_get_file_name(arg_directory));
1216 if (!arg_machine) {
1217 log_oom();
1218 goto finish;
1219 }
1220
1221 hostname_cleanup(arg_machine);
1222 if (isempty(arg_machine)) {
1223 log_error("Failed to determine machine name automatically, please use -M.");
1224 goto finish;
1225 }
1226 }
1227
88213476
LP
1228 if (geteuid() != 0) {
1229 log_error("Need to be root.");
1230 goto finish;
1231 }
1232
04d391da
LP
1233 if (sd_booted() <= 0) {
1234 log_error("Not running on a systemd system.");
1235 goto finish;
1236 }
1237
88213476 1238 if (path_equal(arg_directory, "/")) {
6df6b939 1239 log_error("Spawning container on root directory not supported.");
88213476
LP
1240 goto finish;
1241 }
1242
1243 if (is_os_tree(arg_directory) <= 0) {
1244 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1245 goto finish;
1246 }
1247
842f3b0f
LP
1248 log_close();
1249 n_fd_passed = sd_listen_fds(false);
1250 if (n_fd_passed > 0) {
1251 k = fdset_new_listen_fds(&fds, false);
1252 if (k < 0) {
1253 log_error("Failed to collect file descriptors: %s", strerror(-k));
1254 goto finish;
1255 }
1256 }
1257 fdset_close_others(fds);
1258 log_open();
1259
7027ff61 1260 k = cg_get_machine_path(&machine_root);
db7feb7e 1261 if (k < 0) {
7027ff61 1262 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
04d391da
LP
1263 goto finish;
1264 }
1265
7027ff61
LP
1266 newcg = strjoin(machine_root, "/", arg_machine, NULL);
1267 if (!newcg) {
04d391da
LP
1268 log_error("Failed to allocate cgroup path.");
1269 goto finish;
1270 }
1271
7027ff61
LP
1272 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1273 if (r <= 0 && r != -ENOENT) {
1274 log_error("Container already running.");
04d391da 1275
7027ff61
LP
1276 free(newcg);
1277 newcg = NULL;
1278
1279 goto finish;
40c32a4a
LGL
1280 }
1281
db7feb7e
LP
1282 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1283 if (master < 0) {
a258bf26
LP
1284 log_error("Failed to acquire pseudo tty: %m");
1285 goto finish;
1286 }
1287
db7feb7e
LP
1288 console = ptsname(master);
1289 if (!console) {
a258bf26
LP
1290 log_error("Failed to determine tty name: %m");
1291 goto finish;
1292 }
1293
1294 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1295
1296 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1297 ioctl(master, TIOCSWINSZ, &ws);
1298
1299 if (unlockpt(master) < 0) {
1300 log_error("Failed to unlock tty: %m");
1301 goto finish;
1302 }
1303
51d88d1b
LP
1304 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1305 saved_attr_valid = true;
a258bf26 1306
51d88d1b
LP
1307 raw_attr = saved_attr;
1308 cfmakeraw(&raw_attr);
1309 raw_attr.c_lflag &= ~ECHO;
1310 }
a258bf26 1311
e58a1277 1312 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
7027ff61 1313 log_error("Failed to create kmsg socket pair.");
e58a1277
LP
1314 goto finish;
1315 }
1316
a258bf26
LP
1317 assert_se(sigemptyset(&mask) == 0);
1318 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1319 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1320
d87be9b0
LP
1321 for (;;) {
1322 siginfo_t status;
1fd96121 1323 int pipefd[2];
52af2106 1324
f2d88580 1325 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1fd96121
ZJS
1326 log_error("pipe2(): %m");
1327 goto finish;
d87be9b0 1328 }
88213476 1329
d87be9b0
LP
1330 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1331 if (pid < 0) {
1332 if (errno == EINVAL)
1333 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1334 else
1335 log_error("clone() failed: %m");
a258bf26 1336
d87be9b0
LP
1337 goto finish;
1338 }
a258bf26 1339
d87be9b0
LP
1340 if (pid == 0) {
1341 /* child */
d87be9b0
LP
1342 const char *home = NULL;
1343 uid_t uid = (uid_t) -1;
1344 gid_t gid = (gid_t) -1;
5674767e 1345 unsigned n_env = 2;
d87be9b0
LP
1346 const char *envp[] = {
1347 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1348 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1349 NULL, /* TERM */
1350 NULL, /* HOME */
1351 NULL, /* USER */
1352 NULL, /* LOGNAME */
1353 NULL, /* container_uuid */
842f3b0f
LP
1354 NULL, /* LISTEN_FDS */
1355 NULL, /* LISTEN_PID */
d87be9b0
LP
1356 NULL
1357 };
a258bf26 1358
5674767e
ZJS
1359 envp[n_env] = strv_find_prefix(environ, "TERM=");
1360 if (envp[n_env])
1361 n_env ++;
a258bf26 1362
5659774c 1363 close_nointr_nofail(pipefd[1]);
1fd96121 1364 fd_wait_for_event(pipefd[0], POLLHUP, -1);
5659774c 1365 close_nointr_nofail(pipefd[0]);
1fd96121 1366
d87be9b0 1367 close_nointr_nofail(master);
842f3b0f 1368 master = -1;
a258bf26 1369
1fd96121
ZJS
1370 if (saved_attr_valid) {
1371 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1372 log_error("Failed to set terminal attributes: %m");
1373 goto child_fail;
1374 }
1375 }
1376
d87be9b0
LP
1377 close_nointr(STDIN_FILENO);
1378 close_nointr(STDOUT_FILENO);
1379 close_nointr(STDERR_FILENO);
db7feb7e 1380
842f3b0f
LP
1381 close_nointr_nofail(kmsg_socket_pair[0]);
1382 kmsg_socket_pair[0] = -1;
a258bf26 1383
d87be9b0 1384 reset_all_signal_handlers();
88213476 1385
d87be9b0
LP
1386 assert_se(sigemptyset(&mask) == 0);
1387 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1388
842f3b0f
LP
1389 k = open_terminal(console, O_RDWR);
1390 if (k != STDIN_FILENO) {
1391 if (k >= 0) {
1392 close_nointr_nofail(k);
1393 k = -EINVAL;
1394 }
1395
1396 log_error("Failed to open console: %s", strerror(-k));
1397 goto child_fail;
1398 }
1399
1400 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1401 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1402 log_error("Failed to duplicate console: %m");
d87be9b0 1403 goto child_fail;
842f3b0f 1404 }
bc2f673e 1405
d87be9b0
LP
1406 if (setsid() < 0) {
1407 log_error("setsid() failed: %m");
bc2f673e
LP
1408 goto child_fail;
1409 }
1410
d87be9b0
LP
1411 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1412 log_error("PR_SET_PDEATHSIG failed: %m");
1413 goto child_fail;
1414 }
e58a1277 1415
7027ff61
LP
1416 if (setup_cgroup(newcg) < 0)
1417 goto child_fail;
1418
d87be9b0
LP
1419 /* Mark everything as slave, so that we still
1420 * receive mounts from the real root, but don't
1421 * propagate mounts to the real root. */
1422 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1423 log_error("MS_SLAVE|MS_REC failed: %m");
1424 goto child_fail;
1425 }
04bc4a3f 1426
d87be9b0
LP
1427 /* Turn directory into bind mount */
1428 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1429 log_error("Failed to make bind mount.");
1430 goto child_fail;
1431 }
88213476 1432
d87be9b0
LP
1433 if (arg_read_only)
1434 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1435 log_error("Failed to make read-only.");
1436 goto child_fail;
1437 }
2547bb41 1438
d87be9b0
LP
1439 if (mount_all(arg_directory) < 0)
1440 goto child_fail;
57fb9fb5 1441
d87be9b0
LP
1442 if (copy_devnodes(arg_directory) < 0)
1443 goto child_fail;
a258bf26 1444
f2d88580
LP
1445 if (setup_ptmx(arg_directory) < 0)
1446 goto child_fail;
1447
d87be9b0 1448 dev_setup(arg_directory);
88213476 1449
d87be9b0
LP
1450 if (setup_dev_console(arg_directory, console) < 0)
1451 goto child_fail;
88213476 1452
d87be9b0
LP
1453 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1454 goto child_fail;
88213476 1455
d87be9b0 1456 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1457 kmsg_socket_pair[1] = -1;
a258bf26 1458
d87be9b0
LP
1459 if (setup_boot_id(arg_directory) < 0)
1460 goto child_fail;
a41fe3a2 1461
d87be9b0
LP
1462 if (setup_timezone(arg_directory) < 0)
1463 goto child_fail;
88213476 1464
d87be9b0
LP
1465 if (setup_resolv_conf(arg_directory) < 0)
1466 goto child_fail;
687d0825 1467
d87be9b0 1468 if (setup_journal(arg_directory) < 0)
687d0825 1469 goto child_fail;
687d0825 1470
17fe0523
LP
1471 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1472 goto child_fail;
1473
1474 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1475 goto child_fail;
1476
d87be9b0
LP
1477 if (chdir(arg_directory) < 0) {
1478 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1479 goto child_fail;
1480 }
1481
d87be9b0
LP
1482 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1483 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1484 goto child_fail;
1485 }
1486
d87be9b0
LP
1487 if (chroot(".") < 0) {
1488 log_error("chroot() failed: %m");
687d0825
MV
1489 goto child_fail;
1490 }
1491
d87be9b0
LP
1492 if (chdir("/") < 0) {
1493 log_error("chdir() failed: %m");
687d0825
MV
1494 goto child_fail;
1495 }
1496
d87be9b0
LP
1497 umask(0022);
1498
1499 loopback_setup();
1500
1501 if (drop_capabilities() < 0) {
1502 log_error("drop_capabilities() failed: %m");
687d0825
MV
1503 goto child_fail;
1504 }
687d0825 1505
d87be9b0
LP
1506 if (arg_user) {
1507
963ddb91
LP
1508 /* Note that this resolves user names
1509 * inside the container, and hence
1510 * accesses the NSS modules from the
1511 * container and not the host. This is
1512 * a bit weird... */
1513
d87be9b0
LP
1514 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1515 log_error("get_user_creds() failed: %m");
1516 goto child_fail;
1517 }
1518
1519 if (mkdir_parents_label(home, 0775) < 0) {
1520 log_error("mkdir_parents_label() failed: %m");
1521 goto child_fail;
1522 }
1523
1524 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1525 log_error("mkdir_safe_label() failed: %m");
1526 goto child_fail;
1527 }
1528
1529 if (initgroups((const char*)arg_user, gid) < 0) {
1530 log_error("initgroups() failed: %m");
1531 goto child_fail;
1532 }
144f0fc0 1533
d87be9b0
LP
1534 if (setresgid(gid, gid, gid) < 0) {
1535 log_error("setregid() failed: %m");
1536 goto child_fail;
1537 }
1538
1539 if (setresuid(uid, uid, uid) < 0) {
1540 log_error("setreuid() failed: %m");
1541 goto child_fail;
1542 }
3c957acf
LP
1543 } else {
1544 /* Reset everything fully to 0, just in case */
1545
1546 if (setgroups(0, NULL) < 0) {
1547 log_error("setgroups() failed: %m");
1548 goto child_fail;
1549 }
1550
1551 if (setresgid(0, 0, 0) < 0) {
1552 log_error("setregid() failed: %m");
1553 goto child_fail;
1554 }
1555
1556 if (setresuid(0, 0, 0) < 0) {
1557 log_error("setreuid() failed: %m");
1558 goto child_fail;
1559 }
d87be9b0
LP
1560 }
1561
842f3b0f
LP
1562 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1563 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1564 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1565 log_oom();
144f0fc0
LP
1566 goto child_fail;
1567 }
687d0825 1568
d87be9b0 1569 if (arg_uuid) {
842f3b0f
LP
1570 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1571 log_oom();
1572 goto child_fail;
1573 }
1574 }
1575
1576 if (fdset_size(fds) > 0) {
1577 k = fdset_cloexec(fds, false);
1578 if (k < 0) {
1579 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1580 goto child_fail;
1581 }
1582
1583 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
7027ff61 1584 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
d87be9b0
LP
1585 log_oom();
1586 goto child_fail;
1587 }
1588 }
1589
1590 setup_hostname();
1591
1592 if (arg_boot) {
1593 char **a;
1594 size_t l;
88213476 1595
d87be9b0 1596 /* Automatically search for the init system */
0f0dbc46 1597
d87be9b0
LP
1598 l = 1 + argc - optind;
1599 a = newa(char*, l + 1);
1600 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1601
d87be9b0
LP
1602 a[0] = (char*) "/usr/lib/systemd/systemd";
1603 execve(a[0], a, (char**) envp);
0f0dbc46 1604
d87be9b0
LP
1605 a[0] = (char*) "/lib/systemd/systemd";
1606 execve(a[0], a, (char**) envp);
0f0dbc46 1607
d87be9b0
LP
1608 a[0] = (char*) "/sbin/init";
1609 execve(a[0], a, (char**) envp);
1610 } else if (argc > optind)
1611 execvpe(argv[optind], argv + optind, (char**) envp);
1612 else {
1613 chdir(home ? home : "/root");
1614 execle("/bin/bash", "-bash", NULL, (char**) envp);
1615 }
1616
1617 log_error("execv() failed: %m");
0f0dbc46 1618
d87be9b0
LP
1619 child_fail:
1620 _exit(EXIT_FAILURE);
da5b3bad 1621 }
88213476 1622
9d60cb63 1623 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
5659774c
ZJS
1624 close_nointr_nofail(pipefd[0]);
1625 close_nointr_nofail(pipefd[1]);
1fd96121 1626
842f3b0f
LP
1627 fdset_free(fds);
1628 fds = NULL;
1629
57cb4adf 1630 if (process_pty(master, pid, &mask) < 0)
d87be9b0 1631 goto finish;
88213476 1632
d87be9b0
LP
1633 if (saved_attr_valid)
1634 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
a258bf26 1635
d87be9b0
LP
1636 r = wait_for_terminate(pid, &status);
1637 if (r < 0) {
1638 r = EXIT_FAILURE;
1639 break;
1640 }
a258bf26 1641
d87be9b0
LP
1642 if (status.si_code == CLD_EXITED) {
1643 if (status.si_status != 0) {
1644 log_error("Container failed with error code %i.", status.si_status);
1645 r = status.si_status;
1646 break;
1647 }
1648
1649 log_debug("Container exited successfully.");
1650 break;
1651 } else if (status.si_code == CLD_KILLED &&
1652 status.si_status == SIGINT) {
1653 log_info("Container has been shut down.");
1654 r = 0;
1655 break;
1656 } else if (status.si_code == CLD_KILLED &&
1657 status.si_status == SIGHUP) {
1658 log_info("Container is being rebooted.");
1659 continue;
1660 } else if (status.si_code == CLD_KILLED ||
1661 status.si_code == CLD_DUMPED) {
88213476 1662
d87be9b0
LP
1663 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1664 r = EXIT_FAILURE;
1665 break;
1666 } else {
1667 log_error("Container failed due to unknown reason.");
1668 r = EXIT_FAILURE;
1669 break;
1670 }
1671 }
88213476
LP
1672
1673finish:
a258bf26
LP
1674 if (saved_attr_valid)
1675 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1676
e58a1277
LP
1677 close_pipe(kmsg_socket_pair);
1678
04d391da
LP
1679 if (newcg)
1680 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
88213476 1681
04d391da 1682 free(arg_directory);
7027ff61 1683 free(arg_machine);
40c32a4a 1684 strv_free(arg_controllers);
88213476 1685
842f3b0f
LP
1686 fdset_free(fds);
1687
88213476
LP
1688 return r;
1689}