]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: --private-network should imply CAP_NET_ADMIN
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <linux/rtnetlink.h>
354bfd2b 44#include <sys/eventfd.h>
aa28aefe
LP
45#include <net/if.h>
46
5d63309c 47#ifdef HAVE_SELINUX
a8828ed9
DW
48#include <selinux/selinux.h>
49#endif
88213476 50
1f0cd86b
LP
51#include "sd-daemon.h"
52#include "sd-bus.h"
53#include "sd-id128.h"
aa28aefe 54#include "sd-rtnl.h"
88213476
LP
55#include "log.h"
56#include "util.h"
49e942b2 57#include "mkdir.h"
6b2d0e85 58#include "macro.h"
d7832d2c 59#include "audit.h"
94d82985 60#include "missing.h"
04d391da 61#include "cgroup-util.h"
a258bf26 62#include "strv.h"
9eb977db 63#include "path-util.h"
a41fe3a2 64#include "loopback-setup.h"
4fc9982c 65#include "dev-setup.h"
842f3b0f 66#include "fdset.h"
acbeb427 67#include "build.h"
a5c32cff 68#include "fileio.h"
40ca29a1 69#include "bus-util.h"
1f0cd86b 70#include "bus-error.h"
4ba93280 71#include "ptyfwd.h"
9bd37b40 72#include "bus-kernel.h"
f4889f65 73#include "env-util.h"
7f112f50 74#include "def.h"
aa28aefe 75#include "rtnl-util.h"
f2d88580 76
57fb9fb5
LP
77typedef enum LinkJournal {
78 LINK_NO,
79 LINK_AUTO,
80 LINK_HOST,
81 LINK_GUEST
82} LinkJournal;
88213476
LP
83
84static char *arg_directory = NULL;
687d0825 85static char *arg_user = NULL;
9444b1f2 86static sd_id128_t arg_uuid = {};
7027ff61 87static char *arg_machine = NULL;
82adf6af
LP
88static char *arg_selinux_context = NULL;
89static char *arg_selinux_apifs_context = NULL;
9444b1f2 90static const char *arg_slice = NULL;
ff01d048 91static bool arg_private_network = false;
bc2f673e 92static bool arg_read_only = false;
0f0dbc46 93static bool arg_boot = false;
57fb9fb5 94static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
95static uint64_t arg_retain =
96 (1ULL << CAP_CHOWN) |
97 (1ULL << CAP_DAC_OVERRIDE) |
98 (1ULL << CAP_DAC_READ_SEARCH) |
99 (1ULL << CAP_FOWNER) |
100 (1ULL << CAP_FSETID) |
101 (1ULL << CAP_IPC_OWNER) |
102 (1ULL << CAP_KILL) |
103 (1ULL << CAP_LEASE) |
104 (1ULL << CAP_LINUX_IMMUTABLE) |
105 (1ULL << CAP_NET_BIND_SERVICE) |
106 (1ULL << CAP_NET_BROADCAST) |
107 (1ULL << CAP_NET_RAW) |
108 (1ULL << CAP_SETGID) |
109 (1ULL << CAP_SETFCAP) |
110 (1ULL << CAP_SETPCAP) |
111 (1ULL << CAP_SETUID) |
112 (1ULL << CAP_SYS_ADMIN) |
113 (1ULL << CAP_SYS_CHROOT) |
114 (1ULL << CAP_SYS_NICE) |
115 (1ULL << CAP_SYS_PTRACE) |
116 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 117 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
118 (1ULL << CAP_SYS_BOOT) |
119 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
120 (1ULL << CAP_AUDIT_CONTROL) |
121 (1ULL << CAP_MKNOD);
17fe0523
LP
122static char **arg_bind = NULL;
123static char **arg_bind_ro = NULL;
f4889f65 124static char **arg_setenv = NULL;
284c0b91 125static bool arg_quiet = false;
8a96d94e 126static bool arg_share_system = false;
eb91eb18 127static bool arg_register = true;
89f7c846 128static bool arg_keep_unit = false;
aa28aefe 129static char **arg_network_interfaces = NULL;
88213476
LP
130
131static int help(void) {
132
133 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
134 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
135 " -h --help Show this help\n"
136 " --version Print version string\n"
137 " -D --directory=NAME Root directory for the container\n"
138 " -b --boot Boot up full system (i.e. invoke init)\n"
139 " -u --user=USER Run the command under specified user or uid\n"
140 " --uuid=UUID Set a specific machine UUID for the container\n"
141 " -M --machine=NAME Set the machine name for the container\n"
142 " -S --slice=SLICE Place the container in the specified slice\n"
82adf6af
LP
143 " -Z --selinux-context=SECLABEL\n"
144 " Set the SELinux security context to be used by\n"
145 " processes in the container\n"
146 " -L --selinux-apifs-context=SECLABEL\n"
147 " Set the SELinux security context to be used by\n"
148 " API/tmpfs file systems in the container\n"
a8828ed9 149 " --private-network Disable network in container\n"
aa28aefe
LP
150 " --network-interface=INTERFACE\n"
151 " Assign an existing network interface to the container\n"
8a96d94e 152 " --share-system Share system namespaces with host\n"
a8828ed9
DW
153 " --read-only Mount the root directory read-only\n"
154 " --capability=CAP In addition to the default, retain specified\n"
155 " capability\n"
156 " --drop-capability=CAP Drop the specified capability from the default set\n"
157 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
158 " -j Equivalent to --link-journal=host\n"
159 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
160 " the container\n"
161 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
284c0b91 162 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 163 " --register=BOOLEAN Register container as machine\n"
89f7c846
LP
164 " --keep-unit Do not register a scope for the machine, reuse\n"
165 " the service unit nspawn is running in\n"
284c0b91 166 " -q --quiet Do not show status information\n",
88213476
LP
167 program_invocation_short_name);
168
169 return 0;
170}
171
172static int parse_argv(int argc, char *argv[]) {
173
a41fe3a2 174 enum {
acbeb427
ZJS
175 ARG_VERSION = 0x100,
176 ARG_PRIVATE_NETWORK,
bc2f673e 177 ARG_UUID,
5076f0cc 178 ARG_READ_ONLY,
57fb9fb5 179 ARG_CAPABILITY,
420c7379 180 ARG_DROP_CAPABILITY,
17fe0523
LP
181 ARG_LINK_JOURNAL,
182 ARG_BIND,
f4889f65
LP
183 ARG_BIND_RO,
184 ARG_SETENV,
eb91eb18 185 ARG_SHARE_SYSTEM,
89f7c846 186 ARG_REGISTER,
aa28aefe
LP
187 ARG_KEEP_UNIT,
188 ARG_NETWORK_INTERFACE
a41fe3a2
LP
189 };
190
88213476 191 static const struct option options[] = {
aa28aefe
LP
192 { "help", no_argument, NULL, 'h' },
193 { "version", no_argument, NULL, ARG_VERSION },
194 { "directory", required_argument, NULL, 'D' },
195 { "user", required_argument, NULL, 'u' },
196 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
197 { "boot", no_argument, NULL, 'b' },
198 { "uuid", required_argument, NULL, ARG_UUID },
199 { "read-only", no_argument, NULL, ARG_READ_ONLY },
200 { "capability", required_argument, NULL, ARG_CAPABILITY },
201 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
202 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
203 { "bind", required_argument, NULL, ARG_BIND },
204 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
205 { "machine", required_argument, NULL, 'M' },
206 { "slice", required_argument, NULL, 'S' },
207 { "setenv", required_argument, NULL, ARG_SETENV },
208 { "selinux-context", required_argument, NULL, 'Z' },
209 { "selinux-apifs-context", required_argument, NULL, 'L' },
210 { "quiet", no_argument, NULL, 'q' },
211 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
212 { "register", required_argument, NULL, ARG_REGISTER },
213 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
214 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
eb9da376 215 {}
88213476
LP
216 };
217
9444b1f2 218 int c, r;
a42c8b54 219 uint64_t plus = 0, minus = 0;
88213476
LP
220
221 assert(argc >= 0);
222 assert(argv);
223
284c0b91 224 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
88213476
LP
225
226 switch (c) {
227
228 case 'h':
eb9da376 229 return help();
88213476 230
acbeb427
ZJS
231 case ARG_VERSION:
232 puts(PACKAGE_STRING);
233 puts(SYSTEMD_FEATURES);
234 return 0;
235
88213476
LP
236 case 'D':
237 free(arg_directory);
3a74cea5
LP
238 arg_directory = canonicalize_file_name(optarg);
239 if (!arg_directory) {
898d5c91 240 log_error("Invalid root directory: %m");
88213476
LP
241 return -ENOMEM;
242 }
243
244 break;
245
687d0825
MV
246 case 'u':
247 free(arg_user);
7027ff61
LP
248 arg_user = strdup(optarg);
249 if (!arg_user)
250 return log_oom();
687d0825
MV
251
252 break;
253
aa28aefe
LP
254 case ARG_NETWORK_INTERFACE:
255 if (strv_push(&arg_network_interfaces, optarg) < 0)
256 return log_oom();
257
258 /* fall through */
259
ff01d048
LP
260 case ARG_PRIVATE_NETWORK:
261 arg_private_network = true;
a41fe3a2
LP
262 break;
263
0f0dbc46
LP
264 case 'b':
265 arg_boot = true;
266 break;
267
144f0fc0 268 case ARG_UUID:
9444b1f2
LP
269 r = sd_id128_from_string(optarg, &arg_uuid);
270 if (r < 0) {
aa96c6cb 271 log_error("Invalid UUID: %s", optarg);
9444b1f2 272 return r;
aa96c6cb 273 }
9444b1f2 274 break;
aa96c6cb 275
9444b1f2
LP
276 case 'S':
277 arg_slice = strdup(optarg);
b3451bed
DH
278 if (!arg_slice)
279 return log_oom();
280
144f0fc0
LP
281 break;
282
7027ff61 283 case 'M':
eb91eb18
LP
284 if (isempty(optarg)) {
285 free(arg_machine);
286 arg_machine = NULL;
287 } else {
7027ff61 288
eb91eb18
LP
289 if (!hostname_is_valid(optarg)) {
290 log_error("Invalid machine name: %s", optarg);
291 return -EINVAL;
292 }
7027ff61 293
eb91eb18
LP
294 free(arg_machine);
295 arg_machine = strdup(optarg);
296 if (!arg_machine)
297 return log_oom();
298
299 break;
300 }
7027ff61 301
82adf6af
LP
302 case 'Z':
303 arg_selinux_context = optarg;
a8828ed9
DW
304 break;
305
82adf6af
LP
306 case 'L':
307 arg_selinux_apifs_context = optarg;
a8828ed9
DW
308 break;
309
bc2f673e
LP
310 case ARG_READ_ONLY:
311 arg_read_only = true;
312 break;
313
420c7379
LP
314 case ARG_CAPABILITY:
315 case ARG_DROP_CAPABILITY: {
5076f0cc
LP
316 char *state, *word;
317 size_t length;
318
319 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 320 _cleanup_free_ char *t;
5076f0cc 321 cap_value_t cap;
5076f0cc
LP
322
323 t = strndup(word, length);
0d0f0c50
SL
324 if (!t)
325 return log_oom();
5076f0cc 326
39ed67d1
LP
327 if (streq(t, "all")) {
328 if (c == ARG_CAPABILITY)
a42c8b54 329 plus = (uint64_t) -1;
39ed67d1 330 else
a42c8b54 331 minus = (uint64_t) -1;
39ed67d1
LP
332 } else {
333 if (cap_from_name(t, &cap) < 0) {
334 log_error("Failed to parse capability %s.", t);
335 return -EINVAL;
336 }
337
338 if (c == ARG_CAPABILITY)
a42c8b54 339 plus |= 1ULL << (uint64_t) cap;
39ed67d1 340 else
a42c8b54 341 minus |= 1ULL << (uint64_t) cap;
5076f0cc 342 }
5076f0cc
LP
343 }
344
345 break;
346 }
347
57fb9fb5
LP
348 case 'j':
349 arg_link_journal = LINK_GUEST;
350 break;
351
352 case ARG_LINK_JOURNAL:
353 if (streq(optarg, "auto"))
354 arg_link_journal = LINK_AUTO;
355 else if (streq(optarg, "no"))
356 arg_link_journal = LINK_NO;
357 else if (streq(optarg, "guest"))
358 arg_link_journal = LINK_GUEST;
359 else if (streq(optarg, "host"))
360 arg_link_journal = LINK_HOST;
361 else {
362 log_error("Failed to parse link journal mode %s", optarg);
363 return -EINVAL;
364 }
365
366 break;
367
17fe0523
LP
368 case ARG_BIND:
369 case ARG_BIND_RO: {
370 _cleanup_free_ char *a = NULL, *b = NULL;
371 char *e;
372 char ***x;
17fe0523
LP
373
374 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
375
376 e = strchr(optarg, ':');
377 if (e) {
378 a = strndup(optarg, e - optarg);
379 b = strdup(e + 1);
380 } else {
381 a = strdup(optarg);
382 b = strdup(optarg);
383 }
384
385 if (!a || !b)
386 return log_oom();
387
388 if (!path_is_absolute(a) || !path_is_absolute(b)) {
389 log_error("Invalid bind mount specification: %s", optarg);
390 return -EINVAL;
391 }
392
393 r = strv_extend(x, a);
394 if (r < 0)
b3451bed 395 return log_oom();
17fe0523
LP
396
397 r = strv_extend(x, b);
398 if (r < 0)
b3451bed 399 return log_oom();
17fe0523
LP
400
401 break;
402 }
403
f4889f65
LP
404 case ARG_SETENV: {
405 char **n;
406
407 if (!env_assignment_is_valid(optarg)) {
408 log_error("Environment variable assignment '%s' is not valid.", optarg);
409 return -EINVAL;
410 }
411
412 n = strv_env_set(arg_setenv, optarg);
413 if (!n)
414 return log_oom();
415
416 strv_free(arg_setenv);
417 arg_setenv = n;
418 break;
419 }
420
284c0b91
LP
421 case 'q':
422 arg_quiet = true;
423 break;
424
8a96d94e
LP
425 case ARG_SHARE_SYSTEM:
426 arg_share_system = true;
427 break;
428
eb91eb18
LP
429 case ARG_REGISTER:
430 r = parse_boolean(optarg);
431 if (r < 0) {
432 log_error("Failed to parse --register= argument: %s", optarg);
433 return r;
434 }
435
436 arg_register = r;
437 break;
438
89f7c846
LP
439 case ARG_KEEP_UNIT:
440 arg_keep_unit = true;
441 break;
442
88213476
LP
443 case '?':
444 return -EINVAL;
445
446 default:
eb9da376 447 assert_not_reached("Unhandled option");
88213476
LP
448 }
449 }
450
eb91eb18
LP
451 if (arg_share_system)
452 arg_register = false;
453
454 if (arg_boot && arg_share_system) {
455 log_error("--boot and --share-system may not be combined.");
456 return -EINVAL;
457 }
458
89f7c846
LP
459 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
460 log_error("--keep-unit may not be used when invoked from a user session.");
461 return -EINVAL;
462 }
463
a42c8b54
LP
464 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
465
88213476
LP
466 return 1;
467}
468
469static int mount_all(const char *dest) {
470
471 typedef struct MountPoint {
472 const char *what;
473 const char *where;
474 const char *type;
475 const char *options;
476 unsigned long flags;
3bd66c05 477 bool fatal;
88213476
LP
478 } MountPoint;
479
480 static const MountPoint mount_table[] = {
4b7a6af4 481 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
482 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
483 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 484 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 485 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 486 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 487 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 488 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 489#ifdef HAVE_SELINUX
b4c59701
LP
490 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
491 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 492#endif
88213476
LP
493 };
494
495 unsigned k;
496 int r = 0;
497
498 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 499 _cleanup_free_ char *where = NULL;
d002827b 500#ifdef HAVE_SELINUX
a8828ed9 501 _cleanup_free_ char *options = NULL;
d002827b
LP
502#endif
503 const char *o;
88213476
LP
504 int t;
505
17fe0523
LP
506 where = strjoin(dest, "/", mount_table[k].where, NULL);
507 if (!where)
508 return log_oom();
88213476 509
e65aec12 510 t = path_is_mount_point(where, true);
68fb0892 511 if (t < 0) {
88213476 512 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
513
514 if (r == 0)
515 r = t;
516
517 continue;
518 }
519
9c1c7f71
LP
520 /* Skip this entry if it is not a remount. */
521 if (mount_table[k].what && t > 0)
014a9c77
LP
522 continue;
523
17fe0523 524 mkdir_p(where, 0755);
88213476 525
a8828ed9 526#ifdef HAVE_SELINUX
82adf6af
LP
527 if (arg_selinux_apifs_context &&
528 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
529 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
530 if (!options)
531 return log_oom();
532
533 o = options;
534 } else
a8828ed9 535#endif
d002827b 536 o = mount_table[k].options;
a8828ed9 537
a8828ed9 538
88213476
LP
539 if (mount(mount_table[k].what,
540 where,
541 mount_table[k].type,
542 mount_table[k].flags,
d002827b 543 o) < 0 &&
3bd66c05 544 mount_table[k].fatal) {
88213476
LP
545
546 log_error("mount(%s) failed: %m", where);
547
548 if (r == 0)
549 r = -errno;
550 }
88213476
LP
551 }
552
e58a1277
LP
553 return r;
554}
f8440af5 555
17fe0523
LP
556static int mount_binds(const char *dest, char **l, unsigned long flags) {
557 char **x, **y;
558
559 STRV_FOREACH_PAIR(x, y, l) {
2ed4e5e0 560 char *where;
d2421337 561 struct stat source_st, dest_st;
2ed4e5e0 562 int r;
d2421337
DR
563
564 if (stat(*x, &source_st) < 0) {
565 log_error("failed to stat %s: %m", *x);
566 return -errno;
567 }
17fe0523 568
2ed4e5e0
SL
569 where = strappenda(dest, *y);
570 r = stat(where, &dest_st);
571 if (r == 0) {
d2421337 572 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
cecf24e7 573 log_error("The file types of %s and %s do not match. Refusing bind mount",
d2421337
DR
574 *x, where);
575 return -EINVAL;
576 }
2ed4e5e0
SL
577 } else if (errno == ENOENT) {
578 r = mkdir_parents_label(where, 0755);
579 if (r < 0) {
580 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
581 return r;
d2421337 582 }
2ed4e5e0
SL
583 } else {
584 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
585 return -errno;
586 }
587 /* Create the mount point, but be conservative -- refuse to create block
588 * and char devices. */
589 if (S_ISDIR(source_st.st_mode))
590 mkdir_label(where, 0755);
591 else if (S_ISFIFO(source_st.st_mode))
592 mkfifo(where, 0644);
593 else if (S_ISSOCK(source_st.st_mode))
594 mknod(where, 0644 | S_IFSOCK, 0);
595 else if (S_ISREG(source_st.st_mode))
596 touch(where);
597 else {
598 log_error("Refusing to create mountpoint for file: %s", *x);
599 return -ENOTSUP;
d2421337 600 }
17fe0523
LP
601
602 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
603 log_error("mount(%s) failed: %m", where);
604 return -errno;
605 }
606
607 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
608 log_error("mount(%s) failed: %m", where);
609 return -errno;
610 }
611 }
612
613 return 0;
614}
615
e58a1277 616static int setup_timezone(const char *dest) {
d4036145
LP
617 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
618 char *z, *y;
619 int r;
f8440af5 620
e58a1277
LP
621 assert(dest);
622
623 /* Fix the timezone, if possible */
d4036145
LP
624 r = readlink_malloc("/etc/localtime", &p);
625 if (r < 0) {
626 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
627 return 0;
628 }
629
630 z = path_startswith(p, "../usr/share/zoneinfo/");
631 if (!z)
632 z = path_startswith(p, "/usr/share/zoneinfo/");
633 if (!z) {
634 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
635 return 0;
636 }
637
04bc4a3f
LP
638 where = strappend(dest, "/etc/localtime");
639 if (!where)
0d0f0c50 640 return log_oom();
715ac17a 641
d4036145
LP
642 r = readlink_malloc(where, &q);
643 if (r >= 0) {
644 y = path_startswith(q, "../usr/share/zoneinfo/");
645 if (!y)
646 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 647
4d1c38b8 648
d4036145
LP
649 /* Already pointing to the right place? Then do nothing .. */
650 if (y && streq(y, z))
651 return 0;
652 }
653
654 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
655 if (!check)
0d0f0c50 656 return log_oom();
4d1c38b8 657
d4036145
LP
658 if (access(check, F_OK) < 0) {
659 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
660 return 0;
661 }
68fb0892 662
d4036145
LP
663 what = strappend("../usr/share/zoneinfo/", z);
664 if (!what)
665 return log_oom();
666
667 unlink(where);
668 if (symlink(what, where) < 0) {
669 log_error("Failed to correct timezone of container: %m");
670 return 0;
671 }
e58a1277
LP
672
673 return 0;
88213476
LP
674}
675
2547bb41 676static int setup_resolv_conf(const char *dest) {
f333fbb1 677 char _cleanup_free_ *where = NULL;
2547bb41
LP
678
679 assert(dest);
680
681 if (arg_private_network)
682 return 0;
683
684 /* Fix resolv.conf, if possible */
04bc4a3f
LP
685 where = strappend(dest, "/etc/resolv.conf");
686 if (!where)
0d0f0c50 687 return log_oom();
2547bb41 688
77e63faf
LP
689 /* We don't really care for the results of this really. If it
690 * fails, it fails, but meh... */
51045322 691 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
2547bb41
LP
692
693 return 0;
694}
695
04bc4a3f 696static int setup_boot_id(const char *dest) {
7fd1b19b 697 _cleanup_free_ char *from = NULL, *to = NULL;
04bc4a3f
LP
698 sd_id128_t rnd;
699 char as_uuid[37];
700 int r;
701
702 assert(dest);
703
eb91eb18
LP
704 if (arg_share_system)
705 return 0;
706
04bc4a3f
LP
707 /* Generate a new randomized boot ID, so that each boot-up of
708 * the container gets a new one */
709
710 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 711 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
712 if (!from || !to)
713 return log_oom();
04bc4a3f
LP
714
715 r = sd_id128_randomize(&rnd);
716 if (r < 0) {
717 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 718 return r;
04bc4a3f
LP
719 }
720
721 snprintf(as_uuid, sizeof(as_uuid),
722 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
723 SD_ID128_FORMAT_VAL(rnd));
724 char_array_0(as_uuid);
725
574d5f2d 726 r = write_string_file(from, as_uuid);
04bc4a3f
LP
727 if (r < 0) {
728 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 729 return r;
04bc4a3f
LP
730 }
731
732 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733 log_error("Failed to bind mount boot id: %m");
734 r = -errno;
10d18763
ZJS
735 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
736 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
737
738 unlink(from);
04bc4a3f
LP
739 return r;
740}
741
e58a1277 742static int copy_devnodes(const char *dest) {
88213476
LP
743
744 static const char devnodes[] =
745 "null\0"
746 "zero\0"
747 "full\0"
748 "random\0"
749 "urandom\0"
f2d88580 750 "tty\0";
88213476
LP
751
752 const char *d;
e58a1277 753 int r = 0;
7fd1b19b 754 _cleanup_umask_ mode_t u;
a258bf26
LP
755
756 assert(dest);
124640f1
LP
757
758 u = umask(0000);
88213476
LP
759
760 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 761 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 762 struct stat st;
88213476 763
7f112f50
LP
764 from = strappend("/dev/", d);
765 to = strjoin(dest, "/dev/", d, NULL);
766 if (!from || !to)
767 return log_oom();
88213476
LP
768
769 if (stat(from, &st) < 0) {
770
771 if (errno != ENOENT) {
772 log_error("Failed to stat %s: %m", from);
7f112f50 773 return -errno;
88213476
LP
774 }
775
a258bf26 776 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 777
ed8b7a3e 778 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 779 return -EIO;
a258bf26
LP
780
781 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
782
783 log_error("mknod(%s) failed: %m", dest);
7f112f50 784 return -errno;
88213476 785 }
88213476
LP
786 }
787
e58a1277
LP
788 return r;
789}
88213476 790
f2d88580
LP
791static int setup_ptmx(const char *dest) {
792 _cleanup_free_ char *p = NULL;
793
794 p = strappend(dest, "/dev/ptmx");
795 if (!p)
796 return log_oom();
797
798 if (symlink("pts/ptmx", p) < 0) {
799 log_error("Failed to create /dev/ptmx symlink: %m");
800 return -errno;
801 }
802
803 return 0;
804}
805
e58a1277
LP
806static int setup_dev_console(const char *dest, const char *console) {
807 struct stat st;
7fd1b19b 808 _cleanup_free_ char *to = NULL;
e58a1277 809 int r;
7fd1b19b 810 _cleanup_umask_ mode_t u;
e58a1277
LP
811
812 assert(dest);
813 assert(console);
814
815 u = umask(0000);
816
817 if (stat(console, &st) < 0) {
818 log_error("Failed to stat %s: %m", console);
25ea79fe 819 return -errno;
88213476 820
a258bf26 821 } else if (!S_ISCHR(st.st_mode)) {
25ea79fe
ZJS
822 log_error("/dev/console is not a char device");
823 return -EIO;
e58a1277 824 }
88213476 825
e58a1277
LP
826 r = chmod_and_chown(console, 0600, 0, 0);
827 if (r < 0) {
828 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 829 return r;
a258bf26 830 }
88213476 831
25ea79fe
ZJS
832 if (asprintf(&to, "%s/dev/console", dest) < 0)
833 return log_oom();
88213476 834
a258bf26
LP
835 /* We need to bind mount the right tty to /dev/console since
836 * ptys can only exist on pts file systems. To have something
837 * to bind mount things on we create a device node first, that
838 * has the right major/minor (note that the major minor
839 * doesn't actually matter here, since we mount it over
840 * anyway). */
841
e58a1277
LP
842 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
843 log_error("mknod() for /dev/console failed: %m");
25ea79fe 844 return -errno;
e58a1277 845 }
a258bf26
LP
846
847 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 848 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 849 return -errno;
a258bf26
LP
850 }
851
25ea79fe 852 return 0;
e58a1277
LP
853}
854
855static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 856 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 857 int r, fd, k;
7fd1b19b 858 _cleanup_umask_ mode_t u;
e58a1277
LP
859 union {
860 struct cmsghdr cmsghdr;
861 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
862 } control = {};
863 struct msghdr mh = {
864 .msg_control = &control,
865 .msg_controllen = sizeof(control),
866 };
e58a1277
LP
867 struct cmsghdr *cmsg;
868
869 assert(dest);
870 assert(kmsg_socket >= 0);
a258bf26 871
e58a1277 872 u = umask(0000);
a258bf26 873
f1e5dfe2
LP
874 /* We create the kmsg FIFO as /dev/kmsg, but immediately
875 * delete it after bind mounting it to /proc/kmsg. While FIFOs
876 * on the reading side behave very similar to /proc/kmsg,
877 * their writing side behaves differently from /dev/kmsg in
878 * that writing blocks when nothing is reading. In order to
879 * avoid any problems with containers deadlocking due to this
880 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
881 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
882 asprintf(&to, "%s/proc/kmsg", dest) < 0)
883 return log_oom();
e58a1277
LP
884
885 if (mkfifo(from, 0600) < 0) {
886 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 887 return -errno;
e58a1277
LP
888 }
889
890 r = chmod_and_chown(from, 0600, 0, 0);
891 if (r < 0) {
892 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 893 return r;
e58a1277
LP
894 }
895
896 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
897 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 898 return -errno;
e58a1277
LP
899 }
900
901 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
902 if (fd < 0) {
903 log_error("Failed to open fifo: %m");
25ea79fe 904 return -errno;
e58a1277
LP
905 }
906
e58a1277
LP
907 cmsg = CMSG_FIRSTHDR(&mh);
908 cmsg->cmsg_level = SOL_SOCKET;
909 cmsg->cmsg_type = SCM_RIGHTS;
910 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
911 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
912
913 mh.msg_controllen = cmsg->cmsg_len;
914
915 /* Store away the fd in the socket, so that it stays open as
916 * long as we run the child */
917 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
918 close_nointr_nofail(fd);
919
920 if (k < 0) {
921 log_error("Failed to send FIFO fd: %m");
25ea79fe 922 return -errno;
a258bf26
LP
923 }
924
f1e5dfe2
LP
925 /* And now make the FIFO unavailable as /dev/kmsg... */
926 unlink(from);
25ea79fe 927 return 0;
88213476
LP
928}
929
3a74cea5 930static int setup_hostname(void) {
3a74cea5 931
eb91eb18
LP
932 if (arg_share_system)
933 return 0;
934
7027ff61
LP
935 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
936 return -errno;
3a74cea5 937
7027ff61 938 return 0;
3a74cea5
LP
939}
940
57fb9fb5 941static int setup_journal(const char *directory) {
4d680aee 942 sd_id128_t machine_id, this_id;
7fd1b19b 943 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 944 char *id;
57fb9fb5
LP
945 int r;
946
57fb9fb5 947 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
948 if (!p)
949 return log_oom();
57fb9fb5
LP
950
951 r = read_one_line_file(p, &b);
27407a01
ZJS
952 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
953 return 0;
954 else if (r < 0) {
955 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
956 return r;
957 }
958
27407a01
ZJS
959 id = strstrip(b);
960 if (isempty(id) && arg_link_journal == LINK_AUTO)
961 return 0;
57fb9fb5 962
27407a01
ZJS
963 /* Verify validity */
964 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 965 if (r < 0) {
27407a01
ZJS
966 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
967 return r;
57fb9fb5
LP
968 }
969
4d680aee
ZJS
970 r = sd_id128_get_machine(&this_id);
971 if (r < 0) {
972 log_error("Failed to retrieve machine ID: %s", strerror(-r));
973 return r;
974 }
975
976 if (sd_id128_equal(machine_id, this_id)) {
977 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
978 "Host and machine ids are equal (%s): refusing to link journals", id);
979 if (arg_link_journal == LINK_AUTO)
980 return 0;
981 return
982 -EEXIST;
983 }
984
985 if (arg_link_journal == LINK_NO)
986 return 0;
987
57fb9fb5 988 free(p);
27407a01
ZJS
989 p = strappend("/var/log/journal/", id);
990 q = strjoin(directory, "/var/log/journal/", id, NULL);
991 if (!p || !q)
992 return log_oom();
993
994 if (path_is_mount_point(p, false) > 0) {
995 if (arg_link_journal != LINK_AUTO) {
996 log_error("%s: already a mount point, refusing to use for journal", p);
997 return -EEXIST;
998 }
999
1000 return 0;
57fb9fb5
LP
1001 }
1002
27407a01 1003 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1004 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1005 log_error("%s: already a mount point, refusing to use for journal", q);
1006 return -EEXIST;
57fb9fb5
LP
1007 }
1008
27407a01 1009 return 0;
57fb9fb5
LP
1010 }
1011
1012 r = readlink_and_make_absolute(p, &d);
1013 if (r >= 0) {
1014 if ((arg_link_journal == LINK_GUEST ||
1015 arg_link_journal == LINK_AUTO) &&
1016 path_equal(d, q)) {
1017
27407a01
ZJS
1018 r = mkdir_p(q, 0755);
1019 if (r < 0)
1020 log_warning("failed to create directory %s: %m", q);
1021 return 0;
57fb9fb5
LP
1022 }
1023
1024 if (unlink(p) < 0) {
1025 log_error("Failed to remove symlink %s: %m", p);
27407a01 1026 return -errno;
57fb9fb5
LP
1027 }
1028 } else if (r == -EINVAL) {
1029
1030 if (arg_link_journal == LINK_GUEST &&
1031 rmdir(p) < 0) {
1032
27407a01
ZJS
1033 if (errno == ENOTDIR) {
1034 log_error("%s already exists and is neither a symlink nor a directory", p);
1035 return r;
1036 } else {
57fb9fb5 1037 log_error("Failed to remove %s: %m", p);
27407a01 1038 return -errno;
57fb9fb5 1039 }
57fb9fb5
LP
1040 }
1041 } else if (r != -ENOENT) {
1042 log_error("readlink(%s) failed: %m", p);
27407a01 1043 return r;
57fb9fb5
LP
1044 }
1045
1046 if (arg_link_journal == LINK_GUEST) {
1047
1048 if (symlink(q, p) < 0) {
1049 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1050 return -errno;
57fb9fb5
LP
1051 }
1052
27407a01
ZJS
1053 r = mkdir_p(q, 0755);
1054 if (r < 0)
1055 log_warning("failed to create directory %s: %m", q);
1056 return 0;
57fb9fb5
LP
1057 }
1058
1059 if (arg_link_journal == LINK_HOST) {
1060 r = mkdir_p(p, 0755);
1061 if (r < 0) {
1062 log_error("Failed to create %s: %m", p);
27407a01 1063 return r;
57fb9fb5
LP
1064 }
1065
27407a01
ZJS
1066 } else if (access(p, F_OK) < 0)
1067 return 0;
57fb9fb5
LP
1068
1069 if (dir_is_empty(q) == 0) {
1070 log_error("%s not empty.", q);
27407a01 1071 return -ENOTEMPTY;
57fb9fb5
LP
1072 }
1073
1074 r = mkdir_p(q, 0755);
1075 if (r < 0) {
1076 log_error("Failed to create %s: %m", q);
27407a01 1077 return r;
57fb9fb5
LP
1078 }
1079
1080 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1081 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1082 return -errno;
57fb9fb5
LP
1083 }
1084
27407a01 1085 return 0;
57fb9fb5
LP
1086}
1087
9bd37b40
LP
1088static int setup_kdbus(const char *dest, const char *path) {
1089 const char *p;
1090
1091 if (!path)
1092 return 0;
1093
1094 p = strappenda(dest, "/dev/kdbus");
1095 if (mkdir(p, 0755) < 0) {
1096 log_error("Failed to create kdbus path: %m");
1097 return -errno;
1098 }
1099
1100 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1101 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1102 return -errno;
1103 }
1104
1105 return 0;
1106}
1107
88213476 1108static int drop_capabilities(void) {
5076f0cc 1109 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1110}
1111
354bfd2b 1112static int register_machine(pid_t pid) {
9444b1f2
LP
1113 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1114 _cleanup_bus_unref_ sd_bus *bus = NULL;
1115 int r;
1116
eb91eb18
LP
1117 if (!arg_register)
1118 return 0;
1119
1c03020c 1120 r = sd_bus_default_system(&bus);
9444b1f2
LP
1121 if (r < 0) {
1122 log_error("Failed to open system bus: %s", strerror(-r));
1123 return r;
1124 }
1125
89f7c846
LP
1126 if (arg_keep_unit) {
1127 r = sd_bus_call_method(
1128 bus,
1129 "org.freedesktop.machine1",
1130 "/org/freedesktop/machine1",
1131 "org.freedesktop.machine1.Manager",
1132 "RegisterMachine",
1133 &error,
1134 NULL,
1135 "sayssus",
1136 arg_machine,
1137 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1138 "nspawn",
1139 "container",
1140 (uint32_t) pid,
1141 strempty(arg_directory));
1142 } else {
1143 r = sd_bus_call_method(
1144 bus,
1145 "org.freedesktop.machine1",
1146 "/org/freedesktop/machine1",
1147 "org.freedesktop.machine1.Manager",
1148 "CreateMachine",
1149 &error,
1150 NULL,
1151 "sayssusa(sv)",
1152 arg_machine,
1153 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1154 "nspawn",
1155 "container",
1156 (uint32_t) pid,
1157 strempty(arg_directory),
1158 !isempty(arg_slice), "Slice", "s", arg_slice);
1159 }
1160
9444b1f2 1161 if (r < 0) {
1f0cd86b
LP
1162 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1163 return r;
1164 }
1165
1166 return 0;
1167}
1168
1169static int terminate_machine(pid_t pid) {
1170 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1171 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1172 _cleanup_bus_unref_ sd_bus *bus = NULL;
1173 const char *path;
1174 int r;
1175
eb91eb18
LP
1176 if (!arg_register)
1177 return 0;
1178
76b54375 1179 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1180 if (r < 0) {
1181 log_error("Failed to open system bus: %s", strerror(-r));
1182 return r;
1183 }
1184
1185 r = sd_bus_call_method(
1186 bus,
1187 "org.freedesktop.machine1",
1188 "/org/freedesktop/machine1",
1189 "org.freedesktop.machine1.Manager",
1190 "GetMachineByPID",
1191 &error,
1192 &reply,
1193 "u",
1194 (uint32_t) pid);
1195 if (r < 0) {
1196 /* Note that the machine might already have been
1197 * cleaned up automatically, hence don't consider it a
1198 * failure if we cannot get the machine object. */
1199 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1200 return 0;
1201 }
1202
1203 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1204 if (r < 0)
1205 return bus_log_parse_error(r);
9444b1f2 1206
1f0cd86b
LP
1207 r = sd_bus_call_method(
1208 bus,
1209 "org.freedesktop.machine1",
1210 path,
1211 "org.freedesktop.machine1.Machine",
1212 "Terminate",
1213 &error,
1214 NULL,
1215 NULL);
1216 if (r < 0) {
1217 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1218 return 0;
1219 }
1220
9444b1f2
LP
1221 return 0;
1222}
1223
db999e0f
LP
1224static int reset_audit_loginuid(void) {
1225 _cleanup_free_ char *p = NULL;
1226 int r;
1227
1228 if (arg_share_system)
1229 return 0;
1230
1231 r = read_one_line_file("/proc/self/loginuid", &p);
1232 if (r == -EEXIST)
1233 return 0;
1234 if (r < 0) {
1235 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1236 return r;
1237 }
1238
1239 /* Already reset? */
1240 if (streq(p, "4294967295"))
1241 return 0;
1242
1243 r = write_string_file("/proc/self/loginuid", "4294967295");
1244 if (r < 0) {
1245 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1246 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1247 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1248 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1249 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1250
db999e0f 1251 sleep(5);
77b6e194 1252 }
db999e0f
LP
1253
1254 return 0;
77b6e194
LP
1255}
1256
aa28aefe 1257static int move_network_interfaces(pid_t pid) {
cf6a8911 1258 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1259 char **i;
1260 int r;
1261
1262 if (!arg_private_network)
1263 return 0;
1264
1265 if (strv_isempty(arg_network_interfaces))
1266 return 0;
1267
1268 r = sd_rtnl_open(NETLINK_ROUTE, &rtnl);
1269 if (r < 0) {
1270 log_error("Failed to connect to netlink: %s", strerror(-r));
1271 return r;
1272 }
1273
1274 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1275 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
aa28aefe
LP
1276 unsigned ifi;
1277
1278 ifi = if_nametoindex(*i);
1279 if (ifi == 0) {
1280 log_error("Failed to resolve interface %s: %m", *i);
1281 return -errno;
1282 }
1283
d595c5cc 1284 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
aa28aefe
LP
1285 if (r < 0) {
1286 log_error("Failed to allocate netlink message: %s", strerror(-r));
1287 return r;
1288 }
1289
1290 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1291 if (r < 0) {
1292 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1293 return r;
1294 }
1295
1296 r = sd_rtnl_call(rtnl, m, 0, NULL);
1297 if (r < 0) {
1298 log_error("Failed to move interface to namespace: %s", strerror(-r));
1299 return r;
1300 }
1301 }
1302
1303 return 0;
1304}
1305
88213476
LP
1306int main(int argc, char *argv[]) {
1307 pid_t pid = 0;
04d391da 1308 int r = EXIT_FAILURE, k;
354bfd2b 1309 _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
7027ff61 1310 int n_fd_passed;
a258bf26 1311 const char *console = NULL;
a258bf26 1312 sigset_t mask;
04d39279 1313 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
51d122af 1314 _cleanup_fdset_free_ FDSet *fds = NULL;
486e99a3 1315 _cleanup_free_ char *kdbus_domain = NULL;
88213476
LP
1316
1317 log_parse_environment();
1318 log_open();
1319
05947bef
LP
1320 k = parse_argv(argc, argv);
1321 if (k < 0)
88213476 1322 goto finish;
05947bef
LP
1323 else if (k == 0) {
1324 r = EXIT_SUCCESS;
1325 goto finish;
1326 }
88213476
LP
1327
1328 if (arg_directory) {
1329 char *p;
1330
1331 p = path_make_absolute_cwd(arg_directory);
1332 free(arg_directory);
1333 arg_directory = p;
1334 } else
1335 arg_directory = get_current_dir_name();
1336
1337 if (!arg_directory) {
a383724e 1338 log_error("Failed to determine path, please use -D.");
88213476
LP
1339 goto finish;
1340 }
1341
1342 path_kill_slashes(arg_directory);
1343
7027ff61 1344 if (!arg_machine) {
2b6bf07d 1345 arg_machine = strdup(basename(arg_directory));
7027ff61
LP
1346 if (!arg_machine) {
1347 log_oom();
1348 goto finish;
1349 }
1350
e724b063 1351 hostname_cleanup(arg_machine, false);
7027ff61
LP
1352 if (isempty(arg_machine)) {
1353 log_error("Failed to determine machine name automatically, please use -M.");
1354 goto finish;
1355 }
1356 }
1357
88213476
LP
1358 if (geteuid() != 0) {
1359 log_error("Need to be root.");
1360 goto finish;
1361 }
1362
04d391da
LP
1363 if (sd_booted() <= 0) {
1364 log_error("Not running on a systemd system.");
1365 goto finish;
1366 }
1367
88213476 1368 if (path_equal(arg_directory, "/")) {
6df6b939 1369 log_error("Spawning container on root directory not supported.");
88213476
LP
1370 goto finish;
1371 }
1372
fcf90586 1373 if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
f8964235 1374 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
88213476
LP
1375 goto finish;
1376 }
1377
842f3b0f
LP
1378 log_close();
1379 n_fd_passed = sd_listen_fds(false);
1380 if (n_fd_passed > 0) {
1381 k = fdset_new_listen_fds(&fds, false);
1382 if (k < 0) {
1383 log_error("Failed to collect file descriptors: %s", strerror(-k));
1384 goto finish;
1385 }
1386 }
1387 fdset_close_others(fds);
1388 log_open();
1389
db7feb7e
LP
1390 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1391 if (master < 0) {
a258bf26
LP
1392 log_error("Failed to acquire pseudo tty: %m");
1393 goto finish;
1394 }
1395
db7feb7e
LP
1396 console = ptsname(master);
1397 if (!console) {
a258bf26
LP
1398 log_error("Failed to determine tty name: %m");
1399 goto finish;
1400 }
1401
284c0b91
LP
1402 if (!arg_quiet)
1403 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
a258bf26
LP
1404
1405 if (unlockpt(master) < 0) {
1406 log_error("Failed to unlock tty: %m");
1407 goto finish;
1408 }
1409
eb91eb18
LP
1410
1411 if (access("/dev/kdbus/control", F_OK) >= 0) {
1412
1413 if (arg_share_system) {
1414 kdbus_domain = strdup("/dev/kdbus");
1415 if (!kdbus_domain) {
1416 log_oom();
1417 goto finish;
1418 }
1419 } else {
1420 const char *ns;
1421
1422 ns = strappenda("machine-", arg_machine);
1423 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1424 if (r < 0)
1425 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1426 else
1427 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1428 }
1429 }
9bd37b40 1430
e58a1277 1431 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
1432 log_error("Failed to create kmsg socket pair: %m");
1433 goto finish;
1434 }
1435
05947bef
LP
1436 sd_notify(0, "READY=1");
1437
a258bf26
LP
1438 assert_se(sigemptyset(&mask) == 0);
1439 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1440 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1441
d87be9b0
LP
1442 for (;;) {
1443 siginfo_t status;
a383724e 1444
40ddbdf8
LP
1445 sync_fd = eventfd(0, EFD_CLOEXEC);
1446 if (sync_fd < 0) {
1447 log_error("Failed to create event fd: %m");
1448 goto finish;
1449 }
1450
8a96d94e
LP
1451 pid = syscall(__NR_clone,
1452 SIGCHLD|CLONE_NEWNS|
1453 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1454 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
1455 if (pid < 0) {
1456 if (errno == EINVAL)
1457 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1458 else
1459 log_error("clone() failed: %m");
a258bf26 1460
d87be9b0
LP
1461 goto finish;
1462 }
a258bf26 1463
d87be9b0
LP
1464 if (pid == 0) {
1465 /* child */
d87be9b0
LP
1466 const char *home = NULL;
1467 uid_t uid = (uid_t) -1;
1468 gid_t gid = (gid_t) -1;
5674767e 1469 unsigned n_env = 2;
d87be9b0 1470 const char *envp[] = {
e10a55fd 1471 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
1472 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1473 NULL, /* TERM */
1474 NULL, /* HOME */
1475 NULL, /* USER */
1476 NULL, /* LOGNAME */
1477 NULL, /* container_uuid */
842f3b0f
LP
1478 NULL, /* LISTEN_FDS */
1479 NULL, /* LISTEN_PID */
d87be9b0
LP
1480 NULL
1481 };
f4889f65 1482 char **env_use;
354bfd2b 1483 eventfd_t x;
a258bf26 1484
5674767e
ZJS
1485 envp[n_env] = strv_find_prefix(environ, "TERM=");
1486 if (envp[n_env])
1487 n_env ++;
a258bf26 1488
d87be9b0 1489 close_nointr_nofail(master);
842f3b0f 1490 master = -1;
a258bf26 1491
d87be9b0
LP
1492 close_nointr(STDIN_FILENO);
1493 close_nointr(STDOUT_FILENO);
1494 close_nointr(STDERR_FILENO);
db7feb7e 1495
842f3b0f
LP
1496 close_nointr_nofail(kmsg_socket_pair[0]);
1497 kmsg_socket_pair[0] = -1;
a258bf26 1498
d87be9b0 1499 reset_all_signal_handlers();
88213476 1500
d87be9b0
LP
1501 assert_se(sigemptyset(&mask) == 0);
1502 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 1503
842f3b0f
LP
1504 k = open_terminal(console, O_RDWR);
1505 if (k != STDIN_FILENO) {
1506 if (k >= 0) {
1507 close_nointr_nofail(k);
1508 k = -EINVAL;
1509 }
1510
1511 log_error("Failed to open console: %s", strerror(-k));
1512 goto child_fail;
1513 }
1514
1515 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1516 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1517 log_error("Failed to duplicate console: %m");
d87be9b0 1518 goto child_fail;
842f3b0f 1519 }
bc2f673e 1520
d87be9b0
LP
1521 if (setsid() < 0) {
1522 log_error("setsid() failed: %m");
bc2f673e
LP
1523 goto child_fail;
1524 }
1525
db999e0f
LP
1526 if (reset_audit_loginuid() < 0)
1527 goto child_fail;
1528
d87be9b0
LP
1529 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1530 log_error("PR_SET_PDEATHSIG failed: %m");
1531 goto child_fail;
1532 }
e58a1277 1533
d87be9b0
LP
1534 /* Mark everything as slave, so that we still
1535 * receive mounts from the real root, but don't
1536 * propagate mounts to the real root. */
1537 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1538 log_error("MS_SLAVE|MS_REC failed: %m");
1539 goto child_fail;
1540 }
04bc4a3f 1541
d87be9b0
LP
1542 /* Turn directory into bind mount */
1543 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1544 log_error("Failed to make bind mount.");
1545 goto child_fail;
1546 }
88213476 1547
d87be9b0
LP
1548 if (arg_read_only)
1549 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1550 log_error("Failed to make read-only.");
1551 goto child_fail;
1552 }
2547bb41 1553
d87be9b0
LP
1554 if (mount_all(arg_directory) < 0)
1555 goto child_fail;
57fb9fb5 1556
d87be9b0
LP
1557 if (copy_devnodes(arg_directory) < 0)
1558 goto child_fail;
a258bf26 1559
f2d88580
LP
1560 if (setup_ptmx(arg_directory) < 0)
1561 goto child_fail;
1562
d87be9b0 1563 dev_setup(arg_directory);
88213476 1564
d87be9b0
LP
1565 if (setup_dev_console(arg_directory, console) < 0)
1566 goto child_fail;
88213476 1567
d87be9b0
LP
1568 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1569 goto child_fail;
88213476 1570
d87be9b0 1571 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 1572 kmsg_socket_pair[1] = -1;
a258bf26 1573
d87be9b0
LP
1574 if (setup_boot_id(arg_directory) < 0)
1575 goto child_fail;
a41fe3a2 1576
d87be9b0
LP
1577 if (setup_timezone(arg_directory) < 0)
1578 goto child_fail;
88213476 1579
d87be9b0
LP
1580 if (setup_resolv_conf(arg_directory) < 0)
1581 goto child_fail;
687d0825 1582
d87be9b0 1583 if (setup_journal(arg_directory) < 0)
687d0825 1584 goto child_fail;
687d0825 1585
17fe0523
LP
1586 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1587 goto child_fail;
1588
1589 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1590 goto child_fail;
1591
486e99a3 1592 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
9bd37b40
LP
1593 goto child_fail;
1594
d87be9b0
LP
1595 if (chdir(arg_directory) < 0) {
1596 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
1597 goto child_fail;
1598 }
1599
d87be9b0
LP
1600 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1601 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
1602 goto child_fail;
1603 }
1604
d87be9b0
LP
1605 if (chroot(".") < 0) {
1606 log_error("chroot() failed: %m");
687d0825
MV
1607 goto child_fail;
1608 }
1609
d87be9b0
LP
1610 if (chdir("/") < 0) {
1611 log_error("chdir() failed: %m");
687d0825
MV
1612 goto child_fail;
1613 }
1614
d87be9b0
LP
1615 umask(0022);
1616
eb91eb18
LP
1617 if (arg_private_network)
1618 loopback_setup();
d87be9b0
LP
1619
1620 if (drop_capabilities() < 0) {
1621 log_error("drop_capabilities() failed: %m");
687d0825
MV
1622 goto child_fail;
1623 }
687d0825 1624
d87be9b0
LP
1625 if (arg_user) {
1626
963ddb91
LP
1627 /* Note that this resolves user names
1628 * inside the container, and hence
1629 * accesses the NSS modules from the
1630 * container and not the host. This is
1631 * a bit weird... */
1632
d87be9b0
LP
1633 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1634 log_error("get_user_creds() failed: %m");
1635 goto child_fail;
1636 }
1637
1638 if (mkdir_parents_label(home, 0775) < 0) {
1639 log_error("mkdir_parents_label() failed: %m");
1640 goto child_fail;
1641 }
1642
1643 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1644 log_error("mkdir_safe_label() failed: %m");
1645 goto child_fail;
1646 }
1647
1648 if (initgroups((const char*)arg_user, gid) < 0) {
1649 log_error("initgroups() failed: %m");
1650 goto child_fail;
1651 }
144f0fc0 1652
d87be9b0
LP
1653 if (setresgid(gid, gid, gid) < 0) {
1654 log_error("setregid() failed: %m");
1655 goto child_fail;
1656 }
1657
1658 if (setresuid(uid, uid, uid) < 0) {
1659 log_error("setreuid() failed: %m");
1660 goto child_fail;
1661 }
3c957acf
LP
1662 } else {
1663 /* Reset everything fully to 0, just in case */
1664
1665 if (setgroups(0, NULL) < 0) {
1666 log_error("setgroups() failed: %m");
1667 goto child_fail;
1668 }
1669
1670 if (setresgid(0, 0, 0) < 0) {
1671 log_error("setregid() failed: %m");
1672 goto child_fail;
1673 }
1674
1675 if (setresuid(0, 0, 0) < 0) {
1676 log_error("setreuid() failed: %m");
1677 goto child_fail;
1678 }
d87be9b0
LP
1679 }
1680
842f3b0f
LP
1681 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1682 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1683 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 1684 log_oom();
144f0fc0
LP
1685 goto child_fail;
1686 }
687d0825 1687
9444b1f2
LP
1688 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1689 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
842f3b0f
LP
1690 log_oom();
1691 goto child_fail;
1692 }
1693 }
1694
1695 if (fdset_size(fds) > 0) {
1696 k = fdset_cloexec(fds, false);
1697 if (k < 0) {
1698 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1699 goto child_fail;
1700 }
1701
1702 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 1703 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
1704 log_oom();
1705 goto child_fail;
1706 }
1707 }
1708
1709 setup_hostname();
1710
354bfd2b
LP
1711 eventfd_read(sync_fd, &x);
1712 close_nointr_nofail(sync_fd);
1713 sync_fd = -1;
1714
f4889f65
LP
1715 if (!strv_isempty(arg_setenv)) {
1716 char **n;
1717
1718 n = strv_env_merge(2, envp, arg_setenv);
1719 if (!n) {
1720 log_oom();
1721 goto child_fail;
1722 }
1723
1724 env_use = n;
1725 } else
1726 env_use = (char**) envp;
1727
5d63309c 1728#ifdef HAVE_SELINUX
82adf6af
LP
1729 if (arg_selinux_context)
1730 if (setexeccon(arg_selinux_context) < 0)
1731 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
a8828ed9 1732#endif
d87be9b0
LP
1733 if (arg_boot) {
1734 char **a;
1735 size_t l;
88213476 1736
d87be9b0 1737 /* Automatically search for the init system */
0f0dbc46 1738
d87be9b0
LP
1739 l = 1 + argc - optind;
1740 a = newa(char*, l + 1);
1741 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 1742
d87be9b0 1743 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 1744 execve(a[0], a, env_use);
0f0dbc46 1745
d87be9b0 1746 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 1747 execve(a[0], a, env_use);
0f0dbc46 1748
d87be9b0 1749 a[0] = (char*) "/sbin/init";
f4889f65 1750 execve(a[0], a, env_use);
d87be9b0 1751 } else if (argc > optind)
f4889f65 1752 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
1753 else {
1754 chdir(home ? home : "/root");
f4889f65 1755 execle("/bin/bash", "-bash", NULL, env_use);
d87be9b0
LP
1756 }
1757
1758 log_error("execv() failed: %m");
0f0dbc46 1759
d87be9b0
LP
1760 child_fail:
1761 _exit(EXIT_FAILURE);
da5b3bad 1762 }
88213476 1763
842f3b0f
LP
1764 fdset_free(fds);
1765 fds = NULL;
1766
354bfd2b
LP
1767 r = register_machine(pid);
1768 if (r < 0)
1769 goto finish;
1770
aa28aefe
LP
1771 r = move_network_interfaces(pid);
1772 if (r < 0)
1773 goto finish;
1774
354bfd2b
LP
1775 eventfd_write(sync_fd, 1);
1776 close_nointr_nofail(sync_fd);
1777 sync_fd = -1;
1778
04d39279
LP
1779 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1780 if (k < 0) {
1781 r = EXIT_FAILURE;
1782 break;
1783 }
88213476 1784
284c0b91
LP
1785 if (!arg_quiet)
1786 putc('\n', stdout);
04d39279
LP
1787
1788 /* Kill if it is not dead yet anyway */
1f0cd86b
LP
1789 terminate_machine(pid);
1790
1791 /* Redundant, but better safe than sorry */
04d39279 1792 kill(pid, SIGKILL);
a258bf26 1793
05947bef 1794 k = wait_for_terminate(pid, &status);
04d39279
LP
1795 pid = 0;
1796
05947bef 1797 if (k < 0) {
d87be9b0
LP
1798 r = EXIT_FAILURE;
1799 break;
1800 }
a258bf26 1801
d87be9b0 1802 if (status.si_code == CLD_EXITED) {
a5f5f8a0 1803 r = status.si_status;
d87be9b0 1804 if (status.si_status != 0) {
04d39279 1805 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
d87be9b0
LP
1806 break;
1807 }
1808
284c0b91
LP
1809 if (!arg_quiet)
1810 log_debug("Container %s exited successfully.", arg_machine);
d87be9b0
LP
1811 break;
1812 } else if (status.si_code == CLD_KILLED &&
1813 status.si_status == SIGINT) {
284c0b91
LP
1814
1815 if (!arg_quiet)
1816 log_info("Container %s has been shut down.", arg_machine);
d87be9b0
LP
1817 r = 0;
1818 break;
1819 } else if (status.si_code == CLD_KILLED &&
1820 status.si_status == SIGHUP) {
284c0b91
LP
1821
1822 if (!arg_quiet)
1823 log_info("Container %s is being rebooted.", arg_machine);
d87be9b0
LP
1824 continue;
1825 } else if (status.si_code == CLD_KILLED ||
1826 status.si_code == CLD_DUMPED) {
88213476 1827
eb91eb18 1828 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
d87be9b0
LP
1829 r = EXIT_FAILURE;
1830 break;
1831 } else {
04d39279 1832 log_error("Container %s failed due to unknown reason.", arg_machine);
d87be9b0
LP
1833 r = EXIT_FAILURE;
1834 break;
1835 }
1836 }
88213476
LP
1837
1838finish:
9444b1f2
LP
1839 if (pid > 0)
1840 kill(pid, SIGKILL);
88213476 1841
04d391da 1842 free(arg_directory);
7027ff61 1843 free(arg_machine);
f4889f65 1844 free(arg_setenv);
aa28aefe 1845 free(arg_network_interfaces);
88213476
LP
1846
1847 return r;
1848}