]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
shared: add process-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
aa28aefe 58#include "sd-rtnl.h"
88213476
LP
59#include "log.h"
60#include "util.h"
49e942b2 61#include "mkdir.h"
c6878637 62#include "rm-rf.h"
6b2d0e85 63#include "macro.h"
94d82985 64#include "missing.h"
04d391da 65#include "cgroup-util.h"
a258bf26 66#include "strv.h"
9eb977db 67#include "path-util.h"
a41fe3a2 68#include "loopback-setup.h"
4fc9982c 69#include "dev-setup.h"
842f3b0f 70#include "fdset.h"
acbeb427 71#include "build.h"
a5c32cff 72#include "fileio.h"
40ca29a1 73#include "bus-util.h"
1f0cd86b 74#include "bus-error.h"
4ba93280 75#include "ptyfwd.h"
f4889f65 76#include "env-util.h"
aa28aefe 77#include "rtnl-util.h"
7e227024 78#include "udev-util.h"
1b9e5b12
LP
79#include "blkid-util.h"
80#include "gpt.h"
01dde061 81#include "siphash24.h"
849958d1 82#include "copy.h"
3577de7a 83#include "base-filesystem.h"
a2da110b 84#include "barrier.h"
023fb90b 85#include "event-util.h"
f01ae826 86#include "capability.h"
2822da4f 87#include "cap-list.h"
ec16945e 88#include "btrfs-util.h"
1b9cebf6 89#include "machine-image.h"
6d0b55c2
LP
90#include "list.h"
91#include "in-addr-util.h"
92#include "fw-util.h"
93#include "local-addresses.h"
6482f626 94#include "formats-util.h"
0b452006 95#include "process-util.h"
f2d88580 96
e9642be2
LP
97#ifdef HAVE_SECCOMP
98#include "seccomp-util.h"
99#endif
100
6d0b55c2
LP
101typedef struct ExposePort {
102 int protocol;
103 uint16_t host_port;
104 uint16_t container_port;
105 LIST_FIELDS(struct ExposePort, ports);
106} ExposePort;
107
113cea80
DH
108typedef enum ContainerStatus {
109 CONTAINER_TERMINATED,
110 CONTAINER_REBOOTED
111} ContainerStatus;
112
57fb9fb5
LP
113typedef enum LinkJournal {
114 LINK_NO,
115 LINK_AUTO,
116 LINK_HOST,
117 LINK_GUEST
118} LinkJournal;
88213476 119
4d9f07b4
LP
120typedef enum Volatile {
121 VOLATILE_NO,
122 VOLATILE_YES,
123 VOLATILE_STATE,
124} Volatile;
125
88213476 126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
687d0825 128static char *arg_user = NULL;
9444b1f2 129static sd_id128_t arg_uuid = {};
7027ff61 130static char *arg_machine = NULL;
c74e630d
LP
131static const char *arg_selinux_context = NULL;
132static const char *arg_selinux_apifs_context = NULL;
9444b1f2 133static const char *arg_slice = NULL;
ff01d048 134static bool arg_private_network = false;
bc2f673e 135static bool arg_read_only = false;
0f0dbc46 136static bool arg_boot = false;
ec16945e 137static bool arg_ephemeral = false;
57fb9fb5 138static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 139static bool arg_link_journal_try = false;
5076f0cc
LP
140static uint64_t arg_retain =
141 (1ULL << CAP_CHOWN) |
142 (1ULL << CAP_DAC_OVERRIDE) |
143 (1ULL << CAP_DAC_READ_SEARCH) |
144 (1ULL << CAP_FOWNER) |
145 (1ULL << CAP_FSETID) |
146 (1ULL << CAP_IPC_OWNER) |
147 (1ULL << CAP_KILL) |
148 (1ULL << CAP_LEASE) |
149 (1ULL << CAP_LINUX_IMMUTABLE) |
150 (1ULL << CAP_NET_BIND_SERVICE) |
151 (1ULL << CAP_NET_BROADCAST) |
152 (1ULL << CAP_NET_RAW) |
153 (1ULL << CAP_SETGID) |
154 (1ULL << CAP_SETFCAP) |
155 (1ULL << CAP_SETPCAP) |
156 (1ULL << CAP_SETUID) |
157 (1ULL << CAP_SYS_ADMIN) |
158 (1ULL << CAP_SYS_CHROOT) |
159 (1ULL << CAP_SYS_NICE) |
160 (1ULL << CAP_SYS_PTRACE) |
161 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 162 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
163 (1ULL << CAP_SYS_BOOT) |
164 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
165 (1ULL << CAP_AUDIT_CONTROL) |
166 (1ULL << CAP_MKNOD);
17fe0523
LP
167static char **arg_bind = NULL;
168static char **arg_bind_ro = NULL;
06c17c39 169static char **arg_tmpfs = NULL;
f4889f65 170static char **arg_setenv = NULL;
284c0b91 171static bool arg_quiet = false;
8a96d94e 172static bool arg_share_system = false;
eb91eb18 173static bool arg_register = true;
89f7c846 174static bool arg_keep_unit = false;
aa28aefe 175static char **arg_network_interfaces = NULL;
c74e630d 176static char **arg_network_macvlan = NULL;
4bbfe7ad 177static char **arg_network_ipvlan = NULL;
69c79d3c 178static bool arg_network_veth = false;
c74e630d 179static const char *arg_network_bridge = NULL;
6afc95b7 180static unsigned long arg_personality = 0xffffffffLU;
ec16945e 181static char *arg_image = NULL;
4d9f07b4 182static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 183static ExposePort *arg_expose_ports = NULL;
f36933fe 184static char **arg_property = NULL;
6dac160c
LP
185static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
186static bool arg_userns = false;
c6c8f6e2 187static int arg_kill_signal = 0;
88213476 188
601185b4 189static void help(void) {
88213476
LP
190 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
191 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
192 " -h --help Show this help\n"
193 " --version Print version string\n"
69c79d3c 194 " -q --quiet Do not show status information\n"
1b9e5b12 195 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
196 " --template=PATH Initialize root directory from template directory,\n"
197 " if missing\n"
198 " -x --ephemeral Run container with snapshot of root directory, and\n"
199 " remove it after exit\n"
200 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
201 " -b --boot Boot up full system (i.e. invoke init)\n"
202 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 203 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 204 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 205 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 206 " --property=NAME=VALUE Set scope unit property\n"
69c79d3c
LP
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
210 " container\n"
c74e630d
LP
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
4bbfe7ad
TG
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
0dfaa006 217 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 218 " and container\n"
ab046dde 219 " --network-bridge=INTERFACE\n"
32457153 220 " Add a virtual ethernet connection between host\n"
ab046dde
TG
221 " and container and add it to an existing bridge on\n"
222 " the host\n"
6dac160c
LP
223 " --private-users[=UIDBASE[:NUIDS]]\n"
224 " Run within user namespace\n"
6d0b55c2 225 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 226 " Expose a container IP port on the host\n"
82adf6af
LP
227 " -Z --selinux-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " processes in the container\n"
230 " -L --selinux-apifs-context=SECLABEL\n"
231 " Set the SELinux security context to be used by\n"
232 " API/tmpfs file systems in the container\n"
a8828ed9
DW
233 " --capability=CAP In addition to the default, retain specified\n"
234 " capability\n"
235 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 236 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
237 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
238 " try-guest, try-host\n"
239 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 240 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
241 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
242 " the container\n"
243 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 245 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 246 " --share-system Share system namespaces with host\n"
eb91eb18 247 " --register=BOOLEAN Register container as machine\n"
89f7c846 248 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 249 " the service unit nspawn is running in\n"
6d0b55c2
LP
250 " --volatile[=MODE] Run the system in volatile mode\n"
251 , program_invocation_short_name);
88213476
LP
252}
253
ec16945e
LP
254static int set_sanitized_path(char **b, const char *path) {
255 char *p;
256
257 assert(b);
258 assert(path);
259
260 p = canonicalize_file_name(path);
261 if (!p) {
262 if (errno != ENOENT)
263 return -errno;
264
265 p = path_make_absolute_cwd(path);
266 if (!p)
267 return -ENOMEM;
268 }
269
270 free(*b);
271 *b = path_kill_slashes(p);
272 return 0;
273}
274
88213476
LP
275static int parse_argv(int argc, char *argv[]) {
276
a41fe3a2 277 enum {
acbeb427
ZJS
278 ARG_VERSION = 0x100,
279 ARG_PRIVATE_NETWORK,
bc2f673e 280 ARG_UUID,
5076f0cc 281 ARG_READ_ONLY,
57fb9fb5 282 ARG_CAPABILITY,
420c7379 283 ARG_DROP_CAPABILITY,
17fe0523
LP
284 ARG_LINK_JOURNAL,
285 ARG_BIND,
f4889f65 286 ARG_BIND_RO,
06c17c39 287 ARG_TMPFS,
f4889f65 288 ARG_SETENV,
eb91eb18 289 ARG_SHARE_SYSTEM,
89f7c846 290 ARG_REGISTER,
aa28aefe 291 ARG_KEEP_UNIT,
69c79d3c 292 ARG_NETWORK_INTERFACE,
c74e630d 293 ARG_NETWORK_MACVLAN,
4bbfe7ad 294 ARG_NETWORK_IPVLAN,
ab046dde 295 ARG_NETWORK_BRIDGE,
6afc95b7 296 ARG_PERSONALITY,
4d9f07b4 297 ARG_VOLATILE,
ec16945e 298 ARG_TEMPLATE,
f36933fe 299 ARG_PROPERTY,
6dac160c 300 ARG_PRIVATE_USERS,
c6c8f6e2 301 ARG_KILL_SIGNAL,
a41fe3a2
LP
302 };
303
88213476 304 static const struct option options[] = {
aa28aefe
LP
305 { "help", no_argument, NULL, 'h' },
306 { "version", no_argument, NULL, ARG_VERSION },
307 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
308 { "template", required_argument, NULL, ARG_TEMPLATE },
309 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
310 { "user", required_argument, NULL, 'u' },
311 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
312 { "boot", no_argument, NULL, 'b' },
313 { "uuid", required_argument, NULL, ARG_UUID },
314 { "read-only", no_argument, NULL, ARG_READ_ONLY },
315 { "capability", required_argument, NULL, ARG_CAPABILITY },
316 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
317 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
318 { "bind", required_argument, NULL, ARG_BIND },
319 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 320 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
321 { "machine", required_argument, NULL, 'M' },
322 { "slice", required_argument, NULL, 'S' },
323 { "setenv", required_argument, NULL, ARG_SETENV },
324 { "selinux-context", required_argument, NULL, 'Z' },
325 { "selinux-apifs-context", required_argument, NULL, 'L' },
326 { "quiet", no_argument, NULL, 'q' },
327 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
328 { "register", required_argument, NULL, ARG_REGISTER },
329 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
330 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 331 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 332 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 333 { "network-veth", no_argument, NULL, 'n' },
ab046dde 334 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 335 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 336 { "image", required_argument, NULL, 'i' },
4d9f07b4 337 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 338 { "port", required_argument, NULL, 'p' },
f36933fe 339 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 340 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 341 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 342 {}
88213476
LP
343 };
344
9444b1f2 345 int c, r;
a42c8b54 346 uint64_t plus = 0, minus = 0;
88213476
LP
347
348 assert(argc >= 0);
349 assert(argv);
350
0dfaa006 351 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
352
353 switch (c) {
354
355 case 'h':
601185b4
ZJS
356 help();
357 return 0;
88213476 358
acbeb427
ZJS
359 case ARG_VERSION:
360 puts(PACKAGE_STRING);
361 puts(SYSTEMD_FEATURES);
362 return 0;
363
88213476 364 case 'D':
ec16945e
LP
365 r = set_sanitized_path(&arg_directory, optarg);
366 if (r < 0)
367 return log_error_errno(r, "Invalid root directory: %m");
368
369 break;
370
371 case ARG_TEMPLATE:
372 r = set_sanitized_path(&arg_template, optarg);
373 if (r < 0)
374 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
375
376 break;
377
1b9e5b12 378 case 'i':
ec16945e
LP
379 r = set_sanitized_path(&arg_image, optarg);
380 if (r < 0)
381 return log_error_errno(r, "Invalid image path: %m");
382
383 break;
384
385 case 'x':
386 arg_ephemeral = true;
1b9e5b12
LP
387 break;
388
687d0825
MV
389 case 'u':
390 free(arg_user);
7027ff61
LP
391 arg_user = strdup(optarg);
392 if (!arg_user)
393 return log_oom();
687d0825
MV
394
395 break;
396
ab046dde 397 case ARG_NETWORK_BRIDGE:
c74e630d 398 arg_network_bridge = optarg;
ab046dde
TG
399
400 /* fall through */
401
0dfaa006 402 case 'n':
69c79d3c
LP
403 arg_network_veth = true;
404 arg_private_network = true;
405 break;
406
aa28aefe 407 case ARG_NETWORK_INTERFACE:
c74e630d
LP
408 if (strv_extend(&arg_network_interfaces, optarg) < 0)
409 return log_oom();
410
411 arg_private_network = true;
412 break;
413
414 case ARG_NETWORK_MACVLAN:
415 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
416 return log_oom();
417
4bbfe7ad
TG
418 arg_private_network = true;
419 break;
420
421 case ARG_NETWORK_IPVLAN:
422 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
423 return log_oom();
424
aa28aefe
LP
425 /* fall through */
426
ff01d048
LP
427 case ARG_PRIVATE_NETWORK:
428 arg_private_network = true;
a41fe3a2
LP
429 break;
430
0f0dbc46
LP
431 case 'b':
432 arg_boot = true;
433 break;
434
144f0fc0 435 case ARG_UUID:
9444b1f2
LP
436 r = sd_id128_from_string(optarg, &arg_uuid);
437 if (r < 0) {
aa96c6cb 438 log_error("Invalid UUID: %s", optarg);
9444b1f2 439 return r;
aa96c6cb 440 }
9444b1f2 441 break;
aa96c6cb 442
9444b1f2 443 case 'S':
c74e630d 444 arg_slice = optarg;
144f0fc0
LP
445 break;
446
7027ff61 447 case 'M':
eb91eb18
LP
448 if (isempty(optarg)) {
449 free(arg_machine);
450 arg_machine = NULL;
451 } else {
0c3c4284 452 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
453 log_error("Invalid machine name: %s", optarg);
454 return -EINVAL;
455 }
7027ff61 456
0c3c4284
LP
457 r = free_and_strdup(&arg_machine, optarg);
458 if (r < 0)
eb91eb18
LP
459 return log_oom();
460
461 break;
462 }
7027ff61 463
82adf6af
LP
464 case 'Z':
465 arg_selinux_context = optarg;
a8828ed9
DW
466 break;
467
82adf6af
LP
468 case 'L':
469 arg_selinux_apifs_context = optarg;
a8828ed9
DW
470 break;
471
bc2f673e
LP
472 case ARG_READ_ONLY:
473 arg_read_only = true;
474 break;
475
420c7379
LP
476 case ARG_CAPABILITY:
477 case ARG_DROP_CAPABILITY: {
a2a5291b 478 const char *state, *word;
5076f0cc
LP
479 size_t length;
480
481 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 482 _cleanup_free_ char *t;
5076f0cc
LP
483
484 t = strndup(word, length);
0d0f0c50
SL
485 if (!t)
486 return log_oom();
5076f0cc 487
39ed67d1
LP
488 if (streq(t, "all")) {
489 if (c == ARG_CAPABILITY)
a42c8b54 490 plus = (uint64_t) -1;
39ed67d1 491 else
a42c8b54 492 minus = (uint64_t) -1;
39ed67d1 493 } else {
2822da4f
LP
494 int cap;
495
496 cap = capability_from_name(t);
497 if (cap < 0) {
39ed67d1
LP
498 log_error("Failed to parse capability %s.", t);
499 return -EINVAL;
500 }
501
502 if (c == ARG_CAPABILITY)
a42c8b54 503 plus |= 1ULL << (uint64_t) cap;
39ed67d1 504 else
a42c8b54 505 minus |= 1ULL << (uint64_t) cap;
5076f0cc 506 }
5076f0cc
LP
507 }
508
509 break;
510 }
511
57fb9fb5
LP
512 case 'j':
513 arg_link_journal = LINK_GUEST;
574edc90 514 arg_link_journal_try = true;
57fb9fb5
LP
515 break;
516
517 case ARG_LINK_JOURNAL:
53e438e3 518 if (streq(optarg, "auto")) {
57fb9fb5 519 arg_link_journal = LINK_AUTO;
53e438e3
LP
520 arg_link_journal_try = false;
521 } else if (streq(optarg, "no")) {
57fb9fb5 522 arg_link_journal = LINK_NO;
53e438e3
LP
523 arg_link_journal_try = false;
524 } else if (streq(optarg, "guest")) {
57fb9fb5 525 arg_link_journal = LINK_GUEST;
53e438e3
LP
526 arg_link_journal_try = false;
527 } else if (streq(optarg, "host")) {
57fb9fb5 528 arg_link_journal = LINK_HOST;
53e438e3
LP
529 arg_link_journal_try = false;
530 } else if (streq(optarg, "try-guest")) {
574edc90
MP
531 arg_link_journal = LINK_GUEST;
532 arg_link_journal_try = true;
533 } else if (streq(optarg, "try-host")) {
534 arg_link_journal = LINK_HOST;
535 arg_link_journal_try = true;
536 } else {
57fb9fb5
LP
537 log_error("Failed to parse link journal mode %s", optarg);
538 return -EINVAL;
539 }
540
541 break;
542
17fe0523
LP
543 case ARG_BIND:
544 case ARG_BIND_RO: {
545 _cleanup_free_ char *a = NULL, *b = NULL;
546 char *e;
547 char ***x;
17fe0523
LP
548
549 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
550
551 e = strchr(optarg, ':');
552 if (e) {
553 a = strndup(optarg, e - optarg);
554 b = strdup(e + 1);
555 } else {
556 a = strdup(optarg);
557 b = strdup(optarg);
558 }
559
560 if (!a || !b)
561 return log_oom();
562
563 if (!path_is_absolute(a) || !path_is_absolute(b)) {
564 log_error("Invalid bind mount specification: %s", optarg);
565 return -EINVAL;
566 }
567
568 r = strv_extend(x, a);
569 if (r < 0)
b3451bed 570 return log_oom();
17fe0523
LP
571
572 r = strv_extend(x, b);
573 if (r < 0)
b3451bed 574 return log_oom();
17fe0523
LP
575
576 break;
577 }
578
06c17c39
LP
579 case ARG_TMPFS: {
580 _cleanup_free_ char *a = NULL, *b = NULL;
581 char *e;
582
583 e = strchr(optarg, ':');
584 if (e) {
585 a = strndup(optarg, e - optarg);
586 b = strdup(e + 1);
587 } else {
588 a = strdup(optarg);
589 b = strdup("mode=0755");
590 }
591
592 if (!a || !b)
593 return log_oom();
594
595 if (!path_is_absolute(a)) {
596 log_error("Invalid tmpfs specification: %s", optarg);
597 return -EINVAL;
598 }
599
600 r = strv_push(&arg_tmpfs, a);
601 if (r < 0)
602 return log_oom();
603
604 a = NULL;
605
606 r = strv_push(&arg_tmpfs, b);
607 if (r < 0)
608 return log_oom();
609
610 b = NULL;
611
612 break;
613 }
614
f4889f65
LP
615 case ARG_SETENV: {
616 char **n;
617
618 if (!env_assignment_is_valid(optarg)) {
619 log_error("Environment variable assignment '%s' is not valid.", optarg);
620 return -EINVAL;
621 }
622
623 n = strv_env_set(arg_setenv, optarg);
624 if (!n)
625 return log_oom();
626
627 strv_free(arg_setenv);
628 arg_setenv = n;
629 break;
630 }
631
284c0b91
LP
632 case 'q':
633 arg_quiet = true;
634 break;
635
8a96d94e
LP
636 case ARG_SHARE_SYSTEM:
637 arg_share_system = true;
638 break;
639
eb91eb18
LP
640 case ARG_REGISTER:
641 r = parse_boolean(optarg);
642 if (r < 0) {
643 log_error("Failed to parse --register= argument: %s", optarg);
644 return r;
645 }
646
647 arg_register = r;
648 break;
649
89f7c846
LP
650 case ARG_KEEP_UNIT:
651 arg_keep_unit = true;
652 break;
653
6afc95b7
LP
654 case ARG_PERSONALITY:
655
ac45f971 656 arg_personality = personality_from_string(optarg);
6afc95b7
LP
657 if (arg_personality == 0xffffffffLU) {
658 log_error("Unknown or unsupported personality '%s'.", optarg);
659 return -EINVAL;
660 }
661
662 break;
663
4d9f07b4
LP
664 case ARG_VOLATILE:
665
666 if (!optarg)
667 arg_volatile = VOLATILE_YES;
668 else {
669 r = parse_boolean(optarg);
670 if (r < 0) {
671 if (streq(optarg, "state"))
672 arg_volatile = VOLATILE_STATE;
673 else {
674 log_error("Failed to parse --volatile= argument: %s", optarg);
675 return r;
676 }
677 } else
678 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
679 }
680
681 break;
682
6d0b55c2
LP
683 case 'p': {
684 const char *split, *e;
685 uint16_t container_port, host_port;
686 int protocol;
687 ExposePort *p;
688
689 if ((e = startswith(optarg, "tcp:")))
690 protocol = IPPROTO_TCP;
691 else if ((e = startswith(optarg, "udp:")))
692 protocol = IPPROTO_UDP;
693 else {
694 e = optarg;
695 protocol = IPPROTO_TCP;
696 }
697
698 split = strchr(e, ':');
699 if (split) {
700 char v[split - e + 1];
701
702 memcpy(v, e, split - e);
703 v[split - e] = 0;
704
705 r = safe_atou16(v, &host_port);
706 if (r < 0 || host_port <= 0) {
707 log_error("Failed to parse host port: %s", optarg);
708 return -EINVAL;
709 }
710
711 r = safe_atou16(split + 1, &container_port);
712 } else {
713 r = safe_atou16(e, &container_port);
714 host_port = container_port;
715 }
716
717 if (r < 0 || container_port <= 0) {
718 log_error("Failed to parse host port: %s", optarg);
719 return -EINVAL;
720 }
721
722 LIST_FOREACH(ports, p, arg_expose_ports) {
723 if (p->protocol == protocol && p->host_port == host_port) {
724 log_error("Duplicate port specification: %s", optarg);
725 return -EINVAL;
726 }
727 }
728
729 p = new(ExposePort, 1);
730 if (!p)
731 return log_oom();
732
733 p->protocol = protocol;
734 p->host_port = host_port;
735 p->container_port = container_port;
736
737 LIST_PREPEND(ports, arg_expose_ports, p);
738
739 break;
740 }
741
f36933fe
LP
742 case ARG_PROPERTY:
743 if (strv_extend(&arg_property, optarg) < 0)
744 return log_oom();
745
746 break;
747
6dac160c
LP
748 case ARG_PRIVATE_USERS:
749 if (optarg) {
750 _cleanup_free_ char *buffer = NULL;
751 const char *range, *shift;
752
753 range = strchr(optarg, ':');
754 if (range) {
755 buffer = strndup(optarg, range - optarg);
756 if (!buffer)
757 return log_oom();
758 shift = buffer;
759
760 range++;
761 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
762 log_error("Failed to parse UID range: %s", range);
763 return -EINVAL;
764 }
765 } else
766 shift = optarg;
767
768 if (parse_uid(shift, &arg_uid_shift) < 0) {
769 log_error("Failed to parse UID: %s", optarg);
770 return -EINVAL;
771 }
772 }
773
774 arg_userns = true;
775 break;
776
c6c8f6e2
LP
777 case ARG_KILL_SIGNAL:
778 arg_kill_signal = signal_from_string_try_harder(optarg);
779 if (arg_kill_signal < 0) {
780 log_error("Cannot parse signal: %s", optarg);
781 return -EINVAL;
782 }
783
784 break;
785
88213476
LP
786 case '?':
787 return -EINVAL;
788
789 default:
eb9da376 790 assert_not_reached("Unhandled option");
88213476 791 }
88213476 792
eb91eb18
LP
793 if (arg_share_system)
794 arg_register = false;
795
796 if (arg_boot && arg_share_system) {
797 log_error("--boot and --share-system may not be combined.");
798 return -EINVAL;
799 }
800
89f7c846
LP
801 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
802 log_error("--keep-unit may not be used when invoked from a user session.");
803 return -EINVAL;
804 }
805
1b9e5b12
LP
806 if (arg_directory && arg_image) {
807 log_error("--directory= and --image= may not be combined.");
808 return -EINVAL;
809 }
810
ec16945e
LP
811 if (arg_template && arg_image) {
812 log_error("--template= and --image= may not be combined.");
813 return -EINVAL;
814 }
815
816 if (arg_template && !(arg_directory || arg_machine)) {
817 log_error("--template= needs --directory= or --machine=.");
818 return -EINVAL;
819 }
820
821 if (arg_ephemeral && arg_template) {
822 log_error("--ephemeral and --template= may not be combined.");
823 return -EINVAL;
824 }
825
826 if (arg_ephemeral && arg_image) {
827 log_error("--ephemeral and --image= may not be combined.");
828 return -EINVAL;
829 }
830
df9a75e4
LP
831 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
832 log_error("--ephemeral and --link-journal= may not be combined.");
833 return -EINVAL;
834 }
835
4d9f07b4
LP
836 if (arg_volatile != VOLATILE_NO && arg_read_only) {
837 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
838 return -EINVAL;
839 }
840
6d0b55c2
LP
841 if (arg_expose_ports && !arg_private_network) {
842 log_error("Cannot use --port= without private networking.");
843 return -EINVAL;
844 }
845
a42c8b54
LP
846 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
847
c6c8f6e2
LP
848 if (arg_boot && arg_kill_signal <= 0)
849 arg_kill_signal = SIGRTMIN+3;
850
88213476
LP
851 return 1;
852}
853
854static int mount_all(const char *dest) {
855
856 typedef struct MountPoint {
857 const char *what;
858 const char *where;
859 const char *type;
860 const char *options;
861 unsigned long flags;
3bd66c05 862 bool fatal;
88213476
LP
863 } MountPoint;
864
865 static const MountPoint mount_table[] = {
06c17c39
LP
866 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
867 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
868 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
869 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
870 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 871 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
872 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
873 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
bbb99c30 874 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 875#ifdef HAVE_SELINUX
06c17c39
LP
876 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
877 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 878#endif
88213476
LP
879 };
880
881 unsigned k;
882 int r = 0;
883
884 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 885 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 886 const char *o;
88213476
LP
887 int t;
888
17fe0523
LP
889 where = strjoin(dest, "/", mount_table[k].where, NULL);
890 if (!where)
891 return log_oom();
88213476 892
e65aec12 893 t = path_is_mount_point(where, true);
da00518b 894 if (t < 0 && t != -ENOENT) {
da927ba9 895 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
896
897 if (r == 0)
898 r = t;
899
900 continue;
901 }
902
9c1c7f71
LP
903 /* Skip this entry if it is not a remount. */
904 if (mount_table[k].what && t > 0)
014a9c77
LP
905 continue;
906
79d80fc1
TG
907 t = mkdir_p(where, 0755);
908 if (t < 0) {
909 if (mount_table[k].fatal) {
da927ba9 910 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
911
912 if (r == 0)
913 r = t;
914 } else
da927ba9 915 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
916
917 continue;
918 }
88213476 919
a8828ed9 920#ifdef HAVE_SELINUX
82adf6af
LP
921 if (arg_selinux_apifs_context &&
922 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
923 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
924 if (!options)
925 return log_oom();
926
927 o = options;
928 } else
a8828ed9 929#endif
d002827b 930 o = mount_table[k].options;
a8828ed9 931
6dac160c
LP
932 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
933 char *uid_options = NULL;
934
935 if (o)
936 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
937 else
938 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
939 if (!uid_options)
940 return log_oom();
941
942 free(options);
943 o = options = uid_options;
944 }
a8828ed9 945
88213476
LP
946 if (mount(mount_table[k].what,
947 where,
948 mount_table[k].type,
949 mount_table[k].flags,
79d80fc1 950 o) < 0) {
88213476 951
79d80fc1 952 if (mount_table[k].fatal) {
56f64d95 953 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 954
79d80fc1
TG
955 if (r == 0)
956 r = -errno;
957 } else
56f64d95 958 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 959 }
88213476
LP
960 }
961
e58a1277
LP
962 return r;
963}
f8440af5 964
d6797c92 965static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
966 char **x, **y;
967
968 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 969 _cleanup_free_ char *where = NULL;
d2421337 970 struct stat source_st, dest_st;
2ed4e5e0 971 int r;
d2421337 972
4a62c710
MS
973 if (stat(*x, &source_st) < 0)
974 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 975
06c17c39
LP
976 where = strappend(dest, *y);
977 if (!where)
978 return log_oom();
979
2ed4e5e0
SL
980 r = stat(where, &dest_st);
981 if (r == 0) {
05e7da5a
AC
982 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
983 log_error("Cannot bind mount directory %s on file %s.", *x, where);
984 return -EINVAL;
985 }
986 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
987 log_error("Cannot bind mount file %s on directory %s.", *x, where);
d2421337
DR
988 return -EINVAL;
989 }
2ed4e5e0
SL
990 } else if (errno == ENOENT) {
991 r = mkdir_parents_label(where, 0755);
f647962d
MS
992 if (r < 0)
993 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 994 } else {
56f64d95 995 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
996 return -errno;
997 }
06c17c39 998
05e7da5a
AC
999 /* Create the mount point. Any non-directory file can be
1000 * mounted on any non-directory file (regular, fifo, socket,
1001 * char, block).
1002 */
79d80fc1
TG
1003 if (S_ISDIR(source_st.st_mode)) {
1004 r = mkdir_label(where, 0755);
f647962d
MS
1005 if (r < 0 && errno != EEXIST)
1006 return log_error_errno(r, "Failed to create mount point %s: %m", where);
05e7da5a 1007 } else {
79d80fc1 1008 r = touch(where);
f647962d
MS
1009 if (r < 0)
1010 return log_error_errno(r, "Failed to create mount point %s: %m", where);
d2421337 1011 }
17fe0523 1012
4543768d 1013 if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
4a62c710 1014 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 1015
d6797c92
LP
1016 if (ro) {
1017 r = bind_remount_recursive(where, true);
f647962d
MS
1018 if (r < 0)
1019 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
1020 }
1021 }
1022
1023 return 0;
1024}
1025
b12afc8c
LP
1026static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1027 char *to;
1028 int r;
1029
63c372cb 1030 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c
LP
1031
1032 r = path_is_mount_point(to, false);
da00518b 1033 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1034 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1035 if (r > 0)
1036 return 0;
1037
1038 mkdir_p(to, 0755);
1039
c0534580
LP
1040 /* The superblock mount options of the mount point need to be
1041 * identical to the hosts', and hence writable... */
1042 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1043 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1044
c0534580
LP
1045 /* ... hence let's only make the bind mount read-only, not the
1046 * superblock. */
1047 if (read_only) {
1048 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1049 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1050 }
b12afc8c
LP
1051 return 1;
1052}
1053
1054static int mount_cgroup(const char *dest) {
1055 _cleanup_set_free_free_ Set *controllers = NULL;
1056 _cleanup_free_ char *own_cgroup_path = NULL;
1057 const char *cgroup_root, *systemd_root, *systemd_own;
1058 int r;
1059
1060 controllers = set_new(&string_hash_ops);
1061 if (!controllers)
1062 return log_oom();
1063
1064 r = cg_kernel_controllers(controllers);
1065 if (r < 0)
1066 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1067
1068 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1071
63c372cb 1072 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
b12afc8c
LP
1073 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1074 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1075
1076 for (;;) {
1077 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1078
1079 controller = set_steal_first(controllers);
1080 if (!controller)
1081 break;
1082
1083 origin = strappend("/sys/fs/cgroup/", controller);
1084 if (!origin)
1085 return log_oom();
1086
1087 r = readlink_malloc(origin, &combined);
1088 if (r == -EINVAL) {
1089 /* Not a symbolic link, but directly a single cgroup hierarchy */
1090
1091 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1092 if (r < 0)
1093 return r;
1094
1095 } else if (r < 0)
1096 return log_error_errno(r, "Failed to read link %s: %m", origin);
1097 else {
1098 _cleanup_free_ char *target = NULL;
1099
1100 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1101 if (!target)
1102 return log_oom();
1103
1104 /* A symbolic link, a combination of controllers in one hierarchy */
1105
1106 if (!filename_is_valid(combined)) {
1107 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1108 continue;
1109 }
1110
1111 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1112 if (r < 0)
1113 return r;
1114
1115 if (symlink(combined, target) < 0)
83521414 1116 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1117 }
1118 }
1119
c0534580 1120 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1121 if (r < 0)
1122 return r;
1123
1124 /* Make our own cgroup a (writable) bind mount */
63c372cb 1125 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1126 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1127 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1128
1129 /* And then remount the systemd cgroup root read-only */
63c372cb 1130 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1131 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1132 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1133
1134 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1135 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1136
1137 return 0;
1138}
1139
06c17c39
LP
1140static int mount_tmpfs(const char *dest) {
1141 char **i, **o;
1142
1143 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1144 _cleanup_free_ char *where = NULL;
79d80fc1 1145 int r;
06c17c39
LP
1146
1147 where = strappend(dest, *i);
1148 if (!where)
1149 return log_oom();
1150
79d80fc1 1151 r = mkdir_label(where, 0755);
04a91939
LP
1152 if (r < 0 && r != -EEXIST)
1153 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1154
4a62c710
MS
1155 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1156 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1157 }
1158
1159 return 0;
1160}
1161
e58a1277 1162static int setup_timezone(const char *dest) {
d4036145
LP
1163 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1164 char *z, *y;
1165 int r;
f8440af5 1166
e58a1277
LP
1167 assert(dest);
1168
1169 /* Fix the timezone, if possible */
d4036145
LP
1170 r = readlink_malloc("/etc/localtime", &p);
1171 if (r < 0) {
1172 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1173 return 0;
1174 }
1175
1176 z = path_startswith(p, "../usr/share/zoneinfo/");
1177 if (!z)
1178 z = path_startswith(p, "/usr/share/zoneinfo/");
1179 if (!z) {
1180 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1181 return 0;
1182 }
1183
04bc4a3f
LP
1184 where = strappend(dest, "/etc/localtime");
1185 if (!where)
0d0f0c50 1186 return log_oom();
715ac17a 1187
d4036145
LP
1188 r = readlink_malloc(where, &q);
1189 if (r >= 0) {
1190 y = path_startswith(q, "../usr/share/zoneinfo/");
1191 if (!y)
1192 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1193
d4036145
LP
1194 /* Already pointing to the right place? Then do nothing .. */
1195 if (y && streq(y, z))
1196 return 0;
1197 }
1198
1199 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1200 if (!check)
0d0f0c50 1201 return log_oom();
4d1c38b8 1202
d4036145
LP
1203 if (access(check, F_OK) < 0) {
1204 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1205 return 0;
1206 }
68fb0892 1207
d4036145
LP
1208 what = strappend("../usr/share/zoneinfo/", z);
1209 if (!what)
1210 return log_oom();
1211
79d80fc1
TG
1212 r = mkdir_parents(where, 0755);
1213 if (r < 0) {
da927ba9 1214 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1215
1216 return 0;
1217 }
1218
1219 r = unlink(where);
1220 if (r < 0 && errno != ENOENT) {
56f64d95 1221 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1222
1223 return 0;
1224 }
4d9f07b4 1225
d4036145 1226 if (symlink(what, where) < 0) {
56f64d95 1227 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1228 return 0;
1229 }
e58a1277
LP
1230
1231 return 0;
88213476
LP
1232}
1233
2547bb41 1234static int setup_resolv_conf(const char *dest) {
c8b32e11 1235 _cleanup_free_ char *where = NULL;
79d80fc1 1236 int r;
2547bb41
LP
1237
1238 assert(dest);
1239
1240 if (arg_private_network)
1241 return 0;
1242
1243 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1244 where = strappend(dest, "/etc/resolv.conf");
1245 if (!where)
0d0f0c50 1246 return log_oom();
2547bb41 1247
77e63faf
LP
1248 /* We don't really care for the results of this really. If it
1249 * fails, it fails, but meh... */
79d80fc1
TG
1250 r = mkdir_parents(where, 0755);
1251 if (r < 0) {
da927ba9 1252 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1253
1254 return 0;
1255 }
1256
f2068bcc 1257 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1258 if (r < 0) {
da927ba9 1259 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1260
1261 return 0;
1262 }
2547bb41
LP
1263
1264 return 0;
1265}
1266
4d9f07b4
LP
1267static int setup_volatile_state(const char *directory) {
1268 const char *p;
1269 int r;
1270
1271 assert(directory);
1272
1273 if (arg_volatile != VOLATILE_STATE)
1274 return 0;
1275
1276 /* --volatile=state means we simply overmount /var
1277 with a tmpfs, and the rest read-only. */
1278
1279 r = bind_remount_recursive(directory, true);
f647962d
MS
1280 if (r < 0)
1281 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1282
63c372cb 1283 p = strjoina(directory, "/var");
79d80fc1 1284 r = mkdir(p, 0755);
4a62c710
MS
1285 if (r < 0 && errno != EEXIST)
1286 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1287
4a62c710
MS
1288 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1289 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1290
1291 return 0;
1292}
1293
1294static int setup_volatile(const char *directory) {
1295 bool tmpfs_mounted = false, bind_mounted = false;
1296 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1297 const char *f, *t;
1298 int r;
1299
1300 assert(directory);
1301
1302 if (arg_volatile != VOLATILE_YES)
1303 return 0;
1304
1305 /* --volatile=yes means we mount a tmpfs to the root dir, and
1306 the original /usr to use inside it, and that read-only. */
1307
4a62c710
MS
1308 if (!mkdtemp(template))
1309 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1310
1311 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1312 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1313 r = -errno;
1314 goto fail;
1315 }
1316
1317 tmpfs_mounted = true;
1318
63c372cb
LP
1319 f = strjoina(directory, "/usr");
1320 t = strjoina(template, "/usr");
4d9f07b4 1321
79d80fc1
TG
1322 r = mkdir(t, 0755);
1323 if (r < 0 && errno != EEXIST) {
56f64d95 1324 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1325 r = -errno;
1326 goto fail;
1327 }
1328
4543768d 1329 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1330 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1331 r = -errno;
1332 goto fail;
1333 }
1334
1335 bind_mounted = true;
1336
1337 r = bind_remount_recursive(t, true);
1338 if (r < 0) {
da927ba9 1339 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1340 goto fail;
1341 }
1342
1343 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1344 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1345 r = -errno;
1346 goto fail;
1347 }
1348
1349 rmdir(template);
1350
1351 return 0;
1352
1353fail:
1354 if (bind_mounted)
1355 umount(t);
1356 if (tmpfs_mounted)
1357 umount(template);
1358 rmdir(template);
1359 return r;
1360}
1361
9f24adc2
LP
1362static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1363
1364 snprintf(s, 37,
1365 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1366 SD_ID128_FORMAT_VAL(id));
1367
1368 return s;
1369}
1370
04bc4a3f 1371static int setup_boot_id(const char *dest) {
7fd1b19b 1372 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1373 sd_id128_t rnd = {};
04bc4a3f
LP
1374 char as_uuid[37];
1375 int r;
1376
1377 assert(dest);
1378
eb91eb18
LP
1379 if (arg_share_system)
1380 return 0;
1381
04bc4a3f
LP
1382 /* Generate a new randomized boot ID, so that each boot-up of
1383 * the container gets a new one */
1384
1385 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1386 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1387 if (!from || !to)
1388 return log_oom();
04bc4a3f
LP
1389
1390 r = sd_id128_randomize(&rnd);
f647962d
MS
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1393
9f24adc2 1394 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1395
574d5f2d 1396 r = write_string_file(from, as_uuid);
f647962d
MS
1397 if (r < 0)
1398 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1399
4543768d 1400 if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
56f64d95 1401 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1402 r = -errno;
4543768d 1403 } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1404 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1405
1406 unlink(from);
04bc4a3f
LP
1407 return r;
1408}
1409
e58a1277 1410static int copy_devnodes(const char *dest) {
88213476
LP
1411
1412 static const char devnodes[] =
1413 "null\0"
1414 "zero\0"
1415 "full\0"
1416 "random\0"
1417 "urandom\0"
85614d66
TG
1418 "tty\0"
1419 "net/tun\0";
88213476
LP
1420
1421 const char *d;
e58a1277 1422 int r = 0;
7fd1b19b 1423 _cleanup_umask_ mode_t u;
a258bf26
LP
1424
1425 assert(dest);
124640f1
LP
1426
1427 u = umask(0000);
88213476
LP
1428
1429 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1430 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1431 struct stat st;
88213476 1432
7f112f50
LP
1433 from = strappend("/dev/", d);
1434 to = strjoin(dest, "/dev/", d, NULL);
1435 if (!from || !to)
1436 return log_oom();
88213476
LP
1437
1438 if (stat(from, &st) < 0) {
1439
4a62c710
MS
1440 if (errno != ENOENT)
1441 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1442
a258bf26 1443 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1444
ed8b7a3e 1445 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1446 return -EIO;
a258bf26 1447
85614d66
TG
1448 } else {
1449 r = mkdir_parents(to, 0775);
1450 if (r < 0) {
da927ba9 1451 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1452 return -r;
1453 }
a258bf26 1454
81f5049b
AC
1455 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1456 if (errno != EPERM)
1457 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1458
1459 /* Some systems abusively restrict mknod but
1460 * allow bind mounts. */
1461 r = touch(to);
1462 if (r < 0)
1463 return log_error_errno(r, "touch (%s) failed: %m", to);
1464 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1465 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1466 }
6278cf60
LP
1467
1468 if (arg_userns && arg_uid_shift != UID_INVALID)
1469 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1470 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
88213476 1471 }
88213476
LP
1472 }
1473
e58a1277
LP
1474 return r;
1475}
88213476 1476
f2d88580
LP
1477static int setup_ptmx(const char *dest) {
1478 _cleanup_free_ char *p = NULL;
1479
1480 p = strappend(dest, "/dev/ptmx");
1481 if (!p)
1482 return log_oom();
1483
4a62c710
MS
1484 if (symlink("pts/ptmx", p) < 0)
1485 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580 1486
6278cf60
LP
1487 if (arg_userns && arg_uid_shift != UID_INVALID)
1488 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1489 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1490
f2d88580
LP
1491 return 0;
1492}
1493
e58a1277 1494static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1495 _cleanup_umask_ mode_t u;
1496 const char *to;
e58a1277 1497 int r;
e58a1277
LP
1498
1499 assert(dest);
1500 assert(console);
1501
1502 u = umask(0000);
1503
e58a1277 1504 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1507
a258bf26
LP
1508 /* We need to bind mount the right tty to /dev/console since
1509 * ptys can only exist on pts file systems. To have something
81f5049b 1510 * to bind mount things on we create a empty regular file. */
a258bf26 1511
63c372cb 1512 to = strjoina(dest, "/dev/console");
81f5049b
AC
1513 r = touch(to);
1514 if (r < 0)
1515 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1516
4543768d 1517 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1518 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1519
25ea79fe 1520 return 0;
e58a1277
LP
1521}
1522
1523static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1524 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1525 _cleanup_umask_ mode_t u;
6d0b55c2 1526 int r, fd, k;
e58a1277
LP
1527 union {
1528 struct cmsghdr cmsghdr;
1529 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1530 } control = {};
1531 struct msghdr mh = {
1532 .msg_control = &control,
1533 .msg_controllen = sizeof(control),
1534 };
e58a1277
LP
1535 struct cmsghdr *cmsg;
1536
1537 assert(dest);
1538 assert(kmsg_socket >= 0);
a258bf26 1539
e58a1277 1540 u = umask(0000);
a258bf26 1541
f1e5dfe2
LP
1542 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1543 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1544 * on the reading side behave very similar to /proc/kmsg,
1545 * their writing side behaves differently from /dev/kmsg in
1546 * that writing blocks when nothing is reading. In order to
1547 * avoid any problems with containers deadlocking due to this
1548 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1549 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1550 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1551 return log_oom();
e58a1277 1552
4a62c710
MS
1553 if (mkfifo(from, 0600) < 0)
1554 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1555
1556 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1557 if (r < 0)
1558 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1559
4543768d 1560 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1561 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1562
1563 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1564 if (fd < 0)
1565 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1566
e58a1277
LP
1567 cmsg = CMSG_FIRSTHDR(&mh);
1568 cmsg->cmsg_level = SOL_SOCKET;
1569 cmsg->cmsg_type = SCM_RIGHTS;
1570 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1571 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1572
1573 mh.msg_controllen = cmsg->cmsg_len;
1574
1575 /* Store away the fd in the socket, so that it stays open as
1576 * long as we run the child */
6d0b55c2 1577 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1578 safe_close(fd);
e58a1277 1579
4a62c710
MS
1580 if (k < 0)
1581 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1582
f1e5dfe2
LP
1583 /* And now make the FIFO unavailable as /dev/kmsg... */
1584 unlink(from);
25ea79fe 1585 return 0;
88213476
LP
1586}
1587
6d0b55c2
LP
1588static int send_rtnl(int send_fd) {
1589 union {
1590 struct cmsghdr cmsghdr;
1591 uint8_t buf[CMSG_SPACE(sizeof(int))];
1592 } control = {};
1593 struct msghdr mh = {
1594 .msg_control = &control,
1595 .msg_controllen = sizeof(control),
1596 };
1597 struct cmsghdr *cmsg;
1598 _cleanup_close_ int fd = -1;
1599 ssize_t k;
1600
1601 assert(send_fd >= 0);
1602
1603 if (!arg_expose_ports)
1604 return 0;
1605
1606 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1607 if (fd < 0)
1608 return log_error_errno(errno, "failed to allocate container netlink: %m");
1609
1610 cmsg = CMSG_FIRSTHDR(&mh);
1611 cmsg->cmsg_level = SOL_SOCKET;
1612 cmsg->cmsg_type = SCM_RIGHTS;
1613 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1614 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1615
1616 mh.msg_controllen = cmsg->cmsg_len;
1617
1618 /* Store away the fd in the socket, so that it stays open as
1619 * long as we run the child */
1620 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1621 if (k < 0)
1622 return log_error_errno(errno, "Failed to send netlink fd: %m");
1623
1624 return 0;
1625}
1626
1627static int flush_ports(union in_addr_union *exposed) {
1628 ExposePort *p;
1629 int r, af = AF_INET;
1630
1631 assert(exposed);
1632
1633 if (!arg_expose_ports)
1634 return 0;
1635
1636 if (in_addr_is_null(af, exposed))
1637 return 0;
1638
1639 log_debug("Lost IP address.");
1640
1641 LIST_FOREACH(ports, p, arg_expose_ports) {
1642 r = fw_add_local_dnat(false,
1643 af,
1644 p->protocol,
1645 NULL,
1646 NULL, 0,
1647 NULL, 0,
1648 p->host_port,
1649 exposed,
1650 p->container_port,
1651 NULL);
1652 if (r < 0)
1653 log_warning_errno(r, "Failed to modify firewall: %m");
1654 }
1655
1656 *exposed = IN_ADDR_NULL;
1657 return 0;
1658}
1659
1660static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1661 _cleanup_free_ struct local_address *addresses = NULL;
1662 _cleanup_free_ char *pretty = NULL;
1663 union in_addr_union new_exposed;
1664 ExposePort *p;
1665 bool add;
1666 int af = AF_INET, r;
1667
1668 assert(exposed);
1669
1670 /* Invoked each time an address is added or removed inside the
1671 * container */
1672
1673 if (!arg_expose_ports)
1674 return 0;
1675
1676 r = local_addresses(rtnl, 0, af, &addresses);
1677 if (r < 0)
1678 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1679
1680 add = r > 0 &&
1681 addresses[0].family == af &&
1682 addresses[0].scope < RT_SCOPE_LINK;
1683
1684 if (!add)
1685 return flush_ports(exposed);
1686
1687 new_exposed = addresses[0].address;
1688 if (in_addr_equal(af, exposed, &new_exposed))
1689 return 0;
1690
1691 in_addr_to_string(af, &new_exposed, &pretty);
1692 log_debug("New container IP is %s.", strna(pretty));
1693
1694 LIST_FOREACH(ports, p, arg_expose_ports) {
1695
1696 r = fw_add_local_dnat(true,
1697 af,
1698 p->protocol,
1699 NULL,
1700 NULL, 0,
1701 NULL, 0,
1702 p->host_port,
1703 &new_exposed,
1704 p->container_port,
1705 in_addr_is_null(af, exposed) ? NULL : exposed);
1706 if (r < 0)
1707 log_warning_errno(r, "Failed to modify firewall: %m");
1708 }
1709
1710 *exposed = new_exposed;
1711 return 0;
1712}
1713
1714static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1715 union in_addr_union *exposed = userdata;
1716
1717 assert(rtnl);
1718 assert(m);
1719 assert(exposed);
1720
1721 expose_ports(rtnl, exposed);
1722 return 0;
1723}
1724
1725static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1726 union {
1727 struct cmsghdr cmsghdr;
1728 uint8_t buf[CMSG_SPACE(sizeof(int))];
1729 } control = {};
1730 struct msghdr mh = {
1731 .msg_control = &control,
1732 .msg_controllen = sizeof(control),
1733 };
1734 struct cmsghdr *cmsg;
1735 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1736 int fd, r;
1737 ssize_t k;
1738
1739 assert(event);
1740 assert(recv_fd >= 0);
1741 assert(ret);
1742
1743 if (!arg_expose_ports)
1744 return 0;
1745
1746 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1747 if (k < 0)
1748 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1749
1750 cmsg = CMSG_FIRSTHDR(&mh);
1751 assert(cmsg->cmsg_level == SOL_SOCKET);
1752 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1753 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1754 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1755
1756 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1757 if (r < 0) {
1758 safe_close(fd);
1759 return log_error_errno(r, "Failed to create rtnl object: %m");
1760 }
1761
1762 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1763 if (r < 0)
1764 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1765
1766 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1767 if (r < 0)
1768 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1769
1770 r = sd_rtnl_attach_event(rtnl, event, 0);
1771 if (r < 0)
1772 return log_error_errno(r, "Failed to add to even loop: %m");
1773
1774 *ret = rtnl;
1775 rtnl = NULL;
1776
1777 return 0;
1778}
1779
3a74cea5 1780static int setup_hostname(void) {
3a74cea5 1781
eb91eb18
LP
1782 if (arg_share_system)
1783 return 0;
1784
605f81a8 1785 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1786 return -errno;
3a74cea5 1787
7027ff61 1788 return 0;
3a74cea5
LP
1789}
1790
57fb9fb5 1791static int setup_journal(const char *directory) {
4d680aee 1792 sd_id128_t machine_id, this_id;
7fd1b19b 1793 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1794 char *id;
57fb9fb5
LP
1795 int r;
1796
df9a75e4
LP
1797 /* Don't link journals in ephemeral mode */
1798 if (arg_ephemeral)
1799 return 0;
1800
57fb9fb5 1801 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1802 if (!p)
1803 return log_oom();
57fb9fb5
LP
1804
1805 r = read_one_line_file(p, &b);
27407a01
ZJS
1806 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1807 return 0;
f647962d
MS
1808 else if (r < 0)
1809 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1810
27407a01
ZJS
1811 id = strstrip(b);
1812 if (isempty(id) && arg_link_journal == LINK_AUTO)
1813 return 0;
57fb9fb5 1814
27407a01
ZJS
1815 /* Verify validity */
1816 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1817 if (r < 0)
1818 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1819
4d680aee 1820 r = sd_id128_get_machine(&this_id);
f647962d
MS
1821 if (r < 0)
1822 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1823
1824 if (sd_id128_equal(machine_id, this_id)) {
1825 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1826 "Host and machine ids are equal (%s): refusing to link journals", id);
1827 if (arg_link_journal == LINK_AUTO)
1828 return 0;
df9a75e4 1829 return -EEXIST;
4d680aee
ZJS
1830 }
1831
1832 if (arg_link_journal == LINK_NO)
1833 return 0;
1834
57fb9fb5 1835 free(p);
27407a01
ZJS
1836 p = strappend("/var/log/journal/", id);
1837 q = strjoin(directory, "/var/log/journal/", id, NULL);
1838 if (!p || !q)
1839 return log_oom();
1840
1841 if (path_is_mount_point(p, false) > 0) {
1842 if (arg_link_journal != LINK_AUTO) {
1843 log_error("%s: already a mount point, refusing to use for journal", p);
1844 return -EEXIST;
1845 }
1846
1847 return 0;
57fb9fb5
LP
1848 }
1849
27407a01 1850 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1851 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1852 log_error("%s: already a mount point, refusing to use for journal", q);
1853 return -EEXIST;
57fb9fb5
LP
1854 }
1855
27407a01 1856 return 0;
57fb9fb5
LP
1857 }
1858
1859 r = readlink_and_make_absolute(p, &d);
1860 if (r >= 0) {
1861 if ((arg_link_journal == LINK_GUEST ||
1862 arg_link_journal == LINK_AUTO) &&
1863 path_equal(d, q)) {
1864
27407a01
ZJS
1865 r = mkdir_p(q, 0755);
1866 if (r < 0)
56f64d95 1867 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1868 return 0;
57fb9fb5
LP
1869 }
1870
4a62c710
MS
1871 if (unlink(p) < 0)
1872 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1873 } else if (r == -EINVAL) {
1874
1875 if (arg_link_journal == LINK_GUEST &&
1876 rmdir(p) < 0) {
1877
27407a01
ZJS
1878 if (errno == ENOTDIR) {
1879 log_error("%s already exists and is neither a symlink nor a directory", p);
1880 return r;
1881 } else {
56f64d95 1882 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1883 return -errno;
57fb9fb5 1884 }
57fb9fb5
LP
1885 }
1886 } else if (r != -ENOENT) {
56f64d95 1887 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1888 return r;
57fb9fb5
LP
1889 }
1890
1891 if (arg_link_journal == LINK_GUEST) {
1892
1893 if (symlink(q, p) < 0) {
574edc90 1894 if (arg_link_journal_try) {
56f64d95 1895 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1896 return 0;
1897 } else {
56f64d95 1898 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1899 return -errno;
1900 }
57fb9fb5
LP
1901 }
1902
27407a01
ZJS
1903 r = mkdir_p(q, 0755);
1904 if (r < 0)
56f64d95 1905 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1906 return 0;
57fb9fb5
LP
1907 }
1908
1909 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1910 /* don't create parents here -- if the host doesn't have
1911 * permanent journal set up, don't force it here */
1912 r = mkdir(p, 0755);
57fb9fb5 1913 if (r < 0) {
574edc90 1914 if (arg_link_journal_try) {
56f64d95 1915 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1916 return 0;
1917 } else {
56f64d95 1918 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1919 return r;
1920 }
57fb9fb5
LP
1921 }
1922
27407a01
ZJS
1923 } else if (access(p, F_OK) < 0)
1924 return 0;
57fb9fb5 1925
cdb2b9d0
LP
1926 if (dir_is_empty(q) == 0)
1927 log_warning("%s is not empty, proceeding anyway.", q);
1928
57fb9fb5
LP
1929 r = mkdir_p(q, 0755);
1930 if (r < 0) {
56f64d95 1931 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1932 return r;
57fb9fb5
LP
1933 }
1934
4543768d 1935 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1936 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1937
27407a01 1938 return 0;
57fb9fb5
LP
1939}
1940
88213476 1941static int drop_capabilities(void) {
5076f0cc 1942 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1943}
1944
5aa4bb6b 1945static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1946 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1947 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1948 int r;
1949
eb91eb18
LP
1950 if (!arg_register)
1951 return 0;
1952
1c03020c 1953 r = sd_bus_default_system(&bus);
f647962d
MS
1954 if (r < 0)
1955 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1956
89f7c846
LP
1957 if (arg_keep_unit) {
1958 r = sd_bus_call_method(
1959 bus,
1960 "org.freedesktop.machine1",
1961 "/org/freedesktop/machine1",
1962 "org.freedesktop.machine1.Manager",
5aa4bb6b 1963 "RegisterMachineWithNetwork",
89f7c846
LP
1964 &error,
1965 NULL,
5aa4bb6b 1966 "sayssusai",
89f7c846
LP
1967 arg_machine,
1968 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1969 "nspawn",
1970 "container",
1971 (uint32_t) pid,
5aa4bb6b
LP
1972 strempty(arg_directory),
1973 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1974 } else {
9457ac5b 1975 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 1976 char **i;
9457ac5b
LP
1977
1978 r = sd_bus_message_new_method_call(
89f7c846 1979 bus,
9457ac5b 1980 &m,
89f7c846
LP
1981 "org.freedesktop.machine1",
1982 "/org/freedesktop/machine1",
1983 "org.freedesktop.machine1.Manager",
5aa4bb6b 1984 "CreateMachineWithNetwork");
f647962d 1985 if (r < 0)
f36933fe 1986 return bus_log_create_error(r);
9457ac5b
LP
1987
1988 r = sd_bus_message_append(
1989 m,
5aa4bb6b 1990 "sayssusai",
89f7c846
LP
1991 arg_machine,
1992 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1993 "nspawn",
1994 "container",
1995 (uint32_t) pid,
5aa4bb6b
LP
1996 strempty(arg_directory),
1997 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 1998 if (r < 0)
f36933fe 1999 return bus_log_create_error(r);
9457ac5b
LP
2000
2001 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2002 if (r < 0)
f36933fe 2003 return bus_log_create_error(r);
9457ac5b
LP
2004
2005 if (!isempty(arg_slice)) {
2006 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2007 if (r < 0)
f36933fe 2008 return bus_log_create_error(r);
9457ac5b
LP
2009 }
2010
2011 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2012 if (r < 0)
f36933fe 2013 return bus_log_create_error(r);
9457ac5b 2014
63cc4c31 2015 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2016 /* Allow the container to
2017 * access and create the API
2018 * device nodes, so that
2019 * PrivateDevices= in the
2020 * container can work
2021 * fine */
2022 "/dev/null", "rwm",
2023 "/dev/zero", "rwm",
2024 "/dev/full", "rwm",
2025 "/dev/random", "rwm",
2026 "/dev/urandom", "rwm",
2027 "/dev/tty", "rwm",
864e1706 2028 "/dev/net/tun", "rwm",
9457ac5b
LP
2029 /* Allow the container
2030 * access to ptys. However,
2031 * do not permit the
2032 * container to ever create
2033 * these device nodes. */
2034 "/dev/pts/ptmx", "rw",
63cc4c31 2035 "char-pts", "rw");
f647962d
MS
2036 if (r < 0)
2037 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b 2038
f36933fe
LP
2039 STRV_FOREACH(i, arg_property) {
2040 r = sd_bus_message_open_container(m, 'r', "sv");
2041 if (r < 0)
2042 return bus_log_create_error(r);
2043
2044 r = bus_append_unit_property_assignment(m, *i);
2045 if (r < 0)
2046 return r;
2047
2048 r = sd_bus_message_close_container(m);
2049 if (r < 0)
2050 return bus_log_create_error(r);
2051 }
2052
9457ac5b 2053 r = sd_bus_message_close_container(m);
f647962d 2054 if (r < 0)
f36933fe 2055 return bus_log_create_error(r);
9457ac5b
LP
2056
2057 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2058 }
2059
9444b1f2 2060 if (r < 0) {
1f0cd86b
LP
2061 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2062 return r;
2063 }
2064
2065 return 0;
2066}
2067
2068static int terminate_machine(pid_t pid) {
2069 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2070 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 2071 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2072 const char *path;
2073 int r;
2074
eb91eb18
LP
2075 if (!arg_register)
2076 return 0;
2077
76b54375 2078 r = sd_bus_default_system(&bus);
f647962d
MS
2079 if (r < 0)
2080 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2081
2082 r = sd_bus_call_method(
2083 bus,
2084 "org.freedesktop.machine1",
2085 "/org/freedesktop/machine1",
2086 "org.freedesktop.machine1.Manager",
2087 "GetMachineByPID",
2088 &error,
2089 &reply,
2090 "u",
2091 (uint32_t) pid);
2092 if (r < 0) {
2093 /* Note that the machine might already have been
2094 * cleaned up automatically, hence don't consider it a
2095 * failure if we cannot get the machine object. */
2096 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2097 return 0;
2098 }
2099
2100 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2101 if (r < 0)
2102 return bus_log_parse_error(r);
9444b1f2 2103
1f0cd86b
LP
2104 r = sd_bus_call_method(
2105 bus,
2106 "org.freedesktop.machine1",
2107 path,
2108 "org.freedesktop.machine1.Machine",
2109 "Terminate",
2110 &error,
2111 NULL,
2112 NULL);
2113 if (r < 0) {
2114 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2115 return 0;
2116 }
2117
9444b1f2
LP
2118 return 0;
2119}
2120
db999e0f
LP
2121static int reset_audit_loginuid(void) {
2122 _cleanup_free_ char *p = NULL;
2123 int r;
2124
2125 if (arg_share_system)
2126 return 0;
2127
2128 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2129 if (r == -ENOENT)
db999e0f 2130 return 0;
f647962d
MS
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2133
2134 /* Already reset? */
2135 if (streq(p, "4294967295"))
2136 return 0;
2137
2138 r = write_string_file("/proc/self/loginuid", "4294967295");
2139 if (r < 0) {
2140 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2141 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2142 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2143 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2144 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 2145
db999e0f 2146 sleep(5);
77b6e194 2147 }
db999e0f
LP
2148
2149 return 0;
77b6e194
LP
2150}
2151
4f758c23
LP
2152#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2153#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2154#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2155
a90e2305 2156static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2157 uint8_t result[8];
2158 size_t l, sz;
a90e2305
LP
2159 uint8_t *v, *i;
2160 int r;
01dde061
TG
2161
2162 l = strlen(arg_machine);
2163 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2164 if (idx > 0)
2165 sz += sizeof(idx);
a90e2305 2166
01dde061
TG
2167 v = alloca(sz);
2168
2169 /* fetch some persistent data unique to the host */
2170 r = sd_id128_get_machine((sd_id128_t*) v);
2171 if (r < 0)
2172 return r;
2173
2174 /* combine with some data unique (on this host) to this
2175 * container instance */
a90e2305
LP
2176 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2177 if (idx > 0) {
2178 idx = htole64(idx);
2179 memcpy(i, &idx, sizeof(idx));
2180 }
01dde061
TG
2181
2182 /* Let's hash the host machine ID plus the container name. We
2183 * use a fixed, but originally randomly created hash key here. */
4f758c23 2184 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2185
2186 assert_cc(ETH_ALEN <= sizeof(result));
2187 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2188
2189 /* see eth_random_addr in the kernel */
2190 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2191 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2192
2193 return 0;
2194}
2195
5aa4bb6b 2196static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2197 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2198 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2199 struct ether_addr mac_host, mac_container;
5aa4bb6b 2200 int r, i;
69c79d3c
LP
2201
2202 if (!arg_private_network)
2203 return 0;
2204
2205 if (!arg_network_veth)
2206 return 0;
2207
08af0da2
LP
2208 /* Use two different interface name prefixes depending whether
2209 * we are in bridge mode or not. */
c00524c9 2210 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2211 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2212
e867ceb6
LP
2213 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2216
e867ceb6
LP
2217 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2220
151b9b96 2221 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2222 if (r < 0)
2223 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2224
151b9b96 2225 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2226 if (r < 0)
2227 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2228
ab046dde 2229 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2230 if (r < 0)
2231 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2232
4f758c23 2233 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2234 if (r < 0)
2235 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2236
ee3a6a51 2237 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2238 if (r < 0)
2239 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2240
d8e538ec 2241 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2242 if (r < 0)
2243 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2244
ee3a6a51 2245 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2246 if (r < 0)
2247 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2248
ab046dde 2249 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2250 if (r < 0)
2251 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2252
4f758c23 2253 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2254 if (r < 0)
2255 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2256
ab046dde 2257 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2258 if (r < 0)
2259 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2260
2261 r = sd_rtnl_message_close_container(m);
f647962d
MS
2262 if (r < 0)
2263 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2264
2265 r = sd_rtnl_message_close_container(m);
f647962d
MS
2266 if (r < 0)
2267 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2268
2269 r = sd_rtnl_message_close_container(m);
f647962d
MS
2270 if (r < 0)
2271 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2272
2273 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2274 if (r < 0)
2275 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2276
5aa4bb6b 2277 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2278 if (i <= 0)
2279 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2280
2281 *ifi = i;
2282
69c79d3c
LP
2283 return 0;
2284}
2285
5aa4bb6b 2286static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2287 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2288 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2289 int r, bridge;
2290
2291 if (!arg_private_network)
2292 return 0;
2293
2294 if (!arg_network_veth)
2295 return 0;
2296
2297 if (!arg_network_bridge)
2298 return 0;
2299
2300 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2301 if (bridge <= 0)
2302 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2303
5aa4bb6b
LP
2304 *ifi = bridge;
2305
151b9b96 2306 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2307 if (r < 0)
2308 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2309
151b9b96 2310 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2311 if (r < 0)
2312 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2313
039dd4af 2314 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2315 if (r < 0)
2316 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2317
ab046dde 2318 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2321
2322 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2323 if (r < 0)
2324 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2325
2326 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2327 if (r < 0)
2328 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2329
2330 return 0;
2331}
2332
c74e630d
LP
2333static int parse_interface(struct udev *udev, const char *name) {
2334 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2335 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2336 int ifi;
2337
2338 ifi = (int) if_nametoindex(name);
4a62c710
MS
2339 if (ifi <= 0)
2340 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2341
2342 sprintf(ifi_str, "n%i", ifi);
2343 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2344 if (!d)
2345 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2346
2347 if (udev_device_get_is_initialized(d) <= 0) {
2348 log_error("Network interface %s is not initialized yet.", name);
2349 return -EBUSY;
2350 }
2351
2352 return ifi;
2353}
2354
69c79d3c 2355static int move_network_interfaces(pid_t pid) {
7e227024 2356 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2357 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2358 char **i;
2359 int r;
2360
2361 if (!arg_private_network)
2362 return 0;
2363
2364 if (strv_isempty(arg_network_interfaces))
2365 return 0;
2366
151b9b96 2367 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2370
7e227024
LP
2371 udev = udev_new();
2372 if (!udev) {
2373 log_error("Failed to connect to udev.");
2374 return -ENOMEM;
2375 }
2376
aa28aefe 2377 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2378 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2379 int ifi;
aa28aefe 2380
c74e630d
LP
2381 ifi = parse_interface(udev, *i);
2382 if (ifi < 0)
2383 return ifi;
2384
3125b3ef 2385 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2388
c74e630d 2389 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2392
c74e630d 2393 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2394 if (r < 0)
2395 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2396 }
7e227024 2397
c74e630d
LP
2398 return 0;
2399}
2400
2401static int setup_macvlan(pid_t pid) {
2402 _cleanup_udev_unref_ struct udev *udev = NULL;
2403 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2404 unsigned idx = 0;
c74e630d
LP
2405 char **i;
2406 int r;
2407
2408 if (!arg_private_network)
2409 return 0;
2410
2411 if (strv_isempty(arg_network_macvlan))
2412 return 0;
2413
2414 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2417
2418 udev = udev_new();
2419 if (!udev) {
2420 log_error("Failed to connect to udev.");
2421 return -ENOMEM;
2422 }
2423
2424 STRV_FOREACH(i, arg_network_macvlan) {
2425 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2426 _cleanup_free_ char *n = NULL;
e867ceb6 2427 struct ether_addr mac;
c74e630d
LP
2428 int ifi;
2429
2430 ifi = parse_interface(udev, *i);
2431 if (ifi < 0)
2432 return ifi;
2433
e867ceb6
LP
2434 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2435 if (r < 0)
2436 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2437
c74e630d 2438 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2441
c74e630d 2442 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2443 if (r < 0)
2444 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2445
2446 n = strappend("mv-", *i);
2447 if (!n)
2448 return log_oom();
2449
2450 strshorten(n, IFNAMSIZ-1);
2451
2452 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2453 if (r < 0)
2454 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2455
e867ceb6
LP
2456 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2459
aa28aefe 2460 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2463
2464 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2465 if (r < 0)
2466 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2467
d8e538ec 2468 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2471
2472 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2473 if (r < 0)
2474 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2475
2476 r = sd_rtnl_message_close_container(m);
f647962d
MS
2477 if (r < 0)
2478 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2479
2480 r = sd_rtnl_message_close_container(m);
f647962d
MS
2481 if (r < 0)
2482 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2483
2484 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2485 if (r < 0)
2486 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2487 }
2488
2489 return 0;
2490}
2491
4bbfe7ad
TG
2492static int setup_ipvlan(pid_t pid) {
2493 _cleanup_udev_unref_ struct udev *udev = NULL;
2494 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2495 char **i;
2496 int r;
2497
2498 if (!arg_private_network)
2499 return 0;
2500
2501 if (strv_isempty(arg_network_ipvlan))
2502 return 0;
2503
2504 r = sd_rtnl_open(&rtnl, 0);
2505 if (r < 0)
2506 return log_error_errno(r, "Failed to connect to netlink: %m");
2507
2508 udev = udev_new();
2509 if (!udev) {
2510 log_error("Failed to connect to udev.");
2511 return -ENOMEM;
2512 }
2513
2514 STRV_FOREACH(i, arg_network_ipvlan) {
2515 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2516 _cleanup_free_ char *n = NULL;
2517 int ifi;
2518
2519 ifi = parse_interface(udev, *i);
2520 if (ifi < 0)
2521 return ifi;
2522
2523 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2524 if (r < 0)
2525 return log_error_errno(r, "Failed to allocate netlink message: %m");
2526
2527 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2528 if (r < 0)
2529 return log_error_errno(r, "Failed to add netlink interface index: %m");
2530
2531 n = strappend("iv-", *i);
2532 if (!n)
2533 return log_oom();
2534
2535 strshorten(n, IFNAMSIZ-1);
2536
2537 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2538 if (r < 0)
2539 return log_error_errno(r, "Failed to add netlink interface name: %m");
2540
2541 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2542 if (r < 0)
2543 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2544
2545 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to open netlink container: %m");
2548
2549 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to open netlink container: %m");
2552
2553 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2554 if (r < 0)
2555 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2556
2557 r = sd_rtnl_message_close_container(m);
2558 if (r < 0)
2559 return log_error_errno(r, "Failed to close netlink container: %m");
2560
2561 r = sd_rtnl_message_close_container(m);
2562 if (r < 0)
2563 return log_error_errno(r, "Failed to close netlink container: %m");
2564
2565 r = sd_rtnl_call(rtnl, m, 0, NULL);
2566 if (r < 0)
2567 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2568 }
2569
2570 return 0;
2571}
2572
28650077 2573static int setup_seccomp(void) {
24fb1112
LP
2574
2575#ifdef HAVE_SECCOMP
9a71b112
JF
2576 static const struct {
2577 uint64_t capability;
2578 int syscall_num;
2579 } blacklist[] = {
2580 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2581 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2582 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2583 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2584 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2585 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2586 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2587 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2588 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
d0a0ccf3
JF
2589 };
2590
24fb1112 2591 scmp_filter_ctx seccomp;
28650077 2592 unsigned i;
24fb1112
LP
2593 int r;
2594
24fb1112
LP
2595 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2596 if (!seccomp)
2597 return log_oom();
2598
e9642be2 2599 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2600 if (r < 0) {
da927ba9 2601 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2602 goto finish;
2603 }
2604
28650077 2605 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2606 if (arg_retain & (1ULL << blacklist[i].capability))
2607 continue;
2608
2609 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2610 if (r == -EFAULT)
2611 continue; /* unknown syscall */
2612 if (r < 0) {
da927ba9 2613 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2614 goto finish;
2615 }
2616 }
2617
d0a0ccf3 2618
28650077
LP
2619 /*
2620 Audit is broken in containers, much of the userspace audit
2621 hookup will fail if running inside a container. We don't
2622 care and just turn off creation of audit sockets.
2623
2624 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2625 with EAFNOSUPPORT which audit userspace uses as indication
2626 that audit is disabled in the kernel.
2627 */
2628
3302da46 2629 r = seccomp_rule_add(
24fb1112
LP
2630 seccomp,
2631 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2632 SCMP_SYS(socket),
2633 2,
2634 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2635 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2636 if (r < 0) {
da927ba9 2637 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2638 goto finish;
2639 }
2640
2641 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2642 if (r < 0) {
da927ba9 2643 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2644 goto finish;
2645 }
2646
2647 r = seccomp_load(seccomp);
2648 if (r < 0)
da927ba9 2649 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2650
2651finish:
2652 seccomp_release(seccomp);
2653 return r;
2654#else
2655 return 0;
2656#endif
2657
2658}
2659
785890ac
LP
2660static int setup_propagate(const char *root) {
2661 const char *p, *q;
2662
2663 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2664 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2665 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2666 (void) mkdir_p(p, 0600);
2667
63c372cb 2668 q = strjoina(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2669 mkdir_parents(q, 0755);
2670 mkdir_p(q, 0600);
2671
2672 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2673 return log_error_errno(errno, "Failed to install propagation bind mount.");
2674
2675 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2676 return log_error_errno(errno, "Failed to make propagation mount read-only");
2677
2678 return 0;
2679}
2680
1b9e5b12
LP
2681static int setup_image(char **device_path, int *loop_nr) {
2682 struct loop_info64 info = {
2683 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2684 };
2685 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2686 _cleanup_free_ char* loopdev = NULL;
2687 struct stat st;
2688 int r, nr;
2689
2690 assert(device_path);
2691 assert(loop_nr);
ec16945e 2692 assert(arg_image);
1b9e5b12
LP
2693
2694 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2695 if (fd < 0)
2696 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2697
4a62c710
MS
2698 if (fstat(fd, &st) < 0)
2699 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2700
2701 if (S_ISBLK(st.st_mode)) {
2702 char *p;
2703
2704 p = strdup(arg_image);
2705 if (!p)
2706 return log_oom();
2707
2708 *device_path = p;
2709
2710 *loop_nr = -1;
2711
2712 r = fd;
2713 fd = -1;
2714
2715 return r;
2716 }
2717
2718 if (!S_ISREG(st.st_mode)) {
56f64d95 2719 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2720 return -EINVAL;
2721 }
2722
2723 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2724 if (control < 0)
2725 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2726
2727 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2728 if (nr < 0)
2729 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2730
2731 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2732 return log_oom();
2733
2734 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2735 if (loop < 0)
2736 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2737
4a62c710
MS
2738 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2739 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2740
2741 if (arg_read_only)
2742 info.lo_flags |= LO_FLAGS_READ_ONLY;
2743
4a62c710
MS
2744 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2745 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2746
2747 *device_path = loopdev;
2748 loopdev = NULL;
2749
2750 *loop_nr = nr;
2751
2752 r = loop;
2753 loop = -1;
2754
2755 return r;
2756}
2757
ada4799a
LP
2758#define PARTITION_TABLE_BLURB \
2759 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 2760 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 2761 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2762 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2763 "to be bootable with systemd-nspawn."
2764
1b9e5b12
LP
2765static int dissect_image(
2766 int fd,
727fd4fd
LP
2767 char **root_device, bool *root_device_rw,
2768 char **home_device, bool *home_device_rw,
2769 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2770 bool *secondary) {
2771
2772#ifdef HAVE_BLKID
01dc33ce
ZJS
2773 int home_nr = -1, srv_nr = -1;
2774#ifdef GPT_ROOT_NATIVE
2775 int root_nr = -1;
2776#endif
2777#ifdef GPT_ROOT_SECONDARY
2778 int secondary_root_nr = -1;
2779#endif
f6c51a81 2780 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2781 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2782 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2783 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2784 _cleanup_udev_unref_ struct udev *udev = NULL;
2785 struct udev_list_entry *first, *item;
f6c51a81 2786 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2787 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2788 const char *pttype = NULL;
2789 blkid_partlist pl;
2790 struct stat st;
c09ef2e4 2791 unsigned i;
1b9e5b12
LP
2792 int r;
2793
2794 assert(fd >= 0);
2795 assert(root_device);
2796 assert(home_device);
2797 assert(srv_device);
2798 assert(secondary);
ec16945e 2799 assert(arg_image);
1b9e5b12
LP
2800
2801 b = blkid_new_probe();
2802 if (!b)
2803 return log_oom();
2804
2805 errno = 0;
2806 r = blkid_probe_set_device(b, fd, 0, 0);
2807 if (r != 0) {
2808 if (errno == 0)
2809 return log_oom();
2810
56f64d95 2811 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2812 return -errno;
2813 }
2814
2815 blkid_probe_enable_partitions(b, 1);
2816 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2817
2818 errno = 0;
2819 r = blkid_do_safeprobe(b);
2820 if (r == -2 || r == 1) {
ada4799a
LP
2821 log_error("Failed to identify any partition table on\n"
2822 " %s\n"
2823 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2824 return -EINVAL;
2825 } else if (r != 0) {
2826 if (errno == 0)
2827 errno = EIO;
56f64d95 2828 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2829 return -errno;
2830 }
2831
48861960 2832 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2833
2834 is_gpt = streq_ptr(pttype, "gpt");
2835 is_mbr = streq_ptr(pttype, "dos");
2836
2837 if (!is_gpt && !is_mbr) {
2838 log_error("No GPT or MBR partition table discovered on\n"
2839 " %s\n"
2840 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2841 return -EINVAL;
2842 }
2843
2844 errno = 0;
2845 pl = blkid_probe_get_partitions(b);
2846 if (!pl) {
2847 if (errno == 0)
2848 return log_oom();
2849
2850 log_error("Failed to list partitions of %s", arg_image);
2851 return -errno;
2852 }
2853
2854 udev = udev_new();
2855 if (!udev)
2856 return log_oom();
2857
4a62c710
MS
2858 if (fstat(fd, &st) < 0)
2859 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2860
c09ef2e4
LP
2861 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2862 if (!d)
1b9e5b12
LP
2863 return log_oom();
2864
c09ef2e4
LP
2865 for (i = 0;; i++) {
2866 int n, m;
1b9e5b12 2867
c09ef2e4
LP
2868 if (i >= 10) {
2869 log_error("Kernel partitions never appeared.");
2870 return -ENXIO;
2871 }
2872
2873 e = udev_enumerate_new(udev);
2874 if (!e)
2875 return log_oom();
2876
2877 r = udev_enumerate_add_match_parent(e, d);
2878 if (r < 0)
2879 return log_oom();
2880
2881 r = udev_enumerate_scan_devices(e);
2882 if (r < 0)
2883 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2884
2885 /* Count the partitions enumerated by the kernel */
2886 n = 0;
2887 first = udev_enumerate_get_list_entry(e);
2888 udev_list_entry_foreach(item, first)
2889 n++;
2890
2891 /* Count the partitions enumerated by blkid */
2892 m = blkid_partlist_numof_partitions(pl);
2893 if (n == m + 1)
2894 break;
2895 if (n > m + 1) {
2896 log_error("blkid and kernel partition list do not match.");
2897 return -EIO;
2898 }
2899 if (n < m + 1) {
2900 unsigned j;
2901
2902 /* The kernel has probed fewer partitions than
2903 * blkid? Maybe the kernel prober is still
2904 * running or it got EBUSY because udev
2905 * already opened the device. Let's reprobe
2906 * the device, which is a synchronous call
2907 * that waits until probing is complete. */
2908
2909 for (j = 0; j < 20; j++) {
2910
2911 r = ioctl(fd, BLKRRPART, 0);
2912 if (r < 0)
2913 r = -errno;
2914 if (r >= 0 || r != -EBUSY)
2915 break;
2916
2917 /* If something else has the device
2918 * open, such as an udev rule, the
2919 * ioctl will return EBUSY. Since
2920 * there's no way to wait until it
2921 * isn't busy anymore, let's just wait
2922 * a bit, and try again.
2923 *
2924 * This is really something they
2925 * should fix in the kernel! */
2926
2927 usleep(50 * USEC_PER_MSEC);
2928 }
2929
2930 if (r < 0)
2931 return log_error_errno(r, "Failed to reread partition table: %m");
2932 }
2933
2934 e = udev_enumerate_unref(e);
2935 }
1b9e5b12
LP
2936
2937 first = udev_enumerate_get_list_entry(e);
2938 udev_list_entry_foreach(item, first) {
2939 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2940 const char *node;
727fd4fd 2941 unsigned long long flags;
1b9e5b12
LP
2942 blkid_partition pp;
2943 dev_t qn;
2944 int nr;
2945
2946 errno = 0;
2947 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2948 if (!q) {
2949 if (!errno)
2950 errno = ENOMEM;
2951
56f64d95 2952 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2953 return -errno;
2954 }
2955
2956 qn = udev_device_get_devnum(q);
2957 if (major(qn) == 0)
2958 continue;
2959
2960 if (st.st_rdev == qn)
2961 continue;
2962
2963 node = udev_device_get_devnode(q);
2964 if (!node)
2965 continue;
2966
2967 pp = blkid_partlist_devno_to_partition(pl, qn);
2968 if (!pp)
2969 continue;
2970
727fd4fd 2971 flags = blkid_partition_get_flags(pp);
727fd4fd 2972
1b9e5b12
LP
2973 nr = blkid_partition_get_partno(pp);
2974 if (nr < 0)
2975 continue;
2976
ada4799a
LP
2977 if (is_gpt) {
2978 sd_id128_t type_id;
2979 const char *stype;
1b9e5b12 2980
f6c51a81
LP
2981 if (flags & GPT_FLAG_NO_AUTO)
2982 continue;
2983
ada4799a
LP
2984 stype = blkid_partition_get_type_string(pp);
2985 if (!stype)
2986 continue;
1b9e5b12 2987
ada4799a 2988 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2989 continue;
2990
ada4799a 2991 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2992
ada4799a
LP
2993 if (home && nr >= home_nr)
2994 continue;
1b9e5b12 2995
ada4799a
LP
2996 home_nr = nr;
2997 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2998
ada4799a
LP
2999 r = free_and_strdup(&home, node);
3000 if (r < 0)
3001 return log_oom();
727fd4fd 3002
ada4799a
LP
3003 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3004
3005 if (srv && nr >= srv_nr)
3006 continue;
3007
3008 srv_nr = nr;
3009 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3010
3011 r = free_and_strdup(&srv, node);
3012 if (r < 0)
3013 return log_oom();
3014 }
1b9e5b12 3015#ifdef GPT_ROOT_NATIVE
ada4799a 3016 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3017
ada4799a
LP
3018 if (root && nr >= root_nr)
3019 continue;
1b9e5b12 3020
ada4799a
LP
3021 root_nr = nr;
3022 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3023
ada4799a
LP
3024 r = free_and_strdup(&root, node);
3025 if (r < 0)
3026 return log_oom();
3027 }
1b9e5b12
LP
3028#endif
3029#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3030 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3031
3032 if (secondary_root && nr >= secondary_root_nr)
3033 continue;
3034
3035 secondary_root_nr = nr;
3036 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3037
3038 r = free_and_strdup(&secondary_root, node);
3039 if (r < 0)
3040 return log_oom();
3041 }
3042#endif
f6c51a81
LP
3043 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3044
3045 if (generic)
3046 multiple_generic = true;
3047 else {
3048 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3049
3050 r = free_and_strdup(&generic, node);
3051 if (r < 0)
3052 return log_oom();
3053 }
3054 }
ada4799a
LP
3055
3056 } else if (is_mbr) {
3057 int type;
1b9e5b12 3058
f6c51a81
LP
3059 if (flags != 0x80) /* Bootable flag */
3060 continue;
3061
ada4799a
LP
3062 type = blkid_partition_get_type(pp);
3063 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3064 continue;
3065
f6c51a81
LP
3066 if (generic)
3067 multiple_generic = true;
3068 else {
3069 generic_rw = true;
727fd4fd 3070
f6c51a81
LP
3071 r = free_and_strdup(&root, node);
3072 if (r < 0)
3073 return log_oom();
3074 }
1b9e5b12 3075 }
1b9e5b12
LP
3076 }
3077
1b9e5b12
LP
3078 if (root) {
3079 *root_device = root;
3080 root = NULL;
727fd4fd
LP
3081
3082 *root_device_rw = root_rw;
1b9e5b12
LP
3083 *secondary = false;
3084 } else if (secondary_root) {
3085 *root_device = secondary_root;
3086 secondary_root = NULL;
727fd4fd
LP
3087
3088 *root_device_rw = secondary_root_rw;
1b9e5b12 3089 *secondary = true;
f6c51a81
LP
3090 } else if (generic) {
3091
3092 /* There were no partitions with precise meanings
3093 * around, but we found generic partitions. In this
3094 * case, if there's only one, we can go ahead and boot
3095 * it, otherwise we bail out, because we really cannot
3096 * make any sense of it. */
3097
3098 if (multiple_generic) {
3099 log_error("Identified multiple bootable Linux partitions on\n"
3100 " %s\n"
3101 PARTITION_TABLE_BLURB, arg_image);
3102 return -EINVAL;
3103 }
3104
3105 *root_device = generic;
3106 generic = NULL;
3107
3108 *root_device_rw = generic_rw;
3109 *secondary = false;
3110 } else {
3111 log_error("Failed to identify root partition in disk image\n"
3112 " %s\n"
3113 PARTITION_TABLE_BLURB, arg_image);
3114 return -EINVAL;
1b9e5b12
LP
3115 }
3116
3117 if (home) {
3118 *home_device = home;
3119 home = NULL;
727fd4fd
LP
3120
3121 *home_device_rw = home_rw;
1b9e5b12
LP
3122 }
3123
3124 if (srv) {
3125 *srv_device = srv;
3126 srv = NULL;
727fd4fd
LP
3127
3128 *srv_device_rw = srv_rw;
1b9e5b12
LP
3129 }
3130
3131 return 0;
3132#else
3133 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3134 return -EOPNOTSUPP;
1b9e5b12
LP
3135#endif
3136}
3137
727fd4fd 3138static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3139#ifdef HAVE_BLKID
3140 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3141 const char *fstype, *p;
3142 int r;
3143
3144 assert(what);
3145 assert(where);
3146
727fd4fd
LP
3147 if (arg_read_only)
3148 rw = false;
3149
1b9e5b12 3150 if (directory)
63c372cb 3151 p = strjoina(where, directory);
1b9e5b12
LP
3152 else
3153 p = where;
3154
3155 errno = 0;
3156 b = blkid_new_probe_from_filename(what);
3157 if (!b) {
3158 if (errno == 0)
3159 return log_oom();
56f64d95 3160 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3161 return -errno;
3162 }
3163
3164 blkid_probe_enable_superblocks(b, 1);
3165 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3166
3167 errno = 0;
3168 r = blkid_do_safeprobe(b);
3169 if (r == -1 || r == 1) {
3170 log_error("Cannot determine file system type of %s", what);
3171 return -EINVAL;
3172 } else if (r != 0) {
3173 if (errno == 0)
3174 errno = EIO;
56f64d95 3175 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3176 return -errno;
3177 }
3178
3179 errno = 0;
3180 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3181 if (errno == 0)
3182 errno = EINVAL;
3183 log_error("Failed to determine file system type of %s", what);
3184 return -errno;
3185 }
3186
3187 if (streq(fstype, "crypto_LUKS")) {
3188 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3189 return -EOPNOTSUPP;
1b9e5b12
LP
3190 }
3191
4a62c710
MS
3192 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3193 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3194
3195 return 0;
3196#else
3197 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3198 return -EOPNOTSUPP;
1b9e5b12
LP
3199#endif
3200}
3201
727fd4fd
LP
3202static int mount_devices(
3203 const char *where,
3204 const char *root_device, bool root_device_rw,
3205 const char *home_device, bool home_device_rw,
3206 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3207 int r;
3208
3209 assert(where);
3210
3211 if (root_device) {
727fd4fd 3212 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3213 if (r < 0)
3214 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3215 }
3216
3217 if (home_device) {
727fd4fd 3218 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3219 if (r < 0)
3220 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3221 }
3222
3223 if (srv_device) {
727fd4fd 3224 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3225 if (r < 0)
3226 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3227 }
3228
3229 return 0;
3230}
3231
3232static void loop_remove(int nr, int *image_fd) {
3233 _cleanup_close_ int control = -1;
e8c8ddcc 3234 int r;
1b9e5b12
LP
3235
3236 if (nr < 0)
3237 return;
3238
3239 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3240 r = ioctl(*image_fd, LOOP_CLR_FD);
3241 if (r < 0)
5e4074aa 3242 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3243 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3244 }
3245
3246 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3247 if (control < 0) {
56f64d95 3248 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3249 return;
e8c8ddcc 3250 }
1b9e5b12 3251
e8c8ddcc
TG
3252 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3253 if (r < 0)
5e4074aa 3254 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3255}
3256
0cb9fbcd
LP
3257static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3258 int pipe_fds[2];
3259 pid_t pid;
3260
3261 assert(database);
3262 assert(key);
3263 assert(rpid);
3264
4a62c710
MS
3265 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3266 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3267
3268 pid = fork();
4a62c710
MS
3269 if (pid < 0)
3270 return log_error_errno(errno, "Failed to fork getent child: %m");
3271 else if (pid == 0) {
0cb9fbcd
LP
3272 int nullfd;
3273 char *empty_env = NULL;
3274
3275 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3276 _exit(EXIT_FAILURE);
3277
3278 if (pipe_fds[0] > 2)
03e334a1 3279 safe_close(pipe_fds[0]);
0cb9fbcd 3280 if (pipe_fds[1] > 2)
03e334a1 3281 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3282
3283 nullfd = open("/dev/null", O_RDWR);
3284 if (nullfd < 0)
3285 _exit(EXIT_FAILURE);
3286
3287 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3288 _exit(EXIT_FAILURE);
3289
3290 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3291 _exit(EXIT_FAILURE);
3292
3293 if (nullfd > 2)
03e334a1 3294 safe_close(nullfd);
0cb9fbcd
LP
3295
3296 reset_all_signal_handlers();
3297 close_all_fds(NULL, 0);
3298
4de82926
MM
3299 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3300 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3301 _exit(EXIT_FAILURE);
3302 }
3303
03e334a1 3304 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3305
3306 *rpid = pid;
3307
3308 return pipe_fds[0];
3309}
3310
3311static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3312 char line[LINE_MAX], *x, *u, *g, *h;
3313 const char *word, *state;
0cb9fbcd
LP
3314 _cleanup_free_ uid_t *uids = NULL;
3315 _cleanup_free_ char *home = NULL;
3316 _cleanup_fclose_ FILE *f = NULL;
3317 _cleanup_close_ int fd = -1;
3318 unsigned n_uids = 0;
70f539ca 3319 size_t sz = 0, l;
0cb9fbcd
LP
3320 uid_t uid;
3321 gid_t gid;
3322 pid_t pid;
3323 int r;
3324
3325 assert(_home);
3326
3327 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3328 /* Reset everything fully to 0, just in case */
3329
4a62c710
MS
3330 if (setgroups(0, NULL) < 0)
3331 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3332
4a62c710
MS
3333 if (setresgid(0, 0, 0) < 0)
3334 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3335
4a62c710
MS
3336 if (setresuid(0, 0, 0) < 0)
3337 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3338
3339 *_home = NULL;
3340 return 0;
3341 }
3342
3343 /* First, get user credentials */
3344 fd = spawn_getent("passwd", arg_user, &pid);
3345 if (fd < 0)
3346 return fd;
3347
3348 f = fdopen(fd, "r");
3349 if (!f)
3350 return log_oom();
3351 fd = -1;
3352
3353 if (!fgets(line, sizeof(line), f)) {
3354
3355 if (!ferror(f)) {
3356 log_error("Failed to resolve user %s.", arg_user);
3357 return -ESRCH;
3358 }
3359
56f64d95 3360 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3361 return -errno;
3362 }
3363
3364 truncate_nl(line);
3365
820d3acf 3366 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3367
3368 x = strchr(line, ':');
3369 if (!x) {
3370 log_error("/etc/passwd entry has invalid user field.");
3371 return -EIO;
3372 }
3373
3374 u = strchr(x+1, ':');
3375 if (!u) {
3376 log_error("/etc/passwd entry has invalid password field.");
3377 return -EIO;
3378 }
3379
3380 u++;
3381 g = strchr(u, ':');
3382 if (!g) {
3383 log_error("/etc/passwd entry has invalid UID field.");
3384 return -EIO;
3385 }
3386
3387 *g = 0;
3388 g++;
3389 x = strchr(g, ':');
3390 if (!x) {
3391 log_error("/etc/passwd entry has invalid GID field.");
3392 return -EIO;
3393 }
3394
3395 *x = 0;
3396 h = strchr(x+1, ':');
3397 if (!h) {
3398 log_error("/etc/passwd entry has invalid GECOS field.");
3399 return -EIO;
3400 }
3401
3402 h++;
3403 x = strchr(h, ':');
3404 if (!x) {
3405 log_error("/etc/passwd entry has invalid home directory field.");
3406 return -EIO;
3407 }
3408
3409 *x = 0;
3410
3411 r = parse_uid(u, &uid);
3412 if (r < 0) {
3413 log_error("Failed to parse UID of user.");
3414 return -EIO;
3415 }
3416
3417 r = parse_gid(g, &gid);
3418 if (r < 0) {
3419 log_error("Failed to parse GID of user.");
3420 return -EIO;
3421 }
3422
3423 home = strdup(h);
3424 if (!home)
3425 return log_oom();
3426
3427 /* Second, get group memberships */
3428 fd = spawn_getent("initgroups", arg_user, &pid);
3429 if (fd < 0)
3430 return fd;
3431
3432 fclose(f);
3433 f = fdopen(fd, "r");
3434 if (!f)
3435 return log_oom();
3436 fd = -1;
3437
3438 if (!fgets(line, sizeof(line), f)) {
3439 if (!ferror(f)) {
3440 log_error("Failed to resolve user %s.", arg_user);
3441 return -ESRCH;
3442 }
3443
56f64d95 3444 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3445 return -errno;
3446 }
3447
3448 truncate_nl(line);
3449
820d3acf 3450 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3451
3452 /* Skip over the username and subsequent separator whitespace */
3453 x = line;
3454 x += strcspn(x, WHITESPACE);
3455 x += strspn(x, WHITESPACE);
3456
a2a5291b 3457 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3458 char c[l+1];
3459
a2a5291b 3460 memcpy(c, word, l);
0cb9fbcd
LP
3461 c[l] = 0;
3462
3463 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3464 return log_oom();
3465
3466 r = parse_uid(c, &uids[n_uids++]);
3467 if (r < 0) {
3468 log_error("Failed to parse group data from getent.");
3469 return -EIO;
3470 }
3471 }
3472
3473 r = mkdir_parents(home, 0775);
f647962d
MS
3474 if (r < 0)
3475 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3476
3477 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3478 if (r < 0 && r != -EEXIST)
3479 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3480
3481 fchown(STDIN_FILENO, uid, gid);
3482 fchown(STDOUT_FILENO, uid, gid);
3483 fchown(STDERR_FILENO, uid, gid);
3484
4a62c710
MS
3485 if (setgroups(n_uids, uids) < 0)
3486 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3487
4a62c710
MS
3488 if (setresgid(gid, gid, gid) < 0)
3489 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3490
4a62c710
MS
3491 if (setresuid(uid, uid, uid) < 0)
3492 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3493
3494 if (_home) {
3495 *_home = home;
3496 home = NULL;
3497 }
3498
3499 return 0;
3500}
3501
113cea80 3502/*
6d416b9c
LS
3503 * Return values:
3504 * < 0 : wait_for_terminate() failed to get the state of the
3505 * container, the container was terminated by a signal, or
3506 * failed for an unknown reason. No change is made to the
3507 * container argument.
3508 * > 0 : The program executed in the container terminated with an
3509 * error. The exit code of the program executed in the
919699ec
LP
3510 * container is returned. The container argument has been set
3511 * to CONTAINER_TERMINATED.
6d416b9c
LS
3512 * 0 : The container is being rebooted, has been shut down or exited
3513 * successfully. The container argument has been set to either
3514 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3515 *
6d416b9c
LS
3516 * That is, success is indicated by a return value of zero, and an
3517 * error is indicated by a non-zero value.
113cea80
DH
3518 */
3519static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3520 siginfo_t status;
919699ec 3521 int r;
113cea80
DH
3522
3523 r = wait_for_terminate(pid, &status);
f647962d
MS
3524 if (r < 0)
3525 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3526
3527 switch (status.si_code) {
fddbb89c 3528
113cea80 3529 case CLD_EXITED:
919699ec
LP
3530 if (status.si_status == 0) {
3531 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3532
fddbb89c 3533 } else
919699ec 3534 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3535
919699ec
LP
3536 *container = CONTAINER_TERMINATED;
3537 return status.si_status;
113cea80
DH
3538
3539 case CLD_KILLED:
3540 if (status.si_status == SIGINT) {
113cea80 3541
919699ec 3542 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3543 *container = CONTAINER_TERMINATED;
919699ec
LP
3544 return 0;
3545
113cea80 3546 } else if (status.si_status == SIGHUP) {
113cea80 3547
919699ec 3548 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3549 *container = CONTAINER_REBOOTED;
919699ec 3550 return 0;
113cea80 3551 }
919699ec 3552
113cea80
DH
3553 /* CLD_KILLED fallthrough */
3554
3555 case CLD_DUMPED:
fddbb89c 3556 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3557 return -EIO;
113cea80
DH
3558
3559 default:
fddbb89c 3560 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3561 return -EIO;
113cea80
DH
3562 }
3563
3564 return r;
3565}
3566
e866af3a
DH
3567static void nop_handler(int sig) {}
3568
023fb90b
LP
3569static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3570 pid_t pid;
3571
3572 pid = PTR_TO_UINT32(userdata);
3573 if (pid > 0) {
c6c8f6e2 3574 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3575 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3576 sd_event_source_set_userdata(s, NULL);
3577 return 0;
3578 }
3579 }
3580
3581 sd_event_exit(sd_event_source_get_event(s), 0);
3582 return 0;
3583}
3584
ec16945e 3585static int determine_names(void) {
1b9cebf6 3586 int r;
ec16945e
LP
3587
3588 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3589 if (arg_machine) {
3590 _cleanup_(image_unrefp) Image *i = NULL;
3591
3592 r = image_find(arg_machine, &i);
3593 if (r < 0)
3594 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3595 else if (r == 0) {
3596 log_error("No image for machine '%s': %m", arg_machine);
3597 return -ENOENT;
3598 }
3599
aceac2f0 3600 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3601 r = set_sanitized_path(&arg_image, i->path);
3602 else
3603 r = set_sanitized_path(&arg_directory, i->path);
3604 if (r < 0)
3605 return log_error_errno(r, "Invalid image directory: %m");
3606
3607 arg_read_only = arg_read_only || i->read_only;
3608 } else
ec16945e
LP
3609 arg_directory = get_current_dir_name();
3610
1b9cebf6
LP
3611 if (!arg_directory && !arg_machine) {
3612 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3613 return -EINVAL;
3614 }
3615 }
3616
3617 if (!arg_machine) {
b9ba4dab
LP
3618 if (arg_directory && path_equal(arg_directory, "/"))
3619 arg_machine = gethostname_malloc();
3620 else
3621 arg_machine = strdup(basename(arg_image ?: arg_directory));
3622
ec16945e
LP
3623 if (!arg_machine)
3624 return log_oom();
3625
3626 hostname_cleanup(arg_machine, false);
3627 if (!machine_name_is_valid(arg_machine)) {
3628 log_error("Failed to determine machine name automatically, please use -M.");
3629 return -EINVAL;
3630 }
b9ba4dab
LP
3631
3632 if (arg_ephemeral) {
3633 char *b;
3634
3635 /* Add a random suffix when this is an
3636 * ephemeral machine, so that we can run many
3637 * instances at once without manually having
3638 * to specify -M each time. */
3639
3640 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3641 return log_oom();
3642
3643 free(arg_machine);
3644 arg_machine = b;
3645 }
ec16945e
LP
3646 }
3647
3648 return 0;
3649}
3650
6dac160c
LP
3651static int determine_uid_shift(void) {
3652 int r;
3653
3654 if (!arg_userns)
3655 return 0;
3656
3657 if (arg_uid_shift == UID_INVALID) {
3658 struct stat st;
3659
3660 r = stat(arg_directory, &st);
3661 if (r < 0)
3662 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3663
3664 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3665
3666 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3667 log_error("UID and GID base of %s don't match.", arg_directory);
3668 return -EINVAL;
3669 }
3670
3671 arg_uid_range = UINT32_C(0x10000);
3672 }
3673
3674 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3675 log_error("UID base too high for UID range.");
3676 return -EINVAL;
3677 }
3678
3679 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3680 return 0;
3681}
3682
88213476 3683int main(int argc, char *argv[]) {
69c79d3c 3684
611b312b 3685 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3686 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3687 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3688 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3689 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3690 char veth_name[IFNAMSIZ];
ec16945e 3691 bool secondary = false, remove_subvol = false;
e866af3a 3692 sigset_t mask, mask_chld;
69c79d3c 3693 pid_t pid = 0;
ec16945e 3694 int ret = EXIT_SUCCESS;
6d0b55c2 3695 union in_addr_union exposed = {};
30535c16 3696 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
9c857b9d 3697 bool interactive;
88213476
LP
3698
3699 log_parse_environment();
3700 log_open();
3701
ec16945e
LP
3702 r = parse_argv(argc, argv);
3703 if (r <= 0)
88213476 3704 goto finish;
88213476 3705
ec16945e
LP
3706 r = determine_names();
3707 if (r < 0)
3708 goto finish;
7027ff61 3709
88213476
LP
3710 if (geteuid() != 0) {
3711 log_error("Need to be root.");
ec16945e 3712 r = -EPERM;
88213476
LP
3713 goto finish;
3714 }
3715
1b9e5b12
LP
3716 log_close();
3717 n_fd_passed = sd_listen_fds(false);
3718 if (n_fd_passed > 0) {
ec16945e
LP
3719 r = fdset_new_listen_fds(&fds, false);
3720 if (r < 0) {
3721 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3722 goto finish;
3723 }
88213476 3724 }
1b9e5b12
LP
3725 fdset_close_others(fds);
3726 log_open();
88213476 3727
1b9e5b12 3728 if (arg_directory) {
ec16945e
LP
3729 assert(!arg_image);
3730
c4e34a61
LP
3731 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3732 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3733 r = -EINVAL;
6b9132a9
LP
3734 goto finish;
3735 }
1b9e5b12 3736
30535c16 3737 if (arg_ephemeral) {
8a16a7b4 3738 _cleanup_free_ char *np = NULL;
ec16945e 3739
c4e34a61
LP
3740 /* If the specified path is a mount point we
3741 * generate the new snapshot immediately
3742 * inside it under a random name. However if
3743 * the specified is not a mount point we
3744 * create the new snapshot in the parent
3745 * directory, just next to it. */
3746 r = path_is_mount_point(arg_directory, false);
3747 if (r < 0) {
3748 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3749 goto finish;
3750 }
3751 if (r > 0)
3752 r = tempfn_random_child(arg_directory, &np);
3753 else
3754 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3755 if (r < 0) {
3756 log_error_errno(r, "Failed to generate name for snapshot: %m");
3757 goto finish;
3758 }
3759
30535c16
LP
3760 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3761 if (r < 0) {
3762 log_error_errno(r, "Failed to lock %s: %m", np);
3763 goto finish;
3764 }
3765
f70a17f8 3766 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
ec16945e 3767 if (r < 0) {
ec16945e
LP
3768 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3769 goto finish;
3770 }
3771
3772 free(arg_directory);
3773 arg_directory = np;
8a16a7b4 3774 np = NULL;
ec16945e
LP
3775
3776 remove_subvol = true;
30535c16
LP
3777
3778 } else {
3779 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3780 if (r == -EBUSY) {
3781 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3782 goto finish;
3783 }
3784 if (r < 0) {
3785 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3786 return r;
3787 }
3788
3789 if (arg_template) {
f70a17f8 3790 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
3791 if (r == -EEXIST) {
3792 if (!arg_quiet)
3793 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3794 } else if (r < 0) {
83521414 3795 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3796 goto finish;
3797 } else {
3798 if (!arg_quiet)
3799 log_info("Populated %s from template %s.", arg_directory, arg_template);
3800 }
3801 }
ec16945e
LP
3802 }
3803
1b9e5b12
LP
3804 if (arg_boot) {
3805 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3806 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3807 r = -EINVAL;
1b9e5b12
LP
3808 goto finish;
3809 }
3810 } else {
3811 const char *p;
3812
63c372cb 3813 p = strjoina(arg_directory,
1b9e5b12
LP
3814 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3815 if (access(p, F_OK) < 0) {
3816 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3817 r = -EINVAL;
1b9e5b12 3818 goto finish;
1b9e5b12
LP
3819 }
3820 }
ec16945e 3821
6b9132a9 3822 } else {
1b9e5b12 3823 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3824
ec16945e
LP
3825 assert(arg_image);
3826 assert(!arg_template);
3827
30535c16
LP
3828 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3829 if (r == -EBUSY) {
3830 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3831 goto finish;
3832 }
3833 if (r < 0) {
3834 r = log_error_errno(r, "Failed to create image lock: %m");
3835 goto finish;
3836 }
3837
1b9e5b12 3838 if (!mkdtemp(template)) {
56f64d95 3839 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3840 r = -errno;
6b9132a9 3841 goto finish;
1b9e5b12 3842 }
6b9132a9 3843
1b9e5b12
LP
3844 arg_directory = strdup(template);
3845 if (!arg_directory) {
3846 r = log_oom();
3847 goto finish;
6b9132a9 3848 }
88213476 3849
1b9e5b12
LP
3850 image_fd = setup_image(&device_path, &loop_nr);
3851 if (image_fd < 0) {
3852 r = image_fd;
842f3b0f
LP
3853 goto finish;
3854 }
1b9e5b12 3855
4d9f07b4
LP
3856 r = dissect_image(image_fd,
3857 &root_device, &root_device_rw,
3858 &home_device, &home_device_rw,
3859 &srv_device, &srv_device_rw,
3860 &secondary);
1b9e5b12
LP
3861 if (r < 0)
3862 goto finish;
842f3b0f 3863 }
842f3b0f 3864
6dac160c
LP
3865 r = determine_uid_shift();
3866 if (r < 0)
3867 goto finish;
3868
9c857b9d
LP
3869 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3870
db7feb7e
LP
3871 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3872 if (master < 0) {
ec16945e 3873 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3874 goto finish;
3875 }
3876
611b312b
LP
3877 r = ptsname_malloc(master, &console);
3878 if (r < 0) {
3879 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3880 goto finish;
3881 }
3882
a258bf26 3883 if (unlockpt(master) < 0) {
ec16945e 3884 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3885 goto finish;
3886 }
3887
9c857b9d
LP
3888 if (!arg_quiet)
3889 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3890 arg_machine, arg_image ?: arg_directory);
3891
a258bf26
LP
3892 assert_se(sigemptyset(&mask) == 0);
3893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3895
023fb90b
LP
3896 assert_se(sigemptyset(&mask_chld) == 0);
3897 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3898
d87be9b0 3899 for (;;) {
6d0b55c2 3900 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3901 ContainerStatus container_status;
7566e267 3902 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3903 struct sigaction sa = {
3904 .sa_handler = nop_handler,
3905 .sa_flags = SA_NOCLDSTOP,
3906 };
3907
7566e267 3908 r = barrier_create(&barrier);
a2da110b 3909 if (r < 0) {
da927ba9 3910 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3911 goto finish;
3912 }
3913
6d0b55c2
LP
3914 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3915 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3916 goto finish;
3917 }
3918
3919 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3920 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3921 goto finish;
3922 }
3923
e866af3a
DH
3924 /* Child can be killed before execv(), so handle SIGCHLD
3925 * in order to interrupt parent's blocking calls and
3926 * give it a chance to call wait() and terminate. */
3927 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3928 if (r < 0) {
ec16945e 3929 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3930 goto finish;
3931 }
3932
e866af3a
DH
3933 r = sigaction(SIGCHLD, &sa, NULL);
3934 if (r < 0) {
ec16945e 3935 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3936 goto finish;
3937 }
3938
60e1651a
KW
3939 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3940 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3941 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3942 if (pid < 0) {
3943 if (errno == EINVAL)
ec16945e 3944 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3945 else
ec16945e 3946 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3947
d87be9b0
LP
3948 goto finish;
3949 }
a258bf26 3950
d87be9b0
LP
3951 if (pid == 0) {
3952 /* child */
0cb9fbcd 3953 _cleanup_free_ char *home = NULL;
5674767e 3954 unsigned n_env = 2;
d87be9b0 3955 const char *envp[] = {
e10a55fd 3956 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3957 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3958 NULL, /* TERM */
3959 NULL, /* HOME */
3960 NULL, /* USER */
3961 NULL, /* LOGNAME */
3962 NULL, /* container_uuid */
842f3b0f
LP
3963 NULL, /* LISTEN_FDS */
3964 NULL, /* LISTEN_PID */
d87be9b0
LP
3965 NULL
3966 };
f4889f65 3967 char **env_use;
a258bf26 3968
a2da110b
DH
3969 barrier_set_role(&barrier, BARRIER_CHILD);
3970
5674767e
ZJS
3971 envp[n_env] = strv_find_prefix(environ, "TERM=");
3972 if (envp[n_env])
3973 n_env ++;
a258bf26 3974
03e334a1 3975 master = safe_close(master);
a258bf26 3976
03e334a1 3977 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3978 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3979
d87be9b0 3980 reset_all_signal_handlers();
1b6d7fa7 3981 reset_signal_mask();
f5c1b9ee 3982
9c857b9d
LP
3983 if (interactive) {
3984 close_nointr(STDIN_FILENO);
3985 close_nointr(STDOUT_FILENO);
3986 close_nointr(STDERR_FILENO);
842f3b0f 3987
9c857b9d
LP
3988 r = open_terminal(console, O_RDWR);
3989 if (r != STDIN_FILENO) {
3990 if (r >= 0) {
3991 safe_close(r);
3992 r = -EINVAL;
3993 }
842f3b0f 3994
9c857b9d
LP
3995 log_error_errno(r, "Failed to open console: %m");
3996 _exit(EXIT_FAILURE);
3997 }
3998
3999 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4000 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4001 log_error_errno(errno, "Failed to duplicate console: %m");
4002 _exit(EXIT_FAILURE);
4003 }
842f3b0f 4004 }
bc2f673e 4005
d87be9b0 4006 if (setsid() < 0) {
56f64d95 4007 log_error_errno(errno, "setsid() failed: %m");
a2da110b 4008 _exit(EXIT_FAILURE);
bc2f673e
LP
4009 }
4010
db999e0f 4011 if (reset_audit_loginuid() < 0)
a2da110b 4012 _exit(EXIT_FAILURE);
db999e0f 4013
d87be9b0 4014 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 4015 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 4016 _exit(EXIT_FAILURE);
d87be9b0 4017 }
e58a1277 4018
6dac160c
LP
4019 if (arg_private_network)
4020 loopback_setup();
4021
d87be9b0
LP
4022 /* Mark everything as slave, so that we still
4023 * receive mounts from the real root, but don't
4024 * propagate mounts to the real root. */
4025 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 4026 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 4027 _exit(EXIT_FAILURE);
d87be9b0 4028 }
04bc4a3f 4029
727fd4fd
LP
4030 if (mount_devices(arg_directory,
4031 root_device, root_device_rw,
4032 home_device, home_device_rw,
4033 srv_device, srv_device_rw) < 0)
a2da110b 4034 _exit(EXIT_FAILURE);
1b9e5b12 4035
d87be9b0 4036 /* Turn directory into bind mount */
4543768d 4037 if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 4038 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 4039 _exit(EXIT_FAILURE);
d87be9b0 4040 }
88213476 4041
4d9f07b4
LP
4042 r = setup_volatile(arg_directory);
4043 if (r < 0)
a2da110b 4044 _exit(EXIT_FAILURE);
4d9f07b4
LP
4045
4046 if (setup_volatile_state(arg_directory) < 0)
a2da110b 4047 _exit(EXIT_FAILURE);
4d9f07b4
LP
4048
4049 r = base_filesystem_create(arg_directory);
4050 if (r < 0)
a2da110b 4051 _exit(EXIT_FAILURE);
4d9f07b4 4052
d6797c92 4053 if (arg_read_only) {
ec16945e
LP
4054 r = bind_remount_recursive(arg_directory, true);
4055 if (r < 0) {
4056 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 4057 _exit(EXIT_FAILURE);
d87be9b0 4058 }
d6797c92 4059 }
2547bb41 4060
d87be9b0 4061 if (mount_all(arg_directory) < 0)
a2da110b 4062 _exit(EXIT_FAILURE);
57fb9fb5 4063
d87be9b0 4064 if (copy_devnodes(arg_directory) < 0)
a2da110b 4065 _exit(EXIT_FAILURE);
a258bf26 4066
f2d88580 4067 if (setup_ptmx(arg_directory) < 0)
a2da110b 4068 _exit(EXIT_FAILURE);
f2d88580 4069
d87be9b0 4070 dev_setup(arg_directory);
88213476 4071
785890ac
LP
4072 if (setup_propagate(arg_directory) < 0)
4073 _exit(EXIT_FAILURE);
4074
28650077 4075 if (setup_seccomp() < 0)
a2da110b 4076 _exit(EXIT_FAILURE);
24fb1112 4077
d87be9b0 4078 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 4079 _exit(EXIT_FAILURE);
88213476 4080
d87be9b0 4081 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 4082 _exit(EXIT_FAILURE);
03e334a1 4083 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 4084
6d0b55c2
LP
4085 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4086 _exit(EXIT_FAILURE);
4087 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4088
b12afc8c
LP
4089 /* Tell the parent that we are ready, and that
4090 * it can cgroupify us to that we lack access
4091 * to certain devices and resources. */
6dac160c 4092 (void) barrier_place(&barrier); /* #1 */
b12afc8c 4093
d87be9b0 4094 if (setup_boot_id(arg_directory) < 0)
a2da110b 4095 _exit(EXIT_FAILURE);
a41fe3a2 4096
d87be9b0 4097 if (setup_timezone(arg_directory) < 0)
a2da110b 4098 _exit(EXIT_FAILURE);
88213476 4099
d87be9b0 4100 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 4101 _exit(EXIT_FAILURE);
687d0825 4102
d87be9b0 4103 if (setup_journal(arg_directory) < 0)
a2da110b 4104 _exit(EXIT_FAILURE);
687d0825 4105
d6797c92 4106 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 4107 _exit(EXIT_FAILURE);
17fe0523 4108
d6797c92 4109 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 4110 _exit(EXIT_FAILURE);
17fe0523 4111
06c17c39 4112 if (mount_tmpfs(arg_directory) < 0)
a2da110b 4113 _exit(EXIT_FAILURE);
06c17c39 4114
b12afc8c
LP
4115 /* Wait until we are cgroup-ified, so that we
4116 * can mount the right cgroup path writable */
6dac160c 4117 (void) barrier_place_and_sync(&barrier); /* #2 */
b12afc8c
LP
4118
4119 if (mount_cgroup(arg_directory) < 0)
4120 _exit(EXIT_FAILURE);
d96c1ecf 4121
d87be9b0 4122 if (chdir(arg_directory) < 0) {
56f64d95 4123 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 4124 _exit(EXIT_FAILURE);
687d0825
MV
4125 }
4126
d87be9b0 4127 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 4128 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 4129 _exit(EXIT_FAILURE);
687d0825
MV
4130 }
4131
d87be9b0 4132 if (chroot(".") < 0) {
56f64d95 4133 log_error_errno(errno, "chroot() failed: %m");
a2da110b 4134 _exit(EXIT_FAILURE);
687d0825
MV
4135 }
4136
d87be9b0 4137 if (chdir("/") < 0) {
56f64d95 4138 log_error_errno(errno, "chdir() failed: %m");
a2da110b 4139 _exit(EXIT_FAILURE);
687d0825
MV
4140 }
4141
6dac160c
LP
4142 if (arg_userns) {
4143 if (unshare(CLONE_NEWUSER) < 0) {
4144 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4145 _exit(EXIT_FAILURE);
4146 }
d87be9b0 4147
6dac160c
LP
4148 /* Tell the parent, that it now can
4149 * write the UID map. */
4150 (void) barrier_place(&barrier); /* #3 */
4151
4152 /* Wait until the parent wrote the UID
4153 * map */
4154 (void) barrier_place_and_sync(&barrier); /* #4 */
4155 }
4156
4157 umask(0022);
d87be9b0
LP
4158
4159 if (drop_capabilities() < 0) {
56f64d95 4160 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4161 _exit(EXIT_FAILURE);
687d0825 4162 }
687d0825 4163
6dac160c
LP
4164 setup_hostname();
4165
4166 if (arg_personality != 0xffffffffLU) {
4167 if (personality(arg_personality) < 0) {
4168 log_error_errno(errno, "personality() failed: %m");
4169 _exit(EXIT_FAILURE);
4170 }
4171 } else if (secondary) {
4172 if (personality(PER_LINUX32) < 0) {
4173 log_error_errno(errno, "personality() failed: %m");
4174 _exit(EXIT_FAILURE);
4175 }
4176 }
4177
4178#ifdef HAVE_SELINUX
4179 if (arg_selinux_context)
4180 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4181 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4182 _exit(EXIT_FAILURE);
4183 }
4184#endif
4185
0cb9fbcd
LP
4186 r = change_uid_gid(&home);
4187 if (r < 0)
a2da110b 4188 _exit(EXIT_FAILURE);
d87be9b0 4189
842f3b0f
LP
4190 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4191 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4192 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4193 log_oom();
a2da110b 4194 _exit(EXIT_FAILURE);
144f0fc0 4195 }
687d0825 4196
9444b1f2 4197 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4198 char as_uuid[37];
4199
4200 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4201 log_oom();
a2da110b 4202 _exit(EXIT_FAILURE);
842f3b0f
LP
4203 }
4204 }
4205
4206 if (fdset_size(fds) > 0) {
ec16945e
LP
4207 r = fdset_cloexec(fds, false);
4208 if (r < 0) {
4209 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4210 _exit(EXIT_FAILURE);
842f3b0f
LP
4211 }
4212
4213 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4214 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4215 log_oom();
a2da110b 4216 _exit(EXIT_FAILURE);
d87be9b0
LP
4217 }
4218 }
4219
f4889f65
LP
4220 if (!strv_isempty(arg_setenv)) {
4221 char **n;
4222
4223 n = strv_env_merge(2, envp, arg_setenv);
4224 if (!n) {
4225 log_oom();
a2da110b 4226 _exit(EXIT_FAILURE);
f4889f65
LP
4227 }
4228
4229 env_use = n;
4230 } else
4231 env_use = (char**) envp;
4232
6dac160c
LP
4233 /* Let the parent know that we are ready and
4234 * wait until the parent is ready with the
4235 * setup, too... */
4236 (void) barrier_place_and_sync(&barrier); /* #5 */
d96c1ecf 4237
d87be9b0
LP
4238 if (arg_boot) {
4239 char **a;
4240 size_t l;
88213476 4241
d87be9b0 4242 /* Automatically search for the init system */
0f0dbc46 4243
d87be9b0
LP
4244 l = 1 + argc - optind;
4245 a = newa(char*, l + 1);
4246 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4247
d87be9b0 4248 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4249 execve(a[0], a, env_use);
0f0dbc46 4250
d87be9b0 4251 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4252 execve(a[0], a, env_use);
0f0dbc46 4253
d87be9b0 4254 a[0] = (char*) "/sbin/init";
f4889f65 4255 execve(a[0], a, env_use);
d87be9b0 4256 } else if (argc > optind)
f4889f65 4257 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4258 else {
4259 chdir(home ? home : "/root");
f4889f65 4260 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4261 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4262 }
4263
56f64d95 4264 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4265 _exit(EXIT_FAILURE);
da5b3bad 4266 }
88213476 4267
a2da110b 4268 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4269 fdset_free(fds);
4270 fds = NULL;
4271
6d0b55c2
LP
4272 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4273 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4274
6dac160c
LP
4275 (void) barrier_place(&barrier); /* #1 */
4276
b12afc8c
LP
4277 /* Wait for the most basic Child-setup to be done,
4278 * before we add hardware to it, and place it in a
4279 * cgroup. */
6dac160c 4280 if (barrier_sync(&barrier)) { /* #1 */
5aa4bb6b 4281 int ifi = 0;
354bfd2b 4282
840295fc
LP
4283 r = move_network_interfaces(pid);
4284 if (r < 0)
4285 goto finish;
aa28aefe 4286
5aa4bb6b 4287 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4288 if (r < 0)
4289 goto finish;
ab046dde 4290
5aa4bb6b 4291 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4292 if (r < 0)
4293 goto finish;
ab046dde 4294
840295fc
LP
4295 r = setup_macvlan(pid);
4296 if (r < 0)
4297 goto finish;
c74e630d 4298
4bbfe7ad
TG
4299 r = setup_ipvlan(pid);
4300 if (r < 0)
4301 goto finish;
4302
5aa4bb6b
LP
4303 r = register_machine(pid, ifi);
4304 if (r < 0)
4305 goto finish;
4306
6dac160c
LP
4307 /* Notify the child that the parent is ready with all
4308 * its setup, and that the child can now hand over
4309 * control to the code to run inside the container. */
4310 (void) barrier_place(&barrier); /* #2 */
4311
4312 if (arg_userns) {
4313 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4314
4315 (void) barrier_place_and_sync(&barrier); /* #3 */
4316
4317 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4318 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4319 r = write_string_file(uid_map, line);
4320 if (r < 0) {
4321 log_error_errno(r, "Failed to write UID map: %m");
4322 goto finish;
4323 }
4324
4325 /* We always assign the same UID and GID ranges */
4326 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4327 r = write_string_file(uid_map, line);
4328 if (r < 0) {
4329 log_error_errno(r, "Failed to write GID map: %m");
4330 goto finish;
4331 }
4332
4333 (void) barrier_place(&barrier); /* #4 */
4334 }
4335
840295fc
LP
4336 /* Block SIGCHLD here, before notifying child.
4337 * process_pty() will handle it with the other signals. */
4338 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4339 if (r < 0)
4340 goto finish;
e866af3a 4341
840295fc
LP
4342 /* Reset signal to default */
4343 r = default_signals(SIGCHLD, -1);
4344 if (r < 0)
4345 goto finish;
e866af3a 4346
6dac160c
LP
4347 /* Let the child know that we are ready and wait that the child is completely ready now. */
4348 if (barrier_place_and_sync(&barrier)) { /* #5 */
6d0b55c2
LP
4349 _cleanup_event_unref_ sd_event *event = NULL;
4350 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4352 char last_char = 0;
b12afc8c 4353
733d15ac
LP
4354 sd_notifyf(false,
4355 "READY=1\n"
4356 "STATUS=Container running.\n"
4357 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4358
6d0b55c2
LP
4359 r = sd_event_new(&event);
4360 if (r < 0) {
4361 log_error_errno(r, "Failed to get default event source: %m");
4362 goto finish;
4363 }
88213476 4364
c6c8f6e2 4365 if (arg_kill_signal > 0) {
6d0b55c2
LP
4366 /* Try to kill the init system on SIGINT or SIGTERM */
4367 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4368 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4369 } else {
4370 /* Immediately exit */
4371 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4372 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4373 }
023fb90b 4374
6d0b55c2
LP
4375 /* simply exit on sigchld */
4376 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4377
6d0b55c2
LP
4378 if (arg_expose_ports) {
4379 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4380 if (r < 0)
4381 goto finish;
023fb90b 4382
6d0b55c2
LP
4383 (void) expose_ports(rtnl, &exposed);
4384 }
023fb90b 4385
6d0b55c2 4386 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4387
9c857b9d 4388 r = pty_forward_new(event, master, true, !interactive, &forward);
6d0b55c2
LP
4389 if (r < 0) {
4390 log_error_errno(r, "Failed to create PTY forwarder: %m");
4391 goto finish;
4392 }
023fb90b 4393
6d0b55c2
LP
4394 r = sd_event_loop(event);
4395 if (r < 0) {
4396 log_error_errno(r, "Failed to run event loop: %m");
4397 goto finish;
4398 }
4399
4400 pty_forward_get_last_char(forward, &last_char);
4401
4402 forward = pty_forward_free(forward);
4403
4404 if (!arg_quiet && last_char != '\n')
4405 putc('\n', stdout);
04d39279 4406
6d0b55c2
LP
4407 /* Kill if it is not dead yet anyway */
4408 terminate_machine(pid);
4409 }
840295fc 4410 }
1f0cd86b 4411
840295fc 4412 /* Normally redundant, but better safe than sorry */
04d39279 4413 kill(pid, SIGKILL);
a258bf26 4414
113cea80 4415 r = wait_for_container(pid, &container_status);
04d39279
LP
4416 pid = 0;
4417
ec16945e 4418 if (r < 0)
ce9f1527
LP
4419 /* We failed to wait for the container, or the
4420 * container exited abnormally */
ec16945e
LP
4421 goto finish;
4422 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4423 /* The container exited with a non-zero
4424 * status, or with zero status and no reboot
4425 * was requested. */
ec16945e 4426 ret = r;
d87be9b0 4427 break;
ec16945e 4428 }
88213476 4429
113cea80 4430 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4431
4432 if (arg_keep_unit) {
4433 /* Special handling if we are running as a
4434 * service: instead of simply restarting the
4435 * machine we want to restart the entire
4436 * service, so let's inform systemd about this
4437 * with the special exit code 133. The service
4438 * file uses RestartForceExitStatus=133 so
4439 * that this results in a full nspawn
4440 * restart. This is necessary since we might
4441 * have cgroup parameters set we want to have
4442 * flushed out. */
ec16945e
LP
4443 ret = 133;
4444 r = 0;
ce38dbc8
LP
4445 break;
4446 }
6d0b55c2
LP
4447
4448 flush_ports(&exposed);
d87be9b0 4449 }
88213476
LP
4450
4451finish:
af4ec430
LP
4452 sd_notify(false,
4453 "STOPPING=1\n"
4454 "STATUS=Terminating...");
4455
1b9e5b12
LP
4456 loop_remove(loop_nr, &image_fd);
4457
9444b1f2
LP
4458 if (pid > 0)
4459 kill(pid, SIGKILL);
88213476 4460
ec16945e
LP
4461 if (remove_subvol && arg_directory) {
4462 int k;
4463
d9e2daaf 4464 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
4465 if (k < 0)
4466 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4467 }
4468
785890ac
LP
4469 if (arg_machine) {
4470 const char *p;
4471
63c372cb 4472 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4473 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4474 }
4475
04d391da 4476 free(arg_directory);
ec16945e
LP
4477 free(arg_template);
4478 free(arg_image);
7027ff61 4479 free(arg_machine);
c74e630d
LP
4480 free(arg_user);
4481 strv_free(arg_setenv);
4482 strv_free(arg_network_interfaces);
4483 strv_free(arg_network_macvlan);
4bbfe7ad 4484 strv_free(arg_network_ipvlan);
c74e630d
LP
4485 strv_free(arg_bind);
4486 strv_free(arg_bind_ro);
06c17c39 4487 strv_free(arg_tmpfs);
88213476 4488
6d0b55c2
LP
4489 flush_ports(&exposed);
4490
4491 while (arg_expose_ports) {
4492 ExposePort *p = arg_expose_ports;
4493 LIST_REMOVE(ports, arg_expose_ports, p);
4494 free(p);
4495 }
4496
ec16945e 4497 return r < 0 ? EXIT_FAILURE : ret;
88213476 4498}