]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
util: rework rm_rf() logic
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
aa28aefe 58#include "sd-rtnl.h"
88213476
LP
59#include "log.h"
60#include "util.h"
49e942b2 61#include "mkdir.h"
c6878637 62#include "rm-rf.h"
6b2d0e85 63#include "macro.h"
94d82985 64#include "missing.h"
04d391da 65#include "cgroup-util.h"
a258bf26 66#include "strv.h"
9eb977db 67#include "path-util.h"
a41fe3a2 68#include "loopback-setup.h"
4fc9982c 69#include "dev-setup.h"
842f3b0f 70#include "fdset.h"
acbeb427 71#include "build.h"
a5c32cff 72#include "fileio.h"
40ca29a1 73#include "bus-util.h"
1f0cd86b 74#include "bus-error.h"
4ba93280 75#include "ptyfwd.h"
f4889f65 76#include "env-util.h"
aa28aefe 77#include "rtnl-util.h"
7e227024 78#include "udev-util.h"
1b9e5b12
LP
79#include "blkid-util.h"
80#include "gpt.h"
01dde061 81#include "siphash24.h"
849958d1 82#include "copy.h"
3577de7a 83#include "base-filesystem.h"
a2da110b 84#include "barrier.h"
023fb90b 85#include "event-util.h"
f01ae826 86#include "capability.h"
2822da4f 87#include "cap-list.h"
ec16945e 88#include "btrfs-util.h"
1b9cebf6 89#include "machine-image.h"
6d0b55c2
LP
90#include "list.h"
91#include "in-addr-util.h"
92#include "fw-util.h"
93#include "local-addresses.h"
f2d88580 94
e9642be2
LP
95#ifdef HAVE_SECCOMP
96#include "seccomp-util.h"
97#endif
98
6d0b55c2
LP
99typedef struct ExposePort {
100 int protocol;
101 uint16_t host_port;
102 uint16_t container_port;
103 LIST_FIELDS(struct ExposePort, ports);
104} ExposePort;
105
113cea80
DH
106typedef enum ContainerStatus {
107 CONTAINER_TERMINATED,
108 CONTAINER_REBOOTED
109} ContainerStatus;
110
57fb9fb5
LP
111typedef enum LinkJournal {
112 LINK_NO,
113 LINK_AUTO,
114 LINK_HOST,
115 LINK_GUEST
116} LinkJournal;
88213476 117
4d9f07b4
LP
118typedef enum Volatile {
119 VOLATILE_NO,
120 VOLATILE_YES,
121 VOLATILE_STATE,
122} Volatile;
123
88213476 124static char *arg_directory = NULL;
ec16945e 125static char *arg_template = NULL;
687d0825 126static char *arg_user = NULL;
9444b1f2 127static sd_id128_t arg_uuid = {};
7027ff61 128static char *arg_machine = NULL;
c74e630d
LP
129static const char *arg_selinux_context = NULL;
130static const char *arg_selinux_apifs_context = NULL;
9444b1f2 131static const char *arg_slice = NULL;
ff01d048 132static bool arg_private_network = false;
bc2f673e 133static bool arg_read_only = false;
0f0dbc46 134static bool arg_boot = false;
ec16945e 135static bool arg_ephemeral = false;
57fb9fb5 136static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 137static bool arg_link_journal_try = false;
5076f0cc
LP
138static uint64_t arg_retain =
139 (1ULL << CAP_CHOWN) |
140 (1ULL << CAP_DAC_OVERRIDE) |
141 (1ULL << CAP_DAC_READ_SEARCH) |
142 (1ULL << CAP_FOWNER) |
143 (1ULL << CAP_FSETID) |
144 (1ULL << CAP_IPC_OWNER) |
145 (1ULL << CAP_KILL) |
146 (1ULL << CAP_LEASE) |
147 (1ULL << CAP_LINUX_IMMUTABLE) |
148 (1ULL << CAP_NET_BIND_SERVICE) |
149 (1ULL << CAP_NET_BROADCAST) |
150 (1ULL << CAP_NET_RAW) |
151 (1ULL << CAP_SETGID) |
152 (1ULL << CAP_SETFCAP) |
153 (1ULL << CAP_SETPCAP) |
154 (1ULL << CAP_SETUID) |
155 (1ULL << CAP_SYS_ADMIN) |
156 (1ULL << CAP_SYS_CHROOT) |
157 (1ULL << CAP_SYS_NICE) |
158 (1ULL << CAP_SYS_PTRACE) |
159 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 160 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
161 (1ULL << CAP_SYS_BOOT) |
162 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
163 (1ULL << CAP_AUDIT_CONTROL) |
164 (1ULL << CAP_MKNOD);
17fe0523
LP
165static char **arg_bind = NULL;
166static char **arg_bind_ro = NULL;
06c17c39 167static char **arg_tmpfs = NULL;
f4889f65 168static char **arg_setenv = NULL;
284c0b91 169static bool arg_quiet = false;
8a96d94e 170static bool arg_share_system = false;
eb91eb18 171static bool arg_register = true;
89f7c846 172static bool arg_keep_unit = false;
aa28aefe 173static char **arg_network_interfaces = NULL;
c74e630d 174static char **arg_network_macvlan = NULL;
4bbfe7ad 175static char **arg_network_ipvlan = NULL;
69c79d3c 176static bool arg_network_veth = false;
c74e630d 177static const char *arg_network_bridge = NULL;
6afc95b7 178static unsigned long arg_personality = 0xffffffffLU;
ec16945e 179static char *arg_image = NULL;
4d9f07b4 180static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 181static ExposePort *arg_expose_ports = NULL;
f36933fe 182static char **arg_property = NULL;
6dac160c
LP
183static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
184static bool arg_userns = false;
c6c8f6e2 185static int arg_kill_signal = 0;
88213476 186
601185b4 187static void help(void) {
88213476
LP
188 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
189 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
190 " -h --help Show this help\n"
191 " --version Print version string\n"
69c79d3c 192 " -q --quiet Do not show status information\n"
1b9e5b12 193 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
194 " --template=PATH Initialize root directory from template directory,\n"
195 " if missing\n"
196 " -x --ephemeral Run container with snapshot of root directory, and\n"
197 " remove it after exit\n"
198 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
199 " -b --boot Boot up full system (i.e. invoke init)\n"
200 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 201 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 202 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 203 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 204 " --property=NAME=VALUE Set scope unit property\n"
69c79d3c
LP
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
208 " container\n"
c74e630d
LP
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
4bbfe7ad
TG
212 " --network-ipvlan=INTERFACE\n"
213 " Create a ipvlan network interface based on an\n"
214 " existing network interface to the container\n"
0dfaa006 215 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 216 " and container\n"
ab046dde 217 " --network-bridge=INTERFACE\n"
32457153 218 " Add a virtual ethernet connection between host\n"
ab046dde
TG
219 " and container and add it to an existing bridge on\n"
220 " the host\n"
6dac160c
LP
221 " --private-users[=UIDBASE[:NUIDS]]\n"
222 " Run within user namespace\n"
6d0b55c2 223 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 224 " Expose a container IP port on the host\n"
82adf6af
LP
225 " -Z --selinux-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " processes in the container\n"
228 " -L --selinux-apifs-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " API/tmpfs file systems in the container\n"
a8828ed9
DW
231 " --capability=CAP In addition to the default, retain specified\n"
232 " capability\n"
233 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 234 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 238 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
239 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 242 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 243 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 244 " --share-system Share system namespaces with host\n"
eb91eb18 245 " --register=BOOLEAN Register container as machine\n"
89f7c846 246 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 247 " the service unit nspawn is running in\n"
6d0b55c2
LP
248 " --volatile[=MODE] Run the system in volatile mode\n"
249 , program_invocation_short_name);
88213476
LP
250}
251
ec16945e
LP
252static int set_sanitized_path(char **b, const char *path) {
253 char *p;
254
255 assert(b);
256 assert(path);
257
258 p = canonicalize_file_name(path);
259 if (!p) {
260 if (errno != ENOENT)
261 return -errno;
262
263 p = path_make_absolute_cwd(path);
264 if (!p)
265 return -ENOMEM;
266 }
267
268 free(*b);
269 *b = path_kill_slashes(p);
270 return 0;
271}
272
88213476
LP
273static int parse_argv(int argc, char *argv[]) {
274
a41fe3a2 275 enum {
acbeb427
ZJS
276 ARG_VERSION = 0x100,
277 ARG_PRIVATE_NETWORK,
bc2f673e 278 ARG_UUID,
5076f0cc 279 ARG_READ_ONLY,
57fb9fb5 280 ARG_CAPABILITY,
420c7379 281 ARG_DROP_CAPABILITY,
17fe0523
LP
282 ARG_LINK_JOURNAL,
283 ARG_BIND,
f4889f65 284 ARG_BIND_RO,
06c17c39 285 ARG_TMPFS,
f4889f65 286 ARG_SETENV,
eb91eb18 287 ARG_SHARE_SYSTEM,
89f7c846 288 ARG_REGISTER,
aa28aefe 289 ARG_KEEP_UNIT,
69c79d3c 290 ARG_NETWORK_INTERFACE,
c74e630d 291 ARG_NETWORK_MACVLAN,
4bbfe7ad 292 ARG_NETWORK_IPVLAN,
ab046dde 293 ARG_NETWORK_BRIDGE,
6afc95b7 294 ARG_PERSONALITY,
4d9f07b4 295 ARG_VOLATILE,
ec16945e 296 ARG_TEMPLATE,
f36933fe 297 ARG_PROPERTY,
6dac160c 298 ARG_PRIVATE_USERS,
c6c8f6e2 299 ARG_KILL_SIGNAL,
a41fe3a2
LP
300 };
301
88213476 302 static const struct option options[] = {
aa28aefe
LP
303 { "help", no_argument, NULL, 'h' },
304 { "version", no_argument, NULL, ARG_VERSION },
305 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
306 { "template", required_argument, NULL, ARG_TEMPLATE },
307 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
308 { "user", required_argument, NULL, 'u' },
309 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
310 { "boot", no_argument, NULL, 'b' },
311 { "uuid", required_argument, NULL, ARG_UUID },
312 { "read-only", no_argument, NULL, ARG_READ_ONLY },
313 { "capability", required_argument, NULL, ARG_CAPABILITY },
314 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
315 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
316 { "bind", required_argument, NULL, ARG_BIND },
317 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 318 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
319 { "machine", required_argument, NULL, 'M' },
320 { "slice", required_argument, NULL, 'S' },
321 { "setenv", required_argument, NULL, ARG_SETENV },
322 { "selinux-context", required_argument, NULL, 'Z' },
323 { "selinux-apifs-context", required_argument, NULL, 'L' },
324 { "quiet", no_argument, NULL, 'q' },
325 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
326 { "register", required_argument, NULL, ARG_REGISTER },
327 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
328 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 329 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 330 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 331 { "network-veth", no_argument, NULL, 'n' },
ab046dde 332 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 333 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 334 { "image", required_argument, NULL, 'i' },
4d9f07b4 335 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 336 { "port", required_argument, NULL, 'p' },
f36933fe 337 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 338 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 339 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 340 {}
88213476
LP
341 };
342
9444b1f2 343 int c, r;
a42c8b54 344 uint64_t plus = 0, minus = 0;
88213476
LP
345
346 assert(argc >= 0);
347 assert(argv);
348
0dfaa006 349 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
350
351 switch (c) {
352
353 case 'h':
601185b4
ZJS
354 help();
355 return 0;
88213476 356
acbeb427
ZJS
357 case ARG_VERSION:
358 puts(PACKAGE_STRING);
359 puts(SYSTEMD_FEATURES);
360 return 0;
361
88213476 362 case 'D':
ec16945e
LP
363 r = set_sanitized_path(&arg_directory, optarg);
364 if (r < 0)
365 return log_error_errno(r, "Invalid root directory: %m");
366
367 break;
368
369 case ARG_TEMPLATE:
370 r = set_sanitized_path(&arg_template, optarg);
371 if (r < 0)
372 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
373
374 break;
375
1b9e5b12 376 case 'i':
ec16945e
LP
377 r = set_sanitized_path(&arg_image, optarg);
378 if (r < 0)
379 return log_error_errno(r, "Invalid image path: %m");
380
381 break;
382
383 case 'x':
384 arg_ephemeral = true;
1b9e5b12
LP
385 break;
386
687d0825
MV
387 case 'u':
388 free(arg_user);
7027ff61
LP
389 arg_user = strdup(optarg);
390 if (!arg_user)
391 return log_oom();
687d0825
MV
392
393 break;
394
ab046dde 395 case ARG_NETWORK_BRIDGE:
c74e630d 396 arg_network_bridge = optarg;
ab046dde
TG
397
398 /* fall through */
399
0dfaa006 400 case 'n':
69c79d3c
LP
401 arg_network_veth = true;
402 arg_private_network = true;
403 break;
404
aa28aefe 405 case ARG_NETWORK_INTERFACE:
c74e630d
LP
406 if (strv_extend(&arg_network_interfaces, optarg) < 0)
407 return log_oom();
408
409 arg_private_network = true;
410 break;
411
412 case ARG_NETWORK_MACVLAN:
413 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
414 return log_oom();
415
4bbfe7ad
TG
416 arg_private_network = true;
417 break;
418
419 case ARG_NETWORK_IPVLAN:
420 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
421 return log_oom();
422
aa28aefe
LP
423 /* fall through */
424
ff01d048
LP
425 case ARG_PRIVATE_NETWORK:
426 arg_private_network = true;
a41fe3a2
LP
427 break;
428
0f0dbc46
LP
429 case 'b':
430 arg_boot = true;
431 break;
432
144f0fc0 433 case ARG_UUID:
9444b1f2
LP
434 r = sd_id128_from_string(optarg, &arg_uuid);
435 if (r < 0) {
aa96c6cb 436 log_error("Invalid UUID: %s", optarg);
9444b1f2 437 return r;
aa96c6cb 438 }
9444b1f2 439 break;
aa96c6cb 440
9444b1f2 441 case 'S':
c74e630d 442 arg_slice = optarg;
144f0fc0
LP
443 break;
444
7027ff61 445 case 'M':
eb91eb18
LP
446 if (isempty(optarg)) {
447 free(arg_machine);
448 arg_machine = NULL;
449 } else {
0c3c4284 450 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
451 log_error("Invalid machine name: %s", optarg);
452 return -EINVAL;
453 }
7027ff61 454
0c3c4284
LP
455 r = free_and_strdup(&arg_machine, optarg);
456 if (r < 0)
eb91eb18
LP
457 return log_oom();
458
459 break;
460 }
7027ff61 461
82adf6af
LP
462 case 'Z':
463 arg_selinux_context = optarg;
a8828ed9
DW
464 break;
465
82adf6af
LP
466 case 'L':
467 arg_selinux_apifs_context = optarg;
a8828ed9
DW
468 break;
469
bc2f673e
LP
470 case ARG_READ_ONLY:
471 arg_read_only = true;
472 break;
473
420c7379
LP
474 case ARG_CAPABILITY:
475 case ARG_DROP_CAPABILITY: {
a2a5291b 476 const char *state, *word;
5076f0cc
LP
477 size_t length;
478
479 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 480 _cleanup_free_ char *t;
5076f0cc
LP
481
482 t = strndup(word, length);
0d0f0c50
SL
483 if (!t)
484 return log_oom();
5076f0cc 485
39ed67d1
LP
486 if (streq(t, "all")) {
487 if (c == ARG_CAPABILITY)
a42c8b54 488 plus = (uint64_t) -1;
39ed67d1 489 else
a42c8b54 490 minus = (uint64_t) -1;
39ed67d1 491 } else {
2822da4f
LP
492 int cap;
493
494 cap = capability_from_name(t);
495 if (cap < 0) {
39ed67d1
LP
496 log_error("Failed to parse capability %s.", t);
497 return -EINVAL;
498 }
499
500 if (c == ARG_CAPABILITY)
a42c8b54 501 plus |= 1ULL << (uint64_t) cap;
39ed67d1 502 else
a42c8b54 503 minus |= 1ULL << (uint64_t) cap;
5076f0cc 504 }
5076f0cc
LP
505 }
506
507 break;
508 }
509
57fb9fb5
LP
510 case 'j':
511 arg_link_journal = LINK_GUEST;
574edc90 512 arg_link_journal_try = true;
57fb9fb5
LP
513 break;
514
515 case ARG_LINK_JOURNAL:
53e438e3 516 if (streq(optarg, "auto")) {
57fb9fb5 517 arg_link_journal = LINK_AUTO;
53e438e3
LP
518 arg_link_journal_try = false;
519 } else if (streq(optarg, "no")) {
57fb9fb5 520 arg_link_journal = LINK_NO;
53e438e3
LP
521 arg_link_journal_try = false;
522 } else if (streq(optarg, "guest")) {
57fb9fb5 523 arg_link_journal = LINK_GUEST;
53e438e3
LP
524 arg_link_journal_try = false;
525 } else if (streq(optarg, "host")) {
57fb9fb5 526 arg_link_journal = LINK_HOST;
53e438e3
LP
527 arg_link_journal_try = false;
528 } else if (streq(optarg, "try-guest")) {
574edc90
MP
529 arg_link_journal = LINK_GUEST;
530 arg_link_journal_try = true;
531 } else if (streq(optarg, "try-host")) {
532 arg_link_journal = LINK_HOST;
533 arg_link_journal_try = true;
534 } else {
57fb9fb5
LP
535 log_error("Failed to parse link journal mode %s", optarg);
536 return -EINVAL;
537 }
538
539 break;
540
17fe0523
LP
541 case ARG_BIND:
542 case ARG_BIND_RO: {
543 _cleanup_free_ char *a = NULL, *b = NULL;
544 char *e;
545 char ***x;
17fe0523
LP
546
547 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
548
549 e = strchr(optarg, ':');
550 if (e) {
551 a = strndup(optarg, e - optarg);
552 b = strdup(e + 1);
553 } else {
554 a = strdup(optarg);
555 b = strdup(optarg);
556 }
557
558 if (!a || !b)
559 return log_oom();
560
561 if (!path_is_absolute(a) || !path_is_absolute(b)) {
562 log_error("Invalid bind mount specification: %s", optarg);
563 return -EINVAL;
564 }
565
566 r = strv_extend(x, a);
567 if (r < 0)
b3451bed 568 return log_oom();
17fe0523
LP
569
570 r = strv_extend(x, b);
571 if (r < 0)
b3451bed 572 return log_oom();
17fe0523
LP
573
574 break;
575 }
576
06c17c39
LP
577 case ARG_TMPFS: {
578 _cleanup_free_ char *a = NULL, *b = NULL;
579 char *e;
580
581 e = strchr(optarg, ':');
582 if (e) {
583 a = strndup(optarg, e - optarg);
584 b = strdup(e + 1);
585 } else {
586 a = strdup(optarg);
587 b = strdup("mode=0755");
588 }
589
590 if (!a || !b)
591 return log_oom();
592
593 if (!path_is_absolute(a)) {
594 log_error("Invalid tmpfs specification: %s", optarg);
595 return -EINVAL;
596 }
597
598 r = strv_push(&arg_tmpfs, a);
599 if (r < 0)
600 return log_oom();
601
602 a = NULL;
603
604 r = strv_push(&arg_tmpfs, b);
605 if (r < 0)
606 return log_oom();
607
608 b = NULL;
609
610 break;
611 }
612
f4889f65
LP
613 case ARG_SETENV: {
614 char **n;
615
616 if (!env_assignment_is_valid(optarg)) {
617 log_error("Environment variable assignment '%s' is not valid.", optarg);
618 return -EINVAL;
619 }
620
621 n = strv_env_set(arg_setenv, optarg);
622 if (!n)
623 return log_oom();
624
625 strv_free(arg_setenv);
626 arg_setenv = n;
627 break;
628 }
629
284c0b91
LP
630 case 'q':
631 arg_quiet = true;
632 break;
633
8a96d94e
LP
634 case ARG_SHARE_SYSTEM:
635 arg_share_system = true;
636 break;
637
eb91eb18
LP
638 case ARG_REGISTER:
639 r = parse_boolean(optarg);
640 if (r < 0) {
641 log_error("Failed to parse --register= argument: %s", optarg);
642 return r;
643 }
644
645 arg_register = r;
646 break;
647
89f7c846
LP
648 case ARG_KEEP_UNIT:
649 arg_keep_unit = true;
650 break;
651
6afc95b7
LP
652 case ARG_PERSONALITY:
653
ac45f971 654 arg_personality = personality_from_string(optarg);
6afc95b7
LP
655 if (arg_personality == 0xffffffffLU) {
656 log_error("Unknown or unsupported personality '%s'.", optarg);
657 return -EINVAL;
658 }
659
660 break;
661
4d9f07b4
LP
662 case ARG_VOLATILE:
663
664 if (!optarg)
665 arg_volatile = VOLATILE_YES;
666 else {
667 r = parse_boolean(optarg);
668 if (r < 0) {
669 if (streq(optarg, "state"))
670 arg_volatile = VOLATILE_STATE;
671 else {
672 log_error("Failed to parse --volatile= argument: %s", optarg);
673 return r;
674 }
675 } else
676 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
677 }
678
679 break;
680
6d0b55c2
LP
681 case 'p': {
682 const char *split, *e;
683 uint16_t container_port, host_port;
684 int protocol;
685 ExposePort *p;
686
687 if ((e = startswith(optarg, "tcp:")))
688 protocol = IPPROTO_TCP;
689 else if ((e = startswith(optarg, "udp:")))
690 protocol = IPPROTO_UDP;
691 else {
692 e = optarg;
693 protocol = IPPROTO_TCP;
694 }
695
696 split = strchr(e, ':');
697 if (split) {
698 char v[split - e + 1];
699
700 memcpy(v, e, split - e);
701 v[split - e] = 0;
702
703 r = safe_atou16(v, &host_port);
704 if (r < 0 || host_port <= 0) {
705 log_error("Failed to parse host port: %s", optarg);
706 return -EINVAL;
707 }
708
709 r = safe_atou16(split + 1, &container_port);
710 } else {
711 r = safe_atou16(e, &container_port);
712 host_port = container_port;
713 }
714
715 if (r < 0 || container_port <= 0) {
716 log_error("Failed to parse host port: %s", optarg);
717 return -EINVAL;
718 }
719
720 LIST_FOREACH(ports, p, arg_expose_ports) {
721 if (p->protocol == protocol && p->host_port == host_port) {
722 log_error("Duplicate port specification: %s", optarg);
723 return -EINVAL;
724 }
725 }
726
727 p = new(ExposePort, 1);
728 if (!p)
729 return log_oom();
730
731 p->protocol = protocol;
732 p->host_port = host_port;
733 p->container_port = container_port;
734
735 LIST_PREPEND(ports, arg_expose_ports, p);
736
737 break;
738 }
739
f36933fe
LP
740 case ARG_PROPERTY:
741 if (strv_extend(&arg_property, optarg) < 0)
742 return log_oom();
743
744 break;
745
6dac160c
LP
746 case ARG_PRIVATE_USERS:
747 if (optarg) {
748 _cleanup_free_ char *buffer = NULL;
749 const char *range, *shift;
750
751 range = strchr(optarg, ':');
752 if (range) {
753 buffer = strndup(optarg, range - optarg);
754 if (!buffer)
755 return log_oom();
756 shift = buffer;
757
758 range++;
759 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
760 log_error("Failed to parse UID range: %s", range);
761 return -EINVAL;
762 }
763 } else
764 shift = optarg;
765
766 if (parse_uid(shift, &arg_uid_shift) < 0) {
767 log_error("Failed to parse UID: %s", optarg);
768 return -EINVAL;
769 }
770 }
771
772 arg_userns = true;
773 break;
774
c6c8f6e2
LP
775 case ARG_KILL_SIGNAL:
776 arg_kill_signal = signal_from_string_try_harder(optarg);
777 if (arg_kill_signal < 0) {
778 log_error("Cannot parse signal: %s", optarg);
779 return -EINVAL;
780 }
781
782 break;
783
88213476
LP
784 case '?':
785 return -EINVAL;
786
787 default:
eb9da376 788 assert_not_reached("Unhandled option");
88213476 789 }
88213476 790
eb91eb18
LP
791 if (arg_share_system)
792 arg_register = false;
793
794 if (arg_boot && arg_share_system) {
795 log_error("--boot and --share-system may not be combined.");
796 return -EINVAL;
797 }
798
89f7c846
LP
799 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
800 log_error("--keep-unit may not be used when invoked from a user session.");
801 return -EINVAL;
802 }
803
1b9e5b12
LP
804 if (arg_directory && arg_image) {
805 log_error("--directory= and --image= may not be combined.");
806 return -EINVAL;
807 }
808
ec16945e
LP
809 if (arg_template && arg_image) {
810 log_error("--template= and --image= may not be combined.");
811 return -EINVAL;
812 }
813
814 if (arg_template && !(arg_directory || arg_machine)) {
815 log_error("--template= needs --directory= or --machine=.");
816 return -EINVAL;
817 }
818
819 if (arg_ephemeral && arg_template) {
820 log_error("--ephemeral and --template= may not be combined.");
821 return -EINVAL;
822 }
823
824 if (arg_ephemeral && arg_image) {
825 log_error("--ephemeral and --image= may not be combined.");
826 return -EINVAL;
827 }
828
df9a75e4
LP
829 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
830 log_error("--ephemeral and --link-journal= may not be combined.");
831 return -EINVAL;
832 }
833
4d9f07b4
LP
834 if (arg_volatile != VOLATILE_NO && arg_read_only) {
835 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
836 return -EINVAL;
837 }
838
6d0b55c2
LP
839 if (arg_expose_ports && !arg_private_network) {
840 log_error("Cannot use --port= without private networking.");
841 return -EINVAL;
842 }
843
a42c8b54
LP
844 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
845
c6c8f6e2
LP
846 if (arg_boot && arg_kill_signal <= 0)
847 arg_kill_signal = SIGRTMIN+3;
848
88213476
LP
849 return 1;
850}
851
852static int mount_all(const char *dest) {
853
854 typedef struct MountPoint {
855 const char *what;
856 const char *where;
857 const char *type;
858 const char *options;
859 unsigned long flags;
3bd66c05 860 bool fatal;
88213476
LP
861 } MountPoint;
862
863 static const MountPoint mount_table[] = {
06c17c39
LP
864 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
865 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
866 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
867 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
868 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 869 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
870 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
871 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
bbb99c30 872 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 873#ifdef HAVE_SELINUX
06c17c39
LP
874 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
875 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 876#endif
88213476
LP
877 };
878
879 unsigned k;
880 int r = 0;
881
882 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 883 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 884 const char *o;
88213476
LP
885 int t;
886
17fe0523
LP
887 where = strjoin(dest, "/", mount_table[k].where, NULL);
888 if (!where)
889 return log_oom();
88213476 890
e65aec12 891 t = path_is_mount_point(where, true);
68fb0892 892 if (t < 0) {
da927ba9 893 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
894
895 if (r == 0)
896 r = t;
897
898 continue;
899 }
900
9c1c7f71
LP
901 /* Skip this entry if it is not a remount. */
902 if (mount_table[k].what && t > 0)
014a9c77
LP
903 continue;
904
79d80fc1
TG
905 t = mkdir_p(where, 0755);
906 if (t < 0) {
907 if (mount_table[k].fatal) {
da927ba9 908 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
909
910 if (r == 0)
911 r = t;
912 } else
da927ba9 913 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
914
915 continue;
916 }
88213476 917
a8828ed9 918#ifdef HAVE_SELINUX
82adf6af
LP
919 if (arg_selinux_apifs_context &&
920 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
921 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
922 if (!options)
923 return log_oom();
924
925 o = options;
926 } else
a8828ed9 927#endif
d002827b 928 o = mount_table[k].options;
a8828ed9 929
6dac160c
LP
930 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
931 char *uid_options = NULL;
932
933 if (o)
934 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
935 else
936 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
937 if (!uid_options)
938 return log_oom();
939
940 free(options);
941 o = options = uid_options;
942 }
a8828ed9 943
88213476
LP
944 if (mount(mount_table[k].what,
945 where,
946 mount_table[k].type,
947 mount_table[k].flags,
79d80fc1 948 o) < 0) {
88213476 949
79d80fc1 950 if (mount_table[k].fatal) {
56f64d95 951 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 952
79d80fc1
TG
953 if (r == 0)
954 r = -errno;
955 } else
56f64d95 956 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 957 }
88213476
LP
958 }
959
e58a1277
LP
960 return r;
961}
f8440af5 962
d6797c92 963static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
964 char **x, **y;
965
966 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 967 _cleanup_free_ char *where = NULL;
d2421337 968 struct stat source_st, dest_st;
2ed4e5e0 969 int r;
d2421337 970
4a62c710
MS
971 if (stat(*x, &source_st) < 0)
972 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 973
06c17c39
LP
974 where = strappend(dest, *y);
975 if (!where)
976 return log_oom();
977
2ed4e5e0
SL
978 r = stat(where, &dest_st);
979 if (r == 0) {
05e7da5a
AC
980 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
981 log_error("Cannot bind mount directory %s on file %s.", *x, where);
982 return -EINVAL;
983 }
984 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
985 log_error("Cannot bind mount file %s on directory %s.", *x, where);
d2421337
DR
986 return -EINVAL;
987 }
2ed4e5e0
SL
988 } else if (errno == ENOENT) {
989 r = mkdir_parents_label(where, 0755);
f647962d
MS
990 if (r < 0)
991 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 992 } else {
56f64d95 993 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
994 return -errno;
995 }
06c17c39 996
05e7da5a
AC
997 /* Create the mount point. Any non-directory file can be
998 * mounted on any non-directory file (regular, fifo, socket,
999 * char, block).
1000 */
79d80fc1
TG
1001 if (S_ISDIR(source_st.st_mode)) {
1002 r = mkdir_label(where, 0755);
f647962d
MS
1003 if (r < 0 && errno != EEXIST)
1004 return log_error_errno(r, "Failed to create mount point %s: %m", where);
05e7da5a 1005 } else {
79d80fc1 1006 r = touch(where);
f647962d
MS
1007 if (r < 0)
1008 return log_error_errno(r, "Failed to create mount point %s: %m", where);
d2421337 1009 }
17fe0523 1010
4543768d 1011 if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
4a62c710 1012 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 1013
d6797c92
LP
1014 if (ro) {
1015 r = bind_remount_recursive(where, true);
f647962d
MS
1016 if (r < 0)
1017 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
1018 }
1019 }
1020
1021 return 0;
1022}
1023
b12afc8c
LP
1024static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1025 char *to;
1026 int r;
1027
63c372cb 1028 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c
LP
1029
1030 r = path_is_mount_point(to, false);
1031 if (r < 0)
1032 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1033 if (r > 0)
1034 return 0;
1035
1036 mkdir_p(to, 0755);
1037
c0534580
LP
1038 /* The superblock mount options of the mount point need to be
1039 * identical to the hosts', and hence writable... */
1040 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1041 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1042
c0534580
LP
1043 /* ... hence let's only make the bind mount read-only, not the
1044 * superblock. */
1045 if (read_only) {
1046 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1047 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1048 }
b12afc8c
LP
1049 return 1;
1050}
1051
1052static int mount_cgroup(const char *dest) {
1053 _cleanup_set_free_free_ Set *controllers = NULL;
1054 _cleanup_free_ char *own_cgroup_path = NULL;
1055 const char *cgroup_root, *systemd_root, *systemd_own;
1056 int r;
1057
1058 controllers = set_new(&string_hash_ops);
1059 if (!controllers)
1060 return log_oom();
1061
1062 r = cg_kernel_controllers(controllers);
1063 if (r < 0)
1064 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1065
1066 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1067 if (r < 0)
1068 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1069
63c372cb 1070 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
b12afc8c
LP
1071 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1072 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1073
1074 for (;;) {
1075 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1076
1077 controller = set_steal_first(controllers);
1078 if (!controller)
1079 break;
1080
1081 origin = strappend("/sys/fs/cgroup/", controller);
1082 if (!origin)
1083 return log_oom();
1084
1085 r = readlink_malloc(origin, &combined);
1086 if (r == -EINVAL) {
1087 /* Not a symbolic link, but directly a single cgroup hierarchy */
1088
1089 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1090 if (r < 0)
1091 return r;
1092
1093 } else if (r < 0)
1094 return log_error_errno(r, "Failed to read link %s: %m", origin);
1095 else {
1096 _cleanup_free_ char *target = NULL;
1097
1098 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1099 if (!target)
1100 return log_oom();
1101
1102 /* A symbolic link, a combination of controllers in one hierarchy */
1103
1104 if (!filename_is_valid(combined)) {
1105 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1106 continue;
1107 }
1108
1109 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1110 if (r < 0)
1111 return r;
1112
1113 if (symlink(combined, target) < 0)
83521414 1114 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1115 }
1116 }
1117
c0534580 1118 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1119 if (r < 0)
1120 return r;
1121
1122 /* Make our own cgroup a (writable) bind mount */
63c372cb 1123 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1124 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1125 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1126
1127 /* And then remount the systemd cgroup root read-only */
63c372cb 1128 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1129 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1130 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1131
1132 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1133 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1134
1135 return 0;
1136}
1137
06c17c39
LP
1138static int mount_tmpfs(const char *dest) {
1139 char **i, **o;
1140
1141 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1142 _cleanup_free_ char *where = NULL;
79d80fc1 1143 int r;
06c17c39
LP
1144
1145 where = strappend(dest, *i);
1146 if (!where)
1147 return log_oom();
1148
79d80fc1 1149 r = mkdir_label(where, 0755);
04a91939
LP
1150 if (r < 0 && r != -EEXIST)
1151 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1152
4a62c710
MS
1153 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1154 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1155 }
1156
1157 return 0;
1158}
1159
e58a1277 1160static int setup_timezone(const char *dest) {
d4036145
LP
1161 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1162 char *z, *y;
1163 int r;
f8440af5 1164
e58a1277
LP
1165 assert(dest);
1166
1167 /* Fix the timezone, if possible */
d4036145
LP
1168 r = readlink_malloc("/etc/localtime", &p);
1169 if (r < 0) {
1170 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1171 return 0;
1172 }
1173
1174 z = path_startswith(p, "../usr/share/zoneinfo/");
1175 if (!z)
1176 z = path_startswith(p, "/usr/share/zoneinfo/");
1177 if (!z) {
1178 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1179 return 0;
1180 }
1181
04bc4a3f
LP
1182 where = strappend(dest, "/etc/localtime");
1183 if (!where)
0d0f0c50 1184 return log_oom();
715ac17a 1185
d4036145
LP
1186 r = readlink_malloc(where, &q);
1187 if (r >= 0) {
1188 y = path_startswith(q, "../usr/share/zoneinfo/");
1189 if (!y)
1190 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1191
d4036145
LP
1192 /* Already pointing to the right place? Then do nothing .. */
1193 if (y && streq(y, z))
1194 return 0;
1195 }
1196
1197 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1198 if (!check)
0d0f0c50 1199 return log_oom();
4d1c38b8 1200
d4036145
LP
1201 if (access(check, F_OK) < 0) {
1202 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1203 return 0;
1204 }
68fb0892 1205
d4036145
LP
1206 what = strappend("../usr/share/zoneinfo/", z);
1207 if (!what)
1208 return log_oom();
1209
79d80fc1
TG
1210 r = mkdir_parents(where, 0755);
1211 if (r < 0) {
da927ba9 1212 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1213
1214 return 0;
1215 }
1216
1217 r = unlink(where);
1218 if (r < 0 && errno != ENOENT) {
56f64d95 1219 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1220
1221 return 0;
1222 }
4d9f07b4 1223
d4036145 1224 if (symlink(what, where) < 0) {
56f64d95 1225 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1226 return 0;
1227 }
e58a1277
LP
1228
1229 return 0;
88213476
LP
1230}
1231
2547bb41 1232static int setup_resolv_conf(const char *dest) {
c8b32e11 1233 _cleanup_free_ char *where = NULL;
79d80fc1 1234 int r;
2547bb41
LP
1235
1236 assert(dest);
1237
1238 if (arg_private_network)
1239 return 0;
1240
1241 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1242 where = strappend(dest, "/etc/resolv.conf");
1243 if (!where)
0d0f0c50 1244 return log_oom();
2547bb41 1245
77e63faf
LP
1246 /* We don't really care for the results of this really. If it
1247 * fails, it fails, but meh... */
79d80fc1
TG
1248 r = mkdir_parents(where, 0755);
1249 if (r < 0) {
da927ba9 1250 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1251
1252 return 0;
1253 }
1254
f2068bcc 1255 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1256 if (r < 0) {
da927ba9 1257 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1258
1259 return 0;
1260 }
2547bb41
LP
1261
1262 return 0;
1263}
1264
4d9f07b4
LP
1265static int setup_volatile_state(const char *directory) {
1266 const char *p;
1267 int r;
1268
1269 assert(directory);
1270
1271 if (arg_volatile != VOLATILE_STATE)
1272 return 0;
1273
1274 /* --volatile=state means we simply overmount /var
1275 with a tmpfs, and the rest read-only. */
1276
1277 r = bind_remount_recursive(directory, true);
f647962d
MS
1278 if (r < 0)
1279 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1280
63c372cb 1281 p = strjoina(directory, "/var");
79d80fc1 1282 r = mkdir(p, 0755);
4a62c710
MS
1283 if (r < 0 && errno != EEXIST)
1284 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1285
4a62c710
MS
1286 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1287 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1288
1289 return 0;
1290}
1291
1292static int setup_volatile(const char *directory) {
1293 bool tmpfs_mounted = false, bind_mounted = false;
1294 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1295 const char *f, *t;
1296 int r;
1297
1298 assert(directory);
1299
1300 if (arg_volatile != VOLATILE_YES)
1301 return 0;
1302
1303 /* --volatile=yes means we mount a tmpfs to the root dir, and
1304 the original /usr to use inside it, and that read-only. */
1305
4a62c710
MS
1306 if (!mkdtemp(template))
1307 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1308
1309 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1310 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1311 r = -errno;
1312 goto fail;
1313 }
1314
1315 tmpfs_mounted = true;
1316
63c372cb
LP
1317 f = strjoina(directory, "/usr");
1318 t = strjoina(template, "/usr");
4d9f07b4 1319
79d80fc1
TG
1320 r = mkdir(t, 0755);
1321 if (r < 0 && errno != EEXIST) {
56f64d95 1322 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1323 r = -errno;
1324 goto fail;
1325 }
1326
4543768d 1327 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1328 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1329 r = -errno;
1330 goto fail;
1331 }
1332
1333 bind_mounted = true;
1334
1335 r = bind_remount_recursive(t, true);
1336 if (r < 0) {
da927ba9 1337 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1338 goto fail;
1339 }
1340
1341 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1342 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1343 r = -errno;
1344 goto fail;
1345 }
1346
1347 rmdir(template);
1348
1349 return 0;
1350
1351fail:
1352 if (bind_mounted)
1353 umount(t);
1354 if (tmpfs_mounted)
1355 umount(template);
1356 rmdir(template);
1357 return r;
1358}
1359
9f24adc2
LP
1360static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1361
1362 snprintf(s, 37,
1363 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1364 SD_ID128_FORMAT_VAL(id));
1365
1366 return s;
1367}
1368
04bc4a3f 1369static int setup_boot_id(const char *dest) {
7fd1b19b 1370 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1371 sd_id128_t rnd = {};
04bc4a3f
LP
1372 char as_uuid[37];
1373 int r;
1374
1375 assert(dest);
1376
eb91eb18
LP
1377 if (arg_share_system)
1378 return 0;
1379
04bc4a3f
LP
1380 /* Generate a new randomized boot ID, so that each boot-up of
1381 * the container gets a new one */
1382
1383 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1384 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1385 if (!from || !to)
1386 return log_oom();
04bc4a3f
LP
1387
1388 r = sd_id128_randomize(&rnd);
f647962d
MS
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1391
9f24adc2 1392 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1393
574d5f2d 1394 r = write_string_file(from, as_uuid);
f647962d
MS
1395 if (r < 0)
1396 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1397
4543768d 1398 if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
56f64d95 1399 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1400 r = -errno;
4543768d 1401 } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1402 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1403
1404 unlink(from);
04bc4a3f
LP
1405 return r;
1406}
1407
e58a1277 1408static int copy_devnodes(const char *dest) {
88213476
LP
1409
1410 static const char devnodes[] =
1411 "null\0"
1412 "zero\0"
1413 "full\0"
1414 "random\0"
1415 "urandom\0"
85614d66
TG
1416 "tty\0"
1417 "net/tun\0";
88213476
LP
1418
1419 const char *d;
e58a1277 1420 int r = 0;
7fd1b19b 1421 _cleanup_umask_ mode_t u;
a258bf26
LP
1422
1423 assert(dest);
124640f1
LP
1424
1425 u = umask(0000);
88213476
LP
1426
1427 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1428 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1429 struct stat st;
88213476 1430
7f112f50
LP
1431 from = strappend("/dev/", d);
1432 to = strjoin(dest, "/dev/", d, NULL);
1433 if (!from || !to)
1434 return log_oom();
88213476
LP
1435
1436 if (stat(from, &st) < 0) {
1437
4a62c710
MS
1438 if (errno != ENOENT)
1439 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1440
a258bf26 1441 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1442
ed8b7a3e 1443 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1444 return -EIO;
a258bf26 1445
85614d66
TG
1446 } else {
1447 r = mkdir_parents(to, 0775);
1448 if (r < 0) {
da927ba9 1449 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1450 return -r;
1451 }
a258bf26 1452
81f5049b
AC
1453 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1454 if (errno != EPERM)
1455 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1456
1457 /* Some systems abusively restrict mknod but
1458 * allow bind mounts. */
1459 r = touch(to);
1460 if (r < 0)
1461 return log_error_errno(r, "touch (%s) failed: %m", to);
1462 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1463 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1464 }
6278cf60
LP
1465
1466 if (arg_userns && arg_uid_shift != UID_INVALID)
1467 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1468 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
88213476 1469 }
88213476
LP
1470 }
1471
e58a1277
LP
1472 return r;
1473}
88213476 1474
f2d88580
LP
1475static int setup_ptmx(const char *dest) {
1476 _cleanup_free_ char *p = NULL;
1477
1478 p = strappend(dest, "/dev/ptmx");
1479 if (!p)
1480 return log_oom();
1481
4a62c710
MS
1482 if (symlink("pts/ptmx", p) < 0)
1483 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580 1484
6278cf60
LP
1485 if (arg_userns && arg_uid_shift != UID_INVALID)
1486 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1487 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1488
f2d88580
LP
1489 return 0;
1490}
1491
e58a1277 1492static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1493 _cleanup_umask_ mode_t u;
1494 const char *to;
e58a1277 1495 int r;
e58a1277
LP
1496
1497 assert(dest);
1498 assert(console);
1499
1500 u = umask(0000);
1501
e58a1277 1502 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1503 if (r < 0)
1504 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1505
a258bf26
LP
1506 /* We need to bind mount the right tty to /dev/console since
1507 * ptys can only exist on pts file systems. To have something
81f5049b 1508 * to bind mount things on we create a empty regular file. */
a258bf26 1509
63c372cb 1510 to = strjoina(dest, "/dev/console");
81f5049b
AC
1511 r = touch(to);
1512 if (r < 0)
1513 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1514
4543768d 1515 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1516 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1517
25ea79fe 1518 return 0;
e58a1277
LP
1519}
1520
1521static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1522 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1523 _cleanup_umask_ mode_t u;
6d0b55c2 1524 int r, fd, k;
e58a1277
LP
1525 union {
1526 struct cmsghdr cmsghdr;
1527 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1528 } control = {};
1529 struct msghdr mh = {
1530 .msg_control = &control,
1531 .msg_controllen = sizeof(control),
1532 };
e58a1277
LP
1533 struct cmsghdr *cmsg;
1534
1535 assert(dest);
1536 assert(kmsg_socket >= 0);
a258bf26 1537
e58a1277 1538 u = umask(0000);
a258bf26 1539
f1e5dfe2
LP
1540 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1541 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1542 * on the reading side behave very similar to /proc/kmsg,
1543 * their writing side behaves differently from /dev/kmsg in
1544 * that writing blocks when nothing is reading. In order to
1545 * avoid any problems with containers deadlocking due to this
1546 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1547 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1548 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1549 return log_oom();
e58a1277 1550
4a62c710
MS
1551 if (mkfifo(from, 0600) < 0)
1552 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1553
1554 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1555 if (r < 0)
1556 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1557
4543768d 1558 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1559 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1560
1561 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1562 if (fd < 0)
1563 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1564
e58a1277
LP
1565 cmsg = CMSG_FIRSTHDR(&mh);
1566 cmsg->cmsg_level = SOL_SOCKET;
1567 cmsg->cmsg_type = SCM_RIGHTS;
1568 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1569 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1570
1571 mh.msg_controllen = cmsg->cmsg_len;
1572
1573 /* Store away the fd in the socket, so that it stays open as
1574 * long as we run the child */
6d0b55c2 1575 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1576 safe_close(fd);
e58a1277 1577
4a62c710
MS
1578 if (k < 0)
1579 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1580
f1e5dfe2
LP
1581 /* And now make the FIFO unavailable as /dev/kmsg... */
1582 unlink(from);
25ea79fe 1583 return 0;
88213476
LP
1584}
1585
6d0b55c2
LP
1586static int send_rtnl(int send_fd) {
1587 union {
1588 struct cmsghdr cmsghdr;
1589 uint8_t buf[CMSG_SPACE(sizeof(int))];
1590 } control = {};
1591 struct msghdr mh = {
1592 .msg_control = &control,
1593 .msg_controllen = sizeof(control),
1594 };
1595 struct cmsghdr *cmsg;
1596 _cleanup_close_ int fd = -1;
1597 ssize_t k;
1598
1599 assert(send_fd >= 0);
1600
1601 if (!arg_expose_ports)
1602 return 0;
1603
1604 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1605 if (fd < 0)
1606 return log_error_errno(errno, "failed to allocate container netlink: %m");
1607
1608 cmsg = CMSG_FIRSTHDR(&mh);
1609 cmsg->cmsg_level = SOL_SOCKET;
1610 cmsg->cmsg_type = SCM_RIGHTS;
1611 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1612 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1613
1614 mh.msg_controllen = cmsg->cmsg_len;
1615
1616 /* Store away the fd in the socket, so that it stays open as
1617 * long as we run the child */
1618 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1619 if (k < 0)
1620 return log_error_errno(errno, "Failed to send netlink fd: %m");
1621
1622 return 0;
1623}
1624
1625static int flush_ports(union in_addr_union *exposed) {
1626 ExposePort *p;
1627 int r, af = AF_INET;
1628
1629 assert(exposed);
1630
1631 if (!arg_expose_ports)
1632 return 0;
1633
1634 if (in_addr_is_null(af, exposed))
1635 return 0;
1636
1637 log_debug("Lost IP address.");
1638
1639 LIST_FOREACH(ports, p, arg_expose_ports) {
1640 r = fw_add_local_dnat(false,
1641 af,
1642 p->protocol,
1643 NULL,
1644 NULL, 0,
1645 NULL, 0,
1646 p->host_port,
1647 exposed,
1648 p->container_port,
1649 NULL);
1650 if (r < 0)
1651 log_warning_errno(r, "Failed to modify firewall: %m");
1652 }
1653
1654 *exposed = IN_ADDR_NULL;
1655 return 0;
1656}
1657
1658static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1659 _cleanup_free_ struct local_address *addresses = NULL;
1660 _cleanup_free_ char *pretty = NULL;
1661 union in_addr_union new_exposed;
1662 ExposePort *p;
1663 bool add;
1664 int af = AF_INET, r;
1665
1666 assert(exposed);
1667
1668 /* Invoked each time an address is added or removed inside the
1669 * container */
1670
1671 if (!arg_expose_ports)
1672 return 0;
1673
1674 r = local_addresses(rtnl, 0, af, &addresses);
1675 if (r < 0)
1676 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1677
1678 add = r > 0 &&
1679 addresses[0].family == af &&
1680 addresses[0].scope < RT_SCOPE_LINK;
1681
1682 if (!add)
1683 return flush_ports(exposed);
1684
1685 new_exposed = addresses[0].address;
1686 if (in_addr_equal(af, exposed, &new_exposed))
1687 return 0;
1688
1689 in_addr_to_string(af, &new_exposed, &pretty);
1690 log_debug("New container IP is %s.", strna(pretty));
1691
1692 LIST_FOREACH(ports, p, arg_expose_ports) {
1693
1694 r = fw_add_local_dnat(true,
1695 af,
1696 p->protocol,
1697 NULL,
1698 NULL, 0,
1699 NULL, 0,
1700 p->host_port,
1701 &new_exposed,
1702 p->container_port,
1703 in_addr_is_null(af, exposed) ? NULL : exposed);
1704 if (r < 0)
1705 log_warning_errno(r, "Failed to modify firewall: %m");
1706 }
1707
1708 *exposed = new_exposed;
1709 return 0;
1710}
1711
1712static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1713 union in_addr_union *exposed = userdata;
1714
1715 assert(rtnl);
1716 assert(m);
1717 assert(exposed);
1718
1719 expose_ports(rtnl, exposed);
1720 return 0;
1721}
1722
1723static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1724 union {
1725 struct cmsghdr cmsghdr;
1726 uint8_t buf[CMSG_SPACE(sizeof(int))];
1727 } control = {};
1728 struct msghdr mh = {
1729 .msg_control = &control,
1730 .msg_controllen = sizeof(control),
1731 };
1732 struct cmsghdr *cmsg;
1733 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1734 int fd, r;
1735 ssize_t k;
1736
1737 assert(event);
1738 assert(recv_fd >= 0);
1739 assert(ret);
1740
1741 if (!arg_expose_ports)
1742 return 0;
1743
1744 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1745 if (k < 0)
1746 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1747
1748 cmsg = CMSG_FIRSTHDR(&mh);
1749 assert(cmsg->cmsg_level == SOL_SOCKET);
1750 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1751 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1752 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1753
1754 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1755 if (r < 0) {
1756 safe_close(fd);
1757 return log_error_errno(r, "Failed to create rtnl object: %m");
1758 }
1759
1760 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1763
1764 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1765 if (r < 0)
1766 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1767
1768 r = sd_rtnl_attach_event(rtnl, event, 0);
1769 if (r < 0)
1770 return log_error_errno(r, "Failed to add to even loop: %m");
1771
1772 *ret = rtnl;
1773 rtnl = NULL;
1774
1775 return 0;
1776}
1777
3a74cea5 1778static int setup_hostname(void) {
3a74cea5 1779
eb91eb18
LP
1780 if (arg_share_system)
1781 return 0;
1782
605f81a8 1783 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1784 return -errno;
3a74cea5 1785
7027ff61 1786 return 0;
3a74cea5
LP
1787}
1788
57fb9fb5 1789static int setup_journal(const char *directory) {
4d680aee 1790 sd_id128_t machine_id, this_id;
7fd1b19b 1791 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1792 char *id;
57fb9fb5
LP
1793 int r;
1794
df9a75e4
LP
1795 /* Don't link journals in ephemeral mode */
1796 if (arg_ephemeral)
1797 return 0;
1798
57fb9fb5 1799 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1800 if (!p)
1801 return log_oom();
57fb9fb5
LP
1802
1803 r = read_one_line_file(p, &b);
27407a01
ZJS
1804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1805 return 0;
f647962d
MS
1806 else if (r < 0)
1807 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1808
27407a01
ZJS
1809 id = strstrip(b);
1810 if (isempty(id) && arg_link_journal == LINK_AUTO)
1811 return 0;
57fb9fb5 1812
27407a01
ZJS
1813 /* Verify validity */
1814 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1815 if (r < 0)
1816 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1817
4d680aee 1818 r = sd_id128_get_machine(&this_id);
f647962d
MS
1819 if (r < 0)
1820 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1821
1822 if (sd_id128_equal(machine_id, this_id)) {
1823 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1824 "Host and machine ids are equal (%s): refusing to link journals", id);
1825 if (arg_link_journal == LINK_AUTO)
1826 return 0;
df9a75e4 1827 return -EEXIST;
4d680aee
ZJS
1828 }
1829
1830 if (arg_link_journal == LINK_NO)
1831 return 0;
1832
57fb9fb5 1833 free(p);
27407a01
ZJS
1834 p = strappend("/var/log/journal/", id);
1835 q = strjoin(directory, "/var/log/journal/", id, NULL);
1836 if (!p || !q)
1837 return log_oom();
1838
1839 if (path_is_mount_point(p, false) > 0) {
1840 if (arg_link_journal != LINK_AUTO) {
1841 log_error("%s: already a mount point, refusing to use for journal", p);
1842 return -EEXIST;
1843 }
1844
1845 return 0;
57fb9fb5
LP
1846 }
1847
27407a01 1848 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1849 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1850 log_error("%s: already a mount point, refusing to use for journal", q);
1851 return -EEXIST;
57fb9fb5
LP
1852 }
1853
27407a01 1854 return 0;
57fb9fb5
LP
1855 }
1856
1857 r = readlink_and_make_absolute(p, &d);
1858 if (r >= 0) {
1859 if ((arg_link_journal == LINK_GUEST ||
1860 arg_link_journal == LINK_AUTO) &&
1861 path_equal(d, q)) {
1862
27407a01
ZJS
1863 r = mkdir_p(q, 0755);
1864 if (r < 0)
56f64d95 1865 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1866 return 0;
57fb9fb5
LP
1867 }
1868
4a62c710
MS
1869 if (unlink(p) < 0)
1870 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1871 } else if (r == -EINVAL) {
1872
1873 if (arg_link_journal == LINK_GUEST &&
1874 rmdir(p) < 0) {
1875
27407a01
ZJS
1876 if (errno == ENOTDIR) {
1877 log_error("%s already exists and is neither a symlink nor a directory", p);
1878 return r;
1879 } else {
56f64d95 1880 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1881 return -errno;
57fb9fb5 1882 }
57fb9fb5
LP
1883 }
1884 } else if (r != -ENOENT) {
56f64d95 1885 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1886 return r;
57fb9fb5
LP
1887 }
1888
1889 if (arg_link_journal == LINK_GUEST) {
1890
1891 if (symlink(q, p) < 0) {
574edc90 1892 if (arg_link_journal_try) {
56f64d95 1893 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1894 return 0;
1895 } else {
56f64d95 1896 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1897 return -errno;
1898 }
57fb9fb5
LP
1899 }
1900
27407a01
ZJS
1901 r = mkdir_p(q, 0755);
1902 if (r < 0)
56f64d95 1903 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1904 return 0;
57fb9fb5
LP
1905 }
1906
1907 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1908 /* don't create parents here -- if the host doesn't have
1909 * permanent journal set up, don't force it here */
1910 r = mkdir(p, 0755);
57fb9fb5 1911 if (r < 0) {
574edc90 1912 if (arg_link_journal_try) {
56f64d95 1913 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1914 return 0;
1915 } else {
56f64d95 1916 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1917 return r;
1918 }
57fb9fb5
LP
1919 }
1920
27407a01
ZJS
1921 } else if (access(p, F_OK) < 0)
1922 return 0;
57fb9fb5 1923
cdb2b9d0
LP
1924 if (dir_is_empty(q) == 0)
1925 log_warning("%s is not empty, proceeding anyway.", q);
1926
57fb9fb5
LP
1927 r = mkdir_p(q, 0755);
1928 if (r < 0) {
56f64d95 1929 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1930 return r;
57fb9fb5
LP
1931 }
1932
4543768d 1933 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1934 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1935
27407a01 1936 return 0;
57fb9fb5
LP
1937}
1938
88213476 1939static int drop_capabilities(void) {
5076f0cc 1940 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1941}
1942
5aa4bb6b 1943static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1944 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1945 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1946 int r;
1947
eb91eb18
LP
1948 if (!arg_register)
1949 return 0;
1950
1c03020c 1951 r = sd_bus_default_system(&bus);
f647962d
MS
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1954
89f7c846
LP
1955 if (arg_keep_unit) {
1956 r = sd_bus_call_method(
1957 bus,
1958 "org.freedesktop.machine1",
1959 "/org/freedesktop/machine1",
1960 "org.freedesktop.machine1.Manager",
5aa4bb6b 1961 "RegisterMachineWithNetwork",
89f7c846
LP
1962 &error,
1963 NULL,
5aa4bb6b 1964 "sayssusai",
89f7c846
LP
1965 arg_machine,
1966 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1967 "nspawn",
1968 "container",
1969 (uint32_t) pid,
5aa4bb6b
LP
1970 strempty(arg_directory),
1971 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1972 } else {
9457ac5b 1973 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 1974 char **i;
9457ac5b
LP
1975
1976 r = sd_bus_message_new_method_call(
89f7c846 1977 bus,
9457ac5b 1978 &m,
89f7c846
LP
1979 "org.freedesktop.machine1",
1980 "/org/freedesktop/machine1",
1981 "org.freedesktop.machine1.Manager",
5aa4bb6b 1982 "CreateMachineWithNetwork");
f647962d 1983 if (r < 0)
f36933fe 1984 return bus_log_create_error(r);
9457ac5b
LP
1985
1986 r = sd_bus_message_append(
1987 m,
5aa4bb6b 1988 "sayssusai",
89f7c846
LP
1989 arg_machine,
1990 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1991 "nspawn",
1992 "container",
1993 (uint32_t) pid,
5aa4bb6b
LP
1994 strempty(arg_directory),
1995 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 1996 if (r < 0)
f36933fe 1997 return bus_log_create_error(r);
9457ac5b
LP
1998
1999 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2000 if (r < 0)
f36933fe 2001 return bus_log_create_error(r);
9457ac5b
LP
2002
2003 if (!isempty(arg_slice)) {
2004 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2005 if (r < 0)
f36933fe 2006 return bus_log_create_error(r);
9457ac5b
LP
2007 }
2008
2009 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2010 if (r < 0)
f36933fe 2011 return bus_log_create_error(r);
9457ac5b 2012
63cc4c31 2013 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2014 /* Allow the container to
2015 * access and create the API
2016 * device nodes, so that
2017 * PrivateDevices= in the
2018 * container can work
2019 * fine */
2020 "/dev/null", "rwm",
2021 "/dev/zero", "rwm",
2022 "/dev/full", "rwm",
2023 "/dev/random", "rwm",
2024 "/dev/urandom", "rwm",
2025 "/dev/tty", "rwm",
864e1706 2026 "/dev/net/tun", "rwm",
9457ac5b
LP
2027 /* Allow the container
2028 * access to ptys. However,
2029 * do not permit the
2030 * container to ever create
2031 * these device nodes. */
2032 "/dev/pts/ptmx", "rw",
63cc4c31 2033 "char-pts", "rw");
f647962d
MS
2034 if (r < 0)
2035 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b 2036
f36933fe
LP
2037 STRV_FOREACH(i, arg_property) {
2038 r = sd_bus_message_open_container(m, 'r', "sv");
2039 if (r < 0)
2040 return bus_log_create_error(r);
2041
2042 r = bus_append_unit_property_assignment(m, *i);
2043 if (r < 0)
2044 return r;
2045
2046 r = sd_bus_message_close_container(m);
2047 if (r < 0)
2048 return bus_log_create_error(r);
2049 }
2050
9457ac5b 2051 r = sd_bus_message_close_container(m);
f647962d 2052 if (r < 0)
f36933fe 2053 return bus_log_create_error(r);
9457ac5b
LP
2054
2055 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2056 }
2057
9444b1f2 2058 if (r < 0) {
1f0cd86b
LP
2059 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2060 return r;
2061 }
2062
2063 return 0;
2064}
2065
2066static int terminate_machine(pid_t pid) {
2067 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2068 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 2069 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2070 const char *path;
2071 int r;
2072
eb91eb18
LP
2073 if (!arg_register)
2074 return 0;
2075
76b54375 2076 r = sd_bus_default_system(&bus);
f647962d
MS
2077 if (r < 0)
2078 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2079
2080 r = sd_bus_call_method(
2081 bus,
2082 "org.freedesktop.machine1",
2083 "/org/freedesktop/machine1",
2084 "org.freedesktop.machine1.Manager",
2085 "GetMachineByPID",
2086 &error,
2087 &reply,
2088 "u",
2089 (uint32_t) pid);
2090 if (r < 0) {
2091 /* Note that the machine might already have been
2092 * cleaned up automatically, hence don't consider it a
2093 * failure if we cannot get the machine object. */
2094 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2095 return 0;
2096 }
2097
2098 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2099 if (r < 0)
2100 return bus_log_parse_error(r);
9444b1f2 2101
1f0cd86b
LP
2102 r = sd_bus_call_method(
2103 bus,
2104 "org.freedesktop.machine1",
2105 path,
2106 "org.freedesktop.machine1.Machine",
2107 "Terminate",
2108 &error,
2109 NULL,
2110 NULL);
2111 if (r < 0) {
2112 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2113 return 0;
2114 }
2115
9444b1f2
LP
2116 return 0;
2117}
2118
db999e0f
LP
2119static int reset_audit_loginuid(void) {
2120 _cleanup_free_ char *p = NULL;
2121 int r;
2122
2123 if (arg_share_system)
2124 return 0;
2125
2126 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2127 if (r == -ENOENT)
db999e0f 2128 return 0;
f647962d
MS
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2131
2132 /* Already reset? */
2133 if (streq(p, "4294967295"))
2134 return 0;
2135
2136 r = write_string_file("/proc/self/loginuid", "4294967295");
2137 if (r < 0) {
2138 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2139 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2140 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2141 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2142 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 2143
db999e0f 2144 sleep(5);
77b6e194 2145 }
db999e0f
LP
2146
2147 return 0;
77b6e194
LP
2148}
2149
4f758c23
LP
2150#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2151#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2152#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2153
a90e2305 2154static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2155 uint8_t result[8];
2156 size_t l, sz;
a90e2305
LP
2157 uint8_t *v, *i;
2158 int r;
01dde061
TG
2159
2160 l = strlen(arg_machine);
2161 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2162 if (idx > 0)
2163 sz += sizeof(idx);
a90e2305 2164
01dde061
TG
2165 v = alloca(sz);
2166
2167 /* fetch some persistent data unique to the host */
2168 r = sd_id128_get_machine((sd_id128_t*) v);
2169 if (r < 0)
2170 return r;
2171
2172 /* combine with some data unique (on this host) to this
2173 * container instance */
a90e2305
LP
2174 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2175 if (idx > 0) {
2176 idx = htole64(idx);
2177 memcpy(i, &idx, sizeof(idx));
2178 }
01dde061
TG
2179
2180 /* Let's hash the host machine ID plus the container name. We
2181 * use a fixed, but originally randomly created hash key here. */
4f758c23 2182 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2183
2184 assert_cc(ETH_ALEN <= sizeof(result));
2185 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2186
2187 /* see eth_random_addr in the kernel */
2188 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2189 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2190
2191 return 0;
2192}
2193
5aa4bb6b 2194static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2195 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2196 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2197 struct ether_addr mac_host, mac_container;
5aa4bb6b 2198 int r, i;
69c79d3c
LP
2199
2200 if (!arg_private_network)
2201 return 0;
2202
2203 if (!arg_network_veth)
2204 return 0;
2205
08af0da2
LP
2206 /* Use two different interface name prefixes depending whether
2207 * we are in bridge mode or not. */
c00524c9 2208 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2209 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2210
e867ceb6
LP
2211 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2214
e867ceb6
LP
2215 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2218
151b9b96 2219 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2222
151b9b96 2223 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2226
ab046dde 2227 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2230
4f758c23 2231 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2232 if (r < 0)
2233 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2234
ee3a6a51 2235 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2236 if (r < 0)
2237 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2238
d8e538ec 2239 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2240 if (r < 0)
2241 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2242
ee3a6a51 2243 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2244 if (r < 0)
2245 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2246
ab046dde 2247 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2248 if (r < 0)
2249 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2250
4f758c23 2251 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2252 if (r < 0)
2253 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2254
ab046dde 2255 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2256 if (r < 0)
2257 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2258
2259 r = sd_rtnl_message_close_container(m);
f647962d
MS
2260 if (r < 0)
2261 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2262
2263 r = sd_rtnl_message_close_container(m);
f647962d
MS
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2266
2267 r = sd_rtnl_message_close_container(m);
f647962d
MS
2268 if (r < 0)
2269 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2270
2271 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2272 if (r < 0)
2273 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2274
5aa4bb6b 2275 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2276 if (i <= 0)
2277 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2278
2279 *ifi = i;
2280
69c79d3c
LP
2281 return 0;
2282}
2283
5aa4bb6b 2284static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2285 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2286 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2287 int r, bridge;
2288
2289 if (!arg_private_network)
2290 return 0;
2291
2292 if (!arg_network_veth)
2293 return 0;
2294
2295 if (!arg_network_bridge)
2296 return 0;
2297
2298 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2299 if (bridge <= 0)
2300 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2301
5aa4bb6b
LP
2302 *ifi = bridge;
2303
151b9b96 2304 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2307
151b9b96 2308 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2309 if (r < 0)
2310 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2311
039dd4af 2312 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2313 if (r < 0)
2314 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2315
ab046dde 2316 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2319
2320 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2321 if (r < 0)
2322 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2323
2324 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2325 if (r < 0)
2326 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2327
2328 return 0;
2329}
2330
c74e630d
LP
2331static int parse_interface(struct udev *udev, const char *name) {
2332 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2333 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2334 int ifi;
2335
2336 ifi = (int) if_nametoindex(name);
4a62c710
MS
2337 if (ifi <= 0)
2338 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2339
2340 sprintf(ifi_str, "n%i", ifi);
2341 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2342 if (!d)
2343 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2344
2345 if (udev_device_get_is_initialized(d) <= 0) {
2346 log_error("Network interface %s is not initialized yet.", name);
2347 return -EBUSY;
2348 }
2349
2350 return ifi;
2351}
2352
69c79d3c 2353static int move_network_interfaces(pid_t pid) {
7e227024 2354 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2355 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2356 char **i;
2357 int r;
2358
2359 if (!arg_private_network)
2360 return 0;
2361
2362 if (strv_isempty(arg_network_interfaces))
2363 return 0;
2364
151b9b96 2365 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2368
7e227024
LP
2369 udev = udev_new();
2370 if (!udev) {
2371 log_error("Failed to connect to udev.");
2372 return -ENOMEM;
2373 }
2374
aa28aefe 2375 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2376 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2377 int ifi;
aa28aefe 2378
c74e630d
LP
2379 ifi = parse_interface(udev, *i);
2380 if (ifi < 0)
2381 return ifi;
2382
3125b3ef 2383 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2386
c74e630d 2387 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2390
c74e630d 2391 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2392 if (r < 0)
2393 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2394 }
7e227024 2395
c74e630d
LP
2396 return 0;
2397}
2398
2399static int setup_macvlan(pid_t pid) {
2400 _cleanup_udev_unref_ struct udev *udev = NULL;
2401 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2402 unsigned idx = 0;
c74e630d
LP
2403 char **i;
2404 int r;
2405
2406 if (!arg_private_network)
2407 return 0;
2408
2409 if (strv_isempty(arg_network_macvlan))
2410 return 0;
2411
2412 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2413 if (r < 0)
2414 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2415
2416 udev = udev_new();
2417 if (!udev) {
2418 log_error("Failed to connect to udev.");
2419 return -ENOMEM;
2420 }
2421
2422 STRV_FOREACH(i, arg_network_macvlan) {
2423 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2424 _cleanup_free_ char *n = NULL;
e867ceb6 2425 struct ether_addr mac;
c74e630d
LP
2426 int ifi;
2427
2428 ifi = parse_interface(udev, *i);
2429 if (ifi < 0)
2430 return ifi;
2431
e867ceb6
LP
2432 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2433 if (r < 0)
2434 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2435
c74e630d 2436 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2437 if (r < 0)
2438 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2439
c74e630d 2440 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2441 if (r < 0)
2442 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2443
2444 n = strappend("mv-", *i);
2445 if (!n)
2446 return log_oom();
2447
2448 strshorten(n, IFNAMSIZ-1);
2449
2450 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2453
e867ceb6
LP
2454 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2457
aa28aefe 2458 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2461
2462 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2465
d8e538ec 2466 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2469
2470 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2473
2474 r = sd_rtnl_message_close_container(m);
f647962d
MS
2475 if (r < 0)
2476 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2477
2478 r = sd_rtnl_message_close_container(m);
f647962d
MS
2479 if (r < 0)
2480 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2481
2482 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2483 if (r < 0)
2484 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2485 }
2486
2487 return 0;
2488}
2489
4bbfe7ad
TG
2490static int setup_ipvlan(pid_t pid) {
2491 _cleanup_udev_unref_ struct udev *udev = NULL;
2492 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2493 char **i;
2494 int r;
2495
2496 if (!arg_private_network)
2497 return 0;
2498
2499 if (strv_isempty(arg_network_ipvlan))
2500 return 0;
2501
2502 r = sd_rtnl_open(&rtnl, 0);
2503 if (r < 0)
2504 return log_error_errno(r, "Failed to connect to netlink: %m");
2505
2506 udev = udev_new();
2507 if (!udev) {
2508 log_error("Failed to connect to udev.");
2509 return -ENOMEM;
2510 }
2511
2512 STRV_FOREACH(i, arg_network_ipvlan) {
2513 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2514 _cleanup_free_ char *n = NULL;
2515 int ifi;
2516
2517 ifi = parse_interface(udev, *i);
2518 if (ifi < 0)
2519 return ifi;
2520
2521 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2522 if (r < 0)
2523 return log_error_errno(r, "Failed to allocate netlink message: %m");
2524
2525 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2526 if (r < 0)
2527 return log_error_errno(r, "Failed to add netlink interface index: %m");
2528
2529 n = strappend("iv-", *i);
2530 if (!n)
2531 return log_oom();
2532
2533 strshorten(n, IFNAMSIZ-1);
2534
2535 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to add netlink interface name: %m");
2538
2539 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2542
2543 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2544 if (r < 0)
2545 return log_error_errno(r, "Failed to open netlink container: %m");
2546
2547 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2548 if (r < 0)
2549 return log_error_errno(r, "Failed to open netlink container: %m");
2550
2551 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2554
2555 r = sd_rtnl_message_close_container(m);
2556 if (r < 0)
2557 return log_error_errno(r, "Failed to close netlink container: %m");
2558
2559 r = sd_rtnl_message_close_container(m);
2560 if (r < 0)
2561 return log_error_errno(r, "Failed to close netlink container: %m");
2562
2563 r = sd_rtnl_call(rtnl, m, 0, NULL);
2564 if (r < 0)
2565 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2566 }
2567
2568 return 0;
2569}
2570
28650077 2571static int setup_seccomp(void) {
24fb1112
LP
2572
2573#ifdef HAVE_SECCOMP
9a71b112
JF
2574 static const struct {
2575 uint64_t capability;
2576 int syscall_num;
2577 } blacklist[] = {
2578 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2579 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2580 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2581 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2582 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2583 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2584 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2585 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2586 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
d0a0ccf3
JF
2587 };
2588
24fb1112 2589 scmp_filter_ctx seccomp;
28650077 2590 unsigned i;
24fb1112
LP
2591 int r;
2592
24fb1112
LP
2593 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2594 if (!seccomp)
2595 return log_oom();
2596
e9642be2 2597 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2598 if (r < 0) {
da927ba9 2599 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2600 goto finish;
2601 }
2602
28650077 2603 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2604 if (arg_retain & (1ULL << blacklist[i].capability))
2605 continue;
2606
2607 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2608 if (r == -EFAULT)
2609 continue; /* unknown syscall */
2610 if (r < 0) {
da927ba9 2611 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2612 goto finish;
2613 }
2614 }
2615
d0a0ccf3 2616
28650077
LP
2617 /*
2618 Audit is broken in containers, much of the userspace audit
2619 hookup will fail if running inside a container. We don't
2620 care and just turn off creation of audit sockets.
2621
2622 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2623 with EAFNOSUPPORT which audit userspace uses as indication
2624 that audit is disabled in the kernel.
2625 */
2626
3302da46 2627 r = seccomp_rule_add(
24fb1112
LP
2628 seccomp,
2629 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2630 SCMP_SYS(socket),
2631 2,
2632 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2633 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2634 if (r < 0) {
da927ba9 2635 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2636 goto finish;
2637 }
2638
2639 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2640 if (r < 0) {
da927ba9 2641 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2642 goto finish;
2643 }
2644
2645 r = seccomp_load(seccomp);
2646 if (r < 0)
da927ba9 2647 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2648
2649finish:
2650 seccomp_release(seccomp);
2651 return r;
2652#else
2653 return 0;
2654#endif
2655
2656}
2657
785890ac
LP
2658static int setup_propagate(const char *root) {
2659 const char *p, *q;
2660
2661 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2662 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2663 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2664 (void) mkdir_p(p, 0600);
2665
63c372cb 2666 q = strjoina(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2667 mkdir_parents(q, 0755);
2668 mkdir_p(q, 0600);
2669
2670 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2671 return log_error_errno(errno, "Failed to install propagation bind mount.");
2672
2673 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2674 return log_error_errno(errno, "Failed to make propagation mount read-only");
2675
2676 return 0;
2677}
2678
1b9e5b12
LP
2679static int setup_image(char **device_path, int *loop_nr) {
2680 struct loop_info64 info = {
2681 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2682 };
2683 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2684 _cleanup_free_ char* loopdev = NULL;
2685 struct stat st;
2686 int r, nr;
2687
2688 assert(device_path);
2689 assert(loop_nr);
ec16945e 2690 assert(arg_image);
1b9e5b12
LP
2691
2692 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2693 if (fd < 0)
2694 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2695
4a62c710
MS
2696 if (fstat(fd, &st) < 0)
2697 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2698
2699 if (S_ISBLK(st.st_mode)) {
2700 char *p;
2701
2702 p = strdup(arg_image);
2703 if (!p)
2704 return log_oom();
2705
2706 *device_path = p;
2707
2708 *loop_nr = -1;
2709
2710 r = fd;
2711 fd = -1;
2712
2713 return r;
2714 }
2715
2716 if (!S_ISREG(st.st_mode)) {
56f64d95 2717 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2718 return -EINVAL;
2719 }
2720
2721 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2722 if (control < 0)
2723 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2724
2725 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2726 if (nr < 0)
2727 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2728
2729 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2730 return log_oom();
2731
2732 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2733 if (loop < 0)
2734 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2735
4a62c710
MS
2736 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2737 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2738
2739 if (arg_read_only)
2740 info.lo_flags |= LO_FLAGS_READ_ONLY;
2741
4a62c710
MS
2742 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2743 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2744
2745 *device_path = loopdev;
2746 loopdev = NULL;
2747
2748 *loop_nr = nr;
2749
2750 r = loop;
2751 loop = -1;
2752
2753 return r;
2754}
2755
ada4799a
LP
2756#define PARTITION_TABLE_BLURB \
2757 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 2758 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 2759 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2760 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2761 "to be bootable with systemd-nspawn."
2762
1b9e5b12
LP
2763static int dissect_image(
2764 int fd,
727fd4fd
LP
2765 char **root_device, bool *root_device_rw,
2766 char **home_device, bool *home_device_rw,
2767 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2768 bool *secondary) {
2769
2770#ifdef HAVE_BLKID
01dc33ce
ZJS
2771 int home_nr = -1, srv_nr = -1;
2772#ifdef GPT_ROOT_NATIVE
2773 int root_nr = -1;
2774#endif
2775#ifdef GPT_ROOT_SECONDARY
2776 int secondary_root_nr = -1;
2777#endif
f6c51a81 2778 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2779 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2780 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2781 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2782 _cleanup_udev_unref_ struct udev *udev = NULL;
2783 struct udev_list_entry *first, *item;
f6c51a81 2784 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2785 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2786 const char *pttype = NULL;
2787 blkid_partlist pl;
2788 struct stat st;
c09ef2e4 2789 unsigned i;
1b9e5b12
LP
2790 int r;
2791
2792 assert(fd >= 0);
2793 assert(root_device);
2794 assert(home_device);
2795 assert(srv_device);
2796 assert(secondary);
ec16945e 2797 assert(arg_image);
1b9e5b12
LP
2798
2799 b = blkid_new_probe();
2800 if (!b)
2801 return log_oom();
2802
2803 errno = 0;
2804 r = blkid_probe_set_device(b, fd, 0, 0);
2805 if (r != 0) {
2806 if (errno == 0)
2807 return log_oom();
2808
56f64d95 2809 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2810 return -errno;
2811 }
2812
2813 blkid_probe_enable_partitions(b, 1);
2814 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2815
2816 errno = 0;
2817 r = blkid_do_safeprobe(b);
2818 if (r == -2 || r == 1) {
ada4799a
LP
2819 log_error("Failed to identify any partition table on\n"
2820 " %s\n"
2821 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2822 return -EINVAL;
2823 } else if (r != 0) {
2824 if (errno == 0)
2825 errno = EIO;
56f64d95 2826 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2827 return -errno;
2828 }
2829
48861960 2830 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2831
2832 is_gpt = streq_ptr(pttype, "gpt");
2833 is_mbr = streq_ptr(pttype, "dos");
2834
2835 if (!is_gpt && !is_mbr) {
2836 log_error("No GPT or MBR partition table discovered on\n"
2837 " %s\n"
2838 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2839 return -EINVAL;
2840 }
2841
2842 errno = 0;
2843 pl = blkid_probe_get_partitions(b);
2844 if (!pl) {
2845 if (errno == 0)
2846 return log_oom();
2847
2848 log_error("Failed to list partitions of %s", arg_image);
2849 return -errno;
2850 }
2851
2852 udev = udev_new();
2853 if (!udev)
2854 return log_oom();
2855
4a62c710
MS
2856 if (fstat(fd, &st) < 0)
2857 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2858
c09ef2e4
LP
2859 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2860 if (!d)
1b9e5b12
LP
2861 return log_oom();
2862
c09ef2e4
LP
2863 for (i = 0;; i++) {
2864 int n, m;
1b9e5b12 2865
c09ef2e4
LP
2866 if (i >= 10) {
2867 log_error("Kernel partitions never appeared.");
2868 return -ENXIO;
2869 }
2870
2871 e = udev_enumerate_new(udev);
2872 if (!e)
2873 return log_oom();
2874
2875 r = udev_enumerate_add_match_parent(e, d);
2876 if (r < 0)
2877 return log_oom();
2878
2879 r = udev_enumerate_scan_devices(e);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2882
2883 /* Count the partitions enumerated by the kernel */
2884 n = 0;
2885 first = udev_enumerate_get_list_entry(e);
2886 udev_list_entry_foreach(item, first)
2887 n++;
2888
2889 /* Count the partitions enumerated by blkid */
2890 m = blkid_partlist_numof_partitions(pl);
2891 if (n == m + 1)
2892 break;
2893 if (n > m + 1) {
2894 log_error("blkid and kernel partition list do not match.");
2895 return -EIO;
2896 }
2897 if (n < m + 1) {
2898 unsigned j;
2899
2900 /* The kernel has probed fewer partitions than
2901 * blkid? Maybe the kernel prober is still
2902 * running or it got EBUSY because udev
2903 * already opened the device. Let's reprobe
2904 * the device, which is a synchronous call
2905 * that waits until probing is complete. */
2906
2907 for (j = 0; j < 20; j++) {
2908
2909 r = ioctl(fd, BLKRRPART, 0);
2910 if (r < 0)
2911 r = -errno;
2912 if (r >= 0 || r != -EBUSY)
2913 break;
2914
2915 /* If something else has the device
2916 * open, such as an udev rule, the
2917 * ioctl will return EBUSY. Since
2918 * there's no way to wait until it
2919 * isn't busy anymore, let's just wait
2920 * a bit, and try again.
2921 *
2922 * This is really something they
2923 * should fix in the kernel! */
2924
2925 usleep(50 * USEC_PER_MSEC);
2926 }
2927
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to reread partition table: %m");
2930 }
2931
2932 e = udev_enumerate_unref(e);
2933 }
1b9e5b12
LP
2934
2935 first = udev_enumerate_get_list_entry(e);
2936 udev_list_entry_foreach(item, first) {
2937 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2938 const char *node;
727fd4fd 2939 unsigned long long flags;
1b9e5b12
LP
2940 blkid_partition pp;
2941 dev_t qn;
2942 int nr;
2943
2944 errno = 0;
2945 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2946 if (!q) {
2947 if (!errno)
2948 errno = ENOMEM;
2949
56f64d95 2950 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2951 return -errno;
2952 }
2953
2954 qn = udev_device_get_devnum(q);
2955 if (major(qn) == 0)
2956 continue;
2957
2958 if (st.st_rdev == qn)
2959 continue;
2960
2961 node = udev_device_get_devnode(q);
2962 if (!node)
2963 continue;
2964
2965 pp = blkid_partlist_devno_to_partition(pl, qn);
2966 if (!pp)
2967 continue;
2968
727fd4fd 2969 flags = blkid_partition_get_flags(pp);
727fd4fd 2970
1b9e5b12
LP
2971 nr = blkid_partition_get_partno(pp);
2972 if (nr < 0)
2973 continue;
2974
ada4799a
LP
2975 if (is_gpt) {
2976 sd_id128_t type_id;
2977 const char *stype;
1b9e5b12 2978
f6c51a81
LP
2979 if (flags & GPT_FLAG_NO_AUTO)
2980 continue;
2981
ada4799a
LP
2982 stype = blkid_partition_get_type_string(pp);
2983 if (!stype)
2984 continue;
1b9e5b12 2985
ada4799a 2986 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2987 continue;
2988
ada4799a 2989 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2990
ada4799a
LP
2991 if (home && nr >= home_nr)
2992 continue;
1b9e5b12 2993
ada4799a
LP
2994 home_nr = nr;
2995 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2996
ada4799a
LP
2997 r = free_and_strdup(&home, node);
2998 if (r < 0)
2999 return log_oom();
727fd4fd 3000
ada4799a
LP
3001 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3002
3003 if (srv && nr >= srv_nr)
3004 continue;
3005
3006 srv_nr = nr;
3007 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3008
3009 r = free_and_strdup(&srv, node);
3010 if (r < 0)
3011 return log_oom();
3012 }
1b9e5b12 3013#ifdef GPT_ROOT_NATIVE
ada4799a 3014 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3015
ada4799a
LP
3016 if (root && nr >= root_nr)
3017 continue;
1b9e5b12 3018
ada4799a
LP
3019 root_nr = nr;
3020 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3021
ada4799a
LP
3022 r = free_and_strdup(&root, node);
3023 if (r < 0)
3024 return log_oom();
3025 }
1b9e5b12
LP
3026#endif
3027#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3028 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3029
3030 if (secondary_root && nr >= secondary_root_nr)
3031 continue;
3032
3033 secondary_root_nr = nr;
3034 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3035
3036 r = free_and_strdup(&secondary_root, node);
3037 if (r < 0)
3038 return log_oom();
3039 }
3040#endif
f6c51a81
LP
3041 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3042
3043 if (generic)
3044 multiple_generic = true;
3045 else {
3046 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3047
3048 r = free_and_strdup(&generic, node);
3049 if (r < 0)
3050 return log_oom();
3051 }
3052 }
ada4799a
LP
3053
3054 } else if (is_mbr) {
3055 int type;
1b9e5b12 3056
f6c51a81
LP
3057 if (flags != 0x80) /* Bootable flag */
3058 continue;
3059
ada4799a
LP
3060 type = blkid_partition_get_type(pp);
3061 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3062 continue;
3063
f6c51a81
LP
3064 if (generic)
3065 multiple_generic = true;
3066 else {
3067 generic_rw = true;
727fd4fd 3068
f6c51a81
LP
3069 r = free_and_strdup(&root, node);
3070 if (r < 0)
3071 return log_oom();
3072 }
1b9e5b12 3073 }
1b9e5b12
LP
3074 }
3075
1b9e5b12
LP
3076 if (root) {
3077 *root_device = root;
3078 root = NULL;
727fd4fd
LP
3079
3080 *root_device_rw = root_rw;
1b9e5b12
LP
3081 *secondary = false;
3082 } else if (secondary_root) {
3083 *root_device = secondary_root;
3084 secondary_root = NULL;
727fd4fd
LP
3085
3086 *root_device_rw = secondary_root_rw;
1b9e5b12 3087 *secondary = true;
f6c51a81
LP
3088 } else if (generic) {
3089
3090 /* There were no partitions with precise meanings
3091 * around, but we found generic partitions. In this
3092 * case, if there's only one, we can go ahead and boot
3093 * it, otherwise we bail out, because we really cannot
3094 * make any sense of it. */
3095
3096 if (multiple_generic) {
3097 log_error("Identified multiple bootable Linux partitions on\n"
3098 " %s\n"
3099 PARTITION_TABLE_BLURB, arg_image);
3100 return -EINVAL;
3101 }
3102
3103 *root_device = generic;
3104 generic = NULL;
3105
3106 *root_device_rw = generic_rw;
3107 *secondary = false;
3108 } else {
3109 log_error("Failed to identify root partition in disk image\n"
3110 " %s\n"
3111 PARTITION_TABLE_BLURB, arg_image);
3112 return -EINVAL;
1b9e5b12
LP
3113 }
3114
3115 if (home) {
3116 *home_device = home;
3117 home = NULL;
727fd4fd
LP
3118
3119 *home_device_rw = home_rw;
1b9e5b12
LP
3120 }
3121
3122 if (srv) {
3123 *srv_device = srv;
3124 srv = NULL;
727fd4fd
LP
3125
3126 *srv_device_rw = srv_rw;
1b9e5b12
LP
3127 }
3128
3129 return 0;
3130#else
3131 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3132 return -EOPNOTSUPP;
1b9e5b12
LP
3133#endif
3134}
3135
727fd4fd 3136static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3137#ifdef HAVE_BLKID
3138 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3139 const char *fstype, *p;
3140 int r;
3141
3142 assert(what);
3143 assert(where);
3144
727fd4fd
LP
3145 if (arg_read_only)
3146 rw = false;
3147
1b9e5b12 3148 if (directory)
63c372cb 3149 p = strjoina(where, directory);
1b9e5b12
LP
3150 else
3151 p = where;
3152
3153 errno = 0;
3154 b = blkid_new_probe_from_filename(what);
3155 if (!b) {
3156 if (errno == 0)
3157 return log_oom();
56f64d95 3158 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3159 return -errno;
3160 }
3161
3162 blkid_probe_enable_superblocks(b, 1);
3163 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3164
3165 errno = 0;
3166 r = blkid_do_safeprobe(b);
3167 if (r == -1 || r == 1) {
3168 log_error("Cannot determine file system type of %s", what);
3169 return -EINVAL;
3170 } else if (r != 0) {
3171 if (errno == 0)
3172 errno = EIO;
56f64d95 3173 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3174 return -errno;
3175 }
3176
3177 errno = 0;
3178 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3179 if (errno == 0)
3180 errno = EINVAL;
3181 log_error("Failed to determine file system type of %s", what);
3182 return -errno;
3183 }
3184
3185 if (streq(fstype, "crypto_LUKS")) {
3186 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3187 return -EOPNOTSUPP;
1b9e5b12
LP
3188 }
3189
4a62c710
MS
3190 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3191 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3192
3193 return 0;
3194#else
3195 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3196 return -EOPNOTSUPP;
1b9e5b12
LP
3197#endif
3198}
3199
727fd4fd
LP
3200static int mount_devices(
3201 const char *where,
3202 const char *root_device, bool root_device_rw,
3203 const char *home_device, bool home_device_rw,
3204 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3205 int r;
3206
3207 assert(where);
3208
3209 if (root_device) {
727fd4fd 3210 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3211 if (r < 0)
3212 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3213 }
3214
3215 if (home_device) {
727fd4fd 3216 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3217 if (r < 0)
3218 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3219 }
3220
3221 if (srv_device) {
727fd4fd 3222 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3223 if (r < 0)
3224 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3225 }
3226
3227 return 0;
3228}
3229
3230static void loop_remove(int nr, int *image_fd) {
3231 _cleanup_close_ int control = -1;
e8c8ddcc 3232 int r;
1b9e5b12
LP
3233
3234 if (nr < 0)
3235 return;
3236
3237 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3238 r = ioctl(*image_fd, LOOP_CLR_FD);
3239 if (r < 0)
5e4074aa 3240 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3241 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3242 }
3243
3244 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3245 if (control < 0) {
56f64d95 3246 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3247 return;
e8c8ddcc 3248 }
1b9e5b12 3249
e8c8ddcc
TG
3250 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3251 if (r < 0)
5e4074aa 3252 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3253}
3254
0cb9fbcd
LP
3255static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3256 int pipe_fds[2];
3257 pid_t pid;
3258
3259 assert(database);
3260 assert(key);
3261 assert(rpid);
3262
4a62c710
MS
3263 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3264 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3265
3266 pid = fork();
4a62c710
MS
3267 if (pid < 0)
3268 return log_error_errno(errno, "Failed to fork getent child: %m");
3269 else if (pid == 0) {
0cb9fbcd
LP
3270 int nullfd;
3271 char *empty_env = NULL;
3272
3273 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3274 _exit(EXIT_FAILURE);
3275
3276 if (pipe_fds[0] > 2)
03e334a1 3277 safe_close(pipe_fds[0]);
0cb9fbcd 3278 if (pipe_fds[1] > 2)
03e334a1 3279 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3280
3281 nullfd = open("/dev/null", O_RDWR);
3282 if (nullfd < 0)
3283 _exit(EXIT_FAILURE);
3284
3285 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3286 _exit(EXIT_FAILURE);
3287
3288 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3289 _exit(EXIT_FAILURE);
3290
3291 if (nullfd > 2)
03e334a1 3292 safe_close(nullfd);
0cb9fbcd
LP
3293
3294 reset_all_signal_handlers();
3295 close_all_fds(NULL, 0);
3296
4de82926
MM
3297 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3298 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3299 _exit(EXIT_FAILURE);
3300 }
3301
03e334a1 3302 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3303
3304 *rpid = pid;
3305
3306 return pipe_fds[0];
3307}
3308
3309static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3310 char line[LINE_MAX], *x, *u, *g, *h;
3311 const char *word, *state;
0cb9fbcd
LP
3312 _cleanup_free_ uid_t *uids = NULL;
3313 _cleanup_free_ char *home = NULL;
3314 _cleanup_fclose_ FILE *f = NULL;
3315 _cleanup_close_ int fd = -1;
3316 unsigned n_uids = 0;
70f539ca 3317 size_t sz = 0, l;
0cb9fbcd
LP
3318 uid_t uid;
3319 gid_t gid;
3320 pid_t pid;
3321 int r;
3322
3323 assert(_home);
3324
3325 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3326 /* Reset everything fully to 0, just in case */
3327
4a62c710
MS
3328 if (setgroups(0, NULL) < 0)
3329 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3330
4a62c710
MS
3331 if (setresgid(0, 0, 0) < 0)
3332 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3333
4a62c710
MS
3334 if (setresuid(0, 0, 0) < 0)
3335 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3336
3337 *_home = NULL;
3338 return 0;
3339 }
3340
3341 /* First, get user credentials */
3342 fd = spawn_getent("passwd", arg_user, &pid);
3343 if (fd < 0)
3344 return fd;
3345
3346 f = fdopen(fd, "r");
3347 if (!f)
3348 return log_oom();
3349 fd = -1;
3350
3351 if (!fgets(line, sizeof(line), f)) {
3352
3353 if (!ferror(f)) {
3354 log_error("Failed to resolve user %s.", arg_user);
3355 return -ESRCH;
3356 }
3357
56f64d95 3358 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3359 return -errno;
3360 }
3361
3362 truncate_nl(line);
3363
820d3acf 3364 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3365
3366 x = strchr(line, ':');
3367 if (!x) {
3368 log_error("/etc/passwd entry has invalid user field.");
3369 return -EIO;
3370 }
3371
3372 u = strchr(x+1, ':');
3373 if (!u) {
3374 log_error("/etc/passwd entry has invalid password field.");
3375 return -EIO;
3376 }
3377
3378 u++;
3379 g = strchr(u, ':');
3380 if (!g) {
3381 log_error("/etc/passwd entry has invalid UID field.");
3382 return -EIO;
3383 }
3384
3385 *g = 0;
3386 g++;
3387 x = strchr(g, ':');
3388 if (!x) {
3389 log_error("/etc/passwd entry has invalid GID field.");
3390 return -EIO;
3391 }
3392
3393 *x = 0;
3394 h = strchr(x+1, ':');
3395 if (!h) {
3396 log_error("/etc/passwd entry has invalid GECOS field.");
3397 return -EIO;
3398 }
3399
3400 h++;
3401 x = strchr(h, ':');
3402 if (!x) {
3403 log_error("/etc/passwd entry has invalid home directory field.");
3404 return -EIO;
3405 }
3406
3407 *x = 0;
3408
3409 r = parse_uid(u, &uid);
3410 if (r < 0) {
3411 log_error("Failed to parse UID of user.");
3412 return -EIO;
3413 }
3414
3415 r = parse_gid(g, &gid);
3416 if (r < 0) {
3417 log_error("Failed to parse GID of user.");
3418 return -EIO;
3419 }
3420
3421 home = strdup(h);
3422 if (!home)
3423 return log_oom();
3424
3425 /* Second, get group memberships */
3426 fd = spawn_getent("initgroups", arg_user, &pid);
3427 if (fd < 0)
3428 return fd;
3429
3430 fclose(f);
3431 f = fdopen(fd, "r");
3432 if (!f)
3433 return log_oom();
3434 fd = -1;
3435
3436 if (!fgets(line, sizeof(line), f)) {
3437 if (!ferror(f)) {
3438 log_error("Failed to resolve user %s.", arg_user);
3439 return -ESRCH;
3440 }
3441
56f64d95 3442 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3443 return -errno;
3444 }
3445
3446 truncate_nl(line);
3447
820d3acf 3448 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3449
3450 /* Skip over the username and subsequent separator whitespace */
3451 x = line;
3452 x += strcspn(x, WHITESPACE);
3453 x += strspn(x, WHITESPACE);
3454
a2a5291b 3455 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3456 char c[l+1];
3457
a2a5291b 3458 memcpy(c, word, l);
0cb9fbcd
LP
3459 c[l] = 0;
3460
3461 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3462 return log_oom();
3463
3464 r = parse_uid(c, &uids[n_uids++]);
3465 if (r < 0) {
3466 log_error("Failed to parse group data from getent.");
3467 return -EIO;
3468 }
3469 }
3470
3471 r = mkdir_parents(home, 0775);
f647962d
MS
3472 if (r < 0)
3473 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3474
3475 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3476 if (r < 0 && r != -EEXIST)
3477 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3478
3479 fchown(STDIN_FILENO, uid, gid);
3480 fchown(STDOUT_FILENO, uid, gid);
3481 fchown(STDERR_FILENO, uid, gid);
3482
4a62c710
MS
3483 if (setgroups(n_uids, uids) < 0)
3484 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3485
4a62c710
MS
3486 if (setresgid(gid, gid, gid) < 0)
3487 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3488
4a62c710
MS
3489 if (setresuid(uid, uid, uid) < 0)
3490 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3491
3492 if (_home) {
3493 *_home = home;
3494 home = NULL;
3495 }
3496
3497 return 0;
3498}
3499
113cea80 3500/*
6d416b9c
LS
3501 * Return values:
3502 * < 0 : wait_for_terminate() failed to get the state of the
3503 * container, the container was terminated by a signal, or
3504 * failed for an unknown reason. No change is made to the
3505 * container argument.
3506 * > 0 : The program executed in the container terminated with an
3507 * error. The exit code of the program executed in the
919699ec
LP
3508 * container is returned. The container argument has been set
3509 * to CONTAINER_TERMINATED.
6d416b9c
LS
3510 * 0 : The container is being rebooted, has been shut down or exited
3511 * successfully. The container argument has been set to either
3512 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3513 *
6d416b9c
LS
3514 * That is, success is indicated by a return value of zero, and an
3515 * error is indicated by a non-zero value.
113cea80
DH
3516 */
3517static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3518 siginfo_t status;
919699ec 3519 int r;
113cea80
DH
3520
3521 r = wait_for_terminate(pid, &status);
f647962d
MS
3522 if (r < 0)
3523 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3524
3525 switch (status.si_code) {
fddbb89c 3526
113cea80 3527 case CLD_EXITED:
919699ec
LP
3528 if (status.si_status == 0) {
3529 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3530
fddbb89c 3531 } else
919699ec 3532 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3533
919699ec
LP
3534 *container = CONTAINER_TERMINATED;
3535 return status.si_status;
113cea80
DH
3536
3537 case CLD_KILLED:
3538 if (status.si_status == SIGINT) {
113cea80 3539
919699ec 3540 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3541 *container = CONTAINER_TERMINATED;
919699ec
LP
3542 return 0;
3543
113cea80 3544 } else if (status.si_status == SIGHUP) {
113cea80 3545
919699ec 3546 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3547 *container = CONTAINER_REBOOTED;
919699ec 3548 return 0;
113cea80 3549 }
919699ec 3550
113cea80
DH
3551 /* CLD_KILLED fallthrough */
3552
3553 case CLD_DUMPED:
fddbb89c 3554 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3555 return -EIO;
113cea80
DH
3556
3557 default:
fddbb89c 3558 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3559 return -EIO;
113cea80
DH
3560 }
3561
3562 return r;
3563}
3564
e866af3a
DH
3565static void nop_handler(int sig) {}
3566
023fb90b
LP
3567static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3568 pid_t pid;
3569
3570 pid = PTR_TO_UINT32(userdata);
3571 if (pid > 0) {
c6c8f6e2 3572 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3573 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3574 sd_event_source_set_userdata(s, NULL);
3575 return 0;
3576 }
3577 }
3578
3579 sd_event_exit(sd_event_source_get_event(s), 0);
3580 return 0;
3581}
3582
ec16945e 3583static int determine_names(void) {
1b9cebf6 3584 int r;
ec16945e
LP
3585
3586 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3587 if (arg_machine) {
3588 _cleanup_(image_unrefp) Image *i = NULL;
3589
3590 r = image_find(arg_machine, &i);
3591 if (r < 0)
3592 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3593 else if (r == 0) {
3594 log_error("No image for machine '%s': %m", arg_machine);
3595 return -ENOENT;
3596 }
3597
aceac2f0 3598 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3599 r = set_sanitized_path(&arg_image, i->path);
3600 else
3601 r = set_sanitized_path(&arg_directory, i->path);
3602 if (r < 0)
3603 return log_error_errno(r, "Invalid image directory: %m");
3604
3605 arg_read_only = arg_read_only || i->read_only;
3606 } else
ec16945e
LP
3607 arg_directory = get_current_dir_name();
3608
1b9cebf6
LP
3609 if (!arg_directory && !arg_machine) {
3610 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3611 return -EINVAL;
3612 }
3613 }
3614
3615 if (!arg_machine) {
b9ba4dab
LP
3616 if (arg_directory && path_equal(arg_directory, "/"))
3617 arg_machine = gethostname_malloc();
3618 else
3619 arg_machine = strdup(basename(arg_image ?: arg_directory));
3620
ec16945e
LP
3621 if (!arg_machine)
3622 return log_oom();
3623
3624 hostname_cleanup(arg_machine, false);
3625 if (!machine_name_is_valid(arg_machine)) {
3626 log_error("Failed to determine machine name automatically, please use -M.");
3627 return -EINVAL;
3628 }
b9ba4dab
LP
3629
3630 if (arg_ephemeral) {
3631 char *b;
3632
3633 /* Add a random suffix when this is an
3634 * ephemeral machine, so that we can run many
3635 * instances at once without manually having
3636 * to specify -M each time. */
3637
3638 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3639 return log_oom();
3640
3641 free(arg_machine);
3642 arg_machine = b;
3643 }
ec16945e
LP
3644 }
3645
3646 return 0;
3647}
3648
6dac160c
LP
3649static int determine_uid_shift(void) {
3650 int r;
3651
3652 if (!arg_userns)
3653 return 0;
3654
3655 if (arg_uid_shift == UID_INVALID) {
3656 struct stat st;
3657
3658 r = stat(arg_directory, &st);
3659 if (r < 0)
3660 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3661
3662 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3663
3664 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3665 log_error("UID and GID base of %s don't match.", arg_directory);
3666 return -EINVAL;
3667 }
3668
3669 arg_uid_range = UINT32_C(0x10000);
3670 }
3671
3672 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3673 log_error("UID base too high for UID range.");
3674 return -EINVAL;
3675 }
3676
3677 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3678 return 0;
3679}
3680
88213476 3681int main(int argc, char *argv[]) {
69c79d3c 3682
611b312b 3683 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3684 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3685 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3686 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3687 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3688 char veth_name[IFNAMSIZ];
ec16945e 3689 bool secondary = false, remove_subvol = false;
e866af3a 3690 sigset_t mask, mask_chld;
69c79d3c 3691 pid_t pid = 0;
ec16945e 3692 int ret = EXIT_SUCCESS;
6d0b55c2 3693 union in_addr_union exposed = {};
30535c16 3694 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
9c857b9d 3695 bool interactive;
88213476
LP
3696
3697 log_parse_environment();
3698 log_open();
3699
ec16945e
LP
3700 r = parse_argv(argc, argv);
3701 if (r <= 0)
88213476 3702 goto finish;
88213476 3703
ec16945e
LP
3704 r = determine_names();
3705 if (r < 0)
3706 goto finish;
7027ff61 3707
88213476
LP
3708 if (geteuid() != 0) {
3709 log_error("Need to be root.");
ec16945e 3710 r = -EPERM;
88213476
LP
3711 goto finish;
3712 }
3713
1b9e5b12
LP
3714 log_close();
3715 n_fd_passed = sd_listen_fds(false);
3716 if (n_fd_passed > 0) {
ec16945e
LP
3717 r = fdset_new_listen_fds(&fds, false);
3718 if (r < 0) {
3719 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3720 goto finish;
3721 }
88213476 3722 }
1b9e5b12
LP
3723 fdset_close_others(fds);
3724 log_open();
88213476 3725
1b9e5b12 3726 if (arg_directory) {
ec16945e
LP
3727 assert(!arg_image);
3728
c4e34a61
LP
3729 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3730 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3731 r = -EINVAL;
6b9132a9
LP
3732 goto finish;
3733 }
1b9e5b12 3734
30535c16 3735 if (arg_ephemeral) {
8a16a7b4 3736 _cleanup_free_ char *np = NULL;
ec16945e 3737
c4e34a61
LP
3738 /* If the specified path is a mount point we
3739 * generate the new snapshot immediately
3740 * inside it under a random name. However if
3741 * the specified is not a mount point we
3742 * create the new snapshot in the parent
3743 * directory, just next to it. */
3744 r = path_is_mount_point(arg_directory, false);
3745 if (r < 0) {
3746 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3747 goto finish;
3748 }
3749 if (r > 0)
3750 r = tempfn_random_child(arg_directory, &np);
3751 else
3752 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3753 if (r < 0) {
3754 log_error_errno(r, "Failed to generate name for snapshot: %m");
3755 goto finish;
3756 }
3757
30535c16
LP
3758 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3759 if (r < 0) {
3760 log_error_errno(r, "Failed to lock %s: %m", np);
3761 goto finish;
3762 }
3763
ec16945e
LP
3764 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3765 if (r < 0) {
ec16945e
LP
3766 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3767 goto finish;
3768 }
3769
3770 free(arg_directory);
3771 arg_directory = np;
8a16a7b4 3772 np = NULL;
ec16945e
LP
3773
3774 remove_subvol = true;
30535c16
LP
3775
3776 } else {
3777 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3778 if (r == -EBUSY) {
3779 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3780 goto finish;
3781 }
3782 if (r < 0) {
3783 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3784 return r;
3785 }
3786
3787 if (arg_template) {
3788 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3789 if (r == -EEXIST) {
3790 if (!arg_quiet)
3791 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3792 } else if (r < 0) {
83521414 3793 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3794 goto finish;
3795 } else {
3796 if (!arg_quiet)
3797 log_info("Populated %s from template %s.", arg_directory, arg_template);
3798 }
3799 }
ec16945e
LP
3800 }
3801
1b9e5b12
LP
3802 if (arg_boot) {
3803 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3804 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3805 r = -EINVAL;
1b9e5b12
LP
3806 goto finish;
3807 }
3808 } else {
3809 const char *p;
3810
63c372cb 3811 p = strjoina(arg_directory,
1b9e5b12
LP
3812 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3813 if (access(p, F_OK) < 0) {
3814 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3815 r = -EINVAL;
1b9e5b12 3816 goto finish;
1b9e5b12
LP
3817 }
3818 }
ec16945e 3819
6b9132a9 3820 } else {
1b9e5b12 3821 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3822
ec16945e
LP
3823 assert(arg_image);
3824 assert(!arg_template);
3825
30535c16
LP
3826 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3827 if (r == -EBUSY) {
3828 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3829 goto finish;
3830 }
3831 if (r < 0) {
3832 r = log_error_errno(r, "Failed to create image lock: %m");
3833 goto finish;
3834 }
3835
1b9e5b12 3836 if (!mkdtemp(template)) {
56f64d95 3837 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3838 r = -errno;
6b9132a9 3839 goto finish;
1b9e5b12 3840 }
6b9132a9 3841
1b9e5b12
LP
3842 arg_directory = strdup(template);
3843 if (!arg_directory) {
3844 r = log_oom();
3845 goto finish;
6b9132a9 3846 }
88213476 3847
1b9e5b12
LP
3848 image_fd = setup_image(&device_path, &loop_nr);
3849 if (image_fd < 0) {
3850 r = image_fd;
842f3b0f
LP
3851 goto finish;
3852 }
1b9e5b12 3853
4d9f07b4
LP
3854 r = dissect_image(image_fd,
3855 &root_device, &root_device_rw,
3856 &home_device, &home_device_rw,
3857 &srv_device, &srv_device_rw,
3858 &secondary);
1b9e5b12
LP
3859 if (r < 0)
3860 goto finish;
842f3b0f 3861 }
842f3b0f 3862
6dac160c
LP
3863 r = determine_uid_shift();
3864 if (r < 0)
3865 goto finish;
3866
9c857b9d
LP
3867 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3868
db7feb7e
LP
3869 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3870 if (master < 0) {
ec16945e 3871 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3872 goto finish;
3873 }
3874
611b312b
LP
3875 r = ptsname_malloc(master, &console);
3876 if (r < 0) {
3877 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3878 goto finish;
3879 }
3880
a258bf26 3881 if (unlockpt(master) < 0) {
ec16945e 3882 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3883 goto finish;
3884 }
3885
9c857b9d
LP
3886 if (!arg_quiet)
3887 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3888 arg_machine, arg_image ?: arg_directory);
3889
a258bf26
LP
3890 assert_se(sigemptyset(&mask) == 0);
3891 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3892 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3893
023fb90b
LP
3894 assert_se(sigemptyset(&mask_chld) == 0);
3895 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3896
d87be9b0 3897 for (;;) {
6d0b55c2 3898 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3899 ContainerStatus container_status;
7566e267 3900 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3901 struct sigaction sa = {
3902 .sa_handler = nop_handler,
3903 .sa_flags = SA_NOCLDSTOP,
3904 };
3905
7566e267 3906 r = barrier_create(&barrier);
a2da110b 3907 if (r < 0) {
da927ba9 3908 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3909 goto finish;
3910 }
3911
6d0b55c2
LP
3912 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3913 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3914 goto finish;
3915 }
3916
3917 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3918 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3919 goto finish;
3920 }
3921
e866af3a
DH
3922 /* Child can be killed before execv(), so handle SIGCHLD
3923 * in order to interrupt parent's blocking calls and
3924 * give it a chance to call wait() and terminate. */
3925 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3926 if (r < 0) {
ec16945e 3927 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3928 goto finish;
3929 }
3930
e866af3a
DH
3931 r = sigaction(SIGCHLD, &sa, NULL);
3932 if (r < 0) {
ec16945e 3933 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3934 goto finish;
3935 }
3936
60e1651a
KW
3937 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3938 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3939 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3940 if (pid < 0) {
3941 if (errno == EINVAL)
ec16945e 3942 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3943 else
ec16945e 3944 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3945
d87be9b0
LP
3946 goto finish;
3947 }
a258bf26 3948
d87be9b0
LP
3949 if (pid == 0) {
3950 /* child */
0cb9fbcd 3951 _cleanup_free_ char *home = NULL;
5674767e 3952 unsigned n_env = 2;
d87be9b0 3953 const char *envp[] = {
e10a55fd 3954 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3955 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3956 NULL, /* TERM */
3957 NULL, /* HOME */
3958 NULL, /* USER */
3959 NULL, /* LOGNAME */
3960 NULL, /* container_uuid */
842f3b0f
LP
3961 NULL, /* LISTEN_FDS */
3962 NULL, /* LISTEN_PID */
d87be9b0
LP
3963 NULL
3964 };
f4889f65 3965 char **env_use;
a258bf26 3966
a2da110b
DH
3967 barrier_set_role(&barrier, BARRIER_CHILD);
3968
5674767e
ZJS
3969 envp[n_env] = strv_find_prefix(environ, "TERM=");
3970 if (envp[n_env])
3971 n_env ++;
a258bf26 3972
03e334a1 3973 master = safe_close(master);
a258bf26 3974
03e334a1 3975 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3976 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3977
d87be9b0 3978 reset_all_signal_handlers();
1b6d7fa7 3979 reset_signal_mask();
f5c1b9ee 3980
9c857b9d
LP
3981 if (interactive) {
3982 close_nointr(STDIN_FILENO);
3983 close_nointr(STDOUT_FILENO);
3984 close_nointr(STDERR_FILENO);
842f3b0f 3985
9c857b9d
LP
3986 r = open_terminal(console, O_RDWR);
3987 if (r != STDIN_FILENO) {
3988 if (r >= 0) {
3989 safe_close(r);
3990 r = -EINVAL;
3991 }
842f3b0f 3992
9c857b9d
LP
3993 log_error_errno(r, "Failed to open console: %m");
3994 _exit(EXIT_FAILURE);
3995 }
3996
3997 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3998 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3999 log_error_errno(errno, "Failed to duplicate console: %m");
4000 _exit(EXIT_FAILURE);
4001 }
842f3b0f 4002 }
bc2f673e 4003
d87be9b0 4004 if (setsid() < 0) {
56f64d95 4005 log_error_errno(errno, "setsid() failed: %m");
a2da110b 4006 _exit(EXIT_FAILURE);
bc2f673e
LP
4007 }
4008
db999e0f 4009 if (reset_audit_loginuid() < 0)
a2da110b 4010 _exit(EXIT_FAILURE);
db999e0f 4011
d87be9b0 4012 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 4013 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 4014 _exit(EXIT_FAILURE);
d87be9b0 4015 }
e58a1277 4016
6dac160c
LP
4017 if (arg_private_network)
4018 loopback_setup();
4019
d87be9b0
LP
4020 /* Mark everything as slave, so that we still
4021 * receive mounts from the real root, but don't
4022 * propagate mounts to the real root. */
4023 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 4024 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 4025 _exit(EXIT_FAILURE);
d87be9b0 4026 }
04bc4a3f 4027
727fd4fd
LP
4028 if (mount_devices(arg_directory,
4029 root_device, root_device_rw,
4030 home_device, home_device_rw,
4031 srv_device, srv_device_rw) < 0)
a2da110b 4032 _exit(EXIT_FAILURE);
1b9e5b12 4033
d87be9b0 4034 /* Turn directory into bind mount */
4543768d 4035 if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 4036 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 4037 _exit(EXIT_FAILURE);
d87be9b0 4038 }
88213476 4039
4d9f07b4
LP
4040 r = setup_volatile(arg_directory);
4041 if (r < 0)
a2da110b 4042 _exit(EXIT_FAILURE);
4d9f07b4
LP
4043
4044 if (setup_volatile_state(arg_directory) < 0)
a2da110b 4045 _exit(EXIT_FAILURE);
4d9f07b4
LP
4046
4047 r = base_filesystem_create(arg_directory);
4048 if (r < 0)
a2da110b 4049 _exit(EXIT_FAILURE);
4d9f07b4 4050
d6797c92 4051 if (arg_read_only) {
ec16945e
LP
4052 r = bind_remount_recursive(arg_directory, true);
4053 if (r < 0) {
4054 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 4055 _exit(EXIT_FAILURE);
d87be9b0 4056 }
d6797c92 4057 }
2547bb41 4058
d87be9b0 4059 if (mount_all(arg_directory) < 0)
a2da110b 4060 _exit(EXIT_FAILURE);
57fb9fb5 4061
d87be9b0 4062 if (copy_devnodes(arg_directory) < 0)
a2da110b 4063 _exit(EXIT_FAILURE);
a258bf26 4064
f2d88580 4065 if (setup_ptmx(arg_directory) < 0)
a2da110b 4066 _exit(EXIT_FAILURE);
f2d88580 4067
d87be9b0 4068 dev_setup(arg_directory);
88213476 4069
785890ac
LP
4070 if (setup_propagate(arg_directory) < 0)
4071 _exit(EXIT_FAILURE);
4072
28650077 4073 if (setup_seccomp() < 0)
a2da110b 4074 _exit(EXIT_FAILURE);
24fb1112 4075
d87be9b0 4076 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 4077 _exit(EXIT_FAILURE);
88213476 4078
d87be9b0 4079 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 4080 _exit(EXIT_FAILURE);
03e334a1 4081 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 4082
6d0b55c2
LP
4083 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4084 _exit(EXIT_FAILURE);
4085 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4086
b12afc8c
LP
4087 /* Tell the parent that we are ready, and that
4088 * it can cgroupify us to that we lack access
4089 * to certain devices and resources. */
6dac160c 4090 (void) barrier_place(&barrier); /* #1 */
b12afc8c 4091
d87be9b0 4092 if (setup_boot_id(arg_directory) < 0)
a2da110b 4093 _exit(EXIT_FAILURE);
a41fe3a2 4094
d87be9b0 4095 if (setup_timezone(arg_directory) < 0)
a2da110b 4096 _exit(EXIT_FAILURE);
88213476 4097
d87be9b0 4098 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 4099 _exit(EXIT_FAILURE);
687d0825 4100
d87be9b0 4101 if (setup_journal(arg_directory) < 0)
a2da110b 4102 _exit(EXIT_FAILURE);
687d0825 4103
d6797c92 4104 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 4105 _exit(EXIT_FAILURE);
17fe0523 4106
d6797c92 4107 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 4108 _exit(EXIT_FAILURE);
17fe0523 4109
06c17c39 4110 if (mount_tmpfs(arg_directory) < 0)
a2da110b 4111 _exit(EXIT_FAILURE);
06c17c39 4112
b12afc8c
LP
4113 /* Wait until we are cgroup-ified, so that we
4114 * can mount the right cgroup path writable */
6dac160c 4115 (void) barrier_place_and_sync(&barrier); /* #2 */
b12afc8c
LP
4116
4117 if (mount_cgroup(arg_directory) < 0)
4118 _exit(EXIT_FAILURE);
d96c1ecf 4119
d87be9b0 4120 if (chdir(arg_directory) < 0) {
56f64d95 4121 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 4122 _exit(EXIT_FAILURE);
687d0825
MV
4123 }
4124
d87be9b0 4125 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 4126 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 4127 _exit(EXIT_FAILURE);
687d0825
MV
4128 }
4129
d87be9b0 4130 if (chroot(".") < 0) {
56f64d95 4131 log_error_errno(errno, "chroot() failed: %m");
a2da110b 4132 _exit(EXIT_FAILURE);
687d0825
MV
4133 }
4134
d87be9b0 4135 if (chdir("/") < 0) {
56f64d95 4136 log_error_errno(errno, "chdir() failed: %m");
a2da110b 4137 _exit(EXIT_FAILURE);
687d0825
MV
4138 }
4139
6dac160c
LP
4140 if (arg_userns) {
4141 if (unshare(CLONE_NEWUSER) < 0) {
4142 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4143 _exit(EXIT_FAILURE);
4144 }
d87be9b0 4145
6dac160c
LP
4146 /* Tell the parent, that it now can
4147 * write the UID map. */
4148 (void) barrier_place(&barrier); /* #3 */
4149
4150 /* Wait until the parent wrote the UID
4151 * map */
4152 (void) barrier_place_and_sync(&barrier); /* #4 */
4153 }
4154
4155 umask(0022);
d87be9b0
LP
4156
4157 if (drop_capabilities() < 0) {
56f64d95 4158 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4159 _exit(EXIT_FAILURE);
687d0825 4160 }
687d0825 4161
6dac160c
LP
4162 setup_hostname();
4163
4164 if (arg_personality != 0xffffffffLU) {
4165 if (personality(arg_personality) < 0) {
4166 log_error_errno(errno, "personality() failed: %m");
4167 _exit(EXIT_FAILURE);
4168 }
4169 } else if (secondary) {
4170 if (personality(PER_LINUX32) < 0) {
4171 log_error_errno(errno, "personality() failed: %m");
4172 _exit(EXIT_FAILURE);
4173 }
4174 }
4175
4176#ifdef HAVE_SELINUX
4177 if (arg_selinux_context)
4178 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4179 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4180 _exit(EXIT_FAILURE);
4181 }
4182#endif
4183
0cb9fbcd
LP
4184 r = change_uid_gid(&home);
4185 if (r < 0)
a2da110b 4186 _exit(EXIT_FAILURE);
d87be9b0 4187
842f3b0f
LP
4188 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4189 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4190 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4191 log_oom();
a2da110b 4192 _exit(EXIT_FAILURE);
144f0fc0 4193 }
687d0825 4194
9444b1f2 4195 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4196 char as_uuid[37];
4197
4198 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4199 log_oom();
a2da110b 4200 _exit(EXIT_FAILURE);
842f3b0f
LP
4201 }
4202 }
4203
4204 if (fdset_size(fds) > 0) {
ec16945e
LP
4205 r = fdset_cloexec(fds, false);
4206 if (r < 0) {
4207 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4208 _exit(EXIT_FAILURE);
842f3b0f
LP
4209 }
4210
4211 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4212 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4213 log_oom();
a2da110b 4214 _exit(EXIT_FAILURE);
d87be9b0
LP
4215 }
4216 }
4217
f4889f65
LP
4218 if (!strv_isempty(arg_setenv)) {
4219 char **n;
4220
4221 n = strv_env_merge(2, envp, arg_setenv);
4222 if (!n) {
4223 log_oom();
a2da110b 4224 _exit(EXIT_FAILURE);
f4889f65
LP
4225 }
4226
4227 env_use = n;
4228 } else
4229 env_use = (char**) envp;
4230
6dac160c
LP
4231 /* Let the parent know that we are ready and
4232 * wait until the parent is ready with the
4233 * setup, too... */
4234 (void) barrier_place_and_sync(&barrier); /* #5 */
d96c1ecf 4235
d87be9b0
LP
4236 if (arg_boot) {
4237 char **a;
4238 size_t l;
88213476 4239
d87be9b0 4240 /* Automatically search for the init system */
0f0dbc46 4241
d87be9b0
LP
4242 l = 1 + argc - optind;
4243 a = newa(char*, l + 1);
4244 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4245
d87be9b0 4246 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4247 execve(a[0], a, env_use);
0f0dbc46 4248
d87be9b0 4249 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4250 execve(a[0], a, env_use);
0f0dbc46 4251
d87be9b0 4252 a[0] = (char*) "/sbin/init";
f4889f65 4253 execve(a[0], a, env_use);
d87be9b0 4254 } else if (argc > optind)
f4889f65 4255 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4256 else {
4257 chdir(home ? home : "/root");
f4889f65 4258 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4259 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4260 }
4261
56f64d95 4262 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4263 _exit(EXIT_FAILURE);
da5b3bad 4264 }
88213476 4265
a2da110b 4266 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4267 fdset_free(fds);
4268 fds = NULL;
4269
6d0b55c2
LP
4270 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4271 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4272
6dac160c
LP
4273 (void) barrier_place(&barrier); /* #1 */
4274
b12afc8c
LP
4275 /* Wait for the most basic Child-setup to be done,
4276 * before we add hardware to it, and place it in a
4277 * cgroup. */
6dac160c 4278 if (barrier_sync(&barrier)) { /* #1 */
5aa4bb6b 4279 int ifi = 0;
354bfd2b 4280
840295fc
LP
4281 r = move_network_interfaces(pid);
4282 if (r < 0)
4283 goto finish;
aa28aefe 4284
5aa4bb6b 4285 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4286 if (r < 0)
4287 goto finish;
ab046dde 4288
5aa4bb6b 4289 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4290 if (r < 0)
4291 goto finish;
ab046dde 4292
840295fc
LP
4293 r = setup_macvlan(pid);
4294 if (r < 0)
4295 goto finish;
c74e630d 4296
4bbfe7ad
TG
4297 r = setup_ipvlan(pid);
4298 if (r < 0)
4299 goto finish;
4300
5aa4bb6b
LP
4301 r = register_machine(pid, ifi);
4302 if (r < 0)
4303 goto finish;
4304
6dac160c
LP
4305 /* Notify the child that the parent is ready with all
4306 * its setup, and that the child can now hand over
4307 * control to the code to run inside the container. */
4308 (void) barrier_place(&barrier); /* #2 */
4309
4310 if (arg_userns) {
4311 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4312
4313 (void) barrier_place_and_sync(&barrier); /* #3 */
4314
4315 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4316 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4317 r = write_string_file(uid_map, line);
4318 if (r < 0) {
4319 log_error_errno(r, "Failed to write UID map: %m");
4320 goto finish;
4321 }
4322
4323 /* We always assign the same UID and GID ranges */
4324 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4325 r = write_string_file(uid_map, line);
4326 if (r < 0) {
4327 log_error_errno(r, "Failed to write GID map: %m");
4328 goto finish;
4329 }
4330
4331 (void) barrier_place(&barrier); /* #4 */
4332 }
4333
840295fc
LP
4334 /* Block SIGCHLD here, before notifying child.
4335 * process_pty() will handle it with the other signals. */
4336 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4337 if (r < 0)
4338 goto finish;
e866af3a 4339
840295fc
LP
4340 /* Reset signal to default */
4341 r = default_signals(SIGCHLD, -1);
4342 if (r < 0)
4343 goto finish;
e866af3a 4344
6dac160c
LP
4345 /* Let the child know that we are ready and wait that the child is completely ready now. */
4346 if (barrier_place_and_sync(&barrier)) { /* #5 */
6d0b55c2
LP
4347 _cleanup_event_unref_ sd_event *event = NULL;
4348 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4349 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4350 char last_char = 0;
b12afc8c 4351
733d15ac
LP
4352 sd_notifyf(false,
4353 "READY=1\n"
4354 "STATUS=Container running.\n"
4355 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4356
6d0b55c2
LP
4357 r = sd_event_new(&event);
4358 if (r < 0) {
4359 log_error_errno(r, "Failed to get default event source: %m");
4360 goto finish;
4361 }
88213476 4362
c6c8f6e2 4363 if (arg_kill_signal > 0) {
6d0b55c2
LP
4364 /* Try to kill the init system on SIGINT or SIGTERM */
4365 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4366 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4367 } else {
4368 /* Immediately exit */
4369 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4370 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4371 }
023fb90b 4372
6d0b55c2
LP
4373 /* simply exit on sigchld */
4374 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4375
6d0b55c2
LP
4376 if (arg_expose_ports) {
4377 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4378 if (r < 0)
4379 goto finish;
023fb90b 4380
6d0b55c2
LP
4381 (void) expose_ports(rtnl, &exposed);
4382 }
023fb90b 4383
6d0b55c2 4384 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4385
9c857b9d 4386 r = pty_forward_new(event, master, true, !interactive, &forward);
6d0b55c2
LP
4387 if (r < 0) {
4388 log_error_errno(r, "Failed to create PTY forwarder: %m");
4389 goto finish;
4390 }
023fb90b 4391
6d0b55c2
LP
4392 r = sd_event_loop(event);
4393 if (r < 0) {
4394 log_error_errno(r, "Failed to run event loop: %m");
4395 goto finish;
4396 }
4397
4398 pty_forward_get_last_char(forward, &last_char);
4399
4400 forward = pty_forward_free(forward);
4401
4402 if (!arg_quiet && last_char != '\n')
4403 putc('\n', stdout);
04d39279 4404
6d0b55c2
LP
4405 /* Kill if it is not dead yet anyway */
4406 terminate_machine(pid);
4407 }
840295fc 4408 }
1f0cd86b 4409
840295fc 4410 /* Normally redundant, but better safe than sorry */
04d39279 4411 kill(pid, SIGKILL);
a258bf26 4412
113cea80 4413 r = wait_for_container(pid, &container_status);
04d39279
LP
4414 pid = 0;
4415
ec16945e 4416 if (r < 0)
ce9f1527
LP
4417 /* We failed to wait for the container, or the
4418 * container exited abnormally */
ec16945e
LP
4419 goto finish;
4420 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4421 /* The container exited with a non-zero
4422 * status, or with zero status and no reboot
4423 * was requested. */
ec16945e 4424 ret = r;
d87be9b0 4425 break;
ec16945e 4426 }
88213476 4427
113cea80 4428 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4429
4430 if (arg_keep_unit) {
4431 /* Special handling if we are running as a
4432 * service: instead of simply restarting the
4433 * machine we want to restart the entire
4434 * service, so let's inform systemd about this
4435 * with the special exit code 133. The service
4436 * file uses RestartForceExitStatus=133 so
4437 * that this results in a full nspawn
4438 * restart. This is necessary since we might
4439 * have cgroup parameters set we want to have
4440 * flushed out. */
ec16945e
LP
4441 ret = 133;
4442 r = 0;
ce38dbc8
LP
4443 break;
4444 }
6d0b55c2
LP
4445
4446 flush_ports(&exposed);
d87be9b0 4447 }
88213476
LP
4448
4449finish:
af4ec430
LP
4450 sd_notify(false,
4451 "STOPPING=1\n"
4452 "STATUS=Terminating...");
4453
1b9e5b12
LP
4454 loop_remove(loop_nr, &image_fd);
4455
9444b1f2
LP
4456 if (pid > 0)
4457 kill(pid, SIGKILL);
88213476 4458
ec16945e
LP
4459 if (remove_subvol && arg_directory) {
4460 int k;
4461
4462 k = btrfs_subvol_remove(arg_directory);
4463 if (k < 0)
4464 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4465 }
4466
785890ac
LP
4467 if (arg_machine) {
4468 const char *p;
4469
63c372cb 4470 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4471 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4472 }
4473
04d391da 4474 free(arg_directory);
ec16945e
LP
4475 free(arg_template);
4476 free(arg_image);
7027ff61 4477 free(arg_machine);
c74e630d
LP
4478 free(arg_user);
4479 strv_free(arg_setenv);
4480 strv_free(arg_network_interfaces);
4481 strv_free(arg_network_macvlan);
4bbfe7ad 4482 strv_free(arg_network_ipvlan);
c74e630d
LP
4483 strv_free(arg_bind);
4484 strv_free(arg_bind_ro);
06c17c39 4485 strv_free(arg_tmpfs);
88213476 4486
6d0b55c2
LP
4487 flush_ports(&exposed);
4488
4489 while (arg_expose_ports) {
4490 ExposePort *p = arg_expose_ports;
4491 LIST_REMOVE(ports, arg_expose_ports, p);
4492 free(p);
4493 }
4494
ec16945e 4495 return r < 0 ? EXIT_FAILURE : ret;
88213476 4496}