]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
tree-wide: there is no ENOTSUP on linux
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
aa28aefe 58#include "sd-rtnl.h"
88213476
LP
59#include "log.h"
60#include "util.h"
49e942b2 61#include "mkdir.h"
6b2d0e85 62#include "macro.h"
94d82985 63#include "missing.h"
04d391da 64#include "cgroup-util.h"
a258bf26 65#include "strv.h"
9eb977db 66#include "path-util.h"
a41fe3a2 67#include "loopback-setup.h"
4fc9982c 68#include "dev-setup.h"
842f3b0f 69#include "fdset.h"
acbeb427 70#include "build.h"
a5c32cff 71#include "fileio.h"
40ca29a1 72#include "bus-util.h"
1f0cd86b 73#include "bus-error.h"
4ba93280 74#include "ptyfwd.h"
f4889f65 75#include "env-util.h"
aa28aefe 76#include "rtnl-util.h"
7e227024 77#include "udev-util.h"
1b9e5b12
LP
78#include "blkid-util.h"
79#include "gpt.h"
01dde061 80#include "siphash24.h"
849958d1 81#include "copy.h"
3577de7a 82#include "base-filesystem.h"
a2da110b 83#include "barrier.h"
023fb90b 84#include "event-util.h"
f01ae826 85#include "capability.h"
2822da4f 86#include "cap-list.h"
ec16945e 87#include "btrfs-util.h"
1b9cebf6 88#include "machine-image.h"
6d0b55c2
LP
89#include "list.h"
90#include "in-addr-util.h"
91#include "fw-util.h"
92#include "local-addresses.h"
f2d88580 93
e9642be2
LP
94#ifdef HAVE_SECCOMP
95#include "seccomp-util.h"
96#endif
97
6d0b55c2
LP
98typedef struct ExposePort {
99 int protocol;
100 uint16_t host_port;
101 uint16_t container_port;
102 LIST_FIELDS(struct ExposePort, ports);
103} ExposePort;
104
113cea80
DH
105typedef enum ContainerStatus {
106 CONTAINER_TERMINATED,
107 CONTAINER_REBOOTED
108} ContainerStatus;
109
57fb9fb5
LP
110typedef enum LinkJournal {
111 LINK_NO,
112 LINK_AUTO,
113 LINK_HOST,
114 LINK_GUEST
115} LinkJournal;
88213476 116
4d9f07b4
LP
117typedef enum Volatile {
118 VOLATILE_NO,
119 VOLATILE_YES,
120 VOLATILE_STATE,
121} Volatile;
122
88213476 123static char *arg_directory = NULL;
ec16945e 124static char *arg_template = NULL;
687d0825 125static char *arg_user = NULL;
9444b1f2 126static sd_id128_t arg_uuid = {};
7027ff61 127static char *arg_machine = NULL;
c74e630d
LP
128static const char *arg_selinux_context = NULL;
129static const char *arg_selinux_apifs_context = NULL;
9444b1f2 130static const char *arg_slice = NULL;
ff01d048 131static bool arg_private_network = false;
bc2f673e 132static bool arg_read_only = false;
0f0dbc46 133static bool arg_boot = false;
ec16945e 134static bool arg_ephemeral = false;
57fb9fb5 135static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 136static bool arg_link_journal_try = false;
5076f0cc
LP
137static uint64_t arg_retain =
138 (1ULL << CAP_CHOWN) |
139 (1ULL << CAP_DAC_OVERRIDE) |
140 (1ULL << CAP_DAC_READ_SEARCH) |
141 (1ULL << CAP_FOWNER) |
142 (1ULL << CAP_FSETID) |
143 (1ULL << CAP_IPC_OWNER) |
144 (1ULL << CAP_KILL) |
145 (1ULL << CAP_LEASE) |
146 (1ULL << CAP_LINUX_IMMUTABLE) |
147 (1ULL << CAP_NET_BIND_SERVICE) |
148 (1ULL << CAP_NET_BROADCAST) |
149 (1ULL << CAP_NET_RAW) |
150 (1ULL << CAP_SETGID) |
151 (1ULL << CAP_SETFCAP) |
152 (1ULL << CAP_SETPCAP) |
153 (1ULL << CAP_SETUID) |
154 (1ULL << CAP_SYS_ADMIN) |
155 (1ULL << CAP_SYS_CHROOT) |
156 (1ULL << CAP_SYS_NICE) |
157 (1ULL << CAP_SYS_PTRACE) |
158 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 159 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
160 (1ULL << CAP_SYS_BOOT) |
161 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
162 (1ULL << CAP_AUDIT_CONTROL) |
163 (1ULL << CAP_MKNOD);
17fe0523
LP
164static char **arg_bind = NULL;
165static char **arg_bind_ro = NULL;
06c17c39 166static char **arg_tmpfs = NULL;
f4889f65 167static char **arg_setenv = NULL;
284c0b91 168static bool arg_quiet = false;
8a96d94e 169static bool arg_share_system = false;
eb91eb18 170static bool arg_register = true;
89f7c846 171static bool arg_keep_unit = false;
aa28aefe 172static char **arg_network_interfaces = NULL;
c74e630d 173static char **arg_network_macvlan = NULL;
4bbfe7ad 174static char **arg_network_ipvlan = NULL;
69c79d3c 175static bool arg_network_veth = false;
c74e630d 176static const char *arg_network_bridge = NULL;
6afc95b7 177static unsigned long arg_personality = 0xffffffffLU;
ec16945e 178static char *arg_image = NULL;
4d9f07b4 179static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 180static ExposePort *arg_expose_ports = NULL;
f36933fe 181static char **arg_property = NULL;
6dac160c
LP
182static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183static bool arg_userns = false;
c6c8f6e2 184static int arg_kill_signal = 0;
88213476 185
601185b4 186static void help(void) {
88213476
LP
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
189 " -h --help Show this help\n"
190 " --version Print version string\n"
69c79d3c 191 " -q --quiet Do not show status information\n"
1b9e5b12 192 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
193 " --template=PATH Initialize root directory from template directory,\n"
194 " if missing\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 200 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 201 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 202 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 203 " --property=NAME=VALUE Set scope unit property\n"
69c79d3c
LP
204 " --private-network Disable network in container\n"
205 " --network-interface=INTERFACE\n"
206 " Assign an existing network interface to the\n"
207 " container\n"
c74e630d
LP
208 " --network-macvlan=INTERFACE\n"
209 " Create a macvlan network interface based on an\n"
210 " existing network interface to the container\n"
4bbfe7ad
TG
211 " --network-ipvlan=INTERFACE\n"
212 " Create a ipvlan network interface based on an\n"
213 " existing network interface to the container\n"
0dfaa006 214 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 215 " and container\n"
ab046dde 216 " --network-bridge=INTERFACE\n"
32457153 217 " Add a virtual ethernet connection between host\n"
ab046dde
TG
218 " and container and add it to an existing bridge on\n"
219 " the host\n"
6dac160c
LP
220 " --private-users[=UIDBASE[:NUIDS]]\n"
221 " Run within user namespace\n"
6d0b55c2 222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 223 " Expose a container IP port on the host\n"
82adf6af
LP
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
a8828ed9
DW
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 237 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
238 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
239 " the container\n"
240 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 241 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 242 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 243 " --share-system Share system namespaces with host\n"
eb91eb18 244 " --register=BOOLEAN Register container as machine\n"
89f7c846 245 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 246 " the service unit nspawn is running in\n"
6d0b55c2
LP
247 " --volatile[=MODE] Run the system in volatile mode\n"
248 , program_invocation_short_name);
88213476
LP
249}
250
ec16945e
LP
251static int set_sanitized_path(char **b, const char *path) {
252 char *p;
253
254 assert(b);
255 assert(path);
256
257 p = canonicalize_file_name(path);
258 if (!p) {
259 if (errno != ENOENT)
260 return -errno;
261
262 p = path_make_absolute_cwd(path);
263 if (!p)
264 return -ENOMEM;
265 }
266
267 free(*b);
268 *b = path_kill_slashes(p);
269 return 0;
270}
271
88213476
LP
272static int parse_argv(int argc, char *argv[]) {
273
a41fe3a2 274 enum {
acbeb427
ZJS
275 ARG_VERSION = 0x100,
276 ARG_PRIVATE_NETWORK,
bc2f673e 277 ARG_UUID,
5076f0cc 278 ARG_READ_ONLY,
57fb9fb5 279 ARG_CAPABILITY,
420c7379 280 ARG_DROP_CAPABILITY,
17fe0523
LP
281 ARG_LINK_JOURNAL,
282 ARG_BIND,
f4889f65 283 ARG_BIND_RO,
06c17c39 284 ARG_TMPFS,
f4889f65 285 ARG_SETENV,
eb91eb18 286 ARG_SHARE_SYSTEM,
89f7c846 287 ARG_REGISTER,
aa28aefe 288 ARG_KEEP_UNIT,
69c79d3c 289 ARG_NETWORK_INTERFACE,
c74e630d 290 ARG_NETWORK_MACVLAN,
4bbfe7ad 291 ARG_NETWORK_IPVLAN,
ab046dde 292 ARG_NETWORK_BRIDGE,
6afc95b7 293 ARG_PERSONALITY,
4d9f07b4 294 ARG_VOLATILE,
ec16945e 295 ARG_TEMPLATE,
f36933fe 296 ARG_PROPERTY,
6dac160c 297 ARG_PRIVATE_USERS,
c6c8f6e2 298 ARG_KILL_SIGNAL,
a41fe3a2
LP
299 };
300
88213476 301 static const struct option options[] = {
aa28aefe
LP
302 { "help", no_argument, NULL, 'h' },
303 { "version", no_argument, NULL, ARG_VERSION },
304 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
305 { "template", required_argument, NULL, ARG_TEMPLATE },
306 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
307 { "user", required_argument, NULL, 'u' },
308 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
309 { "boot", no_argument, NULL, 'b' },
310 { "uuid", required_argument, NULL, ARG_UUID },
311 { "read-only", no_argument, NULL, ARG_READ_ONLY },
312 { "capability", required_argument, NULL, ARG_CAPABILITY },
313 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
314 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
315 { "bind", required_argument, NULL, ARG_BIND },
316 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 317 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
318 { "machine", required_argument, NULL, 'M' },
319 { "slice", required_argument, NULL, 'S' },
320 { "setenv", required_argument, NULL, ARG_SETENV },
321 { "selinux-context", required_argument, NULL, 'Z' },
322 { "selinux-apifs-context", required_argument, NULL, 'L' },
323 { "quiet", no_argument, NULL, 'q' },
324 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
325 { "register", required_argument, NULL, ARG_REGISTER },
326 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
327 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 328 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 329 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 330 { "network-veth", no_argument, NULL, 'n' },
ab046dde 331 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 332 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 333 { "image", required_argument, NULL, 'i' },
4d9f07b4 334 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 335 { "port", required_argument, NULL, 'p' },
f36933fe 336 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 337 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 338 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 339 {}
88213476
LP
340 };
341
9444b1f2 342 int c, r;
a42c8b54 343 uint64_t plus = 0, minus = 0;
88213476
LP
344
345 assert(argc >= 0);
346 assert(argv);
347
0dfaa006 348 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
349
350 switch (c) {
351
352 case 'h':
601185b4
ZJS
353 help();
354 return 0;
88213476 355
acbeb427
ZJS
356 case ARG_VERSION:
357 puts(PACKAGE_STRING);
358 puts(SYSTEMD_FEATURES);
359 return 0;
360
88213476 361 case 'D':
ec16945e
LP
362 r = set_sanitized_path(&arg_directory, optarg);
363 if (r < 0)
364 return log_error_errno(r, "Invalid root directory: %m");
365
366 break;
367
368 case ARG_TEMPLATE:
369 r = set_sanitized_path(&arg_template, optarg);
370 if (r < 0)
371 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
372
373 break;
374
1b9e5b12 375 case 'i':
ec16945e
LP
376 r = set_sanitized_path(&arg_image, optarg);
377 if (r < 0)
378 return log_error_errno(r, "Invalid image path: %m");
379
380 break;
381
382 case 'x':
383 arg_ephemeral = true;
1b9e5b12
LP
384 break;
385
687d0825
MV
386 case 'u':
387 free(arg_user);
7027ff61
LP
388 arg_user = strdup(optarg);
389 if (!arg_user)
390 return log_oom();
687d0825
MV
391
392 break;
393
ab046dde 394 case ARG_NETWORK_BRIDGE:
c74e630d 395 arg_network_bridge = optarg;
ab046dde
TG
396
397 /* fall through */
398
0dfaa006 399 case 'n':
69c79d3c
LP
400 arg_network_veth = true;
401 arg_private_network = true;
402 break;
403
aa28aefe 404 case ARG_NETWORK_INTERFACE:
c74e630d
LP
405 if (strv_extend(&arg_network_interfaces, optarg) < 0)
406 return log_oom();
407
408 arg_private_network = true;
409 break;
410
411 case ARG_NETWORK_MACVLAN:
412 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
413 return log_oom();
414
4bbfe7ad
TG
415 arg_private_network = true;
416 break;
417
418 case ARG_NETWORK_IPVLAN:
419 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
420 return log_oom();
421
aa28aefe
LP
422 /* fall through */
423
ff01d048
LP
424 case ARG_PRIVATE_NETWORK:
425 arg_private_network = true;
a41fe3a2
LP
426 break;
427
0f0dbc46
LP
428 case 'b':
429 arg_boot = true;
430 break;
431
144f0fc0 432 case ARG_UUID:
9444b1f2
LP
433 r = sd_id128_from_string(optarg, &arg_uuid);
434 if (r < 0) {
aa96c6cb 435 log_error("Invalid UUID: %s", optarg);
9444b1f2 436 return r;
aa96c6cb 437 }
9444b1f2 438 break;
aa96c6cb 439
9444b1f2 440 case 'S':
c74e630d 441 arg_slice = optarg;
144f0fc0
LP
442 break;
443
7027ff61 444 case 'M':
eb91eb18
LP
445 if (isempty(optarg)) {
446 free(arg_machine);
447 arg_machine = NULL;
448 } else {
0c3c4284 449 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
450 log_error("Invalid machine name: %s", optarg);
451 return -EINVAL;
452 }
7027ff61 453
0c3c4284
LP
454 r = free_and_strdup(&arg_machine, optarg);
455 if (r < 0)
eb91eb18
LP
456 return log_oom();
457
458 break;
459 }
7027ff61 460
82adf6af
LP
461 case 'Z':
462 arg_selinux_context = optarg;
a8828ed9
DW
463 break;
464
82adf6af
LP
465 case 'L':
466 arg_selinux_apifs_context = optarg;
a8828ed9
DW
467 break;
468
bc2f673e
LP
469 case ARG_READ_ONLY:
470 arg_read_only = true;
471 break;
472
420c7379
LP
473 case ARG_CAPABILITY:
474 case ARG_DROP_CAPABILITY: {
a2a5291b 475 const char *state, *word;
5076f0cc
LP
476 size_t length;
477
478 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 479 _cleanup_free_ char *t;
5076f0cc
LP
480
481 t = strndup(word, length);
0d0f0c50
SL
482 if (!t)
483 return log_oom();
5076f0cc 484
39ed67d1
LP
485 if (streq(t, "all")) {
486 if (c == ARG_CAPABILITY)
a42c8b54 487 plus = (uint64_t) -1;
39ed67d1 488 else
a42c8b54 489 minus = (uint64_t) -1;
39ed67d1 490 } else {
2822da4f
LP
491 int cap;
492
493 cap = capability_from_name(t);
494 if (cap < 0) {
39ed67d1
LP
495 log_error("Failed to parse capability %s.", t);
496 return -EINVAL;
497 }
498
499 if (c == ARG_CAPABILITY)
a42c8b54 500 plus |= 1ULL << (uint64_t) cap;
39ed67d1 501 else
a42c8b54 502 minus |= 1ULL << (uint64_t) cap;
5076f0cc 503 }
5076f0cc
LP
504 }
505
506 break;
507 }
508
57fb9fb5
LP
509 case 'j':
510 arg_link_journal = LINK_GUEST;
574edc90 511 arg_link_journal_try = true;
57fb9fb5
LP
512 break;
513
514 case ARG_LINK_JOURNAL:
53e438e3 515 if (streq(optarg, "auto")) {
57fb9fb5 516 arg_link_journal = LINK_AUTO;
53e438e3
LP
517 arg_link_journal_try = false;
518 } else if (streq(optarg, "no")) {
57fb9fb5 519 arg_link_journal = LINK_NO;
53e438e3
LP
520 arg_link_journal_try = false;
521 } else if (streq(optarg, "guest")) {
57fb9fb5 522 arg_link_journal = LINK_GUEST;
53e438e3
LP
523 arg_link_journal_try = false;
524 } else if (streq(optarg, "host")) {
57fb9fb5 525 arg_link_journal = LINK_HOST;
53e438e3
LP
526 arg_link_journal_try = false;
527 } else if (streq(optarg, "try-guest")) {
574edc90
MP
528 arg_link_journal = LINK_GUEST;
529 arg_link_journal_try = true;
530 } else if (streq(optarg, "try-host")) {
531 arg_link_journal = LINK_HOST;
532 arg_link_journal_try = true;
533 } else {
57fb9fb5
LP
534 log_error("Failed to parse link journal mode %s", optarg);
535 return -EINVAL;
536 }
537
538 break;
539
17fe0523
LP
540 case ARG_BIND:
541 case ARG_BIND_RO: {
542 _cleanup_free_ char *a = NULL, *b = NULL;
543 char *e;
544 char ***x;
17fe0523
LP
545
546 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
547
548 e = strchr(optarg, ':');
549 if (e) {
550 a = strndup(optarg, e - optarg);
551 b = strdup(e + 1);
552 } else {
553 a = strdup(optarg);
554 b = strdup(optarg);
555 }
556
557 if (!a || !b)
558 return log_oom();
559
560 if (!path_is_absolute(a) || !path_is_absolute(b)) {
561 log_error("Invalid bind mount specification: %s", optarg);
562 return -EINVAL;
563 }
564
565 r = strv_extend(x, a);
566 if (r < 0)
b3451bed 567 return log_oom();
17fe0523
LP
568
569 r = strv_extend(x, b);
570 if (r < 0)
b3451bed 571 return log_oom();
17fe0523
LP
572
573 break;
574 }
575
06c17c39
LP
576 case ARG_TMPFS: {
577 _cleanup_free_ char *a = NULL, *b = NULL;
578 char *e;
579
580 e = strchr(optarg, ':');
581 if (e) {
582 a = strndup(optarg, e - optarg);
583 b = strdup(e + 1);
584 } else {
585 a = strdup(optarg);
586 b = strdup("mode=0755");
587 }
588
589 if (!a || !b)
590 return log_oom();
591
592 if (!path_is_absolute(a)) {
593 log_error("Invalid tmpfs specification: %s", optarg);
594 return -EINVAL;
595 }
596
597 r = strv_push(&arg_tmpfs, a);
598 if (r < 0)
599 return log_oom();
600
601 a = NULL;
602
603 r = strv_push(&arg_tmpfs, b);
604 if (r < 0)
605 return log_oom();
606
607 b = NULL;
608
609 break;
610 }
611
f4889f65
LP
612 case ARG_SETENV: {
613 char **n;
614
615 if (!env_assignment_is_valid(optarg)) {
616 log_error("Environment variable assignment '%s' is not valid.", optarg);
617 return -EINVAL;
618 }
619
620 n = strv_env_set(arg_setenv, optarg);
621 if (!n)
622 return log_oom();
623
624 strv_free(arg_setenv);
625 arg_setenv = n;
626 break;
627 }
628
284c0b91
LP
629 case 'q':
630 arg_quiet = true;
631 break;
632
8a96d94e
LP
633 case ARG_SHARE_SYSTEM:
634 arg_share_system = true;
635 break;
636
eb91eb18
LP
637 case ARG_REGISTER:
638 r = parse_boolean(optarg);
639 if (r < 0) {
640 log_error("Failed to parse --register= argument: %s", optarg);
641 return r;
642 }
643
644 arg_register = r;
645 break;
646
89f7c846
LP
647 case ARG_KEEP_UNIT:
648 arg_keep_unit = true;
649 break;
650
6afc95b7
LP
651 case ARG_PERSONALITY:
652
ac45f971 653 arg_personality = personality_from_string(optarg);
6afc95b7
LP
654 if (arg_personality == 0xffffffffLU) {
655 log_error("Unknown or unsupported personality '%s'.", optarg);
656 return -EINVAL;
657 }
658
659 break;
660
4d9f07b4
LP
661 case ARG_VOLATILE:
662
663 if (!optarg)
664 arg_volatile = VOLATILE_YES;
665 else {
666 r = parse_boolean(optarg);
667 if (r < 0) {
668 if (streq(optarg, "state"))
669 arg_volatile = VOLATILE_STATE;
670 else {
671 log_error("Failed to parse --volatile= argument: %s", optarg);
672 return r;
673 }
674 } else
675 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
676 }
677
678 break;
679
6d0b55c2
LP
680 case 'p': {
681 const char *split, *e;
682 uint16_t container_port, host_port;
683 int protocol;
684 ExposePort *p;
685
686 if ((e = startswith(optarg, "tcp:")))
687 protocol = IPPROTO_TCP;
688 else if ((e = startswith(optarg, "udp:")))
689 protocol = IPPROTO_UDP;
690 else {
691 e = optarg;
692 protocol = IPPROTO_TCP;
693 }
694
695 split = strchr(e, ':');
696 if (split) {
697 char v[split - e + 1];
698
699 memcpy(v, e, split - e);
700 v[split - e] = 0;
701
702 r = safe_atou16(v, &host_port);
703 if (r < 0 || host_port <= 0) {
704 log_error("Failed to parse host port: %s", optarg);
705 return -EINVAL;
706 }
707
708 r = safe_atou16(split + 1, &container_port);
709 } else {
710 r = safe_atou16(e, &container_port);
711 host_port = container_port;
712 }
713
714 if (r < 0 || container_port <= 0) {
715 log_error("Failed to parse host port: %s", optarg);
716 return -EINVAL;
717 }
718
719 LIST_FOREACH(ports, p, arg_expose_ports) {
720 if (p->protocol == protocol && p->host_port == host_port) {
721 log_error("Duplicate port specification: %s", optarg);
722 return -EINVAL;
723 }
724 }
725
726 p = new(ExposePort, 1);
727 if (!p)
728 return log_oom();
729
730 p->protocol = protocol;
731 p->host_port = host_port;
732 p->container_port = container_port;
733
734 LIST_PREPEND(ports, arg_expose_ports, p);
735
736 break;
737 }
738
f36933fe
LP
739 case ARG_PROPERTY:
740 if (strv_extend(&arg_property, optarg) < 0)
741 return log_oom();
742
743 break;
744
6dac160c
LP
745 case ARG_PRIVATE_USERS:
746 if (optarg) {
747 _cleanup_free_ char *buffer = NULL;
748 const char *range, *shift;
749
750 range = strchr(optarg, ':');
751 if (range) {
752 buffer = strndup(optarg, range - optarg);
753 if (!buffer)
754 return log_oom();
755 shift = buffer;
756
757 range++;
758 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759 log_error("Failed to parse UID range: %s", range);
760 return -EINVAL;
761 }
762 } else
763 shift = optarg;
764
765 if (parse_uid(shift, &arg_uid_shift) < 0) {
766 log_error("Failed to parse UID: %s", optarg);
767 return -EINVAL;
768 }
769 }
770
771 arg_userns = true;
772 break;
773
c6c8f6e2
LP
774 case ARG_KILL_SIGNAL:
775 arg_kill_signal = signal_from_string_try_harder(optarg);
776 if (arg_kill_signal < 0) {
777 log_error("Cannot parse signal: %s", optarg);
778 return -EINVAL;
779 }
780
781 break;
782
88213476
LP
783 case '?':
784 return -EINVAL;
785
786 default:
eb9da376 787 assert_not_reached("Unhandled option");
88213476 788 }
88213476 789
eb91eb18
LP
790 if (arg_share_system)
791 arg_register = false;
792
793 if (arg_boot && arg_share_system) {
794 log_error("--boot and --share-system may not be combined.");
795 return -EINVAL;
796 }
797
89f7c846
LP
798 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799 log_error("--keep-unit may not be used when invoked from a user session.");
800 return -EINVAL;
801 }
802
1b9e5b12
LP
803 if (arg_directory && arg_image) {
804 log_error("--directory= and --image= may not be combined.");
805 return -EINVAL;
806 }
807
ec16945e
LP
808 if (arg_template && arg_image) {
809 log_error("--template= and --image= may not be combined.");
810 return -EINVAL;
811 }
812
813 if (arg_template && !(arg_directory || arg_machine)) {
814 log_error("--template= needs --directory= or --machine=.");
815 return -EINVAL;
816 }
817
818 if (arg_ephemeral && arg_template) {
819 log_error("--ephemeral and --template= may not be combined.");
820 return -EINVAL;
821 }
822
823 if (arg_ephemeral && arg_image) {
824 log_error("--ephemeral and --image= may not be combined.");
825 return -EINVAL;
826 }
827
df9a75e4
LP
828 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829 log_error("--ephemeral and --link-journal= may not be combined.");
830 return -EINVAL;
831 }
832
4d9f07b4
LP
833 if (arg_volatile != VOLATILE_NO && arg_read_only) {
834 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
835 return -EINVAL;
836 }
837
6d0b55c2
LP
838 if (arg_expose_ports && !arg_private_network) {
839 log_error("Cannot use --port= without private networking.");
840 return -EINVAL;
841 }
842
a42c8b54
LP
843 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
844
c6c8f6e2
LP
845 if (arg_boot && arg_kill_signal <= 0)
846 arg_kill_signal = SIGRTMIN+3;
847
88213476
LP
848 return 1;
849}
850
851static int mount_all(const char *dest) {
852
853 typedef struct MountPoint {
854 const char *what;
855 const char *where;
856 const char *type;
857 const char *options;
858 unsigned long flags;
3bd66c05 859 bool fatal;
88213476
LP
860 } MountPoint;
861
862 static const MountPoint mount_table[] = {
06c17c39
LP
863 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
864 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
865 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
866 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
867 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 868 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
869 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
870 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
bbb99c30 871 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 872#ifdef HAVE_SELINUX
06c17c39
LP
873 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
874 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 875#endif
88213476
LP
876 };
877
878 unsigned k;
879 int r = 0;
880
881 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 882 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 883 const char *o;
88213476
LP
884 int t;
885
17fe0523
LP
886 where = strjoin(dest, "/", mount_table[k].where, NULL);
887 if (!where)
888 return log_oom();
88213476 889
e65aec12 890 t = path_is_mount_point(where, true);
68fb0892 891 if (t < 0) {
da927ba9 892 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
893
894 if (r == 0)
895 r = t;
896
897 continue;
898 }
899
9c1c7f71
LP
900 /* Skip this entry if it is not a remount. */
901 if (mount_table[k].what && t > 0)
014a9c77
LP
902 continue;
903
79d80fc1
TG
904 t = mkdir_p(where, 0755);
905 if (t < 0) {
906 if (mount_table[k].fatal) {
da927ba9 907 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
908
909 if (r == 0)
910 r = t;
911 } else
da927ba9 912 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
913
914 continue;
915 }
88213476 916
a8828ed9 917#ifdef HAVE_SELINUX
82adf6af
LP
918 if (arg_selinux_apifs_context &&
919 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
921 if (!options)
922 return log_oom();
923
924 o = options;
925 } else
a8828ed9 926#endif
d002827b 927 o = mount_table[k].options;
a8828ed9 928
6dac160c
LP
929 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930 char *uid_options = NULL;
931
932 if (o)
933 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
934 else
935 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
936 if (!uid_options)
937 return log_oom();
938
939 free(options);
940 o = options = uid_options;
941 }
a8828ed9 942
88213476
LP
943 if (mount(mount_table[k].what,
944 where,
945 mount_table[k].type,
946 mount_table[k].flags,
79d80fc1 947 o) < 0) {
88213476 948
79d80fc1 949 if (mount_table[k].fatal) {
56f64d95 950 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 951
79d80fc1
TG
952 if (r == 0)
953 r = -errno;
954 } else
56f64d95 955 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 956 }
88213476
LP
957 }
958
e58a1277
LP
959 return r;
960}
f8440af5 961
d6797c92 962static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
963 char **x, **y;
964
965 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 966 _cleanup_free_ char *where = NULL;
d2421337 967 struct stat source_st, dest_st;
2ed4e5e0 968 int r;
d2421337 969
4a62c710
MS
970 if (stat(*x, &source_st) < 0)
971 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 972
06c17c39
LP
973 where = strappend(dest, *y);
974 if (!where)
975 return log_oom();
976
2ed4e5e0
SL
977 r = stat(where, &dest_st);
978 if (r == 0) {
05e7da5a
AC
979 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980 log_error("Cannot bind mount directory %s on file %s.", *x, where);
981 return -EINVAL;
982 }
983 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984 log_error("Cannot bind mount file %s on directory %s.", *x, where);
d2421337
DR
985 return -EINVAL;
986 }
2ed4e5e0
SL
987 } else if (errno == ENOENT) {
988 r = mkdir_parents_label(where, 0755);
f647962d
MS
989 if (r < 0)
990 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 991 } else {
56f64d95 992 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
993 return -errno;
994 }
06c17c39 995
05e7da5a
AC
996 /* Create the mount point. Any non-directory file can be
997 * mounted on any non-directory file (regular, fifo, socket,
998 * char, block).
999 */
79d80fc1
TG
1000 if (S_ISDIR(source_st.st_mode)) {
1001 r = mkdir_label(where, 0755);
f647962d
MS
1002 if (r < 0 && errno != EEXIST)
1003 return log_error_errno(r, "Failed to create mount point %s: %m", where);
05e7da5a 1004 } else {
79d80fc1 1005 r = touch(where);
f647962d
MS
1006 if (r < 0)
1007 return log_error_errno(r, "Failed to create mount point %s: %m", where);
d2421337 1008 }
17fe0523 1009
4a62c710
MS
1010 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1011 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 1012
d6797c92
LP
1013 if (ro) {
1014 r = bind_remount_recursive(where, true);
f647962d
MS
1015 if (r < 0)
1016 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
1017 }
1018 }
1019
1020 return 0;
1021}
1022
b12afc8c
LP
1023static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1024 char *to;
1025 int r;
1026
63c372cb 1027 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c
LP
1028
1029 r = path_is_mount_point(to, false);
1030 if (r < 0)
1031 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1032 if (r > 0)
1033 return 0;
1034
1035 mkdir_p(to, 0755);
1036
c0534580
LP
1037 /* The superblock mount options of the mount point need to be
1038 * identical to the hosts', and hence writable... */
1039 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1040 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1041
c0534580
LP
1042 /* ... hence let's only make the bind mount read-only, not the
1043 * superblock. */
1044 if (read_only) {
1045 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1047 }
b12afc8c
LP
1048 return 1;
1049}
1050
1051static int mount_cgroup(const char *dest) {
1052 _cleanup_set_free_free_ Set *controllers = NULL;
1053 _cleanup_free_ char *own_cgroup_path = NULL;
1054 const char *cgroup_root, *systemd_root, *systemd_own;
1055 int r;
1056
1057 controllers = set_new(&string_hash_ops);
1058 if (!controllers)
1059 return log_oom();
1060
1061 r = cg_kernel_controllers(controllers);
1062 if (r < 0)
1063 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1064
1065 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1066 if (r < 0)
1067 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1068
63c372cb 1069 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
b12afc8c
LP
1070 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1072
1073 for (;;) {
1074 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1075
1076 controller = set_steal_first(controllers);
1077 if (!controller)
1078 break;
1079
1080 origin = strappend("/sys/fs/cgroup/", controller);
1081 if (!origin)
1082 return log_oom();
1083
1084 r = readlink_malloc(origin, &combined);
1085 if (r == -EINVAL) {
1086 /* Not a symbolic link, but directly a single cgroup hierarchy */
1087
1088 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1089 if (r < 0)
1090 return r;
1091
1092 } else if (r < 0)
1093 return log_error_errno(r, "Failed to read link %s: %m", origin);
1094 else {
1095 _cleanup_free_ char *target = NULL;
1096
1097 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1098 if (!target)
1099 return log_oom();
1100
1101 /* A symbolic link, a combination of controllers in one hierarchy */
1102
1103 if (!filename_is_valid(combined)) {
1104 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1105 continue;
1106 }
1107
1108 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1109 if (r < 0)
1110 return r;
1111
1112 if (symlink(combined, target) < 0)
83521414 1113 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1114 }
1115 }
1116
c0534580 1117 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1118 if (r < 0)
1119 return r;
1120
1121 /* Make our own cgroup a (writable) bind mount */
63c372cb 1122 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1123 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1124 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1125
1126 /* And then remount the systemd cgroup root read-only */
63c372cb 1127 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1128 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1130
1131 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1133
1134 return 0;
1135}
1136
06c17c39
LP
1137static int mount_tmpfs(const char *dest) {
1138 char **i, **o;
1139
1140 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141 _cleanup_free_ char *where = NULL;
79d80fc1 1142 int r;
06c17c39
LP
1143
1144 where = strappend(dest, *i);
1145 if (!where)
1146 return log_oom();
1147
79d80fc1 1148 r = mkdir_label(where, 0755);
04a91939
LP
1149 if (r < 0 && r != -EEXIST)
1150 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1151
4a62c710
MS
1152 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1154 }
1155
1156 return 0;
1157}
1158
e58a1277 1159static int setup_timezone(const char *dest) {
d4036145
LP
1160 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1161 char *z, *y;
1162 int r;
f8440af5 1163
e58a1277
LP
1164 assert(dest);
1165
1166 /* Fix the timezone, if possible */
d4036145
LP
1167 r = readlink_malloc("/etc/localtime", &p);
1168 if (r < 0) {
1169 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1170 return 0;
1171 }
1172
1173 z = path_startswith(p, "../usr/share/zoneinfo/");
1174 if (!z)
1175 z = path_startswith(p, "/usr/share/zoneinfo/");
1176 if (!z) {
1177 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1178 return 0;
1179 }
1180
04bc4a3f
LP
1181 where = strappend(dest, "/etc/localtime");
1182 if (!where)
0d0f0c50 1183 return log_oom();
715ac17a 1184
d4036145
LP
1185 r = readlink_malloc(where, &q);
1186 if (r >= 0) {
1187 y = path_startswith(q, "../usr/share/zoneinfo/");
1188 if (!y)
1189 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1190
d4036145
LP
1191 /* Already pointing to the right place? Then do nothing .. */
1192 if (y && streq(y, z))
1193 return 0;
1194 }
1195
1196 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1197 if (!check)
0d0f0c50 1198 return log_oom();
4d1c38b8 1199
d4036145
LP
1200 if (access(check, F_OK) < 0) {
1201 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1202 return 0;
1203 }
68fb0892 1204
d4036145
LP
1205 what = strappend("../usr/share/zoneinfo/", z);
1206 if (!what)
1207 return log_oom();
1208
79d80fc1
TG
1209 r = mkdir_parents(where, 0755);
1210 if (r < 0) {
da927ba9 1211 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1212
1213 return 0;
1214 }
1215
1216 r = unlink(where);
1217 if (r < 0 && errno != ENOENT) {
56f64d95 1218 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1219
1220 return 0;
1221 }
4d9f07b4 1222
d4036145 1223 if (symlink(what, where) < 0) {
56f64d95 1224 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1225 return 0;
1226 }
e58a1277
LP
1227
1228 return 0;
88213476
LP
1229}
1230
2547bb41 1231static int setup_resolv_conf(const char *dest) {
c8b32e11 1232 _cleanup_free_ char *where = NULL;
79d80fc1 1233 int r;
2547bb41
LP
1234
1235 assert(dest);
1236
1237 if (arg_private_network)
1238 return 0;
1239
1240 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1241 where = strappend(dest, "/etc/resolv.conf");
1242 if (!where)
0d0f0c50 1243 return log_oom();
2547bb41 1244
77e63faf
LP
1245 /* We don't really care for the results of this really. If it
1246 * fails, it fails, but meh... */
79d80fc1
TG
1247 r = mkdir_parents(where, 0755);
1248 if (r < 0) {
da927ba9 1249 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1250
1251 return 0;
1252 }
1253
f2068bcc 1254 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1255 if (r < 0) {
da927ba9 1256 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1257
1258 return 0;
1259 }
2547bb41
LP
1260
1261 return 0;
1262}
1263
4d9f07b4
LP
1264static int setup_volatile_state(const char *directory) {
1265 const char *p;
1266 int r;
1267
1268 assert(directory);
1269
1270 if (arg_volatile != VOLATILE_STATE)
1271 return 0;
1272
1273 /* --volatile=state means we simply overmount /var
1274 with a tmpfs, and the rest read-only. */
1275
1276 r = bind_remount_recursive(directory, true);
f647962d
MS
1277 if (r < 0)
1278 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1279
63c372cb 1280 p = strjoina(directory, "/var");
79d80fc1 1281 r = mkdir(p, 0755);
4a62c710
MS
1282 if (r < 0 && errno != EEXIST)
1283 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1284
4a62c710
MS
1285 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1287
1288 return 0;
1289}
1290
1291static int setup_volatile(const char *directory) {
1292 bool tmpfs_mounted = false, bind_mounted = false;
1293 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1294 const char *f, *t;
1295 int r;
1296
1297 assert(directory);
1298
1299 if (arg_volatile != VOLATILE_YES)
1300 return 0;
1301
1302 /* --volatile=yes means we mount a tmpfs to the root dir, and
1303 the original /usr to use inside it, and that read-only. */
1304
4a62c710
MS
1305 if (!mkdtemp(template))
1306 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1307
1308 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1309 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1310 r = -errno;
1311 goto fail;
1312 }
1313
1314 tmpfs_mounted = true;
1315
63c372cb
LP
1316 f = strjoina(directory, "/usr");
1317 t = strjoina(template, "/usr");
4d9f07b4 1318
79d80fc1
TG
1319 r = mkdir(t, 0755);
1320 if (r < 0 && errno != EEXIST) {
56f64d95 1321 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1322 r = -errno;
1323 goto fail;
1324 }
1325
4d9f07b4 1326 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1327 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1328 r = -errno;
1329 goto fail;
1330 }
1331
1332 bind_mounted = true;
1333
1334 r = bind_remount_recursive(t, true);
1335 if (r < 0) {
da927ba9 1336 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1337 goto fail;
1338 }
1339
1340 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1341 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1342 r = -errno;
1343 goto fail;
1344 }
1345
1346 rmdir(template);
1347
1348 return 0;
1349
1350fail:
1351 if (bind_mounted)
1352 umount(t);
1353 if (tmpfs_mounted)
1354 umount(template);
1355 rmdir(template);
1356 return r;
1357}
1358
9f24adc2
LP
1359static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1360
1361 snprintf(s, 37,
1362 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363 SD_ID128_FORMAT_VAL(id));
1364
1365 return s;
1366}
1367
04bc4a3f 1368static int setup_boot_id(const char *dest) {
7fd1b19b 1369 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1370 sd_id128_t rnd = {};
04bc4a3f
LP
1371 char as_uuid[37];
1372 int r;
1373
1374 assert(dest);
1375
eb91eb18
LP
1376 if (arg_share_system)
1377 return 0;
1378
04bc4a3f
LP
1379 /* Generate a new randomized boot ID, so that each boot-up of
1380 * the container gets a new one */
1381
1382 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1383 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1384 if (!from || !to)
1385 return log_oom();
04bc4a3f
LP
1386
1387 r = sd_id128_randomize(&rnd);
f647962d
MS
1388 if (r < 0)
1389 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1390
9f24adc2 1391 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1392
574d5f2d 1393 r = write_string_file(from, as_uuid);
f647962d
MS
1394 if (r < 0)
1395 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1396
1397 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1398 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1399 r = -errno;
10d18763 1400 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1401 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1402
1403 unlink(from);
04bc4a3f
LP
1404 return r;
1405}
1406
e58a1277 1407static int copy_devnodes(const char *dest) {
88213476
LP
1408
1409 static const char devnodes[] =
1410 "null\0"
1411 "zero\0"
1412 "full\0"
1413 "random\0"
1414 "urandom\0"
85614d66
TG
1415 "tty\0"
1416 "net/tun\0";
88213476
LP
1417
1418 const char *d;
e58a1277 1419 int r = 0;
7fd1b19b 1420 _cleanup_umask_ mode_t u;
a258bf26
LP
1421
1422 assert(dest);
124640f1
LP
1423
1424 u = umask(0000);
88213476
LP
1425
1426 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1427 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1428 struct stat st;
88213476 1429
7f112f50
LP
1430 from = strappend("/dev/", d);
1431 to = strjoin(dest, "/dev/", d, NULL);
1432 if (!from || !to)
1433 return log_oom();
88213476
LP
1434
1435 if (stat(from, &st) < 0) {
1436
4a62c710
MS
1437 if (errno != ENOENT)
1438 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1439
a258bf26 1440 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1441
ed8b7a3e 1442 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1443 return -EIO;
a258bf26 1444
85614d66
TG
1445 } else {
1446 r = mkdir_parents(to, 0775);
1447 if (r < 0) {
da927ba9 1448 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1449 return -r;
1450 }
a258bf26 1451
4a62c710 1452 if (mknod(to, st.st_mode, st.st_rdev) < 0)
080e7832 1453 return log_error_errno(errno, "mknod(%s) failed: %m", to);
6278cf60
LP
1454
1455 if (arg_userns && arg_uid_shift != UID_INVALID)
1456 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1457 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
88213476 1458 }
88213476
LP
1459 }
1460
e58a1277
LP
1461 return r;
1462}
88213476 1463
f2d88580
LP
1464static int setup_ptmx(const char *dest) {
1465 _cleanup_free_ char *p = NULL;
1466
1467 p = strappend(dest, "/dev/ptmx");
1468 if (!p)
1469 return log_oom();
1470
4a62c710
MS
1471 if (symlink("pts/ptmx", p) < 0)
1472 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580 1473
6278cf60
LP
1474 if (arg_userns && arg_uid_shift != UID_INVALID)
1475 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1476 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1477
f2d88580
LP
1478 return 0;
1479}
1480
e58a1277 1481static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1482 _cleanup_umask_ mode_t u;
1483 const char *to;
e58a1277 1484 struct stat st;
e58a1277 1485 int r;
e58a1277
LP
1486
1487 assert(dest);
1488 assert(console);
1489
1490 u = umask(0000);
1491
4a62c710
MS
1492 if (stat("/dev/null", &st) < 0)
1493 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1494
e58a1277 1495 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1498
a258bf26
LP
1499 /* We need to bind mount the right tty to /dev/console since
1500 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1501 * to bind mount things on we create a device node first, and
1502 * use /dev/null for that since we the cgroups device policy
1503 * allows us to create that freely, while we cannot create
1504 * /dev/console. (Note that the major minor doesn't actually
1505 * matter here, since we mount it over anyway). */
a258bf26 1506
63c372cb 1507 to = strjoina(dest, "/dev/console");
4a62c710
MS
1508 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1509 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1510
4a62c710
MS
1511 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1512 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1513
25ea79fe 1514 return 0;
e58a1277
LP
1515}
1516
1517static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1518 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1519 _cleanup_umask_ mode_t u;
6d0b55c2 1520 int r, fd, k;
e58a1277
LP
1521 union {
1522 struct cmsghdr cmsghdr;
1523 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1524 } control = {};
1525 struct msghdr mh = {
1526 .msg_control = &control,
1527 .msg_controllen = sizeof(control),
1528 };
e58a1277
LP
1529 struct cmsghdr *cmsg;
1530
1531 assert(dest);
1532 assert(kmsg_socket >= 0);
a258bf26 1533
e58a1277 1534 u = umask(0000);
a258bf26 1535
f1e5dfe2
LP
1536 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1537 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1538 * on the reading side behave very similar to /proc/kmsg,
1539 * their writing side behaves differently from /dev/kmsg in
1540 * that writing blocks when nothing is reading. In order to
1541 * avoid any problems with containers deadlocking due to this
1542 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1543 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1544 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1545 return log_oom();
e58a1277 1546
4a62c710
MS
1547 if (mkfifo(from, 0600) < 0)
1548 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1549
1550 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1553
4a62c710
MS
1554 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1555 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1556
1557 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1558 if (fd < 0)
1559 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1560
e58a1277
LP
1561 cmsg = CMSG_FIRSTHDR(&mh);
1562 cmsg->cmsg_level = SOL_SOCKET;
1563 cmsg->cmsg_type = SCM_RIGHTS;
1564 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1565 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1566
1567 mh.msg_controllen = cmsg->cmsg_len;
1568
1569 /* Store away the fd in the socket, so that it stays open as
1570 * long as we run the child */
6d0b55c2 1571 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1572 safe_close(fd);
e58a1277 1573
4a62c710
MS
1574 if (k < 0)
1575 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1576
f1e5dfe2
LP
1577 /* And now make the FIFO unavailable as /dev/kmsg... */
1578 unlink(from);
25ea79fe 1579 return 0;
88213476
LP
1580}
1581
6d0b55c2
LP
1582static int send_rtnl(int send_fd) {
1583 union {
1584 struct cmsghdr cmsghdr;
1585 uint8_t buf[CMSG_SPACE(sizeof(int))];
1586 } control = {};
1587 struct msghdr mh = {
1588 .msg_control = &control,
1589 .msg_controllen = sizeof(control),
1590 };
1591 struct cmsghdr *cmsg;
1592 _cleanup_close_ int fd = -1;
1593 ssize_t k;
1594
1595 assert(send_fd >= 0);
1596
1597 if (!arg_expose_ports)
1598 return 0;
1599
1600 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1601 if (fd < 0)
1602 return log_error_errno(errno, "failed to allocate container netlink: %m");
1603
1604 cmsg = CMSG_FIRSTHDR(&mh);
1605 cmsg->cmsg_level = SOL_SOCKET;
1606 cmsg->cmsg_type = SCM_RIGHTS;
1607 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1608 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1609
1610 mh.msg_controllen = cmsg->cmsg_len;
1611
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
1614 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1615 if (k < 0)
1616 return log_error_errno(errno, "Failed to send netlink fd: %m");
1617
1618 return 0;
1619}
1620
1621static int flush_ports(union in_addr_union *exposed) {
1622 ExposePort *p;
1623 int r, af = AF_INET;
1624
1625 assert(exposed);
1626
1627 if (!arg_expose_ports)
1628 return 0;
1629
1630 if (in_addr_is_null(af, exposed))
1631 return 0;
1632
1633 log_debug("Lost IP address.");
1634
1635 LIST_FOREACH(ports, p, arg_expose_ports) {
1636 r = fw_add_local_dnat(false,
1637 af,
1638 p->protocol,
1639 NULL,
1640 NULL, 0,
1641 NULL, 0,
1642 p->host_port,
1643 exposed,
1644 p->container_port,
1645 NULL);
1646 if (r < 0)
1647 log_warning_errno(r, "Failed to modify firewall: %m");
1648 }
1649
1650 *exposed = IN_ADDR_NULL;
1651 return 0;
1652}
1653
1654static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1655 _cleanup_free_ struct local_address *addresses = NULL;
1656 _cleanup_free_ char *pretty = NULL;
1657 union in_addr_union new_exposed;
1658 ExposePort *p;
1659 bool add;
1660 int af = AF_INET, r;
1661
1662 assert(exposed);
1663
1664 /* Invoked each time an address is added or removed inside the
1665 * container */
1666
1667 if (!arg_expose_ports)
1668 return 0;
1669
1670 r = local_addresses(rtnl, 0, af, &addresses);
1671 if (r < 0)
1672 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1673
1674 add = r > 0 &&
1675 addresses[0].family == af &&
1676 addresses[0].scope < RT_SCOPE_LINK;
1677
1678 if (!add)
1679 return flush_ports(exposed);
1680
1681 new_exposed = addresses[0].address;
1682 if (in_addr_equal(af, exposed, &new_exposed))
1683 return 0;
1684
1685 in_addr_to_string(af, &new_exposed, &pretty);
1686 log_debug("New container IP is %s.", strna(pretty));
1687
1688 LIST_FOREACH(ports, p, arg_expose_ports) {
1689
1690 r = fw_add_local_dnat(true,
1691 af,
1692 p->protocol,
1693 NULL,
1694 NULL, 0,
1695 NULL, 0,
1696 p->host_port,
1697 &new_exposed,
1698 p->container_port,
1699 in_addr_is_null(af, exposed) ? NULL : exposed);
1700 if (r < 0)
1701 log_warning_errno(r, "Failed to modify firewall: %m");
1702 }
1703
1704 *exposed = new_exposed;
1705 return 0;
1706}
1707
1708static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1709 union in_addr_union *exposed = userdata;
1710
1711 assert(rtnl);
1712 assert(m);
1713 assert(exposed);
1714
1715 expose_ports(rtnl, exposed);
1716 return 0;
1717}
1718
1719static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1720 union {
1721 struct cmsghdr cmsghdr;
1722 uint8_t buf[CMSG_SPACE(sizeof(int))];
1723 } control = {};
1724 struct msghdr mh = {
1725 .msg_control = &control,
1726 .msg_controllen = sizeof(control),
1727 };
1728 struct cmsghdr *cmsg;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730 int fd, r;
1731 ssize_t k;
1732
1733 assert(event);
1734 assert(recv_fd >= 0);
1735 assert(ret);
1736
1737 if (!arg_expose_ports)
1738 return 0;
1739
1740 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1741 if (k < 0)
1742 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1743
1744 cmsg = CMSG_FIRSTHDR(&mh);
1745 assert(cmsg->cmsg_level == SOL_SOCKET);
1746 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1747 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1748 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1749
1750 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1751 if (r < 0) {
1752 safe_close(fd);
1753 return log_error_errno(r, "Failed to create rtnl object: %m");
1754 }
1755
1756 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1759
1760 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1763
1764 r = sd_rtnl_attach_event(rtnl, event, 0);
1765 if (r < 0)
1766 return log_error_errno(r, "Failed to add to even loop: %m");
1767
1768 *ret = rtnl;
1769 rtnl = NULL;
1770
1771 return 0;
1772}
1773
3a74cea5 1774static int setup_hostname(void) {
3a74cea5 1775
eb91eb18
LP
1776 if (arg_share_system)
1777 return 0;
1778
605f81a8 1779 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1780 return -errno;
3a74cea5 1781
7027ff61 1782 return 0;
3a74cea5
LP
1783}
1784
57fb9fb5 1785static int setup_journal(const char *directory) {
4d680aee 1786 sd_id128_t machine_id, this_id;
7fd1b19b 1787 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1788 char *id;
57fb9fb5
LP
1789 int r;
1790
df9a75e4
LP
1791 /* Don't link journals in ephemeral mode */
1792 if (arg_ephemeral)
1793 return 0;
1794
57fb9fb5 1795 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1796 if (!p)
1797 return log_oom();
57fb9fb5
LP
1798
1799 r = read_one_line_file(p, &b);
27407a01
ZJS
1800 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1801 return 0;
f647962d
MS
1802 else if (r < 0)
1803 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1804
27407a01
ZJS
1805 id = strstrip(b);
1806 if (isempty(id) && arg_link_journal == LINK_AUTO)
1807 return 0;
57fb9fb5 1808
27407a01
ZJS
1809 /* Verify validity */
1810 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1811 if (r < 0)
1812 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1813
4d680aee 1814 r = sd_id128_get_machine(&this_id);
f647962d
MS
1815 if (r < 0)
1816 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1817
1818 if (sd_id128_equal(machine_id, this_id)) {
1819 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1820 "Host and machine ids are equal (%s): refusing to link journals", id);
1821 if (arg_link_journal == LINK_AUTO)
1822 return 0;
df9a75e4 1823 return -EEXIST;
4d680aee
ZJS
1824 }
1825
1826 if (arg_link_journal == LINK_NO)
1827 return 0;
1828
57fb9fb5 1829 free(p);
27407a01
ZJS
1830 p = strappend("/var/log/journal/", id);
1831 q = strjoin(directory, "/var/log/journal/", id, NULL);
1832 if (!p || !q)
1833 return log_oom();
1834
1835 if (path_is_mount_point(p, false) > 0) {
1836 if (arg_link_journal != LINK_AUTO) {
1837 log_error("%s: already a mount point, refusing to use for journal", p);
1838 return -EEXIST;
1839 }
1840
1841 return 0;
57fb9fb5
LP
1842 }
1843
27407a01 1844 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1845 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1846 log_error("%s: already a mount point, refusing to use for journal", q);
1847 return -EEXIST;
57fb9fb5
LP
1848 }
1849
27407a01 1850 return 0;
57fb9fb5
LP
1851 }
1852
1853 r = readlink_and_make_absolute(p, &d);
1854 if (r >= 0) {
1855 if ((arg_link_journal == LINK_GUEST ||
1856 arg_link_journal == LINK_AUTO) &&
1857 path_equal(d, q)) {
1858
27407a01
ZJS
1859 r = mkdir_p(q, 0755);
1860 if (r < 0)
56f64d95 1861 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1862 return 0;
57fb9fb5
LP
1863 }
1864
4a62c710
MS
1865 if (unlink(p) < 0)
1866 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1867 } else if (r == -EINVAL) {
1868
1869 if (arg_link_journal == LINK_GUEST &&
1870 rmdir(p) < 0) {
1871
27407a01
ZJS
1872 if (errno == ENOTDIR) {
1873 log_error("%s already exists and is neither a symlink nor a directory", p);
1874 return r;
1875 } else {
56f64d95 1876 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1877 return -errno;
57fb9fb5 1878 }
57fb9fb5
LP
1879 }
1880 } else if (r != -ENOENT) {
56f64d95 1881 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1882 return r;
57fb9fb5
LP
1883 }
1884
1885 if (arg_link_journal == LINK_GUEST) {
1886
1887 if (symlink(q, p) < 0) {
574edc90 1888 if (arg_link_journal_try) {
56f64d95 1889 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1890 return 0;
1891 } else {
56f64d95 1892 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1893 return -errno;
1894 }
57fb9fb5
LP
1895 }
1896
27407a01
ZJS
1897 r = mkdir_p(q, 0755);
1898 if (r < 0)
56f64d95 1899 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1900 return 0;
57fb9fb5
LP
1901 }
1902
1903 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1904 /* don't create parents here -- if the host doesn't have
1905 * permanent journal set up, don't force it here */
1906 r = mkdir(p, 0755);
57fb9fb5 1907 if (r < 0) {
574edc90 1908 if (arg_link_journal_try) {
56f64d95 1909 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1910 return 0;
1911 } else {
56f64d95 1912 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1913 return r;
1914 }
57fb9fb5
LP
1915 }
1916
27407a01
ZJS
1917 } else if (access(p, F_OK) < 0)
1918 return 0;
57fb9fb5 1919
cdb2b9d0
LP
1920 if (dir_is_empty(q) == 0)
1921 log_warning("%s is not empty, proceeding anyway.", q);
1922
57fb9fb5
LP
1923 r = mkdir_p(q, 0755);
1924 if (r < 0) {
56f64d95 1925 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1926 return r;
57fb9fb5
LP
1927 }
1928
4a62c710
MS
1929 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1930 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1931
27407a01 1932 return 0;
57fb9fb5
LP
1933}
1934
88213476 1935static int drop_capabilities(void) {
5076f0cc 1936 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1937}
1938
5aa4bb6b 1939static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1940 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1941 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1942 int r;
1943
eb91eb18
LP
1944 if (!arg_register)
1945 return 0;
1946
1c03020c 1947 r = sd_bus_default_system(&bus);
f647962d
MS
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1950
89f7c846
LP
1951 if (arg_keep_unit) {
1952 r = sd_bus_call_method(
1953 bus,
1954 "org.freedesktop.machine1",
1955 "/org/freedesktop/machine1",
1956 "org.freedesktop.machine1.Manager",
5aa4bb6b 1957 "RegisterMachineWithNetwork",
89f7c846
LP
1958 &error,
1959 NULL,
5aa4bb6b 1960 "sayssusai",
89f7c846
LP
1961 arg_machine,
1962 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1963 "nspawn",
1964 "container",
1965 (uint32_t) pid,
5aa4bb6b
LP
1966 strempty(arg_directory),
1967 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1968 } else {
9457ac5b 1969 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 1970 char **i;
9457ac5b
LP
1971
1972 r = sd_bus_message_new_method_call(
89f7c846 1973 bus,
9457ac5b 1974 &m,
89f7c846
LP
1975 "org.freedesktop.machine1",
1976 "/org/freedesktop/machine1",
1977 "org.freedesktop.machine1.Manager",
5aa4bb6b 1978 "CreateMachineWithNetwork");
f647962d 1979 if (r < 0)
f36933fe 1980 return bus_log_create_error(r);
9457ac5b
LP
1981
1982 r = sd_bus_message_append(
1983 m,
5aa4bb6b 1984 "sayssusai",
89f7c846
LP
1985 arg_machine,
1986 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1987 "nspawn",
1988 "container",
1989 (uint32_t) pid,
5aa4bb6b
LP
1990 strempty(arg_directory),
1991 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 1992 if (r < 0)
f36933fe 1993 return bus_log_create_error(r);
9457ac5b
LP
1994
1995 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 1996 if (r < 0)
f36933fe 1997 return bus_log_create_error(r);
9457ac5b
LP
1998
1999 if (!isempty(arg_slice)) {
2000 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2001 if (r < 0)
f36933fe 2002 return bus_log_create_error(r);
9457ac5b
LP
2003 }
2004
2005 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2006 if (r < 0)
f36933fe 2007 return bus_log_create_error(r);
9457ac5b 2008
63cc4c31 2009 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2010 /* Allow the container to
2011 * access and create the API
2012 * device nodes, so that
2013 * PrivateDevices= in the
2014 * container can work
2015 * fine */
2016 "/dev/null", "rwm",
2017 "/dev/zero", "rwm",
2018 "/dev/full", "rwm",
2019 "/dev/random", "rwm",
2020 "/dev/urandom", "rwm",
2021 "/dev/tty", "rwm",
864e1706 2022 "/dev/net/tun", "rwm",
9457ac5b
LP
2023 /* Allow the container
2024 * access to ptys. However,
2025 * do not permit the
2026 * container to ever create
2027 * these device nodes. */
2028 "/dev/pts/ptmx", "rw",
63cc4c31 2029 "char-pts", "rw");
f647962d
MS
2030 if (r < 0)
2031 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b 2032
f36933fe
LP
2033 STRV_FOREACH(i, arg_property) {
2034 r = sd_bus_message_open_container(m, 'r', "sv");
2035 if (r < 0)
2036 return bus_log_create_error(r);
2037
2038 r = bus_append_unit_property_assignment(m, *i);
2039 if (r < 0)
2040 return r;
2041
2042 r = sd_bus_message_close_container(m);
2043 if (r < 0)
2044 return bus_log_create_error(r);
2045 }
2046
9457ac5b 2047 r = sd_bus_message_close_container(m);
f647962d 2048 if (r < 0)
f36933fe 2049 return bus_log_create_error(r);
9457ac5b
LP
2050
2051 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2052 }
2053
9444b1f2 2054 if (r < 0) {
1f0cd86b
LP
2055 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2056 return r;
2057 }
2058
2059 return 0;
2060}
2061
2062static int terminate_machine(pid_t pid) {
2063 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2064 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 2065 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2066 const char *path;
2067 int r;
2068
eb91eb18
LP
2069 if (!arg_register)
2070 return 0;
2071
76b54375 2072 r = sd_bus_default_system(&bus);
f647962d
MS
2073 if (r < 0)
2074 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2075
2076 r = sd_bus_call_method(
2077 bus,
2078 "org.freedesktop.machine1",
2079 "/org/freedesktop/machine1",
2080 "org.freedesktop.machine1.Manager",
2081 "GetMachineByPID",
2082 &error,
2083 &reply,
2084 "u",
2085 (uint32_t) pid);
2086 if (r < 0) {
2087 /* Note that the machine might already have been
2088 * cleaned up automatically, hence don't consider it a
2089 * failure if we cannot get the machine object. */
2090 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2091 return 0;
2092 }
2093
2094 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2095 if (r < 0)
2096 return bus_log_parse_error(r);
9444b1f2 2097
1f0cd86b
LP
2098 r = sd_bus_call_method(
2099 bus,
2100 "org.freedesktop.machine1",
2101 path,
2102 "org.freedesktop.machine1.Machine",
2103 "Terminate",
2104 &error,
2105 NULL,
2106 NULL);
2107 if (r < 0) {
2108 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2109 return 0;
2110 }
2111
9444b1f2
LP
2112 return 0;
2113}
2114
db999e0f
LP
2115static int reset_audit_loginuid(void) {
2116 _cleanup_free_ char *p = NULL;
2117 int r;
2118
2119 if (arg_share_system)
2120 return 0;
2121
2122 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2123 if (r == -ENOENT)
db999e0f 2124 return 0;
f647962d
MS
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2127
2128 /* Already reset? */
2129 if (streq(p, "4294967295"))
2130 return 0;
2131
2132 r = write_string_file("/proc/self/loginuid", "4294967295");
2133 if (r < 0) {
2134 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2135 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 2139
db999e0f 2140 sleep(5);
77b6e194 2141 }
db999e0f
LP
2142
2143 return 0;
77b6e194
LP
2144}
2145
4f758c23
LP
2146#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2147#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2148#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2149
a90e2305 2150static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2151 uint8_t result[8];
2152 size_t l, sz;
a90e2305
LP
2153 uint8_t *v, *i;
2154 int r;
01dde061
TG
2155
2156 l = strlen(arg_machine);
2157 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2158 if (idx > 0)
2159 sz += sizeof(idx);
a90e2305 2160
01dde061
TG
2161 v = alloca(sz);
2162
2163 /* fetch some persistent data unique to the host */
2164 r = sd_id128_get_machine((sd_id128_t*) v);
2165 if (r < 0)
2166 return r;
2167
2168 /* combine with some data unique (on this host) to this
2169 * container instance */
a90e2305
LP
2170 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2171 if (idx > 0) {
2172 idx = htole64(idx);
2173 memcpy(i, &idx, sizeof(idx));
2174 }
01dde061
TG
2175
2176 /* Let's hash the host machine ID plus the container name. We
2177 * use a fixed, but originally randomly created hash key here. */
4f758c23 2178 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2179
2180 assert_cc(ETH_ALEN <= sizeof(result));
2181 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2182
2183 /* see eth_random_addr in the kernel */
2184 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2185 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2186
2187 return 0;
2188}
2189
5aa4bb6b 2190static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2191 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2192 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2193 struct ether_addr mac_host, mac_container;
5aa4bb6b 2194 int r, i;
69c79d3c
LP
2195
2196 if (!arg_private_network)
2197 return 0;
2198
2199 if (!arg_network_veth)
2200 return 0;
2201
08af0da2
LP
2202 /* Use two different interface name prefixes depending whether
2203 * we are in bridge mode or not. */
c00524c9 2204 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2205 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2206
e867ceb6
LP
2207 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2210
e867ceb6
LP
2211 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2214
151b9b96 2215 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2218
151b9b96 2219 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2222
ab046dde 2223 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2226
4f758c23 2227 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2230
ee3a6a51 2231 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2232 if (r < 0)
2233 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2234
d8e538ec 2235 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2236 if (r < 0)
2237 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2238
ee3a6a51 2239 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2240 if (r < 0)
2241 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2242
ab046dde 2243 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2244 if (r < 0)
2245 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2246
4f758c23 2247 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2248 if (r < 0)
2249 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2250
ab046dde 2251 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2252 if (r < 0)
2253 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2254
2255 r = sd_rtnl_message_close_container(m);
f647962d
MS
2256 if (r < 0)
2257 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2258
2259 r = sd_rtnl_message_close_container(m);
f647962d
MS
2260 if (r < 0)
2261 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2262
2263 r = sd_rtnl_message_close_container(m);
f647962d
MS
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2266
2267 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2268 if (r < 0)
2269 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2270
5aa4bb6b 2271 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2272 if (i <= 0)
2273 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2274
2275 *ifi = i;
2276
69c79d3c
LP
2277 return 0;
2278}
2279
5aa4bb6b 2280static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2281 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2282 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2283 int r, bridge;
2284
2285 if (!arg_private_network)
2286 return 0;
2287
2288 if (!arg_network_veth)
2289 return 0;
2290
2291 if (!arg_network_bridge)
2292 return 0;
2293
2294 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2295 if (bridge <= 0)
2296 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2297
5aa4bb6b
LP
2298 *ifi = bridge;
2299
151b9b96 2300 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2301 if (r < 0)
2302 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2303
151b9b96 2304 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2307
039dd4af 2308 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2309 if (r < 0)
2310 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2311
ab046dde 2312 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2313 if (r < 0)
2314 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2315
2316 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2319
2320 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2321 if (r < 0)
2322 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2323
2324 return 0;
2325}
2326
c74e630d
LP
2327static int parse_interface(struct udev *udev, const char *name) {
2328 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2329 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2330 int ifi;
2331
2332 ifi = (int) if_nametoindex(name);
4a62c710
MS
2333 if (ifi <= 0)
2334 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2335
2336 sprintf(ifi_str, "n%i", ifi);
2337 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2338 if (!d)
2339 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2340
2341 if (udev_device_get_is_initialized(d) <= 0) {
2342 log_error("Network interface %s is not initialized yet.", name);
2343 return -EBUSY;
2344 }
2345
2346 return ifi;
2347}
2348
69c79d3c 2349static int move_network_interfaces(pid_t pid) {
7e227024 2350 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2352 char **i;
2353 int r;
2354
2355 if (!arg_private_network)
2356 return 0;
2357
2358 if (strv_isempty(arg_network_interfaces))
2359 return 0;
2360
151b9b96 2361 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2364
7e227024
LP
2365 udev = udev_new();
2366 if (!udev) {
2367 log_error("Failed to connect to udev.");
2368 return -ENOMEM;
2369 }
2370
aa28aefe 2371 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2372 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2373 int ifi;
aa28aefe 2374
c74e630d
LP
2375 ifi = parse_interface(udev, *i);
2376 if (ifi < 0)
2377 return ifi;
2378
3125b3ef 2379 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2382
c74e630d 2383 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2386
c74e630d 2387 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2390 }
7e227024 2391
c74e630d
LP
2392 return 0;
2393}
2394
2395static int setup_macvlan(pid_t pid) {
2396 _cleanup_udev_unref_ struct udev *udev = NULL;
2397 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2398 unsigned idx = 0;
c74e630d
LP
2399 char **i;
2400 int r;
2401
2402 if (!arg_private_network)
2403 return 0;
2404
2405 if (strv_isempty(arg_network_macvlan))
2406 return 0;
2407
2408 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2411
2412 udev = udev_new();
2413 if (!udev) {
2414 log_error("Failed to connect to udev.");
2415 return -ENOMEM;
2416 }
2417
2418 STRV_FOREACH(i, arg_network_macvlan) {
2419 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2420 _cleanup_free_ char *n = NULL;
e867ceb6 2421 struct ether_addr mac;
c74e630d
LP
2422 int ifi;
2423
2424 ifi = parse_interface(udev, *i);
2425 if (ifi < 0)
2426 return ifi;
2427
e867ceb6
LP
2428 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2429 if (r < 0)
2430 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2431
c74e630d 2432 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2433 if (r < 0)
2434 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2435
c74e630d 2436 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2437 if (r < 0)
2438 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2439
2440 n = strappend("mv-", *i);
2441 if (!n)
2442 return log_oom();
2443
2444 strshorten(n, IFNAMSIZ-1);
2445
2446 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2449
e867ceb6
LP
2450 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2453
aa28aefe 2454 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2457
2458 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2461
d8e538ec 2462 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2465
2466 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2469
2470 r = sd_rtnl_message_close_container(m);
f647962d
MS
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2473
2474 r = sd_rtnl_message_close_container(m);
f647962d
MS
2475 if (r < 0)
2476 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2477
2478 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2479 if (r < 0)
2480 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2481 }
2482
2483 return 0;
2484}
2485
4bbfe7ad
TG
2486static int setup_ipvlan(pid_t pid) {
2487 _cleanup_udev_unref_ struct udev *udev = NULL;
2488 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2489 char **i;
2490 int r;
2491
2492 if (!arg_private_network)
2493 return 0;
2494
2495 if (strv_isempty(arg_network_ipvlan))
2496 return 0;
2497
2498 r = sd_rtnl_open(&rtnl, 0);
2499 if (r < 0)
2500 return log_error_errno(r, "Failed to connect to netlink: %m");
2501
2502 udev = udev_new();
2503 if (!udev) {
2504 log_error("Failed to connect to udev.");
2505 return -ENOMEM;
2506 }
2507
2508 STRV_FOREACH(i, arg_network_ipvlan) {
2509 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2510 _cleanup_free_ char *n = NULL;
2511 int ifi;
2512
2513 ifi = parse_interface(udev, *i);
2514 if (ifi < 0)
2515 return ifi;
2516
2517 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to allocate netlink message: %m");
2520
2521 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2522 if (r < 0)
2523 return log_error_errno(r, "Failed to add netlink interface index: %m");
2524
2525 n = strappend("iv-", *i);
2526 if (!n)
2527 return log_oom();
2528
2529 strshorten(n, IFNAMSIZ-1);
2530
2531 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2532 if (r < 0)
2533 return log_error_errno(r, "Failed to add netlink interface name: %m");
2534
2535 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2538
2539 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to open netlink container: %m");
2542
2543 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2544 if (r < 0)
2545 return log_error_errno(r, "Failed to open netlink container: %m");
2546
2547 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2548 if (r < 0)
2549 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2550
2551 r = sd_rtnl_message_close_container(m);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to close netlink container: %m");
2554
2555 r = sd_rtnl_message_close_container(m);
2556 if (r < 0)
2557 return log_error_errno(r, "Failed to close netlink container: %m");
2558
2559 r = sd_rtnl_call(rtnl, m, 0, NULL);
2560 if (r < 0)
2561 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2562 }
2563
2564 return 0;
2565}
2566
28650077 2567static int setup_seccomp(void) {
24fb1112
LP
2568
2569#ifdef HAVE_SECCOMP
9a71b112
JF
2570 static const struct {
2571 uint64_t capability;
2572 int syscall_num;
2573 } blacklist[] = {
2574 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2575 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2576 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2577 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2578 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2579 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2580 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2581 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2582 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
d0a0ccf3
JF
2583 };
2584
24fb1112 2585 scmp_filter_ctx seccomp;
28650077 2586 unsigned i;
24fb1112
LP
2587 int r;
2588
24fb1112
LP
2589 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2590 if (!seccomp)
2591 return log_oom();
2592
e9642be2 2593 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2594 if (r < 0) {
da927ba9 2595 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2596 goto finish;
2597 }
2598
28650077 2599 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2600 if (arg_retain & (1ULL << blacklist[i].capability))
2601 continue;
2602
2603 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2604 if (r == -EFAULT)
2605 continue; /* unknown syscall */
2606 if (r < 0) {
da927ba9 2607 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2608 goto finish;
2609 }
2610 }
2611
d0a0ccf3 2612
28650077
LP
2613 /*
2614 Audit is broken in containers, much of the userspace audit
2615 hookup will fail if running inside a container. We don't
2616 care and just turn off creation of audit sockets.
2617
2618 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2619 with EAFNOSUPPORT which audit userspace uses as indication
2620 that audit is disabled in the kernel.
2621 */
2622
3302da46 2623 r = seccomp_rule_add(
24fb1112
LP
2624 seccomp,
2625 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2626 SCMP_SYS(socket),
2627 2,
2628 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2629 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2630 if (r < 0) {
da927ba9 2631 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2632 goto finish;
2633 }
2634
2635 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2636 if (r < 0) {
da927ba9 2637 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2638 goto finish;
2639 }
2640
2641 r = seccomp_load(seccomp);
2642 if (r < 0)
da927ba9 2643 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2644
2645finish:
2646 seccomp_release(seccomp);
2647 return r;
2648#else
2649 return 0;
2650#endif
2651
2652}
2653
785890ac
LP
2654static int setup_propagate(const char *root) {
2655 const char *p, *q;
2656
2657 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2658 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2659 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2660 (void) mkdir_p(p, 0600);
2661
63c372cb 2662 q = strjoina(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2663 mkdir_parents(q, 0755);
2664 mkdir_p(q, 0600);
2665
2666 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2667 return log_error_errno(errno, "Failed to install propagation bind mount.");
2668
2669 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2670 return log_error_errno(errno, "Failed to make propagation mount read-only");
2671
2672 return 0;
2673}
2674
1b9e5b12
LP
2675static int setup_image(char **device_path, int *loop_nr) {
2676 struct loop_info64 info = {
2677 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2678 };
2679 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2680 _cleanup_free_ char* loopdev = NULL;
2681 struct stat st;
2682 int r, nr;
2683
2684 assert(device_path);
2685 assert(loop_nr);
ec16945e 2686 assert(arg_image);
1b9e5b12
LP
2687
2688 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2689 if (fd < 0)
2690 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2691
4a62c710
MS
2692 if (fstat(fd, &st) < 0)
2693 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2694
2695 if (S_ISBLK(st.st_mode)) {
2696 char *p;
2697
2698 p = strdup(arg_image);
2699 if (!p)
2700 return log_oom();
2701
2702 *device_path = p;
2703
2704 *loop_nr = -1;
2705
2706 r = fd;
2707 fd = -1;
2708
2709 return r;
2710 }
2711
2712 if (!S_ISREG(st.st_mode)) {
56f64d95 2713 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2714 return -EINVAL;
2715 }
2716
2717 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2718 if (control < 0)
2719 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2720
2721 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2722 if (nr < 0)
2723 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2724
2725 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2726 return log_oom();
2727
2728 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2729 if (loop < 0)
2730 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2731
4a62c710
MS
2732 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2733 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2734
2735 if (arg_read_only)
2736 info.lo_flags |= LO_FLAGS_READ_ONLY;
2737
4a62c710
MS
2738 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2739 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2740
2741 *device_path = loopdev;
2742 loopdev = NULL;
2743
2744 *loop_nr = nr;
2745
2746 r = loop;
2747 loop = -1;
2748
2749 return r;
2750}
2751
ada4799a
LP
2752#define PARTITION_TABLE_BLURB \
2753 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 2754 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 2755 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2756 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2757 "to be bootable with systemd-nspawn."
2758
1b9e5b12
LP
2759static int dissect_image(
2760 int fd,
727fd4fd
LP
2761 char **root_device, bool *root_device_rw,
2762 char **home_device, bool *home_device_rw,
2763 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2764 bool *secondary) {
2765
2766#ifdef HAVE_BLKID
01dc33ce
ZJS
2767 int home_nr = -1, srv_nr = -1;
2768#ifdef GPT_ROOT_NATIVE
2769 int root_nr = -1;
2770#endif
2771#ifdef GPT_ROOT_SECONDARY
2772 int secondary_root_nr = -1;
2773#endif
f6c51a81 2774 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2775 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2776 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2777 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2778 _cleanup_udev_unref_ struct udev *udev = NULL;
2779 struct udev_list_entry *first, *item;
f6c51a81 2780 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2781 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2782 const char *pttype = NULL;
2783 blkid_partlist pl;
2784 struct stat st;
c09ef2e4 2785 unsigned i;
1b9e5b12
LP
2786 int r;
2787
2788 assert(fd >= 0);
2789 assert(root_device);
2790 assert(home_device);
2791 assert(srv_device);
2792 assert(secondary);
ec16945e 2793 assert(arg_image);
1b9e5b12
LP
2794
2795 b = blkid_new_probe();
2796 if (!b)
2797 return log_oom();
2798
2799 errno = 0;
2800 r = blkid_probe_set_device(b, fd, 0, 0);
2801 if (r != 0) {
2802 if (errno == 0)
2803 return log_oom();
2804
56f64d95 2805 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2806 return -errno;
2807 }
2808
2809 blkid_probe_enable_partitions(b, 1);
2810 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2811
2812 errno = 0;
2813 r = blkid_do_safeprobe(b);
2814 if (r == -2 || r == 1) {
ada4799a
LP
2815 log_error("Failed to identify any partition table on\n"
2816 " %s\n"
2817 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2818 return -EINVAL;
2819 } else if (r != 0) {
2820 if (errno == 0)
2821 errno = EIO;
56f64d95 2822 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2823 return -errno;
2824 }
2825
2826 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2827
2828 is_gpt = streq_ptr(pttype, "gpt");
2829 is_mbr = streq_ptr(pttype, "dos");
2830
2831 if (!is_gpt && !is_mbr) {
2832 log_error("No GPT or MBR partition table discovered on\n"
2833 " %s\n"
2834 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2835 return -EINVAL;
2836 }
2837
2838 errno = 0;
2839 pl = blkid_probe_get_partitions(b);
2840 if (!pl) {
2841 if (errno == 0)
2842 return log_oom();
2843
2844 log_error("Failed to list partitions of %s", arg_image);
2845 return -errno;
2846 }
2847
2848 udev = udev_new();
2849 if (!udev)
2850 return log_oom();
2851
4a62c710
MS
2852 if (fstat(fd, &st) < 0)
2853 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2854
c09ef2e4
LP
2855 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2856 if (!d)
1b9e5b12
LP
2857 return log_oom();
2858
c09ef2e4
LP
2859 for (i = 0;; i++) {
2860 int n, m;
1b9e5b12 2861
c09ef2e4
LP
2862 if (i >= 10) {
2863 log_error("Kernel partitions never appeared.");
2864 return -ENXIO;
2865 }
2866
2867 e = udev_enumerate_new(udev);
2868 if (!e)
2869 return log_oom();
2870
2871 r = udev_enumerate_add_match_parent(e, d);
2872 if (r < 0)
2873 return log_oom();
2874
2875 r = udev_enumerate_scan_devices(e);
2876 if (r < 0)
2877 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2878
2879 /* Count the partitions enumerated by the kernel */
2880 n = 0;
2881 first = udev_enumerate_get_list_entry(e);
2882 udev_list_entry_foreach(item, first)
2883 n++;
2884
2885 /* Count the partitions enumerated by blkid */
2886 m = blkid_partlist_numof_partitions(pl);
2887 if (n == m + 1)
2888 break;
2889 if (n > m + 1) {
2890 log_error("blkid and kernel partition list do not match.");
2891 return -EIO;
2892 }
2893 if (n < m + 1) {
2894 unsigned j;
2895
2896 /* The kernel has probed fewer partitions than
2897 * blkid? Maybe the kernel prober is still
2898 * running or it got EBUSY because udev
2899 * already opened the device. Let's reprobe
2900 * the device, which is a synchronous call
2901 * that waits until probing is complete. */
2902
2903 for (j = 0; j < 20; j++) {
2904
2905 r = ioctl(fd, BLKRRPART, 0);
2906 if (r < 0)
2907 r = -errno;
2908 if (r >= 0 || r != -EBUSY)
2909 break;
2910
2911 /* If something else has the device
2912 * open, such as an udev rule, the
2913 * ioctl will return EBUSY. Since
2914 * there's no way to wait until it
2915 * isn't busy anymore, let's just wait
2916 * a bit, and try again.
2917 *
2918 * This is really something they
2919 * should fix in the kernel! */
2920
2921 usleep(50 * USEC_PER_MSEC);
2922 }
2923
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to reread partition table: %m");
2926 }
2927
2928 e = udev_enumerate_unref(e);
2929 }
1b9e5b12
LP
2930
2931 first = udev_enumerate_get_list_entry(e);
2932 udev_list_entry_foreach(item, first) {
2933 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2934 const char *node;
727fd4fd 2935 unsigned long long flags;
1b9e5b12
LP
2936 blkid_partition pp;
2937 dev_t qn;
2938 int nr;
2939
2940 errno = 0;
2941 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2942 if (!q) {
2943 if (!errno)
2944 errno = ENOMEM;
2945
56f64d95 2946 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2947 return -errno;
2948 }
2949
2950 qn = udev_device_get_devnum(q);
2951 if (major(qn) == 0)
2952 continue;
2953
2954 if (st.st_rdev == qn)
2955 continue;
2956
2957 node = udev_device_get_devnode(q);
2958 if (!node)
2959 continue;
2960
2961 pp = blkid_partlist_devno_to_partition(pl, qn);
2962 if (!pp)
2963 continue;
2964
727fd4fd 2965 flags = blkid_partition_get_flags(pp);
727fd4fd 2966
1b9e5b12
LP
2967 nr = blkid_partition_get_partno(pp);
2968 if (nr < 0)
2969 continue;
2970
ada4799a
LP
2971 if (is_gpt) {
2972 sd_id128_t type_id;
2973 const char *stype;
1b9e5b12 2974
f6c51a81
LP
2975 if (flags & GPT_FLAG_NO_AUTO)
2976 continue;
2977
ada4799a
LP
2978 stype = blkid_partition_get_type_string(pp);
2979 if (!stype)
2980 continue;
1b9e5b12 2981
ada4799a 2982 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2983 continue;
2984
ada4799a 2985 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2986
ada4799a
LP
2987 if (home && nr >= home_nr)
2988 continue;
1b9e5b12 2989
ada4799a
LP
2990 home_nr = nr;
2991 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2992
ada4799a
LP
2993 r = free_and_strdup(&home, node);
2994 if (r < 0)
2995 return log_oom();
727fd4fd 2996
ada4799a
LP
2997 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2998
2999 if (srv && nr >= srv_nr)
3000 continue;
3001
3002 srv_nr = nr;
3003 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3004
3005 r = free_and_strdup(&srv, node);
3006 if (r < 0)
3007 return log_oom();
3008 }
1b9e5b12 3009#ifdef GPT_ROOT_NATIVE
ada4799a 3010 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3011
ada4799a
LP
3012 if (root && nr >= root_nr)
3013 continue;
1b9e5b12 3014
ada4799a
LP
3015 root_nr = nr;
3016 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3017
ada4799a
LP
3018 r = free_and_strdup(&root, node);
3019 if (r < 0)
3020 return log_oom();
3021 }
1b9e5b12
LP
3022#endif
3023#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3024 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3025
3026 if (secondary_root && nr >= secondary_root_nr)
3027 continue;
3028
3029 secondary_root_nr = nr;
3030 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3031
3032 r = free_and_strdup(&secondary_root, node);
3033 if (r < 0)
3034 return log_oom();
3035 }
3036#endif
f6c51a81
LP
3037 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3038
3039 if (generic)
3040 multiple_generic = true;
3041 else {
3042 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3043
3044 r = free_and_strdup(&generic, node);
3045 if (r < 0)
3046 return log_oom();
3047 }
3048 }
ada4799a
LP
3049
3050 } else if (is_mbr) {
3051 int type;
1b9e5b12 3052
f6c51a81
LP
3053 if (flags != 0x80) /* Bootable flag */
3054 continue;
3055
ada4799a
LP
3056 type = blkid_partition_get_type(pp);
3057 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3058 continue;
3059
f6c51a81
LP
3060 if (generic)
3061 multiple_generic = true;
3062 else {
3063 generic_rw = true;
727fd4fd 3064
f6c51a81
LP
3065 r = free_and_strdup(&root, node);
3066 if (r < 0)
3067 return log_oom();
3068 }
1b9e5b12 3069 }
1b9e5b12
LP
3070 }
3071
1b9e5b12
LP
3072 if (root) {
3073 *root_device = root;
3074 root = NULL;
727fd4fd
LP
3075
3076 *root_device_rw = root_rw;
1b9e5b12
LP
3077 *secondary = false;
3078 } else if (secondary_root) {
3079 *root_device = secondary_root;
3080 secondary_root = NULL;
727fd4fd
LP
3081
3082 *root_device_rw = secondary_root_rw;
1b9e5b12 3083 *secondary = true;
f6c51a81
LP
3084 } else if (generic) {
3085
3086 /* There were no partitions with precise meanings
3087 * around, but we found generic partitions. In this
3088 * case, if there's only one, we can go ahead and boot
3089 * it, otherwise we bail out, because we really cannot
3090 * make any sense of it. */
3091
3092 if (multiple_generic) {
3093 log_error("Identified multiple bootable Linux partitions on\n"
3094 " %s\n"
3095 PARTITION_TABLE_BLURB, arg_image);
3096 return -EINVAL;
3097 }
3098
3099 *root_device = generic;
3100 generic = NULL;
3101
3102 *root_device_rw = generic_rw;
3103 *secondary = false;
3104 } else {
3105 log_error("Failed to identify root partition in disk image\n"
3106 " %s\n"
3107 PARTITION_TABLE_BLURB, arg_image);
3108 return -EINVAL;
1b9e5b12
LP
3109 }
3110
3111 if (home) {
3112 *home_device = home;
3113 home = NULL;
727fd4fd
LP
3114
3115 *home_device_rw = home_rw;
1b9e5b12
LP
3116 }
3117
3118 if (srv) {
3119 *srv_device = srv;
3120 srv = NULL;
727fd4fd
LP
3121
3122 *srv_device_rw = srv_rw;
1b9e5b12
LP
3123 }
3124
3125 return 0;
3126#else
3127 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3128 return -EOPNOTSUPP;
1b9e5b12
LP
3129#endif
3130}
3131
727fd4fd 3132static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3133#ifdef HAVE_BLKID
3134 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3135 const char *fstype, *p;
3136 int r;
3137
3138 assert(what);
3139 assert(where);
3140
727fd4fd
LP
3141 if (arg_read_only)
3142 rw = false;
3143
1b9e5b12 3144 if (directory)
63c372cb 3145 p = strjoina(where, directory);
1b9e5b12
LP
3146 else
3147 p = where;
3148
3149 errno = 0;
3150 b = blkid_new_probe_from_filename(what);
3151 if (!b) {
3152 if (errno == 0)
3153 return log_oom();
56f64d95 3154 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3155 return -errno;
3156 }
3157
3158 blkid_probe_enable_superblocks(b, 1);
3159 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3160
3161 errno = 0;
3162 r = blkid_do_safeprobe(b);
3163 if (r == -1 || r == 1) {
3164 log_error("Cannot determine file system type of %s", what);
3165 return -EINVAL;
3166 } else if (r != 0) {
3167 if (errno == 0)
3168 errno = EIO;
56f64d95 3169 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3170 return -errno;
3171 }
3172
3173 errno = 0;
3174 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3175 if (errno == 0)
3176 errno = EINVAL;
3177 log_error("Failed to determine file system type of %s", what);
3178 return -errno;
3179 }
3180
3181 if (streq(fstype, "crypto_LUKS")) {
3182 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3183 return -EOPNOTSUPP;
1b9e5b12
LP
3184 }
3185
4a62c710
MS
3186 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3187 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3188
3189 return 0;
3190#else
3191 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3192 return -EOPNOTSUPP;
1b9e5b12
LP
3193#endif
3194}
3195
727fd4fd
LP
3196static int mount_devices(
3197 const char *where,
3198 const char *root_device, bool root_device_rw,
3199 const char *home_device, bool home_device_rw,
3200 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3201 int r;
3202
3203 assert(where);
3204
3205 if (root_device) {
727fd4fd 3206 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3207 if (r < 0)
3208 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3209 }
3210
3211 if (home_device) {
727fd4fd 3212 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3213 if (r < 0)
3214 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3215 }
3216
3217 if (srv_device) {
727fd4fd 3218 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3219 if (r < 0)
3220 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3221 }
3222
3223 return 0;
3224}
3225
3226static void loop_remove(int nr, int *image_fd) {
3227 _cleanup_close_ int control = -1;
e8c8ddcc 3228 int r;
1b9e5b12
LP
3229
3230 if (nr < 0)
3231 return;
3232
3233 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3234 r = ioctl(*image_fd, LOOP_CLR_FD);
3235 if (r < 0)
5e4074aa 3236 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3237 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3238 }
3239
3240 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3241 if (control < 0) {
56f64d95 3242 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3243 return;
e8c8ddcc 3244 }
1b9e5b12 3245
e8c8ddcc
TG
3246 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3247 if (r < 0)
5e4074aa 3248 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3249}
3250
0cb9fbcd
LP
3251static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3252 int pipe_fds[2];
3253 pid_t pid;
3254
3255 assert(database);
3256 assert(key);
3257 assert(rpid);
3258
4a62c710
MS
3259 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3260 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3261
3262 pid = fork();
4a62c710
MS
3263 if (pid < 0)
3264 return log_error_errno(errno, "Failed to fork getent child: %m");
3265 else if (pid == 0) {
0cb9fbcd
LP
3266 int nullfd;
3267 char *empty_env = NULL;
3268
3269 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3270 _exit(EXIT_FAILURE);
3271
3272 if (pipe_fds[0] > 2)
03e334a1 3273 safe_close(pipe_fds[0]);
0cb9fbcd 3274 if (pipe_fds[1] > 2)
03e334a1 3275 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3276
3277 nullfd = open("/dev/null", O_RDWR);
3278 if (nullfd < 0)
3279 _exit(EXIT_FAILURE);
3280
3281 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3282 _exit(EXIT_FAILURE);
3283
3284 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3285 _exit(EXIT_FAILURE);
3286
3287 if (nullfd > 2)
03e334a1 3288 safe_close(nullfd);
0cb9fbcd
LP
3289
3290 reset_all_signal_handlers();
3291 close_all_fds(NULL, 0);
3292
4de82926
MM
3293 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3294 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3295 _exit(EXIT_FAILURE);
3296 }
3297
03e334a1 3298 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3299
3300 *rpid = pid;
3301
3302 return pipe_fds[0];
3303}
3304
3305static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3306 char line[LINE_MAX], *x, *u, *g, *h;
3307 const char *word, *state;
0cb9fbcd
LP
3308 _cleanup_free_ uid_t *uids = NULL;
3309 _cleanup_free_ char *home = NULL;
3310 _cleanup_fclose_ FILE *f = NULL;
3311 _cleanup_close_ int fd = -1;
3312 unsigned n_uids = 0;
70f539ca 3313 size_t sz = 0, l;
0cb9fbcd
LP
3314 uid_t uid;
3315 gid_t gid;
3316 pid_t pid;
3317 int r;
3318
3319 assert(_home);
3320
3321 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3322 /* Reset everything fully to 0, just in case */
3323
4a62c710
MS
3324 if (setgroups(0, NULL) < 0)
3325 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3326
4a62c710
MS
3327 if (setresgid(0, 0, 0) < 0)
3328 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3329
4a62c710
MS
3330 if (setresuid(0, 0, 0) < 0)
3331 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3332
3333 *_home = NULL;
3334 return 0;
3335 }
3336
3337 /* First, get user credentials */
3338 fd = spawn_getent("passwd", arg_user, &pid);
3339 if (fd < 0)
3340 return fd;
3341
3342 f = fdopen(fd, "r");
3343 if (!f)
3344 return log_oom();
3345 fd = -1;
3346
3347 if (!fgets(line, sizeof(line), f)) {
3348
3349 if (!ferror(f)) {
3350 log_error("Failed to resolve user %s.", arg_user);
3351 return -ESRCH;
3352 }
3353
56f64d95 3354 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3355 return -errno;
3356 }
3357
3358 truncate_nl(line);
3359
820d3acf 3360 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3361
3362 x = strchr(line, ':');
3363 if (!x) {
3364 log_error("/etc/passwd entry has invalid user field.");
3365 return -EIO;
3366 }
3367
3368 u = strchr(x+1, ':');
3369 if (!u) {
3370 log_error("/etc/passwd entry has invalid password field.");
3371 return -EIO;
3372 }
3373
3374 u++;
3375 g = strchr(u, ':');
3376 if (!g) {
3377 log_error("/etc/passwd entry has invalid UID field.");
3378 return -EIO;
3379 }
3380
3381 *g = 0;
3382 g++;
3383 x = strchr(g, ':');
3384 if (!x) {
3385 log_error("/etc/passwd entry has invalid GID field.");
3386 return -EIO;
3387 }
3388
3389 *x = 0;
3390 h = strchr(x+1, ':');
3391 if (!h) {
3392 log_error("/etc/passwd entry has invalid GECOS field.");
3393 return -EIO;
3394 }
3395
3396 h++;
3397 x = strchr(h, ':');
3398 if (!x) {
3399 log_error("/etc/passwd entry has invalid home directory field.");
3400 return -EIO;
3401 }
3402
3403 *x = 0;
3404
3405 r = parse_uid(u, &uid);
3406 if (r < 0) {
3407 log_error("Failed to parse UID of user.");
3408 return -EIO;
3409 }
3410
3411 r = parse_gid(g, &gid);
3412 if (r < 0) {
3413 log_error("Failed to parse GID of user.");
3414 return -EIO;
3415 }
3416
3417 home = strdup(h);
3418 if (!home)
3419 return log_oom();
3420
3421 /* Second, get group memberships */
3422 fd = spawn_getent("initgroups", arg_user, &pid);
3423 if (fd < 0)
3424 return fd;
3425
3426 fclose(f);
3427 f = fdopen(fd, "r");
3428 if (!f)
3429 return log_oom();
3430 fd = -1;
3431
3432 if (!fgets(line, sizeof(line), f)) {
3433 if (!ferror(f)) {
3434 log_error("Failed to resolve user %s.", arg_user);
3435 return -ESRCH;
3436 }
3437
56f64d95 3438 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3439 return -errno;
3440 }
3441
3442 truncate_nl(line);
3443
820d3acf 3444 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3445
3446 /* Skip over the username and subsequent separator whitespace */
3447 x = line;
3448 x += strcspn(x, WHITESPACE);
3449 x += strspn(x, WHITESPACE);
3450
a2a5291b 3451 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3452 char c[l+1];
3453
a2a5291b 3454 memcpy(c, word, l);
0cb9fbcd
LP
3455 c[l] = 0;
3456
3457 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3458 return log_oom();
3459
3460 r = parse_uid(c, &uids[n_uids++]);
3461 if (r < 0) {
3462 log_error("Failed to parse group data from getent.");
3463 return -EIO;
3464 }
3465 }
3466
3467 r = mkdir_parents(home, 0775);
f647962d
MS
3468 if (r < 0)
3469 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3470
3471 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3472 if (r < 0 && r != -EEXIST)
3473 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3474
3475 fchown(STDIN_FILENO, uid, gid);
3476 fchown(STDOUT_FILENO, uid, gid);
3477 fchown(STDERR_FILENO, uid, gid);
3478
4a62c710
MS
3479 if (setgroups(n_uids, uids) < 0)
3480 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3481
4a62c710
MS
3482 if (setresgid(gid, gid, gid) < 0)
3483 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3484
4a62c710
MS
3485 if (setresuid(uid, uid, uid) < 0)
3486 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3487
3488 if (_home) {
3489 *_home = home;
3490 home = NULL;
3491 }
3492
3493 return 0;
3494}
3495
113cea80 3496/*
6d416b9c
LS
3497 * Return values:
3498 * < 0 : wait_for_terminate() failed to get the state of the
3499 * container, the container was terminated by a signal, or
3500 * failed for an unknown reason. No change is made to the
3501 * container argument.
3502 * > 0 : The program executed in the container terminated with an
3503 * error. The exit code of the program executed in the
919699ec
LP
3504 * container is returned. The container argument has been set
3505 * to CONTAINER_TERMINATED.
6d416b9c
LS
3506 * 0 : The container is being rebooted, has been shut down or exited
3507 * successfully. The container argument has been set to either
3508 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3509 *
6d416b9c
LS
3510 * That is, success is indicated by a return value of zero, and an
3511 * error is indicated by a non-zero value.
113cea80
DH
3512 */
3513static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3514 siginfo_t status;
919699ec 3515 int r;
113cea80
DH
3516
3517 r = wait_for_terminate(pid, &status);
f647962d
MS
3518 if (r < 0)
3519 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3520
3521 switch (status.si_code) {
fddbb89c 3522
113cea80 3523 case CLD_EXITED:
919699ec
LP
3524 if (status.si_status == 0) {
3525 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3526
fddbb89c 3527 } else
919699ec 3528 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3529
919699ec
LP
3530 *container = CONTAINER_TERMINATED;
3531 return status.si_status;
113cea80
DH
3532
3533 case CLD_KILLED:
3534 if (status.si_status == SIGINT) {
113cea80 3535
919699ec 3536 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3537 *container = CONTAINER_TERMINATED;
919699ec
LP
3538 return 0;
3539
113cea80 3540 } else if (status.si_status == SIGHUP) {
113cea80 3541
919699ec 3542 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3543 *container = CONTAINER_REBOOTED;
919699ec 3544 return 0;
113cea80 3545 }
919699ec 3546
113cea80
DH
3547 /* CLD_KILLED fallthrough */
3548
3549 case CLD_DUMPED:
fddbb89c 3550 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3551 return -EIO;
113cea80
DH
3552
3553 default:
fddbb89c 3554 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3555 return -EIO;
113cea80
DH
3556 }
3557
3558 return r;
3559}
3560
e866af3a
DH
3561static void nop_handler(int sig) {}
3562
023fb90b
LP
3563static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3564 pid_t pid;
3565
3566 pid = PTR_TO_UINT32(userdata);
3567 if (pid > 0) {
c6c8f6e2 3568 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3569 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3570 sd_event_source_set_userdata(s, NULL);
3571 return 0;
3572 }
3573 }
3574
3575 sd_event_exit(sd_event_source_get_event(s), 0);
3576 return 0;
3577}
3578
ec16945e 3579static int determine_names(void) {
1b9cebf6 3580 int r;
ec16945e
LP
3581
3582 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3583 if (arg_machine) {
3584 _cleanup_(image_unrefp) Image *i = NULL;
3585
3586 r = image_find(arg_machine, &i);
3587 if (r < 0)
3588 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3589 else if (r == 0) {
3590 log_error("No image for machine '%s': %m", arg_machine);
3591 return -ENOENT;
3592 }
3593
aceac2f0 3594 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3595 r = set_sanitized_path(&arg_image, i->path);
3596 else
3597 r = set_sanitized_path(&arg_directory, i->path);
3598 if (r < 0)
3599 return log_error_errno(r, "Invalid image directory: %m");
3600
3601 arg_read_only = arg_read_only || i->read_only;
3602 } else
ec16945e
LP
3603 arg_directory = get_current_dir_name();
3604
1b9cebf6
LP
3605 if (!arg_directory && !arg_machine) {
3606 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3607 return -EINVAL;
3608 }
3609 }
3610
3611 if (!arg_machine) {
b9ba4dab
LP
3612 if (arg_directory && path_equal(arg_directory, "/"))
3613 arg_machine = gethostname_malloc();
3614 else
3615 arg_machine = strdup(basename(arg_image ?: arg_directory));
3616
ec16945e
LP
3617 if (!arg_machine)
3618 return log_oom();
3619
3620 hostname_cleanup(arg_machine, false);
3621 if (!machine_name_is_valid(arg_machine)) {
3622 log_error("Failed to determine machine name automatically, please use -M.");
3623 return -EINVAL;
3624 }
b9ba4dab
LP
3625
3626 if (arg_ephemeral) {
3627 char *b;
3628
3629 /* Add a random suffix when this is an
3630 * ephemeral machine, so that we can run many
3631 * instances at once without manually having
3632 * to specify -M each time. */
3633
3634 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3635 return log_oom();
3636
3637 free(arg_machine);
3638 arg_machine = b;
3639 }
ec16945e
LP
3640 }
3641
3642 return 0;
3643}
3644
6dac160c
LP
3645static int determine_uid_shift(void) {
3646 int r;
3647
3648 if (!arg_userns)
3649 return 0;
3650
3651 if (arg_uid_shift == UID_INVALID) {
3652 struct stat st;
3653
3654 r = stat(arg_directory, &st);
3655 if (r < 0)
3656 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3657
3658 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3659
3660 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3661 log_error("UID and GID base of %s don't match.", arg_directory);
3662 return -EINVAL;
3663 }
3664
3665 arg_uid_range = UINT32_C(0x10000);
3666 }
3667
3668 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3669 log_error("UID base too high for UID range.");
3670 return -EINVAL;
3671 }
3672
3673 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3674 return 0;
3675}
3676
88213476 3677int main(int argc, char *argv[]) {
69c79d3c 3678
611b312b 3679 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3680 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3681 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3682 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3683 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3684 char veth_name[IFNAMSIZ];
ec16945e 3685 bool secondary = false, remove_subvol = false;
e866af3a 3686 sigset_t mask, mask_chld;
69c79d3c 3687 pid_t pid = 0;
ec16945e 3688 int ret = EXIT_SUCCESS;
6d0b55c2 3689 union in_addr_union exposed = {};
30535c16 3690 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
9c857b9d 3691 bool interactive;
88213476
LP
3692
3693 log_parse_environment();
3694 log_open();
3695
ec16945e
LP
3696 r = parse_argv(argc, argv);
3697 if (r <= 0)
88213476 3698 goto finish;
88213476 3699
ec16945e
LP
3700 r = determine_names();
3701 if (r < 0)
3702 goto finish;
7027ff61 3703
88213476
LP
3704 if (geteuid() != 0) {
3705 log_error("Need to be root.");
ec16945e 3706 r = -EPERM;
88213476
LP
3707 goto finish;
3708 }
3709
04d391da
LP
3710 if (sd_booted() <= 0) {
3711 log_error("Not running on a systemd system.");
ec16945e 3712 r = -EINVAL;
04d391da
LP
3713 goto finish;
3714 }
3715
1b9e5b12
LP
3716 log_close();
3717 n_fd_passed = sd_listen_fds(false);
3718 if (n_fd_passed > 0) {
ec16945e
LP
3719 r = fdset_new_listen_fds(&fds, false);
3720 if (r < 0) {
3721 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3722 goto finish;
3723 }
88213476 3724 }
1b9e5b12
LP
3725 fdset_close_others(fds);
3726 log_open();
88213476 3727
1b9e5b12 3728 if (arg_directory) {
ec16945e
LP
3729 assert(!arg_image);
3730
c4e34a61
LP
3731 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3732 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3733 r = -EINVAL;
6b9132a9
LP
3734 goto finish;
3735 }
1b9e5b12 3736
30535c16 3737 if (arg_ephemeral) {
8a16a7b4 3738 _cleanup_free_ char *np = NULL;
ec16945e 3739
c4e34a61
LP
3740 /* If the specified path is a mount point we
3741 * generate the new snapshot immediately
3742 * inside it under a random name. However if
3743 * the specified is not a mount point we
3744 * create the new snapshot in the parent
3745 * directory, just next to it. */
3746 r = path_is_mount_point(arg_directory, false);
3747 if (r < 0) {
3748 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3749 goto finish;
3750 }
3751 if (r > 0)
3752 r = tempfn_random_child(arg_directory, &np);
3753 else
3754 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3755 if (r < 0) {
3756 log_error_errno(r, "Failed to generate name for snapshot: %m");
3757 goto finish;
3758 }
3759
30535c16
LP
3760 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3761 if (r < 0) {
3762 log_error_errno(r, "Failed to lock %s: %m", np);
3763 goto finish;
3764 }
3765
ec16945e
LP
3766 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3767 if (r < 0) {
ec16945e
LP
3768 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3769 goto finish;
3770 }
3771
3772 free(arg_directory);
3773 arg_directory = np;
8a16a7b4 3774 np = NULL;
ec16945e
LP
3775
3776 remove_subvol = true;
30535c16
LP
3777
3778 } else {
3779 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3780 if (r == -EBUSY) {
3781 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3782 goto finish;
3783 }
3784 if (r < 0) {
3785 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3786 return r;
3787 }
3788
3789 if (arg_template) {
3790 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3791 if (r == -EEXIST) {
3792 if (!arg_quiet)
3793 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3794 } else if (r < 0) {
83521414 3795 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3796 goto finish;
3797 } else {
3798 if (!arg_quiet)
3799 log_info("Populated %s from template %s.", arg_directory, arg_template);
3800 }
3801 }
ec16945e
LP
3802 }
3803
1b9e5b12
LP
3804 if (arg_boot) {
3805 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3806 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3807 r = -EINVAL;
1b9e5b12
LP
3808 goto finish;
3809 }
3810 } else {
3811 const char *p;
3812
63c372cb 3813 p = strjoina(arg_directory,
1b9e5b12
LP
3814 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3815 if (access(p, F_OK) < 0) {
3816 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3817 r = -EINVAL;
1b9e5b12 3818 goto finish;
1b9e5b12
LP
3819 }
3820 }
ec16945e 3821
6b9132a9 3822 } else {
1b9e5b12 3823 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3824
ec16945e
LP
3825 assert(arg_image);
3826 assert(!arg_template);
3827
30535c16
LP
3828 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3829 if (r == -EBUSY) {
3830 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3831 goto finish;
3832 }
3833 if (r < 0) {
3834 r = log_error_errno(r, "Failed to create image lock: %m");
3835 goto finish;
3836 }
3837
1b9e5b12 3838 if (!mkdtemp(template)) {
56f64d95 3839 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3840 r = -errno;
6b9132a9 3841 goto finish;
1b9e5b12 3842 }
6b9132a9 3843
1b9e5b12
LP
3844 arg_directory = strdup(template);
3845 if (!arg_directory) {
3846 r = log_oom();
3847 goto finish;
6b9132a9 3848 }
88213476 3849
1b9e5b12
LP
3850 image_fd = setup_image(&device_path, &loop_nr);
3851 if (image_fd < 0) {
3852 r = image_fd;
842f3b0f
LP
3853 goto finish;
3854 }
1b9e5b12 3855
4d9f07b4
LP
3856 r = dissect_image(image_fd,
3857 &root_device, &root_device_rw,
3858 &home_device, &home_device_rw,
3859 &srv_device, &srv_device_rw,
3860 &secondary);
1b9e5b12
LP
3861 if (r < 0)
3862 goto finish;
842f3b0f 3863 }
842f3b0f 3864
6dac160c
LP
3865 r = determine_uid_shift();
3866 if (r < 0)
3867 goto finish;
3868
9c857b9d
LP
3869 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3870
db7feb7e
LP
3871 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3872 if (master < 0) {
ec16945e 3873 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3874 goto finish;
3875 }
3876
611b312b
LP
3877 r = ptsname_malloc(master, &console);
3878 if (r < 0) {
3879 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3880 goto finish;
3881 }
3882
a258bf26 3883 if (unlockpt(master) < 0) {
ec16945e 3884 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3885 goto finish;
3886 }
3887
9c857b9d
LP
3888 if (!arg_quiet)
3889 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3890 arg_machine, arg_image ?: arg_directory);
3891
a258bf26
LP
3892 assert_se(sigemptyset(&mask) == 0);
3893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3895
023fb90b
LP
3896 assert_se(sigemptyset(&mask_chld) == 0);
3897 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3898
d87be9b0 3899 for (;;) {
6d0b55c2 3900 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3901 ContainerStatus container_status;
7566e267 3902 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3903 struct sigaction sa = {
3904 .sa_handler = nop_handler,
3905 .sa_flags = SA_NOCLDSTOP,
3906 };
3907
7566e267 3908 r = barrier_create(&barrier);
a2da110b 3909 if (r < 0) {
da927ba9 3910 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3911 goto finish;
3912 }
3913
6d0b55c2
LP
3914 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3915 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3916 goto finish;
3917 }
3918
3919 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3920 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3921 goto finish;
3922 }
3923
e866af3a
DH
3924 /* Child can be killed before execv(), so handle SIGCHLD
3925 * in order to interrupt parent's blocking calls and
3926 * give it a chance to call wait() and terminate. */
3927 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3928 if (r < 0) {
ec16945e 3929 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3930 goto finish;
3931 }
3932
e866af3a
DH
3933 r = sigaction(SIGCHLD, &sa, NULL);
3934 if (r < 0) {
ec16945e 3935 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3936 goto finish;
3937 }
3938
60e1651a
KW
3939 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3940 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3941 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3942 if (pid < 0) {
3943 if (errno == EINVAL)
ec16945e 3944 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3945 else
ec16945e 3946 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3947
d87be9b0
LP
3948 goto finish;
3949 }
a258bf26 3950
d87be9b0
LP
3951 if (pid == 0) {
3952 /* child */
0cb9fbcd 3953 _cleanup_free_ char *home = NULL;
5674767e 3954 unsigned n_env = 2;
d87be9b0 3955 const char *envp[] = {
e10a55fd 3956 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3957 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3958 NULL, /* TERM */
3959 NULL, /* HOME */
3960 NULL, /* USER */
3961 NULL, /* LOGNAME */
3962 NULL, /* container_uuid */
842f3b0f
LP
3963 NULL, /* LISTEN_FDS */
3964 NULL, /* LISTEN_PID */
d87be9b0
LP
3965 NULL
3966 };
f4889f65 3967 char **env_use;
a258bf26 3968
a2da110b
DH
3969 barrier_set_role(&barrier, BARRIER_CHILD);
3970
5674767e
ZJS
3971 envp[n_env] = strv_find_prefix(environ, "TERM=");
3972 if (envp[n_env])
3973 n_env ++;
a258bf26 3974
03e334a1 3975 master = safe_close(master);
a258bf26 3976
03e334a1 3977 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3978 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3979
d87be9b0 3980 reset_all_signal_handlers();
1b6d7fa7 3981 reset_signal_mask();
f5c1b9ee 3982
9c857b9d
LP
3983 if (interactive) {
3984 close_nointr(STDIN_FILENO);
3985 close_nointr(STDOUT_FILENO);
3986 close_nointr(STDERR_FILENO);
842f3b0f 3987
9c857b9d
LP
3988 r = open_terminal(console, O_RDWR);
3989 if (r != STDIN_FILENO) {
3990 if (r >= 0) {
3991 safe_close(r);
3992 r = -EINVAL;
3993 }
842f3b0f 3994
9c857b9d
LP
3995 log_error_errno(r, "Failed to open console: %m");
3996 _exit(EXIT_FAILURE);
3997 }
3998
3999 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4000 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4001 log_error_errno(errno, "Failed to duplicate console: %m");
4002 _exit(EXIT_FAILURE);
4003 }
842f3b0f 4004 }
bc2f673e 4005
d87be9b0 4006 if (setsid() < 0) {
56f64d95 4007 log_error_errno(errno, "setsid() failed: %m");
a2da110b 4008 _exit(EXIT_FAILURE);
bc2f673e
LP
4009 }
4010
db999e0f 4011 if (reset_audit_loginuid() < 0)
a2da110b 4012 _exit(EXIT_FAILURE);
db999e0f 4013
d87be9b0 4014 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 4015 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 4016 _exit(EXIT_FAILURE);
d87be9b0 4017 }
e58a1277 4018
6dac160c
LP
4019 if (arg_private_network)
4020 loopback_setup();
4021
d87be9b0
LP
4022 /* Mark everything as slave, so that we still
4023 * receive mounts from the real root, but don't
4024 * propagate mounts to the real root. */
4025 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 4026 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 4027 _exit(EXIT_FAILURE);
d87be9b0 4028 }
04bc4a3f 4029
727fd4fd
LP
4030 if (mount_devices(arg_directory,
4031 root_device, root_device_rw,
4032 home_device, home_device_rw,
4033 srv_device, srv_device_rw) < 0)
a2da110b 4034 _exit(EXIT_FAILURE);
1b9e5b12 4035
d87be9b0
LP
4036 /* Turn directory into bind mount */
4037 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 4038 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 4039 _exit(EXIT_FAILURE);
d87be9b0 4040 }
88213476 4041
4d9f07b4
LP
4042 r = setup_volatile(arg_directory);
4043 if (r < 0)
a2da110b 4044 _exit(EXIT_FAILURE);
4d9f07b4
LP
4045
4046 if (setup_volatile_state(arg_directory) < 0)
a2da110b 4047 _exit(EXIT_FAILURE);
4d9f07b4
LP
4048
4049 r = base_filesystem_create(arg_directory);
4050 if (r < 0)
a2da110b 4051 _exit(EXIT_FAILURE);
4d9f07b4 4052
d6797c92 4053 if (arg_read_only) {
ec16945e
LP
4054 r = bind_remount_recursive(arg_directory, true);
4055 if (r < 0) {
4056 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 4057 _exit(EXIT_FAILURE);
d87be9b0 4058 }
d6797c92 4059 }
2547bb41 4060
d87be9b0 4061 if (mount_all(arg_directory) < 0)
a2da110b 4062 _exit(EXIT_FAILURE);
57fb9fb5 4063
d87be9b0 4064 if (copy_devnodes(arg_directory) < 0)
a2da110b 4065 _exit(EXIT_FAILURE);
a258bf26 4066
f2d88580 4067 if (setup_ptmx(arg_directory) < 0)
a2da110b 4068 _exit(EXIT_FAILURE);
f2d88580 4069
d87be9b0 4070 dev_setup(arg_directory);
88213476 4071
785890ac
LP
4072 if (setup_propagate(arg_directory) < 0)
4073 _exit(EXIT_FAILURE);
4074
28650077 4075 if (setup_seccomp() < 0)
a2da110b 4076 _exit(EXIT_FAILURE);
24fb1112 4077
d87be9b0 4078 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 4079 _exit(EXIT_FAILURE);
88213476 4080
d87be9b0 4081 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 4082 _exit(EXIT_FAILURE);
03e334a1 4083 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 4084
6d0b55c2
LP
4085 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4086 _exit(EXIT_FAILURE);
4087 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4088
b12afc8c
LP
4089 /* Tell the parent that we are ready, and that
4090 * it can cgroupify us to that we lack access
4091 * to certain devices and resources. */
6dac160c 4092 (void) barrier_place(&barrier); /* #1 */
b12afc8c 4093
d87be9b0 4094 if (setup_boot_id(arg_directory) < 0)
a2da110b 4095 _exit(EXIT_FAILURE);
a41fe3a2 4096
d87be9b0 4097 if (setup_timezone(arg_directory) < 0)
a2da110b 4098 _exit(EXIT_FAILURE);
88213476 4099
d87be9b0 4100 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 4101 _exit(EXIT_FAILURE);
687d0825 4102
d87be9b0 4103 if (setup_journal(arg_directory) < 0)
a2da110b 4104 _exit(EXIT_FAILURE);
687d0825 4105
d6797c92 4106 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 4107 _exit(EXIT_FAILURE);
17fe0523 4108
d6797c92 4109 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 4110 _exit(EXIT_FAILURE);
17fe0523 4111
06c17c39 4112 if (mount_tmpfs(arg_directory) < 0)
a2da110b 4113 _exit(EXIT_FAILURE);
06c17c39 4114
b12afc8c
LP
4115 /* Wait until we are cgroup-ified, so that we
4116 * can mount the right cgroup path writable */
6dac160c 4117 (void) barrier_place_and_sync(&barrier); /* #2 */
b12afc8c
LP
4118
4119 if (mount_cgroup(arg_directory) < 0)
4120 _exit(EXIT_FAILURE);
d96c1ecf 4121
d87be9b0 4122 if (chdir(arg_directory) < 0) {
56f64d95 4123 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 4124 _exit(EXIT_FAILURE);
687d0825
MV
4125 }
4126
d87be9b0 4127 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 4128 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 4129 _exit(EXIT_FAILURE);
687d0825
MV
4130 }
4131
d87be9b0 4132 if (chroot(".") < 0) {
56f64d95 4133 log_error_errno(errno, "chroot() failed: %m");
a2da110b 4134 _exit(EXIT_FAILURE);
687d0825
MV
4135 }
4136
d87be9b0 4137 if (chdir("/") < 0) {
56f64d95 4138 log_error_errno(errno, "chdir() failed: %m");
a2da110b 4139 _exit(EXIT_FAILURE);
687d0825
MV
4140 }
4141
6dac160c
LP
4142 if (arg_userns) {
4143 if (unshare(CLONE_NEWUSER) < 0) {
4144 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4145 _exit(EXIT_FAILURE);
4146 }
d87be9b0 4147
6dac160c
LP
4148 /* Tell the parent, that it now can
4149 * write the UID map. */
4150 (void) barrier_place(&barrier); /* #3 */
4151
4152 /* Wait until the parent wrote the UID
4153 * map */
4154 (void) barrier_place_and_sync(&barrier); /* #4 */
4155 }
4156
4157 umask(0022);
d87be9b0
LP
4158
4159 if (drop_capabilities() < 0) {
56f64d95 4160 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4161 _exit(EXIT_FAILURE);
687d0825 4162 }
687d0825 4163
6dac160c
LP
4164 setup_hostname();
4165
4166 if (arg_personality != 0xffffffffLU) {
4167 if (personality(arg_personality) < 0) {
4168 log_error_errno(errno, "personality() failed: %m");
4169 _exit(EXIT_FAILURE);
4170 }
4171 } else if (secondary) {
4172 if (personality(PER_LINUX32) < 0) {
4173 log_error_errno(errno, "personality() failed: %m");
4174 _exit(EXIT_FAILURE);
4175 }
4176 }
4177
4178#ifdef HAVE_SELINUX
4179 if (arg_selinux_context)
4180 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4181 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4182 _exit(EXIT_FAILURE);
4183 }
4184#endif
4185
0cb9fbcd
LP
4186 r = change_uid_gid(&home);
4187 if (r < 0)
a2da110b 4188 _exit(EXIT_FAILURE);
d87be9b0 4189
842f3b0f
LP
4190 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4191 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4192 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4193 log_oom();
a2da110b 4194 _exit(EXIT_FAILURE);
144f0fc0 4195 }
687d0825 4196
9444b1f2 4197 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4198 char as_uuid[37];
4199
4200 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4201 log_oom();
a2da110b 4202 _exit(EXIT_FAILURE);
842f3b0f
LP
4203 }
4204 }
4205
4206 if (fdset_size(fds) > 0) {
ec16945e
LP
4207 r = fdset_cloexec(fds, false);
4208 if (r < 0) {
4209 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4210 _exit(EXIT_FAILURE);
842f3b0f
LP
4211 }
4212
4213 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4214 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4215 log_oom();
a2da110b 4216 _exit(EXIT_FAILURE);
d87be9b0
LP
4217 }
4218 }
4219
f4889f65
LP
4220 if (!strv_isempty(arg_setenv)) {
4221 char **n;
4222
4223 n = strv_env_merge(2, envp, arg_setenv);
4224 if (!n) {
4225 log_oom();
a2da110b 4226 _exit(EXIT_FAILURE);
f4889f65
LP
4227 }
4228
4229 env_use = n;
4230 } else
4231 env_use = (char**) envp;
4232
6dac160c
LP
4233 /* Let the parent know that we are ready and
4234 * wait until the parent is ready with the
4235 * setup, too... */
4236 (void) barrier_place_and_sync(&barrier); /* #5 */
d96c1ecf 4237
d87be9b0
LP
4238 if (arg_boot) {
4239 char **a;
4240 size_t l;
88213476 4241
d87be9b0 4242 /* Automatically search for the init system */
0f0dbc46 4243
d87be9b0
LP
4244 l = 1 + argc - optind;
4245 a = newa(char*, l + 1);
4246 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4247
d87be9b0 4248 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4249 execve(a[0], a, env_use);
0f0dbc46 4250
d87be9b0 4251 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4252 execve(a[0], a, env_use);
0f0dbc46 4253
d87be9b0 4254 a[0] = (char*) "/sbin/init";
f4889f65 4255 execve(a[0], a, env_use);
d87be9b0 4256 } else if (argc > optind)
f4889f65 4257 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4258 else {
4259 chdir(home ? home : "/root");
f4889f65 4260 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4261 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4262 }
4263
56f64d95 4264 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4265 _exit(EXIT_FAILURE);
da5b3bad 4266 }
88213476 4267
a2da110b 4268 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4269 fdset_free(fds);
4270 fds = NULL;
4271
6d0b55c2
LP
4272 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4273 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4274
6dac160c
LP
4275 (void) barrier_place(&barrier); /* #1 */
4276
b12afc8c
LP
4277 /* Wait for the most basic Child-setup to be done,
4278 * before we add hardware to it, and place it in a
4279 * cgroup. */
6dac160c 4280 if (barrier_sync(&barrier)) { /* #1 */
5aa4bb6b 4281 int ifi = 0;
354bfd2b 4282
840295fc
LP
4283 r = move_network_interfaces(pid);
4284 if (r < 0)
4285 goto finish;
aa28aefe 4286
5aa4bb6b 4287 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4288 if (r < 0)
4289 goto finish;
ab046dde 4290
5aa4bb6b 4291 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4292 if (r < 0)
4293 goto finish;
ab046dde 4294
840295fc
LP
4295 r = setup_macvlan(pid);
4296 if (r < 0)
4297 goto finish;
c74e630d 4298
4bbfe7ad
TG
4299 r = setup_ipvlan(pid);
4300 if (r < 0)
4301 goto finish;
4302
5aa4bb6b
LP
4303 r = register_machine(pid, ifi);
4304 if (r < 0)
4305 goto finish;
4306
6dac160c
LP
4307 /* Notify the child that the parent is ready with all
4308 * its setup, and that the child can now hand over
4309 * control to the code to run inside the container. */
4310 (void) barrier_place(&barrier); /* #2 */
4311
4312 if (arg_userns) {
4313 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4314
4315 (void) barrier_place_and_sync(&barrier); /* #3 */
4316
4317 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4318 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4319 r = write_string_file(uid_map, line);
4320 if (r < 0) {
4321 log_error_errno(r, "Failed to write UID map: %m");
4322 goto finish;
4323 }
4324
4325 /* We always assign the same UID and GID ranges */
4326 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4327 r = write_string_file(uid_map, line);
4328 if (r < 0) {
4329 log_error_errno(r, "Failed to write GID map: %m");
4330 goto finish;
4331 }
4332
4333 (void) barrier_place(&barrier); /* #4 */
4334 }
4335
840295fc
LP
4336 /* Block SIGCHLD here, before notifying child.
4337 * process_pty() will handle it with the other signals. */
4338 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4339 if (r < 0)
4340 goto finish;
e866af3a 4341
840295fc
LP
4342 /* Reset signal to default */
4343 r = default_signals(SIGCHLD, -1);
4344 if (r < 0)
4345 goto finish;
e866af3a 4346
6dac160c
LP
4347 /* Let the child know that we are ready and wait that the child is completely ready now. */
4348 if (barrier_place_and_sync(&barrier)) { /* #5 */
6d0b55c2
LP
4349 _cleanup_event_unref_ sd_event *event = NULL;
4350 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4352 char last_char = 0;
b12afc8c 4353
733d15ac
LP
4354 sd_notifyf(false,
4355 "READY=1\n"
4356 "STATUS=Container running.\n"
4357 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4358
6d0b55c2
LP
4359 r = sd_event_new(&event);
4360 if (r < 0) {
4361 log_error_errno(r, "Failed to get default event source: %m");
4362 goto finish;
4363 }
88213476 4364
c6c8f6e2 4365 if (arg_kill_signal > 0) {
6d0b55c2
LP
4366 /* Try to kill the init system on SIGINT or SIGTERM */
4367 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4368 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4369 } else {
4370 /* Immediately exit */
4371 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4372 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4373 }
023fb90b 4374
6d0b55c2
LP
4375 /* simply exit on sigchld */
4376 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4377
6d0b55c2
LP
4378 if (arg_expose_ports) {
4379 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4380 if (r < 0)
4381 goto finish;
023fb90b 4382
6d0b55c2
LP
4383 (void) expose_ports(rtnl, &exposed);
4384 }
023fb90b 4385
6d0b55c2 4386 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4387
9c857b9d 4388 r = pty_forward_new(event, master, true, !interactive, &forward);
6d0b55c2
LP
4389 if (r < 0) {
4390 log_error_errno(r, "Failed to create PTY forwarder: %m");
4391 goto finish;
4392 }
023fb90b 4393
6d0b55c2
LP
4394 r = sd_event_loop(event);
4395 if (r < 0) {
4396 log_error_errno(r, "Failed to run event loop: %m");
4397 goto finish;
4398 }
4399
4400 pty_forward_get_last_char(forward, &last_char);
4401
4402 forward = pty_forward_free(forward);
4403
4404 if (!arg_quiet && last_char != '\n')
4405 putc('\n', stdout);
04d39279 4406
6d0b55c2
LP
4407 /* Kill if it is not dead yet anyway */
4408 terminate_machine(pid);
4409 }
840295fc 4410 }
1f0cd86b 4411
840295fc 4412 /* Normally redundant, but better safe than sorry */
04d39279 4413 kill(pid, SIGKILL);
a258bf26 4414
113cea80 4415 r = wait_for_container(pid, &container_status);
04d39279
LP
4416 pid = 0;
4417
ec16945e 4418 if (r < 0)
ce9f1527
LP
4419 /* We failed to wait for the container, or the
4420 * container exited abnormally */
ec16945e
LP
4421 goto finish;
4422 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4423 /* The container exited with a non-zero
4424 * status, or with zero status and no reboot
4425 * was requested. */
ec16945e 4426 ret = r;
d87be9b0 4427 break;
ec16945e 4428 }
88213476 4429
113cea80 4430 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4431
4432 if (arg_keep_unit) {
4433 /* Special handling if we are running as a
4434 * service: instead of simply restarting the
4435 * machine we want to restart the entire
4436 * service, so let's inform systemd about this
4437 * with the special exit code 133. The service
4438 * file uses RestartForceExitStatus=133 so
4439 * that this results in a full nspawn
4440 * restart. This is necessary since we might
4441 * have cgroup parameters set we want to have
4442 * flushed out. */
ec16945e
LP
4443 ret = 133;
4444 r = 0;
ce38dbc8
LP
4445 break;
4446 }
6d0b55c2
LP
4447
4448 flush_ports(&exposed);
d87be9b0 4449 }
88213476
LP
4450
4451finish:
af4ec430
LP
4452 sd_notify(false,
4453 "STOPPING=1\n"
4454 "STATUS=Terminating...");
4455
1b9e5b12
LP
4456 loop_remove(loop_nr, &image_fd);
4457
9444b1f2
LP
4458 if (pid > 0)
4459 kill(pid, SIGKILL);
88213476 4460
ec16945e
LP
4461 if (remove_subvol && arg_directory) {
4462 int k;
4463
4464 k = btrfs_subvol_remove(arg_directory);
4465 if (k < 0)
4466 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4467 }
4468
785890ac
LP
4469 if (arg_machine) {
4470 const char *p;
4471
63c372cb 4472 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
4473 (void) rm_rf(p, false, true, false);
4474 }
4475
04d391da 4476 free(arg_directory);
ec16945e
LP
4477 free(arg_template);
4478 free(arg_image);
7027ff61 4479 free(arg_machine);
c74e630d
LP
4480 free(arg_user);
4481 strv_free(arg_setenv);
4482 strv_free(arg_network_interfaces);
4483 strv_free(arg_network_macvlan);
4bbfe7ad 4484 strv_free(arg_network_ipvlan);
c74e630d
LP
4485 strv_free(arg_bind);
4486 strv_free(arg_bind_ro);
06c17c39 4487 strv_free(arg_tmpfs);
88213476 4488
6d0b55c2
LP
4489 flush_ports(&exposed);
4490
4491 while (arg_expose_ports) {
4492 ExposePort *p = arg_expose_ports;
4493 LIST_REMOVE(ports, arg_expose_ports, p);
4494 free(p);
4495 }
4496
ec16945e 4497 return r < 0 ? EXIT_FAILURE : ret;
88213476 4498}