]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: make kill signal to use for PID 1 configurable
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
aa28aefe 58#include "sd-rtnl.h"
88213476
LP
59#include "log.h"
60#include "util.h"
49e942b2 61#include "mkdir.h"
6b2d0e85 62#include "macro.h"
94d82985 63#include "missing.h"
04d391da 64#include "cgroup-util.h"
a258bf26 65#include "strv.h"
9eb977db 66#include "path-util.h"
a41fe3a2 67#include "loopback-setup.h"
4fc9982c 68#include "dev-setup.h"
842f3b0f 69#include "fdset.h"
acbeb427 70#include "build.h"
a5c32cff 71#include "fileio.h"
40ca29a1 72#include "bus-util.h"
1f0cd86b 73#include "bus-error.h"
4ba93280 74#include "ptyfwd.h"
f4889f65 75#include "env-util.h"
aa28aefe 76#include "rtnl-util.h"
7e227024 77#include "udev-util.h"
1b9e5b12
LP
78#include "blkid-util.h"
79#include "gpt.h"
01dde061 80#include "siphash24.h"
849958d1 81#include "copy.h"
3577de7a 82#include "base-filesystem.h"
a2da110b 83#include "barrier.h"
023fb90b 84#include "event-util.h"
f01ae826 85#include "capability.h"
2822da4f 86#include "cap-list.h"
ec16945e 87#include "btrfs-util.h"
1b9cebf6 88#include "machine-image.h"
6d0b55c2
LP
89#include "list.h"
90#include "in-addr-util.h"
91#include "fw-util.h"
92#include "local-addresses.h"
f2d88580 93
e9642be2
LP
94#ifdef HAVE_SECCOMP
95#include "seccomp-util.h"
96#endif
97
6d0b55c2
LP
98typedef struct ExposePort {
99 int protocol;
100 uint16_t host_port;
101 uint16_t container_port;
102 LIST_FIELDS(struct ExposePort, ports);
103} ExposePort;
104
113cea80
DH
105typedef enum ContainerStatus {
106 CONTAINER_TERMINATED,
107 CONTAINER_REBOOTED
108} ContainerStatus;
109
57fb9fb5
LP
110typedef enum LinkJournal {
111 LINK_NO,
112 LINK_AUTO,
113 LINK_HOST,
114 LINK_GUEST
115} LinkJournal;
88213476 116
4d9f07b4
LP
117typedef enum Volatile {
118 VOLATILE_NO,
119 VOLATILE_YES,
120 VOLATILE_STATE,
121} Volatile;
122
88213476 123static char *arg_directory = NULL;
ec16945e 124static char *arg_template = NULL;
687d0825 125static char *arg_user = NULL;
9444b1f2 126static sd_id128_t arg_uuid = {};
7027ff61 127static char *arg_machine = NULL;
c74e630d
LP
128static const char *arg_selinux_context = NULL;
129static const char *arg_selinux_apifs_context = NULL;
9444b1f2 130static const char *arg_slice = NULL;
ff01d048 131static bool arg_private_network = false;
bc2f673e 132static bool arg_read_only = false;
0f0dbc46 133static bool arg_boot = false;
ec16945e 134static bool arg_ephemeral = false;
57fb9fb5 135static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 136static bool arg_link_journal_try = false;
5076f0cc
LP
137static uint64_t arg_retain =
138 (1ULL << CAP_CHOWN) |
139 (1ULL << CAP_DAC_OVERRIDE) |
140 (1ULL << CAP_DAC_READ_SEARCH) |
141 (1ULL << CAP_FOWNER) |
142 (1ULL << CAP_FSETID) |
143 (1ULL << CAP_IPC_OWNER) |
144 (1ULL << CAP_KILL) |
145 (1ULL << CAP_LEASE) |
146 (1ULL << CAP_LINUX_IMMUTABLE) |
147 (1ULL << CAP_NET_BIND_SERVICE) |
148 (1ULL << CAP_NET_BROADCAST) |
149 (1ULL << CAP_NET_RAW) |
150 (1ULL << CAP_SETGID) |
151 (1ULL << CAP_SETFCAP) |
152 (1ULL << CAP_SETPCAP) |
153 (1ULL << CAP_SETUID) |
154 (1ULL << CAP_SYS_ADMIN) |
155 (1ULL << CAP_SYS_CHROOT) |
156 (1ULL << CAP_SYS_NICE) |
157 (1ULL << CAP_SYS_PTRACE) |
158 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 159 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
160 (1ULL << CAP_SYS_BOOT) |
161 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
162 (1ULL << CAP_AUDIT_CONTROL) |
163 (1ULL << CAP_MKNOD);
17fe0523
LP
164static char **arg_bind = NULL;
165static char **arg_bind_ro = NULL;
06c17c39 166static char **arg_tmpfs = NULL;
f4889f65 167static char **arg_setenv = NULL;
284c0b91 168static bool arg_quiet = false;
8a96d94e 169static bool arg_share_system = false;
eb91eb18 170static bool arg_register = true;
89f7c846 171static bool arg_keep_unit = false;
aa28aefe 172static char **arg_network_interfaces = NULL;
c74e630d 173static char **arg_network_macvlan = NULL;
4bbfe7ad 174static char **arg_network_ipvlan = NULL;
69c79d3c 175static bool arg_network_veth = false;
c74e630d 176static const char *arg_network_bridge = NULL;
6afc95b7 177static unsigned long arg_personality = 0xffffffffLU;
ec16945e 178static char *arg_image = NULL;
4d9f07b4 179static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 180static ExposePort *arg_expose_ports = NULL;
f36933fe 181static char **arg_property = NULL;
6dac160c
LP
182static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183static bool arg_userns = false;
c6c8f6e2 184static int arg_kill_signal = 0;
88213476 185
601185b4 186static void help(void) {
88213476
LP
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
189 " -h --help Show this help\n"
190 " --version Print version string\n"
69c79d3c 191 " -q --quiet Do not show status information\n"
1b9e5b12 192 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
193 " --template=PATH Initialize root directory from template directory,\n"
194 " if missing\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 200 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 201 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 202 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 203 " --property=NAME=VALUE Set scope unit property\n"
69c79d3c
LP
204 " --private-network Disable network in container\n"
205 " --network-interface=INTERFACE\n"
206 " Assign an existing network interface to the\n"
207 " container\n"
c74e630d
LP
208 " --network-macvlan=INTERFACE\n"
209 " Create a macvlan network interface based on an\n"
210 " existing network interface to the container\n"
4bbfe7ad
TG
211 " --network-ipvlan=INTERFACE\n"
212 " Create a ipvlan network interface based on an\n"
213 " existing network interface to the container\n"
0dfaa006 214 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 215 " and container\n"
ab046dde 216 " --network-bridge=INTERFACE\n"
32457153 217 " Add a virtual ethernet connection between host\n"
ab046dde
TG
218 " and container and add it to an existing bridge on\n"
219 " the host\n"
6dac160c
LP
220 " --private-users[=UIDBASE[:NUIDS]]\n"
221 " Run within user namespace\n"
6d0b55c2 222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 223 " Expose a container IP port on the host\n"
82adf6af
LP
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
a8828ed9
DW
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 237 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
238 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
239 " the container\n"
240 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 241 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 242 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 243 " --share-system Share system namespaces with host\n"
eb91eb18 244 " --register=BOOLEAN Register container as machine\n"
89f7c846 245 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 246 " the service unit nspawn is running in\n"
6d0b55c2
LP
247 " --volatile[=MODE] Run the system in volatile mode\n"
248 , program_invocation_short_name);
88213476
LP
249}
250
ec16945e
LP
251static int set_sanitized_path(char **b, const char *path) {
252 char *p;
253
254 assert(b);
255 assert(path);
256
257 p = canonicalize_file_name(path);
258 if (!p) {
259 if (errno != ENOENT)
260 return -errno;
261
262 p = path_make_absolute_cwd(path);
263 if (!p)
264 return -ENOMEM;
265 }
266
267 free(*b);
268 *b = path_kill_slashes(p);
269 return 0;
270}
271
88213476
LP
272static int parse_argv(int argc, char *argv[]) {
273
a41fe3a2 274 enum {
acbeb427
ZJS
275 ARG_VERSION = 0x100,
276 ARG_PRIVATE_NETWORK,
bc2f673e 277 ARG_UUID,
5076f0cc 278 ARG_READ_ONLY,
57fb9fb5 279 ARG_CAPABILITY,
420c7379 280 ARG_DROP_CAPABILITY,
17fe0523
LP
281 ARG_LINK_JOURNAL,
282 ARG_BIND,
f4889f65 283 ARG_BIND_RO,
06c17c39 284 ARG_TMPFS,
f4889f65 285 ARG_SETENV,
eb91eb18 286 ARG_SHARE_SYSTEM,
89f7c846 287 ARG_REGISTER,
aa28aefe 288 ARG_KEEP_UNIT,
69c79d3c 289 ARG_NETWORK_INTERFACE,
c74e630d 290 ARG_NETWORK_MACVLAN,
4bbfe7ad 291 ARG_NETWORK_IPVLAN,
ab046dde 292 ARG_NETWORK_BRIDGE,
6afc95b7 293 ARG_PERSONALITY,
4d9f07b4 294 ARG_VOLATILE,
ec16945e 295 ARG_TEMPLATE,
f36933fe 296 ARG_PROPERTY,
6dac160c 297 ARG_PRIVATE_USERS,
c6c8f6e2 298 ARG_KILL_SIGNAL,
a41fe3a2
LP
299 };
300
88213476 301 static const struct option options[] = {
aa28aefe
LP
302 { "help", no_argument, NULL, 'h' },
303 { "version", no_argument, NULL, ARG_VERSION },
304 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
305 { "template", required_argument, NULL, ARG_TEMPLATE },
306 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
307 { "user", required_argument, NULL, 'u' },
308 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
309 { "boot", no_argument, NULL, 'b' },
310 { "uuid", required_argument, NULL, ARG_UUID },
311 { "read-only", no_argument, NULL, ARG_READ_ONLY },
312 { "capability", required_argument, NULL, ARG_CAPABILITY },
313 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
314 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
315 { "bind", required_argument, NULL, ARG_BIND },
316 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 317 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
318 { "machine", required_argument, NULL, 'M' },
319 { "slice", required_argument, NULL, 'S' },
320 { "setenv", required_argument, NULL, ARG_SETENV },
321 { "selinux-context", required_argument, NULL, 'Z' },
322 { "selinux-apifs-context", required_argument, NULL, 'L' },
323 { "quiet", no_argument, NULL, 'q' },
324 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
325 { "register", required_argument, NULL, ARG_REGISTER },
326 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
327 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 328 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 329 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 330 { "network-veth", no_argument, NULL, 'n' },
ab046dde 331 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 332 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 333 { "image", required_argument, NULL, 'i' },
4d9f07b4 334 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 335 { "port", required_argument, NULL, 'p' },
f36933fe 336 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 337 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 338 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 339 {}
88213476
LP
340 };
341
9444b1f2 342 int c, r;
a42c8b54 343 uint64_t plus = 0, minus = 0;
88213476
LP
344
345 assert(argc >= 0);
346 assert(argv);
347
0dfaa006 348 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
349
350 switch (c) {
351
352 case 'h':
601185b4
ZJS
353 help();
354 return 0;
88213476 355
acbeb427
ZJS
356 case ARG_VERSION:
357 puts(PACKAGE_STRING);
358 puts(SYSTEMD_FEATURES);
359 return 0;
360
88213476 361 case 'D':
ec16945e
LP
362 r = set_sanitized_path(&arg_directory, optarg);
363 if (r < 0)
364 return log_error_errno(r, "Invalid root directory: %m");
365
366 break;
367
368 case ARG_TEMPLATE:
369 r = set_sanitized_path(&arg_template, optarg);
370 if (r < 0)
371 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
372
373 break;
374
1b9e5b12 375 case 'i':
ec16945e
LP
376 r = set_sanitized_path(&arg_image, optarg);
377 if (r < 0)
378 return log_error_errno(r, "Invalid image path: %m");
379
380 break;
381
382 case 'x':
383 arg_ephemeral = true;
1b9e5b12
LP
384 break;
385
687d0825
MV
386 case 'u':
387 free(arg_user);
7027ff61
LP
388 arg_user = strdup(optarg);
389 if (!arg_user)
390 return log_oom();
687d0825
MV
391
392 break;
393
ab046dde 394 case ARG_NETWORK_BRIDGE:
c74e630d 395 arg_network_bridge = optarg;
ab046dde
TG
396
397 /* fall through */
398
0dfaa006 399 case 'n':
69c79d3c
LP
400 arg_network_veth = true;
401 arg_private_network = true;
402 break;
403
aa28aefe 404 case ARG_NETWORK_INTERFACE:
c74e630d
LP
405 if (strv_extend(&arg_network_interfaces, optarg) < 0)
406 return log_oom();
407
408 arg_private_network = true;
409 break;
410
411 case ARG_NETWORK_MACVLAN:
412 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
413 return log_oom();
414
4bbfe7ad
TG
415 arg_private_network = true;
416 break;
417
418 case ARG_NETWORK_IPVLAN:
419 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
420 return log_oom();
421
aa28aefe
LP
422 /* fall through */
423
ff01d048
LP
424 case ARG_PRIVATE_NETWORK:
425 arg_private_network = true;
a41fe3a2
LP
426 break;
427
0f0dbc46
LP
428 case 'b':
429 arg_boot = true;
430 break;
431
144f0fc0 432 case ARG_UUID:
9444b1f2
LP
433 r = sd_id128_from_string(optarg, &arg_uuid);
434 if (r < 0) {
aa96c6cb 435 log_error("Invalid UUID: %s", optarg);
9444b1f2 436 return r;
aa96c6cb 437 }
9444b1f2 438 break;
aa96c6cb 439
9444b1f2 440 case 'S':
c74e630d 441 arg_slice = optarg;
144f0fc0
LP
442 break;
443
7027ff61 444 case 'M':
eb91eb18
LP
445 if (isempty(optarg)) {
446 free(arg_machine);
447 arg_machine = NULL;
448 } else {
0c3c4284 449 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
450 log_error("Invalid machine name: %s", optarg);
451 return -EINVAL;
452 }
7027ff61 453
0c3c4284
LP
454 r = free_and_strdup(&arg_machine, optarg);
455 if (r < 0)
eb91eb18
LP
456 return log_oom();
457
458 break;
459 }
7027ff61 460
82adf6af
LP
461 case 'Z':
462 arg_selinux_context = optarg;
a8828ed9
DW
463 break;
464
82adf6af
LP
465 case 'L':
466 arg_selinux_apifs_context = optarg;
a8828ed9
DW
467 break;
468
bc2f673e
LP
469 case ARG_READ_ONLY:
470 arg_read_only = true;
471 break;
472
420c7379
LP
473 case ARG_CAPABILITY:
474 case ARG_DROP_CAPABILITY: {
a2a5291b 475 const char *state, *word;
5076f0cc
LP
476 size_t length;
477
478 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 479 _cleanup_free_ char *t;
5076f0cc
LP
480
481 t = strndup(word, length);
0d0f0c50
SL
482 if (!t)
483 return log_oom();
5076f0cc 484
39ed67d1
LP
485 if (streq(t, "all")) {
486 if (c == ARG_CAPABILITY)
a42c8b54 487 plus = (uint64_t) -1;
39ed67d1 488 else
a42c8b54 489 minus = (uint64_t) -1;
39ed67d1 490 } else {
2822da4f
LP
491 int cap;
492
493 cap = capability_from_name(t);
494 if (cap < 0) {
39ed67d1
LP
495 log_error("Failed to parse capability %s.", t);
496 return -EINVAL;
497 }
498
499 if (c == ARG_CAPABILITY)
a42c8b54 500 plus |= 1ULL << (uint64_t) cap;
39ed67d1 501 else
a42c8b54 502 minus |= 1ULL << (uint64_t) cap;
5076f0cc 503 }
5076f0cc
LP
504 }
505
506 break;
507 }
508
57fb9fb5
LP
509 case 'j':
510 arg_link_journal = LINK_GUEST;
574edc90 511 arg_link_journal_try = true;
57fb9fb5
LP
512 break;
513
514 case ARG_LINK_JOURNAL:
53e438e3 515 if (streq(optarg, "auto")) {
57fb9fb5 516 arg_link_journal = LINK_AUTO;
53e438e3
LP
517 arg_link_journal_try = false;
518 } else if (streq(optarg, "no")) {
57fb9fb5 519 arg_link_journal = LINK_NO;
53e438e3
LP
520 arg_link_journal_try = false;
521 } else if (streq(optarg, "guest")) {
57fb9fb5 522 arg_link_journal = LINK_GUEST;
53e438e3
LP
523 arg_link_journal_try = false;
524 } else if (streq(optarg, "host")) {
57fb9fb5 525 arg_link_journal = LINK_HOST;
53e438e3
LP
526 arg_link_journal_try = false;
527 } else if (streq(optarg, "try-guest")) {
574edc90
MP
528 arg_link_journal = LINK_GUEST;
529 arg_link_journal_try = true;
530 } else if (streq(optarg, "try-host")) {
531 arg_link_journal = LINK_HOST;
532 arg_link_journal_try = true;
533 } else {
57fb9fb5
LP
534 log_error("Failed to parse link journal mode %s", optarg);
535 return -EINVAL;
536 }
537
538 break;
539
17fe0523
LP
540 case ARG_BIND:
541 case ARG_BIND_RO: {
542 _cleanup_free_ char *a = NULL, *b = NULL;
543 char *e;
544 char ***x;
17fe0523
LP
545
546 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
547
548 e = strchr(optarg, ':');
549 if (e) {
550 a = strndup(optarg, e - optarg);
551 b = strdup(e + 1);
552 } else {
553 a = strdup(optarg);
554 b = strdup(optarg);
555 }
556
557 if (!a || !b)
558 return log_oom();
559
560 if (!path_is_absolute(a) || !path_is_absolute(b)) {
561 log_error("Invalid bind mount specification: %s", optarg);
562 return -EINVAL;
563 }
564
565 r = strv_extend(x, a);
566 if (r < 0)
b3451bed 567 return log_oom();
17fe0523
LP
568
569 r = strv_extend(x, b);
570 if (r < 0)
b3451bed 571 return log_oom();
17fe0523
LP
572
573 break;
574 }
575
06c17c39
LP
576 case ARG_TMPFS: {
577 _cleanup_free_ char *a = NULL, *b = NULL;
578 char *e;
579
580 e = strchr(optarg, ':');
581 if (e) {
582 a = strndup(optarg, e - optarg);
583 b = strdup(e + 1);
584 } else {
585 a = strdup(optarg);
586 b = strdup("mode=0755");
587 }
588
589 if (!a || !b)
590 return log_oom();
591
592 if (!path_is_absolute(a)) {
593 log_error("Invalid tmpfs specification: %s", optarg);
594 return -EINVAL;
595 }
596
597 r = strv_push(&arg_tmpfs, a);
598 if (r < 0)
599 return log_oom();
600
601 a = NULL;
602
603 r = strv_push(&arg_tmpfs, b);
604 if (r < 0)
605 return log_oom();
606
607 b = NULL;
608
609 break;
610 }
611
f4889f65
LP
612 case ARG_SETENV: {
613 char **n;
614
615 if (!env_assignment_is_valid(optarg)) {
616 log_error("Environment variable assignment '%s' is not valid.", optarg);
617 return -EINVAL;
618 }
619
620 n = strv_env_set(arg_setenv, optarg);
621 if (!n)
622 return log_oom();
623
624 strv_free(arg_setenv);
625 arg_setenv = n;
626 break;
627 }
628
284c0b91
LP
629 case 'q':
630 arg_quiet = true;
631 break;
632
8a96d94e
LP
633 case ARG_SHARE_SYSTEM:
634 arg_share_system = true;
635 break;
636
eb91eb18
LP
637 case ARG_REGISTER:
638 r = parse_boolean(optarg);
639 if (r < 0) {
640 log_error("Failed to parse --register= argument: %s", optarg);
641 return r;
642 }
643
644 arg_register = r;
645 break;
646
89f7c846
LP
647 case ARG_KEEP_UNIT:
648 arg_keep_unit = true;
649 break;
650
6afc95b7
LP
651 case ARG_PERSONALITY:
652
ac45f971 653 arg_personality = personality_from_string(optarg);
6afc95b7
LP
654 if (arg_personality == 0xffffffffLU) {
655 log_error("Unknown or unsupported personality '%s'.", optarg);
656 return -EINVAL;
657 }
658
659 break;
660
4d9f07b4
LP
661 case ARG_VOLATILE:
662
663 if (!optarg)
664 arg_volatile = VOLATILE_YES;
665 else {
666 r = parse_boolean(optarg);
667 if (r < 0) {
668 if (streq(optarg, "state"))
669 arg_volatile = VOLATILE_STATE;
670 else {
671 log_error("Failed to parse --volatile= argument: %s", optarg);
672 return r;
673 }
674 } else
675 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
676 }
677
678 break;
679
6d0b55c2
LP
680 case 'p': {
681 const char *split, *e;
682 uint16_t container_port, host_port;
683 int protocol;
684 ExposePort *p;
685
686 if ((e = startswith(optarg, "tcp:")))
687 protocol = IPPROTO_TCP;
688 else if ((e = startswith(optarg, "udp:")))
689 protocol = IPPROTO_UDP;
690 else {
691 e = optarg;
692 protocol = IPPROTO_TCP;
693 }
694
695 split = strchr(e, ':');
696 if (split) {
697 char v[split - e + 1];
698
699 memcpy(v, e, split - e);
700 v[split - e] = 0;
701
702 r = safe_atou16(v, &host_port);
703 if (r < 0 || host_port <= 0) {
704 log_error("Failed to parse host port: %s", optarg);
705 return -EINVAL;
706 }
707
708 r = safe_atou16(split + 1, &container_port);
709 } else {
710 r = safe_atou16(e, &container_port);
711 host_port = container_port;
712 }
713
714 if (r < 0 || container_port <= 0) {
715 log_error("Failed to parse host port: %s", optarg);
716 return -EINVAL;
717 }
718
719 LIST_FOREACH(ports, p, arg_expose_ports) {
720 if (p->protocol == protocol && p->host_port == host_port) {
721 log_error("Duplicate port specification: %s", optarg);
722 return -EINVAL;
723 }
724 }
725
726 p = new(ExposePort, 1);
727 if (!p)
728 return log_oom();
729
730 p->protocol = protocol;
731 p->host_port = host_port;
732 p->container_port = container_port;
733
734 LIST_PREPEND(ports, arg_expose_ports, p);
735
736 break;
737 }
738
f36933fe
LP
739 case ARG_PROPERTY:
740 if (strv_extend(&arg_property, optarg) < 0)
741 return log_oom();
742
743 break;
744
6dac160c
LP
745 case ARG_PRIVATE_USERS:
746 if (optarg) {
747 _cleanup_free_ char *buffer = NULL;
748 const char *range, *shift;
749
750 range = strchr(optarg, ':');
751 if (range) {
752 buffer = strndup(optarg, range - optarg);
753 if (!buffer)
754 return log_oom();
755 shift = buffer;
756
757 range++;
758 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759 log_error("Failed to parse UID range: %s", range);
760 return -EINVAL;
761 }
762 } else
763 shift = optarg;
764
765 if (parse_uid(shift, &arg_uid_shift) < 0) {
766 log_error("Failed to parse UID: %s", optarg);
767 return -EINVAL;
768 }
769 }
770
771 arg_userns = true;
772 break;
773
c6c8f6e2
LP
774 case ARG_KILL_SIGNAL:
775 arg_kill_signal = signal_from_string_try_harder(optarg);
776 if (arg_kill_signal < 0) {
777 log_error("Cannot parse signal: %s", optarg);
778 return -EINVAL;
779 }
780
781 break;
782
88213476
LP
783 case '?':
784 return -EINVAL;
785
786 default:
eb9da376 787 assert_not_reached("Unhandled option");
88213476 788 }
88213476 789
eb91eb18
LP
790 if (arg_share_system)
791 arg_register = false;
792
793 if (arg_boot && arg_share_system) {
794 log_error("--boot and --share-system may not be combined.");
795 return -EINVAL;
796 }
797
89f7c846
LP
798 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799 log_error("--keep-unit may not be used when invoked from a user session.");
800 return -EINVAL;
801 }
802
1b9e5b12
LP
803 if (arg_directory && arg_image) {
804 log_error("--directory= and --image= may not be combined.");
805 return -EINVAL;
806 }
807
ec16945e
LP
808 if (arg_template && arg_image) {
809 log_error("--template= and --image= may not be combined.");
810 return -EINVAL;
811 }
812
813 if (arg_template && !(arg_directory || arg_machine)) {
814 log_error("--template= needs --directory= or --machine=.");
815 return -EINVAL;
816 }
817
818 if (arg_ephemeral && arg_template) {
819 log_error("--ephemeral and --template= may not be combined.");
820 return -EINVAL;
821 }
822
823 if (arg_ephemeral && arg_image) {
824 log_error("--ephemeral and --image= may not be combined.");
825 return -EINVAL;
826 }
827
df9a75e4
LP
828 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829 log_error("--ephemeral and --link-journal= may not be combined.");
830 return -EINVAL;
831 }
832
4d9f07b4
LP
833 if (arg_volatile != VOLATILE_NO && arg_read_only) {
834 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
835 return -EINVAL;
836 }
837
6d0b55c2
LP
838 if (arg_expose_ports && !arg_private_network) {
839 log_error("Cannot use --port= without private networking.");
840 return -EINVAL;
841 }
842
a42c8b54
LP
843 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
844
c6c8f6e2
LP
845 if (arg_boot && arg_kill_signal <= 0)
846 arg_kill_signal = SIGRTMIN+3;
847
88213476
LP
848 return 1;
849}
850
851static int mount_all(const char *dest) {
852
853 typedef struct MountPoint {
854 const char *what;
855 const char *where;
856 const char *type;
857 const char *options;
858 unsigned long flags;
3bd66c05 859 bool fatal;
88213476
LP
860 } MountPoint;
861
862 static const MountPoint mount_table[] = {
06c17c39
LP
863 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
864 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
865 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
866 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
867 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 868 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
869 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
870 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
bbb99c30 871 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 872#ifdef HAVE_SELINUX
06c17c39
LP
873 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
874 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 875#endif
88213476
LP
876 };
877
878 unsigned k;
879 int r = 0;
880
881 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 882 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 883 const char *o;
88213476
LP
884 int t;
885
17fe0523
LP
886 where = strjoin(dest, "/", mount_table[k].where, NULL);
887 if (!where)
888 return log_oom();
88213476 889
e65aec12 890 t = path_is_mount_point(where, true);
68fb0892 891 if (t < 0) {
da927ba9 892 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
893
894 if (r == 0)
895 r = t;
896
897 continue;
898 }
899
9c1c7f71
LP
900 /* Skip this entry if it is not a remount. */
901 if (mount_table[k].what && t > 0)
014a9c77
LP
902 continue;
903
79d80fc1
TG
904 t = mkdir_p(where, 0755);
905 if (t < 0) {
906 if (mount_table[k].fatal) {
da927ba9 907 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
908
909 if (r == 0)
910 r = t;
911 } else
da927ba9 912 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
913
914 continue;
915 }
88213476 916
a8828ed9 917#ifdef HAVE_SELINUX
82adf6af
LP
918 if (arg_selinux_apifs_context &&
919 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
921 if (!options)
922 return log_oom();
923
924 o = options;
925 } else
a8828ed9 926#endif
d002827b 927 o = mount_table[k].options;
a8828ed9 928
6dac160c
LP
929 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930 char *uid_options = NULL;
931
932 if (o)
933 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
934 else
935 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
936 if (!uid_options)
937 return log_oom();
938
939 free(options);
940 o = options = uid_options;
941 }
a8828ed9 942
88213476
LP
943 if (mount(mount_table[k].what,
944 where,
945 mount_table[k].type,
946 mount_table[k].flags,
79d80fc1 947 o) < 0) {
88213476 948
79d80fc1 949 if (mount_table[k].fatal) {
56f64d95 950 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 951
79d80fc1
TG
952 if (r == 0)
953 r = -errno;
954 } else
56f64d95 955 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 956 }
88213476
LP
957 }
958
e58a1277
LP
959 return r;
960}
f8440af5 961
d6797c92 962static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
963 char **x, **y;
964
965 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 966 _cleanup_free_ char *where = NULL;
d2421337 967 struct stat source_st, dest_st;
2ed4e5e0 968 int r;
d2421337 969
4a62c710
MS
970 if (stat(*x, &source_st) < 0)
971 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 972
06c17c39
LP
973 where = strappend(dest, *y);
974 if (!where)
975 return log_oom();
976
2ed4e5e0
SL
977 r = stat(where, &dest_st);
978 if (r == 0) {
05e7da5a
AC
979 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980 log_error("Cannot bind mount directory %s on file %s.", *x, where);
981 return -EINVAL;
982 }
983 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984 log_error("Cannot bind mount file %s on directory %s.", *x, where);
d2421337
DR
985 return -EINVAL;
986 }
2ed4e5e0
SL
987 } else if (errno == ENOENT) {
988 r = mkdir_parents_label(where, 0755);
f647962d
MS
989 if (r < 0)
990 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 991 } else {
56f64d95 992 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
993 return -errno;
994 }
06c17c39 995
05e7da5a
AC
996 /* Create the mount point. Any non-directory file can be
997 * mounted on any non-directory file (regular, fifo, socket,
998 * char, block).
999 */
79d80fc1
TG
1000 if (S_ISDIR(source_st.st_mode)) {
1001 r = mkdir_label(where, 0755);
f647962d
MS
1002 if (r < 0 && errno != EEXIST)
1003 return log_error_errno(r, "Failed to create mount point %s: %m", where);
05e7da5a 1004 } else {
79d80fc1 1005 r = touch(where);
f647962d
MS
1006 if (r < 0)
1007 return log_error_errno(r, "Failed to create mount point %s: %m", where);
d2421337 1008 }
17fe0523 1009
4a62c710
MS
1010 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1011 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 1012
d6797c92
LP
1013 if (ro) {
1014 r = bind_remount_recursive(where, true);
f647962d
MS
1015 if (r < 0)
1016 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
1017 }
1018 }
1019
1020 return 0;
1021}
1022
b12afc8c
LP
1023static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1024 char *to;
1025 int r;
1026
63c372cb 1027 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c
LP
1028
1029 r = path_is_mount_point(to, false);
1030 if (r < 0)
1031 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1032 if (r > 0)
1033 return 0;
1034
1035 mkdir_p(to, 0755);
1036
c0534580
LP
1037 /* The superblock mount options of the mount point need to be
1038 * identical to the hosts', and hence writable... */
1039 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1040 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1041
c0534580
LP
1042 /* ... hence let's only make the bind mount read-only, not the
1043 * superblock. */
1044 if (read_only) {
1045 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1047 }
b12afc8c
LP
1048 return 1;
1049}
1050
1051static int mount_cgroup(const char *dest) {
1052 _cleanup_set_free_free_ Set *controllers = NULL;
1053 _cleanup_free_ char *own_cgroup_path = NULL;
1054 const char *cgroup_root, *systemd_root, *systemd_own;
1055 int r;
1056
1057 controllers = set_new(&string_hash_ops);
1058 if (!controllers)
1059 return log_oom();
1060
1061 r = cg_kernel_controllers(controllers);
1062 if (r < 0)
1063 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1064
1065 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1066 if (r < 0)
1067 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1068
63c372cb 1069 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
b12afc8c
LP
1070 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1072
1073 for (;;) {
1074 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1075
1076 controller = set_steal_first(controllers);
1077 if (!controller)
1078 break;
1079
1080 origin = strappend("/sys/fs/cgroup/", controller);
1081 if (!origin)
1082 return log_oom();
1083
1084 r = readlink_malloc(origin, &combined);
1085 if (r == -EINVAL) {
1086 /* Not a symbolic link, but directly a single cgroup hierarchy */
1087
1088 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1089 if (r < 0)
1090 return r;
1091
1092 } else if (r < 0)
1093 return log_error_errno(r, "Failed to read link %s: %m", origin);
1094 else {
1095 _cleanup_free_ char *target = NULL;
1096
1097 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1098 if (!target)
1099 return log_oom();
1100
1101 /* A symbolic link, a combination of controllers in one hierarchy */
1102
1103 if (!filename_is_valid(combined)) {
1104 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1105 continue;
1106 }
1107
1108 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1109 if (r < 0)
1110 return r;
1111
1112 if (symlink(combined, target) < 0)
83521414 1113 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1114 }
1115 }
1116
c0534580 1117 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1118 if (r < 0)
1119 return r;
1120
1121 /* Make our own cgroup a (writable) bind mount */
63c372cb 1122 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1123 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1124 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1125
1126 /* And then remount the systemd cgroup root read-only */
63c372cb 1127 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1128 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1130
1131 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1133
1134 return 0;
1135}
1136
06c17c39
LP
1137static int mount_tmpfs(const char *dest) {
1138 char **i, **o;
1139
1140 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141 _cleanup_free_ char *where = NULL;
79d80fc1 1142 int r;
06c17c39
LP
1143
1144 where = strappend(dest, *i);
1145 if (!where)
1146 return log_oom();
1147
79d80fc1 1148 r = mkdir_label(where, 0755);
04a91939
LP
1149 if (r < 0 && r != -EEXIST)
1150 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1151
4a62c710
MS
1152 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1154 }
1155
1156 return 0;
1157}
1158
e58a1277 1159static int setup_timezone(const char *dest) {
d4036145
LP
1160 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1161 char *z, *y;
1162 int r;
f8440af5 1163
e58a1277
LP
1164 assert(dest);
1165
1166 /* Fix the timezone, if possible */
d4036145
LP
1167 r = readlink_malloc("/etc/localtime", &p);
1168 if (r < 0) {
1169 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1170 return 0;
1171 }
1172
1173 z = path_startswith(p, "../usr/share/zoneinfo/");
1174 if (!z)
1175 z = path_startswith(p, "/usr/share/zoneinfo/");
1176 if (!z) {
1177 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1178 return 0;
1179 }
1180
04bc4a3f
LP
1181 where = strappend(dest, "/etc/localtime");
1182 if (!where)
0d0f0c50 1183 return log_oom();
715ac17a 1184
d4036145
LP
1185 r = readlink_malloc(where, &q);
1186 if (r >= 0) {
1187 y = path_startswith(q, "../usr/share/zoneinfo/");
1188 if (!y)
1189 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1190
d4036145
LP
1191 /* Already pointing to the right place? Then do nothing .. */
1192 if (y && streq(y, z))
1193 return 0;
1194 }
1195
1196 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1197 if (!check)
0d0f0c50 1198 return log_oom();
4d1c38b8 1199
d4036145
LP
1200 if (access(check, F_OK) < 0) {
1201 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1202 return 0;
1203 }
68fb0892 1204
d4036145
LP
1205 what = strappend("../usr/share/zoneinfo/", z);
1206 if (!what)
1207 return log_oom();
1208
79d80fc1
TG
1209 r = mkdir_parents(where, 0755);
1210 if (r < 0) {
da927ba9 1211 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1212
1213 return 0;
1214 }
1215
1216 r = unlink(where);
1217 if (r < 0 && errno != ENOENT) {
56f64d95 1218 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1219
1220 return 0;
1221 }
4d9f07b4 1222
d4036145 1223 if (symlink(what, where) < 0) {
56f64d95 1224 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1225 return 0;
1226 }
e58a1277
LP
1227
1228 return 0;
88213476
LP
1229}
1230
2547bb41 1231static int setup_resolv_conf(const char *dest) {
c8b32e11 1232 _cleanup_free_ char *where = NULL;
79d80fc1 1233 int r;
2547bb41
LP
1234
1235 assert(dest);
1236
1237 if (arg_private_network)
1238 return 0;
1239
1240 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1241 where = strappend(dest, "/etc/resolv.conf");
1242 if (!where)
0d0f0c50 1243 return log_oom();
2547bb41 1244
77e63faf
LP
1245 /* We don't really care for the results of this really. If it
1246 * fails, it fails, but meh... */
79d80fc1
TG
1247 r = mkdir_parents(where, 0755);
1248 if (r < 0) {
da927ba9 1249 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1250
1251 return 0;
1252 }
1253
f2068bcc 1254 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1255 if (r < 0) {
da927ba9 1256 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1257
1258 return 0;
1259 }
2547bb41
LP
1260
1261 return 0;
1262}
1263
4d9f07b4
LP
1264static int setup_volatile_state(const char *directory) {
1265 const char *p;
1266 int r;
1267
1268 assert(directory);
1269
1270 if (arg_volatile != VOLATILE_STATE)
1271 return 0;
1272
1273 /* --volatile=state means we simply overmount /var
1274 with a tmpfs, and the rest read-only. */
1275
1276 r = bind_remount_recursive(directory, true);
f647962d
MS
1277 if (r < 0)
1278 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1279
63c372cb 1280 p = strjoina(directory, "/var");
79d80fc1 1281 r = mkdir(p, 0755);
4a62c710
MS
1282 if (r < 0 && errno != EEXIST)
1283 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1284
4a62c710
MS
1285 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1287
1288 return 0;
1289}
1290
1291static int setup_volatile(const char *directory) {
1292 bool tmpfs_mounted = false, bind_mounted = false;
1293 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1294 const char *f, *t;
1295 int r;
1296
1297 assert(directory);
1298
1299 if (arg_volatile != VOLATILE_YES)
1300 return 0;
1301
1302 /* --volatile=yes means we mount a tmpfs to the root dir, and
1303 the original /usr to use inside it, and that read-only. */
1304
4a62c710
MS
1305 if (!mkdtemp(template))
1306 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1307
1308 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1309 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1310 r = -errno;
1311 goto fail;
1312 }
1313
1314 tmpfs_mounted = true;
1315
63c372cb
LP
1316 f = strjoina(directory, "/usr");
1317 t = strjoina(template, "/usr");
4d9f07b4 1318
79d80fc1
TG
1319 r = mkdir(t, 0755);
1320 if (r < 0 && errno != EEXIST) {
56f64d95 1321 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1322 r = -errno;
1323 goto fail;
1324 }
1325
4d9f07b4 1326 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1327 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1328 r = -errno;
1329 goto fail;
1330 }
1331
1332 bind_mounted = true;
1333
1334 r = bind_remount_recursive(t, true);
1335 if (r < 0) {
da927ba9 1336 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1337 goto fail;
1338 }
1339
1340 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1341 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1342 r = -errno;
1343 goto fail;
1344 }
1345
1346 rmdir(template);
1347
1348 return 0;
1349
1350fail:
1351 if (bind_mounted)
1352 umount(t);
1353 if (tmpfs_mounted)
1354 umount(template);
1355 rmdir(template);
1356 return r;
1357}
1358
9f24adc2
LP
1359static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1360
1361 snprintf(s, 37,
1362 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363 SD_ID128_FORMAT_VAL(id));
1364
1365 return s;
1366}
1367
04bc4a3f 1368static int setup_boot_id(const char *dest) {
7fd1b19b 1369 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1370 sd_id128_t rnd = {};
04bc4a3f
LP
1371 char as_uuid[37];
1372 int r;
1373
1374 assert(dest);
1375
eb91eb18
LP
1376 if (arg_share_system)
1377 return 0;
1378
04bc4a3f
LP
1379 /* Generate a new randomized boot ID, so that each boot-up of
1380 * the container gets a new one */
1381
1382 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1383 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1384 if (!from || !to)
1385 return log_oom();
04bc4a3f
LP
1386
1387 r = sd_id128_randomize(&rnd);
f647962d
MS
1388 if (r < 0)
1389 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1390
9f24adc2 1391 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1392
574d5f2d 1393 r = write_string_file(from, as_uuid);
f647962d
MS
1394 if (r < 0)
1395 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1396
1397 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1398 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1399 r = -errno;
10d18763 1400 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1401 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1402
1403 unlink(from);
04bc4a3f
LP
1404 return r;
1405}
1406
e58a1277 1407static int copy_devnodes(const char *dest) {
88213476
LP
1408
1409 static const char devnodes[] =
1410 "null\0"
1411 "zero\0"
1412 "full\0"
1413 "random\0"
1414 "urandom\0"
85614d66
TG
1415 "tty\0"
1416 "net/tun\0";
88213476
LP
1417
1418 const char *d;
e58a1277 1419 int r = 0;
7fd1b19b 1420 _cleanup_umask_ mode_t u;
a258bf26
LP
1421
1422 assert(dest);
124640f1
LP
1423
1424 u = umask(0000);
88213476
LP
1425
1426 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1427 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1428 struct stat st;
88213476 1429
7f112f50
LP
1430 from = strappend("/dev/", d);
1431 to = strjoin(dest, "/dev/", d, NULL);
1432 if (!from || !to)
1433 return log_oom();
88213476
LP
1434
1435 if (stat(from, &st) < 0) {
1436
4a62c710
MS
1437 if (errno != ENOENT)
1438 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1439
a258bf26 1440 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1441
ed8b7a3e 1442 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1443 return -EIO;
a258bf26 1444
85614d66
TG
1445 } else {
1446 r = mkdir_parents(to, 0775);
1447 if (r < 0) {
da927ba9 1448 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1449 return -r;
1450 }
a258bf26 1451
4a62c710 1452 if (mknod(to, st.st_mode, st.st_rdev) < 0)
080e7832 1453 return log_error_errno(errno, "mknod(%s) failed: %m", to);
6278cf60
LP
1454
1455 if (arg_userns && arg_uid_shift != UID_INVALID)
1456 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1457 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
88213476 1458 }
88213476
LP
1459 }
1460
e58a1277
LP
1461 return r;
1462}
88213476 1463
f2d88580
LP
1464static int setup_ptmx(const char *dest) {
1465 _cleanup_free_ char *p = NULL;
1466
1467 p = strappend(dest, "/dev/ptmx");
1468 if (!p)
1469 return log_oom();
1470
4a62c710
MS
1471 if (symlink("pts/ptmx", p) < 0)
1472 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580 1473
6278cf60
LP
1474 if (arg_userns && arg_uid_shift != UID_INVALID)
1475 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1476 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1477
f2d88580
LP
1478 return 0;
1479}
1480
e58a1277 1481static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1482 _cleanup_umask_ mode_t u;
1483 const char *to;
e58a1277 1484 struct stat st;
e58a1277 1485 int r;
e58a1277
LP
1486
1487 assert(dest);
1488 assert(console);
1489
1490 u = umask(0000);
1491
4a62c710
MS
1492 if (stat("/dev/null", &st) < 0)
1493 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1494
e58a1277 1495 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1498
a258bf26
LP
1499 /* We need to bind mount the right tty to /dev/console since
1500 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1501 * to bind mount things on we create a device node first, and
1502 * use /dev/null for that since we the cgroups device policy
1503 * allows us to create that freely, while we cannot create
1504 * /dev/console. (Note that the major minor doesn't actually
1505 * matter here, since we mount it over anyway). */
a258bf26 1506
63c372cb 1507 to = strjoina(dest, "/dev/console");
4a62c710
MS
1508 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1509 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1510
4a62c710
MS
1511 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1512 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1513
25ea79fe 1514 return 0;
e58a1277
LP
1515}
1516
1517static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1518 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1519 _cleanup_umask_ mode_t u;
6d0b55c2 1520 int r, fd, k;
e58a1277
LP
1521 union {
1522 struct cmsghdr cmsghdr;
1523 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1524 } control = {};
1525 struct msghdr mh = {
1526 .msg_control = &control,
1527 .msg_controllen = sizeof(control),
1528 };
e58a1277
LP
1529 struct cmsghdr *cmsg;
1530
1531 assert(dest);
1532 assert(kmsg_socket >= 0);
a258bf26 1533
e58a1277 1534 u = umask(0000);
a258bf26 1535
f1e5dfe2
LP
1536 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1537 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1538 * on the reading side behave very similar to /proc/kmsg,
1539 * their writing side behaves differently from /dev/kmsg in
1540 * that writing blocks when nothing is reading. In order to
1541 * avoid any problems with containers deadlocking due to this
1542 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1543 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1544 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1545 return log_oom();
e58a1277 1546
4a62c710
MS
1547 if (mkfifo(from, 0600) < 0)
1548 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1549
1550 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1553
4a62c710
MS
1554 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1555 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1556
1557 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1558 if (fd < 0)
1559 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1560
e58a1277
LP
1561 cmsg = CMSG_FIRSTHDR(&mh);
1562 cmsg->cmsg_level = SOL_SOCKET;
1563 cmsg->cmsg_type = SCM_RIGHTS;
1564 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1565 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1566
1567 mh.msg_controllen = cmsg->cmsg_len;
1568
1569 /* Store away the fd in the socket, so that it stays open as
1570 * long as we run the child */
6d0b55c2 1571 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1572 safe_close(fd);
e58a1277 1573
4a62c710
MS
1574 if (k < 0)
1575 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1576
f1e5dfe2
LP
1577 /* And now make the FIFO unavailable as /dev/kmsg... */
1578 unlink(from);
25ea79fe 1579 return 0;
88213476
LP
1580}
1581
6d0b55c2
LP
1582static int send_rtnl(int send_fd) {
1583 union {
1584 struct cmsghdr cmsghdr;
1585 uint8_t buf[CMSG_SPACE(sizeof(int))];
1586 } control = {};
1587 struct msghdr mh = {
1588 .msg_control = &control,
1589 .msg_controllen = sizeof(control),
1590 };
1591 struct cmsghdr *cmsg;
1592 _cleanup_close_ int fd = -1;
1593 ssize_t k;
1594
1595 assert(send_fd >= 0);
1596
1597 if (!arg_expose_ports)
1598 return 0;
1599
1600 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1601 if (fd < 0)
1602 return log_error_errno(errno, "failed to allocate container netlink: %m");
1603
1604 cmsg = CMSG_FIRSTHDR(&mh);
1605 cmsg->cmsg_level = SOL_SOCKET;
1606 cmsg->cmsg_type = SCM_RIGHTS;
1607 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1608 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1609
1610 mh.msg_controllen = cmsg->cmsg_len;
1611
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
1614 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1615 if (k < 0)
1616 return log_error_errno(errno, "Failed to send netlink fd: %m");
1617
1618 return 0;
1619}
1620
1621static int flush_ports(union in_addr_union *exposed) {
1622 ExposePort *p;
1623 int r, af = AF_INET;
1624
1625 assert(exposed);
1626
1627 if (!arg_expose_ports)
1628 return 0;
1629
1630 if (in_addr_is_null(af, exposed))
1631 return 0;
1632
1633 log_debug("Lost IP address.");
1634
1635 LIST_FOREACH(ports, p, arg_expose_ports) {
1636 r = fw_add_local_dnat(false,
1637 af,
1638 p->protocol,
1639 NULL,
1640 NULL, 0,
1641 NULL, 0,
1642 p->host_port,
1643 exposed,
1644 p->container_port,
1645 NULL);
1646 if (r < 0)
1647 log_warning_errno(r, "Failed to modify firewall: %m");
1648 }
1649
1650 *exposed = IN_ADDR_NULL;
1651 return 0;
1652}
1653
1654static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1655 _cleanup_free_ struct local_address *addresses = NULL;
1656 _cleanup_free_ char *pretty = NULL;
1657 union in_addr_union new_exposed;
1658 ExposePort *p;
1659 bool add;
1660 int af = AF_INET, r;
1661
1662 assert(exposed);
1663
1664 /* Invoked each time an address is added or removed inside the
1665 * container */
1666
1667 if (!arg_expose_ports)
1668 return 0;
1669
1670 r = local_addresses(rtnl, 0, af, &addresses);
1671 if (r < 0)
1672 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1673
1674 add = r > 0 &&
1675 addresses[0].family == af &&
1676 addresses[0].scope < RT_SCOPE_LINK;
1677
1678 if (!add)
1679 return flush_ports(exposed);
1680
1681 new_exposed = addresses[0].address;
1682 if (in_addr_equal(af, exposed, &new_exposed))
1683 return 0;
1684
1685 in_addr_to_string(af, &new_exposed, &pretty);
1686 log_debug("New container IP is %s.", strna(pretty));
1687
1688 LIST_FOREACH(ports, p, arg_expose_ports) {
1689
1690 r = fw_add_local_dnat(true,
1691 af,
1692 p->protocol,
1693 NULL,
1694 NULL, 0,
1695 NULL, 0,
1696 p->host_port,
1697 &new_exposed,
1698 p->container_port,
1699 in_addr_is_null(af, exposed) ? NULL : exposed);
1700 if (r < 0)
1701 log_warning_errno(r, "Failed to modify firewall: %m");
1702 }
1703
1704 *exposed = new_exposed;
1705 return 0;
1706}
1707
1708static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1709 union in_addr_union *exposed = userdata;
1710
1711 assert(rtnl);
1712 assert(m);
1713 assert(exposed);
1714
1715 expose_ports(rtnl, exposed);
1716 return 0;
1717}
1718
1719static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1720 union {
1721 struct cmsghdr cmsghdr;
1722 uint8_t buf[CMSG_SPACE(sizeof(int))];
1723 } control = {};
1724 struct msghdr mh = {
1725 .msg_control = &control,
1726 .msg_controllen = sizeof(control),
1727 };
1728 struct cmsghdr *cmsg;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730 int fd, r;
1731 ssize_t k;
1732
1733 assert(event);
1734 assert(recv_fd >= 0);
1735 assert(ret);
1736
1737 if (!arg_expose_ports)
1738 return 0;
1739
1740 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1741 if (k < 0)
1742 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1743
1744 cmsg = CMSG_FIRSTHDR(&mh);
1745 assert(cmsg->cmsg_level == SOL_SOCKET);
1746 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1747 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1748 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1749
1750 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1751 if (r < 0) {
1752 safe_close(fd);
1753 return log_error_errno(r, "Failed to create rtnl object: %m");
1754 }
1755
1756 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1759
1760 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1763
1764 r = sd_rtnl_attach_event(rtnl, event, 0);
1765 if (r < 0)
1766 return log_error_errno(r, "Failed to add to even loop: %m");
1767
1768 *ret = rtnl;
1769 rtnl = NULL;
1770
1771 return 0;
1772}
1773
3a74cea5 1774static int setup_hostname(void) {
3a74cea5 1775
eb91eb18
LP
1776 if (arg_share_system)
1777 return 0;
1778
605f81a8 1779 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1780 return -errno;
3a74cea5 1781
7027ff61 1782 return 0;
3a74cea5
LP
1783}
1784
57fb9fb5 1785static int setup_journal(const char *directory) {
4d680aee 1786 sd_id128_t machine_id, this_id;
7fd1b19b 1787 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1788 char *id;
57fb9fb5
LP
1789 int r;
1790
df9a75e4
LP
1791 /* Don't link journals in ephemeral mode */
1792 if (arg_ephemeral)
1793 return 0;
1794
57fb9fb5 1795 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1796 if (!p)
1797 return log_oom();
57fb9fb5
LP
1798
1799 r = read_one_line_file(p, &b);
27407a01
ZJS
1800 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1801 return 0;
f647962d
MS
1802 else if (r < 0)
1803 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1804
27407a01
ZJS
1805 id = strstrip(b);
1806 if (isempty(id) && arg_link_journal == LINK_AUTO)
1807 return 0;
57fb9fb5 1808
27407a01
ZJS
1809 /* Verify validity */
1810 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1811 if (r < 0)
1812 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1813
4d680aee 1814 r = sd_id128_get_machine(&this_id);
f647962d
MS
1815 if (r < 0)
1816 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1817
1818 if (sd_id128_equal(machine_id, this_id)) {
1819 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1820 "Host and machine ids are equal (%s): refusing to link journals", id);
1821 if (arg_link_journal == LINK_AUTO)
1822 return 0;
df9a75e4 1823 return -EEXIST;
4d680aee
ZJS
1824 }
1825
1826 if (arg_link_journal == LINK_NO)
1827 return 0;
1828
57fb9fb5 1829 free(p);
27407a01
ZJS
1830 p = strappend("/var/log/journal/", id);
1831 q = strjoin(directory, "/var/log/journal/", id, NULL);
1832 if (!p || !q)
1833 return log_oom();
1834
1835 if (path_is_mount_point(p, false) > 0) {
1836 if (arg_link_journal != LINK_AUTO) {
1837 log_error("%s: already a mount point, refusing to use for journal", p);
1838 return -EEXIST;
1839 }
1840
1841 return 0;
57fb9fb5
LP
1842 }
1843
27407a01 1844 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1845 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1846 log_error("%s: already a mount point, refusing to use for journal", q);
1847 return -EEXIST;
57fb9fb5
LP
1848 }
1849
27407a01 1850 return 0;
57fb9fb5
LP
1851 }
1852
1853 r = readlink_and_make_absolute(p, &d);
1854 if (r >= 0) {
1855 if ((arg_link_journal == LINK_GUEST ||
1856 arg_link_journal == LINK_AUTO) &&
1857 path_equal(d, q)) {
1858
27407a01
ZJS
1859 r = mkdir_p(q, 0755);
1860 if (r < 0)
56f64d95 1861 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1862 return 0;
57fb9fb5
LP
1863 }
1864
4a62c710
MS
1865 if (unlink(p) < 0)
1866 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1867 } else if (r == -EINVAL) {
1868
1869 if (arg_link_journal == LINK_GUEST &&
1870 rmdir(p) < 0) {
1871
27407a01
ZJS
1872 if (errno == ENOTDIR) {
1873 log_error("%s already exists and is neither a symlink nor a directory", p);
1874 return r;
1875 } else {
56f64d95 1876 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1877 return -errno;
57fb9fb5 1878 }
57fb9fb5
LP
1879 }
1880 } else if (r != -ENOENT) {
56f64d95 1881 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1882 return r;
57fb9fb5
LP
1883 }
1884
1885 if (arg_link_journal == LINK_GUEST) {
1886
1887 if (symlink(q, p) < 0) {
574edc90 1888 if (arg_link_journal_try) {
56f64d95 1889 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1890 return 0;
1891 } else {
56f64d95 1892 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1893 return -errno;
1894 }
57fb9fb5
LP
1895 }
1896
27407a01
ZJS
1897 r = mkdir_p(q, 0755);
1898 if (r < 0)
56f64d95 1899 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1900 return 0;
57fb9fb5
LP
1901 }
1902
1903 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1904 /* don't create parents here -- if the host doesn't have
1905 * permanent journal set up, don't force it here */
1906 r = mkdir(p, 0755);
57fb9fb5 1907 if (r < 0) {
574edc90 1908 if (arg_link_journal_try) {
56f64d95 1909 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1910 return 0;
1911 } else {
56f64d95 1912 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1913 return r;
1914 }
57fb9fb5
LP
1915 }
1916
27407a01
ZJS
1917 } else if (access(p, F_OK) < 0)
1918 return 0;
57fb9fb5 1919
cdb2b9d0
LP
1920 if (dir_is_empty(q) == 0)
1921 log_warning("%s is not empty, proceeding anyway.", q);
1922
57fb9fb5
LP
1923 r = mkdir_p(q, 0755);
1924 if (r < 0) {
56f64d95 1925 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1926 return r;
57fb9fb5
LP
1927 }
1928
4a62c710
MS
1929 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1930 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1931
27407a01 1932 return 0;
57fb9fb5
LP
1933}
1934
88213476 1935static int drop_capabilities(void) {
5076f0cc 1936 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1937}
1938
5aa4bb6b 1939static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1940 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1941 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1942 int r;
1943
eb91eb18
LP
1944 if (!arg_register)
1945 return 0;
1946
1c03020c 1947 r = sd_bus_default_system(&bus);
f647962d
MS
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1950
89f7c846
LP
1951 if (arg_keep_unit) {
1952 r = sd_bus_call_method(
1953 bus,
1954 "org.freedesktop.machine1",
1955 "/org/freedesktop/machine1",
1956 "org.freedesktop.machine1.Manager",
5aa4bb6b 1957 "RegisterMachineWithNetwork",
89f7c846
LP
1958 &error,
1959 NULL,
5aa4bb6b 1960 "sayssusai",
89f7c846
LP
1961 arg_machine,
1962 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1963 "nspawn",
1964 "container",
1965 (uint32_t) pid,
5aa4bb6b
LP
1966 strempty(arg_directory),
1967 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1968 } else {
9457ac5b 1969 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 1970 char **i;
9457ac5b
LP
1971
1972 r = sd_bus_message_new_method_call(
89f7c846 1973 bus,
9457ac5b 1974 &m,
89f7c846
LP
1975 "org.freedesktop.machine1",
1976 "/org/freedesktop/machine1",
1977 "org.freedesktop.machine1.Manager",
5aa4bb6b 1978 "CreateMachineWithNetwork");
f647962d 1979 if (r < 0)
f36933fe 1980 return bus_log_create_error(r);
9457ac5b
LP
1981
1982 r = sd_bus_message_append(
1983 m,
5aa4bb6b 1984 "sayssusai",
89f7c846
LP
1985 arg_machine,
1986 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1987 "nspawn",
1988 "container",
1989 (uint32_t) pid,
5aa4bb6b
LP
1990 strempty(arg_directory),
1991 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 1992 if (r < 0)
f36933fe 1993 return bus_log_create_error(r);
9457ac5b
LP
1994
1995 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 1996 if (r < 0)
f36933fe 1997 return bus_log_create_error(r);
9457ac5b
LP
1998
1999 if (!isempty(arg_slice)) {
2000 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2001 if (r < 0)
f36933fe 2002 return bus_log_create_error(r);
9457ac5b
LP
2003 }
2004
2005 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2006 if (r < 0)
f36933fe 2007 return bus_log_create_error(r);
9457ac5b 2008
63cc4c31 2009 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2010 /* Allow the container to
2011 * access and create the API
2012 * device nodes, so that
2013 * PrivateDevices= in the
2014 * container can work
2015 * fine */
2016 "/dev/null", "rwm",
2017 "/dev/zero", "rwm",
2018 "/dev/full", "rwm",
2019 "/dev/random", "rwm",
2020 "/dev/urandom", "rwm",
2021 "/dev/tty", "rwm",
864e1706 2022 "/dev/net/tun", "rwm",
9457ac5b
LP
2023 /* Allow the container
2024 * access to ptys. However,
2025 * do not permit the
2026 * container to ever create
2027 * these device nodes. */
2028 "/dev/pts/ptmx", "rw",
63cc4c31 2029 "char-pts", "rw");
f647962d
MS
2030 if (r < 0)
2031 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b 2032
f36933fe
LP
2033 STRV_FOREACH(i, arg_property) {
2034 r = sd_bus_message_open_container(m, 'r', "sv");
2035 if (r < 0)
2036 return bus_log_create_error(r);
2037
2038 r = bus_append_unit_property_assignment(m, *i);
2039 if (r < 0)
2040 return r;
2041
2042 r = sd_bus_message_close_container(m);
2043 if (r < 0)
2044 return bus_log_create_error(r);
2045 }
2046
9457ac5b 2047 r = sd_bus_message_close_container(m);
f647962d 2048 if (r < 0)
f36933fe 2049 return bus_log_create_error(r);
9457ac5b
LP
2050
2051 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2052 }
2053
9444b1f2 2054 if (r < 0) {
1f0cd86b
LP
2055 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2056 return r;
2057 }
2058
2059 return 0;
2060}
2061
2062static int terminate_machine(pid_t pid) {
2063 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2064 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 2065 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2066 const char *path;
2067 int r;
2068
eb91eb18
LP
2069 if (!arg_register)
2070 return 0;
2071
76b54375 2072 r = sd_bus_default_system(&bus);
f647962d
MS
2073 if (r < 0)
2074 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2075
2076 r = sd_bus_call_method(
2077 bus,
2078 "org.freedesktop.machine1",
2079 "/org/freedesktop/machine1",
2080 "org.freedesktop.machine1.Manager",
2081 "GetMachineByPID",
2082 &error,
2083 &reply,
2084 "u",
2085 (uint32_t) pid);
2086 if (r < 0) {
2087 /* Note that the machine might already have been
2088 * cleaned up automatically, hence don't consider it a
2089 * failure if we cannot get the machine object. */
2090 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2091 return 0;
2092 }
2093
2094 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2095 if (r < 0)
2096 return bus_log_parse_error(r);
9444b1f2 2097
1f0cd86b
LP
2098 r = sd_bus_call_method(
2099 bus,
2100 "org.freedesktop.machine1",
2101 path,
2102 "org.freedesktop.machine1.Machine",
2103 "Terminate",
2104 &error,
2105 NULL,
2106 NULL);
2107 if (r < 0) {
2108 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2109 return 0;
2110 }
2111
9444b1f2
LP
2112 return 0;
2113}
2114
db999e0f
LP
2115static int reset_audit_loginuid(void) {
2116 _cleanup_free_ char *p = NULL;
2117 int r;
2118
2119 if (arg_share_system)
2120 return 0;
2121
2122 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2123 if (r == -ENOENT)
db999e0f 2124 return 0;
f647962d
MS
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2127
2128 /* Already reset? */
2129 if (streq(p, "4294967295"))
2130 return 0;
2131
2132 r = write_string_file("/proc/self/loginuid", "4294967295");
2133 if (r < 0) {
2134 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2135 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 2139
db999e0f 2140 sleep(5);
77b6e194 2141 }
db999e0f
LP
2142
2143 return 0;
77b6e194
LP
2144}
2145
4f758c23
LP
2146#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2147#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2148#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2149
a90e2305 2150static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2151 uint8_t result[8];
2152 size_t l, sz;
a90e2305
LP
2153 uint8_t *v, *i;
2154 int r;
01dde061
TG
2155
2156 l = strlen(arg_machine);
2157 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2158 if (idx > 0)
2159 sz += sizeof(idx);
a90e2305 2160
01dde061
TG
2161 v = alloca(sz);
2162
2163 /* fetch some persistent data unique to the host */
2164 r = sd_id128_get_machine((sd_id128_t*) v);
2165 if (r < 0)
2166 return r;
2167
2168 /* combine with some data unique (on this host) to this
2169 * container instance */
a90e2305
LP
2170 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2171 if (idx > 0) {
2172 idx = htole64(idx);
2173 memcpy(i, &idx, sizeof(idx));
2174 }
01dde061
TG
2175
2176 /* Let's hash the host machine ID plus the container name. We
2177 * use a fixed, but originally randomly created hash key here. */
4f758c23 2178 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2179
2180 assert_cc(ETH_ALEN <= sizeof(result));
2181 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2182
2183 /* see eth_random_addr in the kernel */
2184 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2185 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2186
2187 return 0;
2188}
2189
5aa4bb6b 2190static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2191 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2192 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2193 struct ether_addr mac_host, mac_container;
5aa4bb6b 2194 int r, i;
69c79d3c
LP
2195
2196 if (!arg_private_network)
2197 return 0;
2198
2199 if (!arg_network_veth)
2200 return 0;
2201
08af0da2
LP
2202 /* Use two different interface name prefixes depending whether
2203 * we are in bridge mode or not. */
c00524c9 2204 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2205 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2206
e867ceb6
LP
2207 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2210
e867ceb6
LP
2211 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2214
151b9b96 2215 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2218
151b9b96 2219 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2222
ab046dde 2223 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2226
4f758c23 2227 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2230
ee3a6a51 2231 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2232 if (r < 0)
2233 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2234
d8e538ec 2235 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2236 if (r < 0)
2237 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2238
ee3a6a51 2239 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2240 if (r < 0)
2241 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2242
ab046dde 2243 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2244 if (r < 0)
2245 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2246
4f758c23 2247 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2248 if (r < 0)
2249 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2250
ab046dde 2251 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2252 if (r < 0)
2253 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2254
2255 r = sd_rtnl_message_close_container(m);
f647962d
MS
2256 if (r < 0)
2257 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2258
2259 r = sd_rtnl_message_close_container(m);
f647962d
MS
2260 if (r < 0)
2261 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2262
2263 r = sd_rtnl_message_close_container(m);
f647962d
MS
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2266
2267 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2268 if (r < 0)
2269 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2270
5aa4bb6b 2271 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2272 if (i <= 0)
2273 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2274
2275 *ifi = i;
2276
69c79d3c
LP
2277 return 0;
2278}
2279
5aa4bb6b 2280static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2281 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2282 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2283 int r, bridge;
2284
2285 if (!arg_private_network)
2286 return 0;
2287
2288 if (!arg_network_veth)
2289 return 0;
2290
2291 if (!arg_network_bridge)
2292 return 0;
2293
2294 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2295 if (bridge <= 0)
2296 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2297
5aa4bb6b
LP
2298 *ifi = bridge;
2299
151b9b96 2300 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2301 if (r < 0)
2302 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2303
151b9b96 2304 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2307
039dd4af 2308 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2309 if (r < 0)
2310 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2311
ab046dde 2312 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2313 if (r < 0)
2314 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2315
2316 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2319
2320 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2321 if (r < 0)
2322 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2323
2324 return 0;
2325}
2326
c74e630d
LP
2327static int parse_interface(struct udev *udev, const char *name) {
2328 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2329 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2330 int ifi;
2331
2332 ifi = (int) if_nametoindex(name);
4a62c710
MS
2333 if (ifi <= 0)
2334 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2335
2336 sprintf(ifi_str, "n%i", ifi);
2337 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2338 if (!d)
2339 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2340
2341 if (udev_device_get_is_initialized(d) <= 0) {
2342 log_error("Network interface %s is not initialized yet.", name);
2343 return -EBUSY;
2344 }
2345
2346 return ifi;
2347}
2348
69c79d3c 2349static int move_network_interfaces(pid_t pid) {
7e227024 2350 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2352 char **i;
2353 int r;
2354
2355 if (!arg_private_network)
2356 return 0;
2357
2358 if (strv_isempty(arg_network_interfaces))
2359 return 0;
2360
151b9b96 2361 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2364
7e227024
LP
2365 udev = udev_new();
2366 if (!udev) {
2367 log_error("Failed to connect to udev.");
2368 return -ENOMEM;
2369 }
2370
aa28aefe 2371 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2372 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2373 int ifi;
aa28aefe 2374
c74e630d
LP
2375 ifi = parse_interface(udev, *i);
2376 if (ifi < 0)
2377 return ifi;
2378
3125b3ef 2379 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2382
c74e630d 2383 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2386
c74e630d 2387 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2390 }
7e227024 2391
c74e630d
LP
2392 return 0;
2393}
2394
2395static int setup_macvlan(pid_t pid) {
2396 _cleanup_udev_unref_ struct udev *udev = NULL;
2397 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2398 unsigned idx = 0;
c74e630d
LP
2399 char **i;
2400 int r;
2401
2402 if (!arg_private_network)
2403 return 0;
2404
2405 if (strv_isempty(arg_network_macvlan))
2406 return 0;
2407
2408 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2411
2412 udev = udev_new();
2413 if (!udev) {
2414 log_error("Failed to connect to udev.");
2415 return -ENOMEM;
2416 }
2417
2418 STRV_FOREACH(i, arg_network_macvlan) {
2419 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2420 _cleanup_free_ char *n = NULL;
e867ceb6 2421 struct ether_addr mac;
c74e630d
LP
2422 int ifi;
2423
2424 ifi = parse_interface(udev, *i);
2425 if (ifi < 0)
2426 return ifi;
2427
e867ceb6
LP
2428 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2429 if (r < 0)
2430 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2431
c74e630d 2432 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2433 if (r < 0)
2434 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2435
c74e630d 2436 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2437 if (r < 0)
2438 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2439
2440 n = strappend("mv-", *i);
2441 if (!n)
2442 return log_oom();
2443
2444 strshorten(n, IFNAMSIZ-1);
2445
2446 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2449
e867ceb6
LP
2450 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2453
aa28aefe 2454 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2457
2458 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2461
d8e538ec 2462 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2465
2466 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2469
2470 r = sd_rtnl_message_close_container(m);
f647962d
MS
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2473
2474 r = sd_rtnl_message_close_container(m);
f647962d
MS
2475 if (r < 0)
2476 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2477
2478 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2479 if (r < 0)
2480 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2481 }
2482
2483 return 0;
2484}
2485
4bbfe7ad
TG
2486static int setup_ipvlan(pid_t pid) {
2487 _cleanup_udev_unref_ struct udev *udev = NULL;
2488 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2489 char **i;
2490 int r;
2491
2492 if (!arg_private_network)
2493 return 0;
2494
2495 if (strv_isempty(arg_network_ipvlan))
2496 return 0;
2497
2498 r = sd_rtnl_open(&rtnl, 0);
2499 if (r < 0)
2500 return log_error_errno(r, "Failed to connect to netlink: %m");
2501
2502 udev = udev_new();
2503 if (!udev) {
2504 log_error("Failed to connect to udev.");
2505 return -ENOMEM;
2506 }
2507
2508 STRV_FOREACH(i, arg_network_ipvlan) {
2509 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2510 _cleanup_free_ char *n = NULL;
2511 int ifi;
2512
2513 ifi = parse_interface(udev, *i);
2514 if (ifi < 0)
2515 return ifi;
2516
2517 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to allocate netlink message: %m");
2520
2521 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2522 if (r < 0)
2523 return log_error_errno(r, "Failed to add netlink interface index: %m");
2524
2525 n = strappend("iv-", *i);
2526 if (!n)
2527 return log_oom();
2528
2529 strshorten(n, IFNAMSIZ-1);
2530
2531 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2532 if (r < 0)
2533 return log_error_errno(r, "Failed to add netlink interface name: %m");
2534
2535 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2538
2539 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to open netlink container: %m");
2542
2543 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2544 if (r < 0)
2545 return log_error_errno(r, "Failed to open netlink container: %m");
2546
2547 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2548 if (r < 0)
2549 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2550
2551 r = sd_rtnl_message_close_container(m);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to close netlink container: %m");
2554
2555 r = sd_rtnl_message_close_container(m);
2556 if (r < 0)
2557 return log_error_errno(r, "Failed to close netlink container: %m");
2558
2559 r = sd_rtnl_call(rtnl, m, 0, NULL);
2560 if (r < 0)
2561 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2562 }
2563
2564 return 0;
2565}
2566
28650077 2567static int setup_seccomp(void) {
24fb1112
LP
2568
2569#ifdef HAVE_SECCOMP
28650077
LP
2570 static const int blacklist[] = {
2571 SCMP_SYS(kexec_load),
2572 SCMP_SYS(open_by_handle_at),
28650077
LP
2573 SCMP_SYS(iopl),
2574 SCMP_SYS(ioperm),
2575 SCMP_SYS(swapon),
2576 SCMP_SYS(swapoff),
2577 };
2578
d0a0ccf3
JF
2579 static const int kmod_blacklist[] = {
2580 SCMP_SYS(init_module),
2581 SCMP_SYS(finit_module),
2582 SCMP_SYS(delete_module),
2583 };
2584
24fb1112 2585 scmp_filter_ctx seccomp;
28650077 2586 unsigned i;
24fb1112
LP
2587 int r;
2588
24fb1112
LP
2589 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2590 if (!seccomp)
2591 return log_oom();
2592
e9642be2 2593 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2594 if (r < 0) {
da927ba9 2595 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2596 goto finish;
2597 }
2598
28650077
LP
2599 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2600 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2601 if (r == -EFAULT)
2602 continue; /* unknown syscall */
2603 if (r < 0) {
da927ba9 2604 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2605 goto finish;
2606 }
2607 }
2608
d0a0ccf3
JF
2609 /* If the CAP_SYS_MODULE capability is not requested then
2610 * we'll block the kmod syscalls too */
2611 if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2612 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2613 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2614 if (r == -EFAULT)
2615 continue; /* unknown syscall */
2616 if (r < 0) {
2617 log_error_errno(r, "Failed to block syscall: %m");
2618 goto finish;
2619 }
2620 }
2621 }
2622
28650077
LP
2623 /*
2624 Audit is broken in containers, much of the userspace audit
2625 hookup will fail if running inside a container. We don't
2626 care and just turn off creation of audit sockets.
2627
2628 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2629 with EAFNOSUPPORT which audit userspace uses as indication
2630 that audit is disabled in the kernel.
2631 */
2632
3302da46 2633 r = seccomp_rule_add(
24fb1112
LP
2634 seccomp,
2635 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2636 SCMP_SYS(socket),
2637 2,
2638 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2639 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2640 if (r < 0) {
da927ba9 2641 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2642 goto finish;
2643 }
2644
2645 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2646 if (r < 0) {
da927ba9 2647 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2648 goto finish;
2649 }
2650
2651 r = seccomp_load(seccomp);
2652 if (r < 0)
da927ba9 2653 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2654
2655finish:
2656 seccomp_release(seccomp);
2657 return r;
2658#else
2659 return 0;
2660#endif
2661
2662}
2663
785890ac
LP
2664static int setup_propagate(const char *root) {
2665 const char *p, *q;
2666
2667 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2668 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2669 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2670 (void) mkdir_p(p, 0600);
2671
63c372cb 2672 q = strjoina(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2673 mkdir_parents(q, 0755);
2674 mkdir_p(q, 0600);
2675
2676 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2677 return log_error_errno(errno, "Failed to install propagation bind mount.");
2678
2679 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2680 return log_error_errno(errno, "Failed to make propagation mount read-only");
2681
2682 return 0;
2683}
2684
1b9e5b12
LP
2685static int setup_image(char **device_path, int *loop_nr) {
2686 struct loop_info64 info = {
2687 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2688 };
2689 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2690 _cleanup_free_ char* loopdev = NULL;
2691 struct stat st;
2692 int r, nr;
2693
2694 assert(device_path);
2695 assert(loop_nr);
ec16945e 2696 assert(arg_image);
1b9e5b12
LP
2697
2698 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2699 if (fd < 0)
2700 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2701
4a62c710
MS
2702 if (fstat(fd, &st) < 0)
2703 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2704
2705 if (S_ISBLK(st.st_mode)) {
2706 char *p;
2707
2708 p = strdup(arg_image);
2709 if (!p)
2710 return log_oom();
2711
2712 *device_path = p;
2713
2714 *loop_nr = -1;
2715
2716 r = fd;
2717 fd = -1;
2718
2719 return r;
2720 }
2721
2722 if (!S_ISREG(st.st_mode)) {
56f64d95 2723 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2724 return -EINVAL;
2725 }
2726
2727 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2728 if (control < 0)
2729 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2730
2731 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2732 if (nr < 0)
2733 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2734
2735 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2736 return log_oom();
2737
2738 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2739 if (loop < 0)
2740 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2741
4a62c710
MS
2742 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2743 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2744
2745 if (arg_read_only)
2746 info.lo_flags |= LO_FLAGS_READ_ONLY;
2747
4a62c710
MS
2748 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2749 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2750
2751 *device_path = loopdev;
2752 loopdev = NULL;
2753
2754 *loop_nr = nr;
2755
2756 r = loop;
2757 loop = -1;
2758
2759 return r;
2760}
2761
ada4799a
LP
2762#define PARTITION_TABLE_BLURB \
2763 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 2764 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 2765 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2766 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2767 "to be bootable with systemd-nspawn."
2768
1b9e5b12
LP
2769static int dissect_image(
2770 int fd,
727fd4fd
LP
2771 char **root_device, bool *root_device_rw,
2772 char **home_device, bool *home_device_rw,
2773 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2774 bool *secondary) {
2775
2776#ifdef HAVE_BLKID
01dc33ce
ZJS
2777 int home_nr = -1, srv_nr = -1;
2778#ifdef GPT_ROOT_NATIVE
2779 int root_nr = -1;
2780#endif
2781#ifdef GPT_ROOT_SECONDARY
2782 int secondary_root_nr = -1;
2783#endif
f6c51a81 2784 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2785 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2786 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2787 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2788 _cleanup_udev_unref_ struct udev *udev = NULL;
2789 struct udev_list_entry *first, *item;
f6c51a81 2790 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2791 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2792 const char *pttype = NULL;
2793 blkid_partlist pl;
2794 struct stat st;
c09ef2e4 2795 unsigned i;
1b9e5b12
LP
2796 int r;
2797
2798 assert(fd >= 0);
2799 assert(root_device);
2800 assert(home_device);
2801 assert(srv_device);
2802 assert(secondary);
ec16945e 2803 assert(arg_image);
1b9e5b12
LP
2804
2805 b = blkid_new_probe();
2806 if (!b)
2807 return log_oom();
2808
2809 errno = 0;
2810 r = blkid_probe_set_device(b, fd, 0, 0);
2811 if (r != 0) {
2812 if (errno == 0)
2813 return log_oom();
2814
56f64d95 2815 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2816 return -errno;
2817 }
2818
2819 blkid_probe_enable_partitions(b, 1);
2820 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2821
2822 errno = 0;
2823 r = blkid_do_safeprobe(b);
2824 if (r == -2 || r == 1) {
ada4799a
LP
2825 log_error("Failed to identify any partition table on\n"
2826 " %s\n"
2827 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2828 return -EINVAL;
2829 } else if (r != 0) {
2830 if (errno == 0)
2831 errno = EIO;
56f64d95 2832 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2833 return -errno;
2834 }
2835
2836 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2837
2838 is_gpt = streq_ptr(pttype, "gpt");
2839 is_mbr = streq_ptr(pttype, "dos");
2840
2841 if (!is_gpt && !is_mbr) {
2842 log_error("No GPT or MBR partition table discovered on\n"
2843 " %s\n"
2844 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2845 return -EINVAL;
2846 }
2847
2848 errno = 0;
2849 pl = blkid_probe_get_partitions(b);
2850 if (!pl) {
2851 if (errno == 0)
2852 return log_oom();
2853
2854 log_error("Failed to list partitions of %s", arg_image);
2855 return -errno;
2856 }
2857
2858 udev = udev_new();
2859 if (!udev)
2860 return log_oom();
2861
4a62c710
MS
2862 if (fstat(fd, &st) < 0)
2863 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2864
c09ef2e4
LP
2865 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2866 if (!d)
1b9e5b12
LP
2867 return log_oom();
2868
c09ef2e4
LP
2869 for (i = 0;; i++) {
2870 int n, m;
1b9e5b12 2871
c09ef2e4
LP
2872 if (i >= 10) {
2873 log_error("Kernel partitions never appeared.");
2874 return -ENXIO;
2875 }
2876
2877 e = udev_enumerate_new(udev);
2878 if (!e)
2879 return log_oom();
2880
2881 r = udev_enumerate_add_match_parent(e, d);
2882 if (r < 0)
2883 return log_oom();
2884
2885 r = udev_enumerate_scan_devices(e);
2886 if (r < 0)
2887 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2888
2889 /* Count the partitions enumerated by the kernel */
2890 n = 0;
2891 first = udev_enumerate_get_list_entry(e);
2892 udev_list_entry_foreach(item, first)
2893 n++;
2894
2895 /* Count the partitions enumerated by blkid */
2896 m = blkid_partlist_numof_partitions(pl);
2897 if (n == m + 1)
2898 break;
2899 if (n > m + 1) {
2900 log_error("blkid and kernel partition list do not match.");
2901 return -EIO;
2902 }
2903 if (n < m + 1) {
2904 unsigned j;
2905
2906 /* The kernel has probed fewer partitions than
2907 * blkid? Maybe the kernel prober is still
2908 * running or it got EBUSY because udev
2909 * already opened the device. Let's reprobe
2910 * the device, which is a synchronous call
2911 * that waits until probing is complete. */
2912
2913 for (j = 0; j < 20; j++) {
2914
2915 r = ioctl(fd, BLKRRPART, 0);
2916 if (r < 0)
2917 r = -errno;
2918 if (r >= 0 || r != -EBUSY)
2919 break;
2920
2921 /* If something else has the device
2922 * open, such as an udev rule, the
2923 * ioctl will return EBUSY. Since
2924 * there's no way to wait until it
2925 * isn't busy anymore, let's just wait
2926 * a bit, and try again.
2927 *
2928 * This is really something they
2929 * should fix in the kernel! */
2930
2931 usleep(50 * USEC_PER_MSEC);
2932 }
2933
2934 if (r < 0)
2935 return log_error_errno(r, "Failed to reread partition table: %m");
2936 }
2937
2938 e = udev_enumerate_unref(e);
2939 }
1b9e5b12
LP
2940
2941 first = udev_enumerate_get_list_entry(e);
2942 udev_list_entry_foreach(item, first) {
2943 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2944 const char *node;
727fd4fd 2945 unsigned long long flags;
1b9e5b12
LP
2946 blkid_partition pp;
2947 dev_t qn;
2948 int nr;
2949
2950 errno = 0;
2951 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2952 if (!q) {
2953 if (!errno)
2954 errno = ENOMEM;
2955
56f64d95 2956 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2957 return -errno;
2958 }
2959
2960 qn = udev_device_get_devnum(q);
2961 if (major(qn) == 0)
2962 continue;
2963
2964 if (st.st_rdev == qn)
2965 continue;
2966
2967 node = udev_device_get_devnode(q);
2968 if (!node)
2969 continue;
2970
2971 pp = blkid_partlist_devno_to_partition(pl, qn);
2972 if (!pp)
2973 continue;
2974
727fd4fd 2975 flags = blkid_partition_get_flags(pp);
727fd4fd 2976
1b9e5b12
LP
2977 nr = blkid_partition_get_partno(pp);
2978 if (nr < 0)
2979 continue;
2980
ada4799a
LP
2981 if (is_gpt) {
2982 sd_id128_t type_id;
2983 const char *stype;
1b9e5b12 2984
f6c51a81
LP
2985 if (flags & GPT_FLAG_NO_AUTO)
2986 continue;
2987
ada4799a
LP
2988 stype = blkid_partition_get_type_string(pp);
2989 if (!stype)
2990 continue;
1b9e5b12 2991
ada4799a 2992 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2993 continue;
2994
ada4799a 2995 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2996
ada4799a
LP
2997 if (home && nr >= home_nr)
2998 continue;
1b9e5b12 2999
ada4799a
LP
3000 home_nr = nr;
3001 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3002
ada4799a
LP
3003 r = free_and_strdup(&home, node);
3004 if (r < 0)
3005 return log_oom();
727fd4fd 3006
ada4799a
LP
3007 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3008
3009 if (srv && nr >= srv_nr)
3010 continue;
3011
3012 srv_nr = nr;
3013 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3014
3015 r = free_and_strdup(&srv, node);
3016 if (r < 0)
3017 return log_oom();
3018 }
1b9e5b12 3019#ifdef GPT_ROOT_NATIVE
ada4799a 3020 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3021
ada4799a
LP
3022 if (root && nr >= root_nr)
3023 continue;
1b9e5b12 3024
ada4799a
LP
3025 root_nr = nr;
3026 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3027
ada4799a
LP
3028 r = free_and_strdup(&root, node);
3029 if (r < 0)
3030 return log_oom();
3031 }
1b9e5b12
LP
3032#endif
3033#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3034 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3035
3036 if (secondary_root && nr >= secondary_root_nr)
3037 continue;
3038
3039 secondary_root_nr = nr;
3040 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3041
3042 r = free_and_strdup(&secondary_root, node);
3043 if (r < 0)
3044 return log_oom();
3045 }
3046#endif
f6c51a81
LP
3047 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3048
3049 if (generic)
3050 multiple_generic = true;
3051 else {
3052 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3053
3054 r = free_and_strdup(&generic, node);
3055 if (r < 0)
3056 return log_oom();
3057 }
3058 }
ada4799a
LP
3059
3060 } else if (is_mbr) {
3061 int type;
1b9e5b12 3062
f6c51a81
LP
3063 if (flags != 0x80) /* Bootable flag */
3064 continue;
3065
ada4799a
LP
3066 type = blkid_partition_get_type(pp);
3067 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3068 continue;
3069
f6c51a81
LP
3070 if (generic)
3071 multiple_generic = true;
3072 else {
3073 generic_rw = true;
727fd4fd 3074
f6c51a81
LP
3075 r = free_and_strdup(&root, node);
3076 if (r < 0)
3077 return log_oom();
3078 }
1b9e5b12 3079 }
1b9e5b12
LP
3080 }
3081
1b9e5b12
LP
3082 if (root) {
3083 *root_device = root;
3084 root = NULL;
727fd4fd
LP
3085
3086 *root_device_rw = root_rw;
1b9e5b12
LP
3087 *secondary = false;
3088 } else if (secondary_root) {
3089 *root_device = secondary_root;
3090 secondary_root = NULL;
727fd4fd
LP
3091
3092 *root_device_rw = secondary_root_rw;
1b9e5b12 3093 *secondary = true;
f6c51a81
LP
3094 } else if (generic) {
3095
3096 /* There were no partitions with precise meanings
3097 * around, but we found generic partitions. In this
3098 * case, if there's only one, we can go ahead and boot
3099 * it, otherwise we bail out, because we really cannot
3100 * make any sense of it. */
3101
3102 if (multiple_generic) {
3103 log_error("Identified multiple bootable Linux partitions on\n"
3104 " %s\n"
3105 PARTITION_TABLE_BLURB, arg_image);
3106 return -EINVAL;
3107 }
3108
3109 *root_device = generic;
3110 generic = NULL;
3111
3112 *root_device_rw = generic_rw;
3113 *secondary = false;
3114 } else {
3115 log_error("Failed to identify root partition in disk image\n"
3116 " %s\n"
3117 PARTITION_TABLE_BLURB, arg_image);
3118 return -EINVAL;
1b9e5b12
LP
3119 }
3120
3121 if (home) {
3122 *home_device = home;
3123 home = NULL;
727fd4fd
LP
3124
3125 *home_device_rw = home_rw;
1b9e5b12
LP
3126 }
3127
3128 if (srv) {
3129 *srv_device = srv;
3130 srv = NULL;
727fd4fd
LP
3131
3132 *srv_device_rw = srv_rw;
1b9e5b12
LP
3133 }
3134
3135 return 0;
3136#else
3137 log_error("--image= is not supported, compiled without blkid support.");
3138 return -ENOTSUP;
3139#endif
3140}
3141
727fd4fd 3142static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3143#ifdef HAVE_BLKID
3144 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3145 const char *fstype, *p;
3146 int r;
3147
3148 assert(what);
3149 assert(where);
3150
727fd4fd
LP
3151 if (arg_read_only)
3152 rw = false;
3153
1b9e5b12 3154 if (directory)
63c372cb 3155 p = strjoina(where, directory);
1b9e5b12
LP
3156 else
3157 p = where;
3158
3159 errno = 0;
3160 b = blkid_new_probe_from_filename(what);
3161 if (!b) {
3162 if (errno == 0)
3163 return log_oom();
56f64d95 3164 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3165 return -errno;
3166 }
3167
3168 blkid_probe_enable_superblocks(b, 1);
3169 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3170
3171 errno = 0;
3172 r = blkid_do_safeprobe(b);
3173 if (r == -1 || r == 1) {
3174 log_error("Cannot determine file system type of %s", what);
3175 return -EINVAL;
3176 } else if (r != 0) {
3177 if (errno == 0)
3178 errno = EIO;
56f64d95 3179 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3180 return -errno;
3181 }
3182
3183 errno = 0;
3184 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3185 if (errno == 0)
3186 errno = EINVAL;
3187 log_error("Failed to determine file system type of %s", what);
3188 return -errno;
3189 }
3190
3191 if (streq(fstype, "crypto_LUKS")) {
3192 log_error("nspawn currently does not support LUKS disk images.");
3193 return -ENOTSUP;
3194 }
3195
4a62c710
MS
3196 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3197 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3198
3199 return 0;
3200#else
3201 log_error("--image= is not supported, compiled without blkid support.");
3202 return -ENOTSUP;
3203#endif
3204}
3205
727fd4fd
LP
3206static int mount_devices(
3207 const char *where,
3208 const char *root_device, bool root_device_rw,
3209 const char *home_device, bool home_device_rw,
3210 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3211 int r;
3212
3213 assert(where);
3214
3215 if (root_device) {
727fd4fd 3216 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3217 if (r < 0)
3218 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3219 }
3220
3221 if (home_device) {
727fd4fd 3222 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3223 if (r < 0)
3224 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3225 }
3226
3227 if (srv_device) {
727fd4fd 3228 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3229 if (r < 0)
3230 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3231 }
3232
3233 return 0;
3234}
3235
3236static void loop_remove(int nr, int *image_fd) {
3237 _cleanup_close_ int control = -1;
e8c8ddcc 3238 int r;
1b9e5b12
LP
3239
3240 if (nr < 0)
3241 return;
3242
3243 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3244 r = ioctl(*image_fd, LOOP_CLR_FD);
3245 if (r < 0)
5e4074aa 3246 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3247 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3248 }
3249
3250 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3251 if (control < 0) {
56f64d95 3252 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3253 return;
e8c8ddcc 3254 }
1b9e5b12 3255
e8c8ddcc
TG
3256 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3257 if (r < 0)
5e4074aa 3258 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3259}
3260
0cb9fbcd
LP
3261static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3262 int pipe_fds[2];
3263 pid_t pid;
3264
3265 assert(database);
3266 assert(key);
3267 assert(rpid);
3268
4a62c710
MS
3269 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3270 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3271
3272 pid = fork();
4a62c710
MS
3273 if (pid < 0)
3274 return log_error_errno(errno, "Failed to fork getent child: %m");
3275 else if (pid == 0) {
0cb9fbcd
LP
3276 int nullfd;
3277 char *empty_env = NULL;
3278
3279 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3280 _exit(EXIT_FAILURE);
3281
3282 if (pipe_fds[0] > 2)
03e334a1 3283 safe_close(pipe_fds[0]);
0cb9fbcd 3284 if (pipe_fds[1] > 2)
03e334a1 3285 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3286
3287 nullfd = open("/dev/null", O_RDWR);
3288 if (nullfd < 0)
3289 _exit(EXIT_FAILURE);
3290
3291 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3292 _exit(EXIT_FAILURE);
3293
3294 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3295 _exit(EXIT_FAILURE);
3296
3297 if (nullfd > 2)
03e334a1 3298 safe_close(nullfd);
0cb9fbcd
LP
3299
3300 reset_all_signal_handlers();
3301 close_all_fds(NULL, 0);
3302
4de82926
MM
3303 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3304 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3305 _exit(EXIT_FAILURE);
3306 }
3307
03e334a1 3308 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3309
3310 *rpid = pid;
3311
3312 return pipe_fds[0];
3313}
3314
3315static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3316 char line[LINE_MAX], *x, *u, *g, *h;
3317 const char *word, *state;
0cb9fbcd
LP
3318 _cleanup_free_ uid_t *uids = NULL;
3319 _cleanup_free_ char *home = NULL;
3320 _cleanup_fclose_ FILE *f = NULL;
3321 _cleanup_close_ int fd = -1;
3322 unsigned n_uids = 0;
70f539ca 3323 size_t sz = 0, l;
0cb9fbcd
LP
3324 uid_t uid;
3325 gid_t gid;
3326 pid_t pid;
3327 int r;
3328
3329 assert(_home);
3330
3331 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3332 /* Reset everything fully to 0, just in case */
3333
4a62c710
MS
3334 if (setgroups(0, NULL) < 0)
3335 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3336
4a62c710
MS
3337 if (setresgid(0, 0, 0) < 0)
3338 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3339
4a62c710
MS
3340 if (setresuid(0, 0, 0) < 0)
3341 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3342
3343 *_home = NULL;
3344 return 0;
3345 }
3346
3347 /* First, get user credentials */
3348 fd = spawn_getent("passwd", arg_user, &pid);
3349 if (fd < 0)
3350 return fd;
3351
3352 f = fdopen(fd, "r");
3353 if (!f)
3354 return log_oom();
3355 fd = -1;
3356
3357 if (!fgets(line, sizeof(line), f)) {
3358
3359 if (!ferror(f)) {
3360 log_error("Failed to resolve user %s.", arg_user);
3361 return -ESRCH;
3362 }
3363
56f64d95 3364 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3365 return -errno;
3366 }
3367
3368 truncate_nl(line);
3369
820d3acf 3370 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3371
3372 x = strchr(line, ':');
3373 if (!x) {
3374 log_error("/etc/passwd entry has invalid user field.");
3375 return -EIO;
3376 }
3377
3378 u = strchr(x+1, ':');
3379 if (!u) {
3380 log_error("/etc/passwd entry has invalid password field.");
3381 return -EIO;
3382 }
3383
3384 u++;
3385 g = strchr(u, ':');
3386 if (!g) {
3387 log_error("/etc/passwd entry has invalid UID field.");
3388 return -EIO;
3389 }
3390
3391 *g = 0;
3392 g++;
3393 x = strchr(g, ':');
3394 if (!x) {
3395 log_error("/etc/passwd entry has invalid GID field.");
3396 return -EIO;
3397 }
3398
3399 *x = 0;
3400 h = strchr(x+1, ':');
3401 if (!h) {
3402 log_error("/etc/passwd entry has invalid GECOS field.");
3403 return -EIO;
3404 }
3405
3406 h++;
3407 x = strchr(h, ':');
3408 if (!x) {
3409 log_error("/etc/passwd entry has invalid home directory field.");
3410 return -EIO;
3411 }
3412
3413 *x = 0;
3414
3415 r = parse_uid(u, &uid);
3416 if (r < 0) {
3417 log_error("Failed to parse UID of user.");
3418 return -EIO;
3419 }
3420
3421 r = parse_gid(g, &gid);
3422 if (r < 0) {
3423 log_error("Failed to parse GID of user.");
3424 return -EIO;
3425 }
3426
3427 home = strdup(h);
3428 if (!home)
3429 return log_oom();
3430
3431 /* Second, get group memberships */
3432 fd = spawn_getent("initgroups", arg_user, &pid);
3433 if (fd < 0)
3434 return fd;
3435
3436 fclose(f);
3437 f = fdopen(fd, "r");
3438 if (!f)
3439 return log_oom();
3440 fd = -1;
3441
3442 if (!fgets(line, sizeof(line), f)) {
3443 if (!ferror(f)) {
3444 log_error("Failed to resolve user %s.", arg_user);
3445 return -ESRCH;
3446 }
3447
56f64d95 3448 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3449 return -errno;
3450 }
3451
3452 truncate_nl(line);
3453
820d3acf 3454 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3455
3456 /* Skip over the username and subsequent separator whitespace */
3457 x = line;
3458 x += strcspn(x, WHITESPACE);
3459 x += strspn(x, WHITESPACE);
3460
a2a5291b 3461 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3462 char c[l+1];
3463
a2a5291b 3464 memcpy(c, word, l);
0cb9fbcd
LP
3465 c[l] = 0;
3466
3467 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3468 return log_oom();
3469
3470 r = parse_uid(c, &uids[n_uids++]);
3471 if (r < 0) {
3472 log_error("Failed to parse group data from getent.");
3473 return -EIO;
3474 }
3475 }
3476
3477 r = mkdir_parents(home, 0775);
f647962d
MS
3478 if (r < 0)
3479 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3480
3481 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3482 if (r < 0 && r != -EEXIST)
3483 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3484
3485 fchown(STDIN_FILENO, uid, gid);
3486 fchown(STDOUT_FILENO, uid, gid);
3487 fchown(STDERR_FILENO, uid, gid);
3488
4a62c710
MS
3489 if (setgroups(n_uids, uids) < 0)
3490 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3491
4a62c710
MS
3492 if (setresgid(gid, gid, gid) < 0)
3493 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3494
4a62c710
MS
3495 if (setresuid(uid, uid, uid) < 0)
3496 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3497
3498 if (_home) {
3499 *_home = home;
3500 home = NULL;
3501 }
3502
3503 return 0;
3504}
3505
113cea80 3506/*
6d416b9c
LS
3507 * Return values:
3508 * < 0 : wait_for_terminate() failed to get the state of the
3509 * container, the container was terminated by a signal, or
3510 * failed for an unknown reason. No change is made to the
3511 * container argument.
3512 * > 0 : The program executed in the container terminated with an
3513 * error. The exit code of the program executed in the
919699ec
LP
3514 * container is returned. The container argument has been set
3515 * to CONTAINER_TERMINATED.
6d416b9c
LS
3516 * 0 : The container is being rebooted, has been shut down or exited
3517 * successfully. The container argument has been set to either
3518 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3519 *
6d416b9c
LS
3520 * That is, success is indicated by a return value of zero, and an
3521 * error is indicated by a non-zero value.
113cea80
DH
3522 */
3523static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3524 siginfo_t status;
919699ec 3525 int r;
113cea80
DH
3526
3527 r = wait_for_terminate(pid, &status);
f647962d
MS
3528 if (r < 0)
3529 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3530
3531 switch (status.si_code) {
fddbb89c 3532
113cea80 3533 case CLD_EXITED:
919699ec
LP
3534 if (status.si_status == 0) {
3535 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3536
fddbb89c 3537 } else
919699ec 3538 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3539
919699ec
LP
3540 *container = CONTAINER_TERMINATED;
3541 return status.si_status;
113cea80
DH
3542
3543 case CLD_KILLED:
3544 if (status.si_status == SIGINT) {
113cea80 3545
919699ec 3546 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3547 *container = CONTAINER_TERMINATED;
919699ec
LP
3548 return 0;
3549
113cea80 3550 } else if (status.si_status == SIGHUP) {
113cea80 3551
919699ec 3552 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3553 *container = CONTAINER_REBOOTED;
919699ec 3554 return 0;
113cea80 3555 }
919699ec 3556
113cea80
DH
3557 /* CLD_KILLED fallthrough */
3558
3559 case CLD_DUMPED:
fddbb89c 3560 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3561 return -EIO;
113cea80
DH
3562
3563 default:
fddbb89c 3564 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3565 return -EIO;
113cea80
DH
3566 }
3567
3568 return r;
3569}
3570
e866af3a
DH
3571static void nop_handler(int sig) {}
3572
023fb90b
LP
3573static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3574 pid_t pid;
3575
3576 pid = PTR_TO_UINT32(userdata);
3577 if (pid > 0) {
c6c8f6e2 3578 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3579 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3580 sd_event_source_set_userdata(s, NULL);
3581 return 0;
3582 }
3583 }
3584
3585 sd_event_exit(sd_event_source_get_event(s), 0);
3586 return 0;
3587}
3588
ec16945e 3589static int determine_names(void) {
1b9cebf6 3590 int r;
ec16945e
LP
3591
3592 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3593 if (arg_machine) {
3594 _cleanup_(image_unrefp) Image *i = NULL;
3595
3596 r = image_find(arg_machine, &i);
3597 if (r < 0)
3598 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3599 else if (r == 0) {
3600 log_error("No image for machine '%s': %m", arg_machine);
3601 return -ENOENT;
3602 }
3603
aceac2f0 3604 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3605 r = set_sanitized_path(&arg_image, i->path);
3606 else
3607 r = set_sanitized_path(&arg_directory, i->path);
3608 if (r < 0)
3609 return log_error_errno(r, "Invalid image directory: %m");
3610
3611 arg_read_only = arg_read_only || i->read_only;
3612 } else
ec16945e
LP
3613 arg_directory = get_current_dir_name();
3614
1b9cebf6
LP
3615 if (!arg_directory && !arg_machine) {
3616 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3617 return -EINVAL;
3618 }
3619 }
3620
3621 if (!arg_machine) {
b9ba4dab
LP
3622 if (arg_directory && path_equal(arg_directory, "/"))
3623 arg_machine = gethostname_malloc();
3624 else
3625 arg_machine = strdup(basename(arg_image ?: arg_directory));
3626
ec16945e
LP
3627 if (!arg_machine)
3628 return log_oom();
3629
3630 hostname_cleanup(arg_machine, false);
3631 if (!machine_name_is_valid(arg_machine)) {
3632 log_error("Failed to determine machine name automatically, please use -M.");
3633 return -EINVAL;
3634 }
b9ba4dab
LP
3635
3636 if (arg_ephemeral) {
3637 char *b;
3638
3639 /* Add a random suffix when this is an
3640 * ephemeral machine, so that we can run many
3641 * instances at once without manually having
3642 * to specify -M each time. */
3643
3644 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3645 return log_oom();
3646
3647 free(arg_machine);
3648 arg_machine = b;
3649 }
ec16945e
LP
3650 }
3651
3652 return 0;
3653}
3654
6dac160c
LP
3655static int determine_uid_shift(void) {
3656 int r;
3657
3658 if (!arg_userns)
3659 return 0;
3660
3661 if (arg_uid_shift == UID_INVALID) {
3662 struct stat st;
3663
3664 r = stat(arg_directory, &st);
3665 if (r < 0)
3666 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3667
3668 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3669
3670 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3671 log_error("UID and GID base of %s don't match.", arg_directory);
3672 return -EINVAL;
3673 }
3674
3675 arg_uid_range = UINT32_C(0x10000);
3676 }
3677
3678 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3679 log_error("UID base too high for UID range.");
3680 return -EINVAL;
3681 }
3682
3683 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3684 return 0;
3685}
3686
88213476 3687int main(int argc, char *argv[]) {
69c79d3c 3688
611b312b 3689 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3690 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3691 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3692 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3693 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3694 char veth_name[IFNAMSIZ];
ec16945e 3695 bool secondary = false, remove_subvol = false;
e866af3a 3696 sigset_t mask, mask_chld;
69c79d3c 3697 pid_t pid = 0;
ec16945e 3698 int ret = EXIT_SUCCESS;
6d0b55c2 3699 union in_addr_union exposed = {};
30535c16 3700 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
9c857b9d 3701 bool interactive;
88213476
LP
3702
3703 log_parse_environment();
3704 log_open();
3705
ec16945e
LP
3706 r = parse_argv(argc, argv);
3707 if (r <= 0)
88213476 3708 goto finish;
88213476 3709
ec16945e
LP
3710 r = determine_names();
3711 if (r < 0)
3712 goto finish;
7027ff61 3713
88213476
LP
3714 if (geteuid() != 0) {
3715 log_error("Need to be root.");
ec16945e 3716 r = -EPERM;
88213476
LP
3717 goto finish;
3718 }
3719
04d391da
LP
3720 if (sd_booted() <= 0) {
3721 log_error("Not running on a systemd system.");
ec16945e 3722 r = -EINVAL;
04d391da
LP
3723 goto finish;
3724 }
3725
1b9e5b12
LP
3726 log_close();
3727 n_fd_passed = sd_listen_fds(false);
3728 if (n_fd_passed > 0) {
ec16945e
LP
3729 r = fdset_new_listen_fds(&fds, false);
3730 if (r < 0) {
3731 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3732 goto finish;
3733 }
88213476 3734 }
1b9e5b12
LP
3735 fdset_close_others(fds);
3736 log_open();
88213476 3737
1b9e5b12 3738 if (arg_directory) {
ec16945e
LP
3739 assert(!arg_image);
3740
c4e34a61
LP
3741 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3742 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3743 r = -EINVAL;
6b9132a9
LP
3744 goto finish;
3745 }
1b9e5b12 3746
30535c16 3747 if (arg_ephemeral) {
ec16945e
LP
3748 char *np;
3749
c4e34a61
LP
3750 /* If the specified path is a mount point we
3751 * generate the new snapshot immediately
3752 * inside it under a random name. However if
3753 * the specified is not a mount point we
3754 * create the new snapshot in the parent
3755 * directory, just next to it. */
3756 r = path_is_mount_point(arg_directory, false);
3757 if (r < 0) {
3758 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3759 goto finish;
3760 }
3761 if (r > 0)
3762 r = tempfn_random_child(arg_directory, &np);
3763 else
3764 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3765 if (r < 0) {
3766 log_error_errno(r, "Failed to generate name for snapshot: %m");
3767 goto finish;
3768 }
3769
30535c16
LP
3770 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3771 if (r < 0) {
3772 log_error_errno(r, "Failed to lock %s: %m", np);
3773 goto finish;
3774 }
3775
ec16945e
LP
3776 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3777 if (r < 0) {
3778 free(np);
3779 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3780 goto finish;
3781 }
3782
3783 free(arg_directory);
3784 arg_directory = np;
3785
3786 remove_subvol = true;
30535c16
LP
3787
3788 } else {
3789 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3790 if (r == -EBUSY) {
3791 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3792 goto finish;
3793 }
3794 if (r < 0) {
3795 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3796 return r;
3797 }
3798
3799 if (arg_template) {
3800 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3801 if (r == -EEXIST) {
3802 if (!arg_quiet)
3803 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3804 } else if (r < 0) {
83521414 3805 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3806 goto finish;
3807 } else {
3808 if (!arg_quiet)
3809 log_info("Populated %s from template %s.", arg_directory, arg_template);
3810 }
3811 }
ec16945e
LP
3812 }
3813
1b9e5b12
LP
3814 if (arg_boot) {
3815 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3816 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3817 r = -EINVAL;
1b9e5b12
LP
3818 goto finish;
3819 }
3820 } else {
3821 const char *p;
3822
63c372cb 3823 p = strjoina(arg_directory,
1b9e5b12
LP
3824 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3825 if (access(p, F_OK) < 0) {
3826 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3827 r = -EINVAL;
1b9e5b12 3828 goto finish;
1b9e5b12
LP
3829 }
3830 }
ec16945e 3831
6b9132a9 3832 } else {
1b9e5b12 3833 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3834
ec16945e
LP
3835 assert(arg_image);
3836 assert(!arg_template);
3837
30535c16
LP
3838 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3839 if (r == -EBUSY) {
3840 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3841 goto finish;
3842 }
3843 if (r < 0) {
3844 r = log_error_errno(r, "Failed to create image lock: %m");
3845 goto finish;
3846 }
3847
1b9e5b12 3848 if (!mkdtemp(template)) {
56f64d95 3849 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3850 r = -errno;
6b9132a9 3851 goto finish;
1b9e5b12 3852 }
6b9132a9 3853
1b9e5b12
LP
3854 arg_directory = strdup(template);
3855 if (!arg_directory) {
3856 r = log_oom();
3857 goto finish;
6b9132a9 3858 }
88213476 3859
1b9e5b12
LP
3860 image_fd = setup_image(&device_path, &loop_nr);
3861 if (image_fd < 0) {
3862 r = image_fd;
842f3b0f
LP
3863 goto finish;
3864 }
1b9e5b12 3865
4d9f07b4
LP
3866 r = dissect_image(image_fd,
3867 &root_device, &root_device_rw,
3868 &home_device, &home_device_rw,
3869 &srv_device, &srv_device_rw,
3870 &secondary);
1b9e5b12
LP
3871 if (r < 0)
3872 goto finish;
842f3b0f 3873 }
842f3b0f 3874
6dac160c
LP
3875 r = determine_uid_shift();
3876 if (r < 0)
3877 goto finish;
3878
9c857b9d
LP
3879 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3880
db7feb7e
LP
3881 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3882 if (master < 0) {
ec16945e 3883 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3884 goto finish;
3885 }
3886
611b312b
LP
3887 r = ptsname_malloc(master, &console);
3888 if (r < 0) {
3889 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3890 goto finish;
3891 }
3892
a258bf26 3893 if (unlockpt(master) < 0) {
ec16945e 3894 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3895 goto finish;
3896 }
3897
9c857b9d
LP
3898 if (!arg_quiet)
3899 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3900 arg_machine, arg_image ?: arg_directory);
3901
a258bf26
LP
3902 assert_se(sigemptyset(&mask) == 0);
3903 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3904 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3905
023fb90b
LP
3906 assert_se(sigemptyset(&mask_chld) == 0);
3907 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3908
d87be9b0 3909 for (;;) {
6d0b55c2 3910 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3911 ContainerStatus container_status;
7566e267 3912 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3913 struct sigaction sa = {
3914 .sa_handler = nop_handler,
3915 .sa_flags = SA_NOCLDSTOP,
3916 };
3917
7566e267 3918 r = barrier_create(&barrier);
a2da110b 3919 if (r < 0) {
da927ba9 3920 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3921 goto finish;
3922 }
3923
6d0b55c2
LP
3924 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3925 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3926 goto finish;
3927 }
3928
3929 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3930 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3931 goto finish;
3932 }
3933
e866af3a
DH
3934 /* Child can be killed before execv(), so handle SIGCHLD
3935 * in order to interrupt parent's blocking calls and
3936 * give it a chance to call wait() and terminate. */
3937 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3938 if (r < 0) {
ec16945e 3939 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3940 goto finish;
3941 }
3942
e866af3a
DH
3943 r = sigaction(SIGCHLD, &sa, NULL);
3944 if (r < 0) {
ec16945e 3945 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3946 goto finish;
3947 }
3948
60e1651a
KW
3949 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3950 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3951 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3952 if (pid < 0) {
3953 if (errno == EINVAL)
ec16945e 3954 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3955 else
ec16945e 3956 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3957
d87be9b0
LP
3958 goto finish;
3959 }
a258bf26 3960
d87be9b0
LP
3961 if (pid == 0) {
3962 /* child */
0cb9fbcd 3963 _cleanup_free_ char *home = NULL;
5674767e 3964 unsigned n_env = 2;
d87be9b0 3965 const char *envp[] = {
e10a55fd 3966 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3967 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3968 NULL, /* TERM */
3969 NULL, /* HOME */
3970 NULL, /* USER */
3971 NULL, /* LOGNAME */
3972 NULL, /* container_uuid */
842f3b0f
LP
3973 NULL, /* LISTEN_FDS */
3974 NULL, /* LISTEN_PID */
d87be9b0
LP
3975 NULL
3976 };
f4889f65 3977 char **env_use;
a258bf26 3978
a2da110b
DH
3979 barrier_set_role(&barrier, BARRIER_CHILD);
3980
5674767e
ZJS
3981 envp[n_env] = strv_find_prefix(environ, "TERM=");
3982 if (envp[n_env])
3983 n_env ++;
a258bf26 3984
03e334a1 3985 master = safe_close(master);
a258bf26 3986
03e334a1 3987 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3988 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3989
d87be9b0 3990 reset_all_signal_handlers();
1b6d7fa7 3991 reset_signal_mask();
f5c1b9ee 3992
9c857b9d
LP
3993 if (interactive) {
3994 close_nointr(STDIN_FILENO);
3995 close_nointr(STDOUT_FILENO);
3996 close_nointr(STDERR_FILENO);
842f3b0f 3997
9c857b9d
LP
3998 r = open_terminal(console, O_RDWR);
3999 if (r != STDIN_FILENO) {
4000 if (r >= 0) {
4001 safe_close(r);
4002 r = -EINVAL;
4003 }
842f3b0f 4004
9c857b9d
LP
4005 log_error_errno(r, "Failed to open console: %m");
4006 _exit(EXIT_FAILURE);
4007 }
4008
4009 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4010 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4011 log_error_errno(errno, "Failed to duplicate console: %m");
4012 _exit(EXIT_FAILURE);
4013 }
842f3b0f 4014 }
bc2f673e 4015
d87be9b0 4016 if (setsid() < 0) {
56f64d95 4017 log_error_errno(errno, "setsid() failed: %m");
a2da110b 4018 _exit(EXIT_FAILURE);
bc2f673e
LP
4019 }
4020
db999e0f 4021 if (reset_audit_loginuid() < 0)
a2da110b 4022 _exit(EXIT_FAILURE);
db999e0f 4023
d87be9b0 4024 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 4025 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 4026 _exit(EXIT_FAILURE);
d87be9b0 4027 }
e58a1277 4028
6dac160c
LP
4029 if (arg_private_network)
4030 loopback_setup();
4031
d87be9b0
LP
4032 /* Mark everything as slave, so that we still
4033 * receive mounts from the real root, but don't
4034 * propagate mounts to the real root. */
4035 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 4036 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 4037 _exit(EXIT_FAILURE);
d87be9b0 4038 }
04bc4a3f 4039
727fd4fd
LP
4040 if (mount_devices(arg_directory,
4041 root_device, root_device_rw,
4042 home_device, home_device_rw,
4043 srv_device, srv_device_rw) < 0)
a2da110b 4044 _exit(EXIT_FAILURE);
1b9e5b12 4045
d87be9b0
LP
4046 /* Turn directory into bind mount */
4047 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 4048 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 4049 _exit(EXIT_FAILURE);
d87be9b0 4050 }
88213476 4051
4d9f07b4
LP
4052 r = setup_volatile(arg_directory);
4053 if (r < 0)
a2da110b 4054 _exit(EXIT_FAILURE);
4d9f07b4
LP
4055
4056 if (setup_volatile_state(arg_directory) < 0)
a2da110b 4057 _exit(EXIT_FAILURE);
4d9f07b4
LP
4058
4059 r = base_filesystem_create(arg_directory);
4060 if (r < 0)
a2da110b 4061 _exit(EXIT_FAILURE);
4d9f07b4 4062
d6797c92 4063 if (arg_read_only) {
ec16945e
LP
4064 r = bind_remount_recursive(arg_directory, true);
4065 if (r < 0) {
4066 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 4067 _exit(EXIT_FAILURE);
d87be9b0 4068 }
d6797c92 4069 }
2547bb41 4070
d87be9b0 4071 if (mount_all(arg_directory) < 0)
a2da110b 4072 _exit(EXIT_FAILURE);
57fb9fb5 4073
d87be9b0 4074 if (copy_devnodes(arg_directory) < 0)
a2da110b 4075 _exit(EXIT_FAILURE);
a258bf26 4076
f2d88580 4077 if (setup_ptmx(arg_directory) < 0)
a2da110b 4078 _exit(EXIT_FAILURE);
f2d88580 4079
d87be9b0 4080 dev_setup(arg_directory);
88213476 4081
785890ac
LP
4082 if (setup_propagate(arg_directory) < 0)
4083 _exit(EXIT_FAILURE);
4084
28650077 4085 if (setup_seccomp() < 0)
a2da110b 4086 _exit(EXIT_FAILURE);
24fb1112 4087
d87be9b0 4088 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 4089 _exit(EXIT_FAILURE);
88213476 4090
d87be9b0 4091 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 4092 _exit(EXIT_FAILURE);
03e334a1 4093 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 4094
6d0b55c2
LP
4095 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4096 _exit(EXIT_FAILURE);
4097 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4098
b12afc8c
LP
4099 /* Tell the parent that we are ready, and that
4100 * it can cgroupify us to that we lack access
4101 * to certain devices and resources. */
6dac160c 4102 (void) barrier_place(&barrier); /* #1 */
b12afc8c 4103
d87be9b0 4104 if (setup_boot_id(arg_directory) < 0)
a2da110b 4105 _exit(EXIT_FAILURE);
a41fe3a2 4106
d87be9b0 4107 if (setup_timezone(arg_directory) < 0)
a2da110b 4108 _exit(EXIT_FAILURE);
88213476 4109
d87be9b0 4110 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 4111 _exit(EXIT_FAILURE);
687d0825 4112
d87be9b0 4113 if (setup_journal(arg_directory) < 0)
a2da110b 4114 _exit(EXIT_FAILURE);
687d0825 4115
d6797c92 4116 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 4117 _exit(EXIT_FAILURE);
17fe0523 4118
d6797c92 4119 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 4120 _exit(EXIT_FAILURE);
17fe0523 4121
06c17c39 4122 if (mount_tmpfs(arg_directory) < 0)
a2da110b 4123 _exit(EXIT_FAILURE);
06c17c39 4124
b12afc8c
LP
4125 /* Wait until we are cgroup-ified, so that we
4126 * can mount the right cgroup path writable */
6dac160c 4127 (void) barrier_place_and_sync(&barrier); /* #2 */
b12afc8c
LP
4128
4129 if (mount_cgroup(arg_directory) < 0)
4130 _exit(EXIT_FAILURE);
d96c1ecf 4131
d87be9b0 4132 if (chdir(arg_directory) < 0) {
56f64d95 4133 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 4134 _exit(EXIT_FAILURE);
687d0825
MV
4135 }
4136
d87be9b0 4137 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 4138 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 4139 _exit(EXIT_FAILURE);
687d0825
MV
4140 }
4141
d87be9b0 4142 if (chroot(".") < 0) {
56f64d95 4143 log_error_errno(errno, "chroot() failed: %m");
a2da110b 4144 _exit(EXIT_FAILURE);
687d0825
MV
4145 }
4146
d87be9b0 4147 if (chdir("/") < 0) {
56f64d95 4148 log_error_errno(errno, "chdir() failed: %m");
a2da110b 4149 _exit(EXIT_FAILURE);
687d0825
MV
4150 }
4151
6dac160c
LP
4152 if (arg_userns) {
4153 if (unshare(CLONE_NEWUSER) < 0) {
4154 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4155 _exit(EXIT_FAILURE);
4156 }
d87be9b0 4157
6dac160c
LP
4158 /* Tell the parent, that it now can
4159 * write the UID map. */
4160 (void) barrier_place(&barrier); /* #3 */
4161
4162 /* Wait until the parent wrote the UID
4163 * map */
4164 (void) barrier_place_and_sync(&barrier); /* #4 */
4165 }
4166
4167 umask(0022);
d87be9b0
LP
4168
4169 if (drop_capabilities() < 0) {
56f64d95 4170 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4171 _exit(EXIT_FAILURE);
687d0825 4172 }
687d0825 4173
6dac160c
LP
4174 setup_hostname();
4175
4176 if (arg_personality != 0xffffffffLU) {
4177 if (personality(arg_personality) < 0) {
4178 log_error_errno(errno, "personality() failed: %m");
4179 _exit(EXIT_FAILURE);
4180 }
4181 } else if (secondary) {
4182 if (personality(PER_LINUX32) < 0) {
4183 log_error_errno(errno, "personality() failed: %m");
4184 _exit(EXIT_FAILURE);
4185 }
4186 }
4187
4188#ifdef HAVE_SELINUX
4189 if (arg_selinux_context)
4190 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4191 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4192 _exit(EXIT_FAILURE);
4193 }
4194#endif
4195
0cb9fbcd
LP
4196 r = change_uid_gid(&home);
4197 if (r < 0)
a2da110b 4198 _exit(EXIT_FAILURE);
d87be9b0 4199
842f3b0f
LP
4200 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4201 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4202 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4203 log_oom();
a2da110b 4204 _exit(EXIT_FAILURE);
144f0fc0 4205 }
687d0825 4206
9444b1f2 4207 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4208 char as_uuid[37];
4209
4210 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4211 log_oom();
a2da110b 4212 _exit(EXIT_FAILURE);
842f3b0f
LP
4213 }
4214 }
4215
4216 if (fdset_size(fds) > 0) {
ec16945e
LP
4217 r = fdset_cloexec(fds, false);
4218 if (r < 0) {
4219 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4220 _exit(EXIT_FAILURE);
842f3b0f
LP
4221 }
4222
4223 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4224 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4225 log_oom();
a2da110b 4226 _exit(EXIT_FAILURE);
d87be9b0
LP
4227 }
4228 }
4229
f4889f65
LP
4230 if (!strv_isempty(arg_setenv)) {
4231 char **n;
4232
4233 n = strv_env_merge(2, envp, arg_setenv);
4234 if (!n) {
4235 log_oom();
a2da110b 4236 _exit(EXIT_FAILURE);
f4889f65
LP
4237 }
4238
4239 env_use = n;
4240 } else
4241 env_use = (char**) envp;
4242
6dac160c
LP
4243 /* Let the parent know that we are ready and
4244 * wait until the parent is ready with the
4245 * setup, too... */
4246 (void) barrier_place_and_sync(&barrier); /* #5 */
d96c1ecf 4247
d87be9b0
LP
4248 if (arg_boot) {
4249 char **a;
4250 size_t l;
88213476 4251
d87be9b0 4252 /* Automatically search for the init system */
0f0dbc46 4253
d87be9b0
LP
4254 l = 1 + argc - optind;
4255 a = newa(char*, l + 1);
4256 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4257
d87be9b0 4258 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4259 execve(a[0], a, env_use);
0f0dbc46 4260
d87be9b0 4261 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4262 execve(a[0], a, env_use);
0f0dbc46 4263
d87be9b0 4264 a[0] = (char*) "/sbin/init";
f4889f65 4265 execve(a[0], a, env_use);
d87be9b0 4266 } else if (argc > optind)
f4889f65 4267 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4268 else {
4269 chdir(home ? home : "/root");
f4889f65 4270 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4271 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4272 }
4273
56f64d95 4274 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4275 _exit(EXIT_FAILURE);
da5b3bad 4276 }
88213476 4277
a2da110b 4278 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4279 fdset_free(fds);
4280 fds = NULL;
4281
6d0b55c2
LP
4282 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4283 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4284
6dac160c
LP
4285 (void) barrier_place(&barrier); /* #1 */
4286
b12afc8c
LP
4287 /* Wait for the most basic Child-setup to be done,
4288 * before we add hardware to it, and place it in a
4289 * cgroup. */
6dac160c 4290 if (barrier_sync(&barrier)) { /* #1 */
5aa4bb6b 4291 int ifi = 0;
354bfd2b 4292
840295fc
LP
4293 r = move_network_interfaces(pid);
4294 if (r < 0)
4295 goto finish;
aa28aefe 4296
5aa4bb6b 4297 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4298 if (r < 0)
4299 goto finish;
ab046dde 4300
5aa4bb6b 4301 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4302 if (r < 0)
4303 goto finish;
ab046dde 4304
840295fc
LP
4305 r = setup_macvlan(pid);
4306 if (r < 0)
4307 goto finish;
c74e630d 4308
4bbfe7ad
TG
4309 r = setup_ipvlan(pid);
4310 if (r < 0)
4311 goto finish;
4312
5aa4bb6b
LP
4313 r = register_machine(pid, ifi);
4314 if (r < 0)
4315 goto finish;
4316
6dac160c
LP
4317 /* Notify the child that the parent is ready with all
4318 * its setup, and that the child can now hand over
4319 * control to the code to run inside the container. */
4320 (void) barrier_place(&barrier); /* #2 */
4321
4322 if (arg_userns) {
4323 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4324
4325 (void) barrier_place_and_sync(&barrier); /* #3 */
4326
4327 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4328 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4329 r = write_string_file(uid_map, line);
4330 if (r < 0) {
4331 log_error_errno(r, "Failed to write UID map: %m");
4332 goto finish;
4333 }
4334
4335 /* We always assign the same UID and GID ranges */
4336 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4337 r = write_string_file(uid_map, line);
4338 if (r < 0) {
4339 log_error_errno(r, "Failed to write GID map: %m");
4340 goto finish;
4341 }
4342
4343 (void) barrier_place(&barrier); /* #4 */
4344 }
4345
840295fc
LP
4346 /* Block SIGCHLD here, before notifying child.
4347 * process_pty() will handle it with the other signals. */
4348 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4349 if (r < 0)
4350 goto finish;
e866af3a 4351
840295fc
LP
4352 /* Reset signal to default */
4353 r = default_signals(SIGCHLD, -1);
4354 if (r < 0)
4355 goto finish;
e866af3a 4356
6dac160c
LP
4357 /* Let the child know that we are ready and wait that the child is completely ready now. */
4358 if (barrier_place_and_sync(&barrier)) { /* #5 */
6d0b55c2
LP
4359 _cleanup_event_unref_ sd_event *event = NULL;
4360 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4361 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4362 char last_char = 0;
b12afc8c 4363
733d15ac
LP
4364 sd_notifyf(false,
4365 "READY=1\n"
4366 "STATUS=Container running.\n"
4367 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4368
6d0b55c2
LP
4369 r = sd_event_new(&event);
4370 if (r < 0) {
4371 log_error_errno(r, "Failed to get default event source: %m");
4372 goto finish;
4373 }
88213476 4374
c6c8f6e2 4375 if (arg_kill_signal > 0) {
6d0b55c2
LP
4376 /* Try to kill the init system on SIGINT or SIGTERM */
4377 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4378 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4379 } else {
4380 /* Immediately exit */
4381 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4382 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4383 }
023fb90b 4384
6d0b55c2
LP
4385 /* simply exit on sigchld */
4386 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4387
6d0b55c2
LP
4388 if (arg_expose_ports) {
4389 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4390 if (r < 0)
4391 goto finish;
023fb90b 4392
6d0b55c2
LP
4393 (void) expose_ports(rtnl, &exposed);
4394 }
023fb90b 4395
6d0b55c2 4396 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4397
9c857b9d 4398 r = pty_forward_new(event, master, true, !interactive, &forward);
6d0b55c2
LP
4399 if (r < 0) {
4400 log_error_errno(r, "Failed to create PTY forwarder: %m");
4401 goto finish;
4402 }
023fb90b 4403
6d0b55c2
LP
4404 r = sd_event_loop(event);
4405 if (r < 0) {
4406 log_error_errno(r, "Failed to run event loop: %m");
4407 goto finish;
4408 }
4409
4410 pty_forward_get_last_char(forward, &last_char);
4411
4412 forward = pty_forward_free(forward);
4413
4414 if (!arg_quiet && last_char != '\n')
4415 putc('\n', stdout);
04d39279 4416
6d0b55c2
LP
4417 /* Kill if it is not dead yet anyway */
4418 terminate_machine(pid);
4419 }
840295fc 4420 }
1f0cd86b 4421
840295fc 4422 /* Normally redundant, but better safe than sorry */
04d39279 4423 kill(pid, SIGKILL);
a258bf26 4424
113cea80 4425 r = wait_for_container(pid, &container_status);
04d39279
LP
4426 pid = 0;
4427
ec16945e 4428 if (r < 0)
ce9f1527
LP
4429 /* We failed to wait for the container, or the
4430 * container exited abnormally */
ec16945e
LP
4431 goto finish;
4432 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4433 /* The container exited with a non-zero
4434 * status, or with zero status and no reboot
4435 * was requested. */
ec16945e 4436 ret = r;
d87be9b0 4437 break;
ec16945e 4438 }
88213476 4439
113cea80 4440 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4441
4442 if (arg_keep_unit) {
4443 /* Special handling if we are running as a
4444 * service: instead of simply restarting the
4445 * machine we want to restart the entire
4446 * service, so let's inform systemd about this
4447 * with the special exit code 133. The service
4448 * file uses RestartForceExitStatus=133 so
4449 * that this results in a full nspawn
4450 * restart. This is necessary since we might
4451 * have cgroup parameters set we want to have
4452 * flushed out. */
ec16945e
LP
4453 ret = 133;
4454 r = 0;
ce38dbc8
LP
4455 break;
4456 }
6d0b55c2
LP
4457
4458 flush_ports(&exposed);
d87be9b0 4459 }
88213476
LP
4460
4461finish:
af4ec430
LP
4462 sd_notify(false,
4463 "STOPPING=1\n"
4464 "STATUS=Terminating...");
4465
1b9e5b12
LP
4466 loop_remove(loop_nr, &image_fd);
4467
9444b1f2
LP
4468 if (pid > 0)
4469 kill(pid, SIGKILL);
88213476 4470
ec16945e
LP
4471 if (remove_subvol && arg_directory) {
4472 int k;
4473
4474 k = btrfs_subvol_remove(arg_directory);
4475 if (k < 0)
4476 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4477 }
4478
785890ac
LP
4479 if (arg_machine) {
4480 const char *p;
4481
63c372cb 4482 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
4483 (void) rm_rf(p, false, true, false);
4484 }
4485
04d391da 4486 free(arg_directory);
ec16945e
LP
4487 free(arg_template);
4488 free(arg_image);
7027ff61 4489 free(arg_machine);
c74e630d
LP
4490 free(arg_user);
4491 strv_free(arg_setenv);
4492 strv_free(arg_network_interfaces);
4493 strv_free(arg_network_macvlan);
4bbfe7ad 4494 strv_free(arg_network_ipvlan);
c74e630d
LP
4495 strv_free(arg_bind);
4496 strv_free(arg_bind_ro);
06c17c39 4497 strv_free(arg_tmpfs);
88213476 4498
6d0b55c2
LP
4499 flush_ports(&exposed);
4500
4501 while (arg_expose_ports) {
4502 ExposePort *p = arg_expose_ports;
4503 LIST_REMOVE(ports, arg_expose_ports, p);
4504 free(p);
4505 }
4506
ec16945e 4507 return r < 0 ? EXIT_FAILURE : ret;
88213476 4508}