]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
update TODO
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
3df3e884 55#include "random-util.h"
1f0cd86b
LP
56#include "sd-daemon.h"
57#include "sd-bus.h"
58#include "sd-id128.h"
aa28aefe 59#include "sd-rtnl.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
aa28aefe 78#include "rtnl-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
93#include "fw-util.h"
94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
f2d88580 98
e9642be2
LP
99#ifdef HAVE_SECCOMP
100#include "seccomp-util.h"
101#endif
102
6d0b55c2
LP
103typedef struct ExposePort {
104 int protocol;
105 uint16_t host_port;
106 uint16_t container_port;
107 LIST_FIELDS(struct ExposePort, ports);
108} ExposePort;
109
113cea80
DH
110typedef enum ContainerStatus {
111 CONTAINER_TERMINATED,
112 CONTAINER_REBOOTED
113} ContainerStatus;
114
57fb9fb5
LP
115typedef enum LinkJournal {
116 LINK_NO,
117 LINK_AUTO,
118 LINK_HOST,
119 LINK_GUEST
120} LinkJournal;
88213476 121
4d9f07b4
LP
122typedef enum Volatile {
123 VOLATILE_NO,
124 VOLATILE_YES,
125 VOLATILE_STATE,
126} Volatile;
127
88213476 128static char *arg_directory = NULL;
ec16945e 129static char *arg_template = NULL;
687d0825 130static char *arg_user = NULL;
9444b1f2 131static sd_id128_t arg_uuid = {};
7027ff61 132static char *arg_machine = NULL;
c74e630d
LP
133static const char *arg_selinux_context = NULL;
134static const char *arg_selinux_apifs_context = NULL;
9444b1f2 135static const char *arg_slice = NULL;
ff01d048 136static bool arg_private_network = false;
bc2f673e 137static bool arg_read_only = false;
0f0dbc46 138static bool arg_boot = false;
ec16945e 139static bool arg_ephemeral = false;
57fb9fb5 140static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 141static bool arg_link_journal_try = false;
5076f0cc
LP
142static uint64_t arg_retain =
143 (1ULL << CAP_CHOWN) |
144 (1ULL << CAP_DAC_OVERRIDE) |
145 (1ULL << CAP_DAC_READ_SEARCH) |
146 (1ULL << CAP_FOWNER) |
147 (1ULL << CAP_FSETID) |
148 (1ULL << CAP_IPC_OWNER) |
149 (1ULL << CAP_KILL) |
150 (1ULL << CAP_LEASE) |
151 (1ULL << CAP_LINUX_IMMUTABLE) |
152 (1ULL << CAP_NET_BIND_SERVICE) |
153 (1ULL << CAP_NET_BROADCAST) |
154 (1ULL << CAP_NET_RAW) |
155 (1ULL << CAP_SETGID) |
156 (1ULL << CAP_SETFCAP) |
157 (1ULL << CAP_SETPCAP) |
158 (1ULL << CAP_SETUID) |
159 (1ULL << CAP_SYS_ADMIN) |
160 (1ULL << CAP_SYS_CHROOT) |
161 (1ULL << CAP_SYS_NICE) |
162 (1ULL << CAP_SYS_PTRACE) |
163 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 164 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
165 (1ULL << CAP_SYS_BOOT) |
166 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
167 (1ULL << CAP_AUDIT_CONTROL) |
168 (1ULL << CAP_MKNOD);
17fe0523
LP
169static char **arg_bind = NULL;
170static char **arg_bind_ro = NULL;
06c17c39 171static char **arg_tmpfs = NULL;
f4889f65 172static char **arg_setenv = NULL;
284c0b91 173static bool arg_quiet = false;
8a96d94e 174static bool arg_share_system = false;
eb91eb18 175static bool arg_register = true;
89f7c846 176static bool arg_keep_unit = false;
aa28aefe 177static char **arg_network_interfaces = NULL;
c74e630d 178static char **arg_network_macvlan = NULL;
4bbfe7ad 179static char **arg_network_ipvlan = NULL;
69c79d3c 180static bool arg_network_veth = false;
c74e630d 181static const char *arg_network_bridge = NULL;
6afc95b7 182static unsigned long arg_personality = 0xffffffffLU;
ec16945e 183static char *arg_image = NULL;
4d9f07b4 184static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 185static ExposePort *arg_expose_ports = NULL;
f36933fe 186static char **arg_property = NULL;
6dac160c
LP
187static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
188static bool arg_userns = false;
c6c8f6e2 189static int arg_kill_signal = 0;
88213476 190
601185b4 191static void help(void) {
88213476
LP
192 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
194 " -h --help Show this help\n"
195 " --version Print version string\n"
69c79d3c 196 " -q --quiet Do not show status information\n"
1b9e5b12 197 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
198 " --template=PATH Initialize root directory from template directory,\n"
199 " if missing\n"
200 " -x --ephemeral Run container with snapshot of root directory, and\n"
201 " remove it after exit\n"
202 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
203 " -b --boot Boot up full system (i.e. invoke init)\n"
204 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 205 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 206 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 207 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 208 " --property=NAME=VALUE Set scope unit property\n"
69c79d3c
LP
209 " --private-network Disable network in container\n"
210 " --network-interface=INTERFACE\n"
211 " Assign an existing network interface to the\n"
212 " container\n"
c74e630d
LP
213 " --network-macvlan=INTERFACE\n"
214 " Create a macvlan network interface based on an\n"
215 " existing network interface to the container\n"
4bbfe7ad
TG
216 " --network-ipvlan=INTERFACE\n"
217 " Create a ipvlan network interface based on an\n"
218 " existing network interface to the container\n"
0dfaa006 219 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 220 " and container\n"
ab046dde 221 " --network-bridge=INTERFACE\n"
32457153 222 " Add a virtual ethernet connection between host\n"
ab046dde
TG
223 " and container and add it to an existing bridge on\n"
224 " the host\n"
6dac160c
LP
225 " --private-users[=UIDBASE[:NUIDS]]\n"
226 " Run within user namespace\n"
6d0b55c2 227 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 228 " Expose a container IP port on the host\n"
82adf6af
LP
229 " -Z --selinux-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " processes in the container\n"
232 " -L --selinux-apifs-context=SECLABEL\n"
233 " Set the SELinux security context to be used by\n"
234 " API/tmpfs file systems in the container\n"
a8828ed9
DW
235 " --capability=CAP In addition to the default, retain specified\n"
236 " capability\n"
237 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 238 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
239 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
240 " try-guest, try-host\n"
241 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 242 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
243 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
244 " the container\n"
245 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 246 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 247 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 248 " --share-system Share system namespaces with host\n"
eb91eb18 249 " --register=BOOLEAN Register container as machine\n"
89f7c846 250 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 251 " the service unit nspawn is running in\n"
6d0b55c2
LP
252 " --volatile[=MODE] Run the system in volatile mode\n"
253 , program_invocation_short_name);
88213476
LP
254}
255
ec16945e
LP
256static int set_sanitized_path(char **b, const char *path) {
257 char *p;
258
259 assert(b);
260 assert(path);
261
262 p = canonicalize_file_name(path);
263 if (!p) {
264 if (errno != ENOENT)
265 return -errno;
266
267 p = path_make_absolute_cwd(path);
268 if (!p)
269 return -ENOMEM;
270 }
271
272 free(*b);
273 *b = path_kill_slashes(p);
274 return 0;
275}
276
88213476
LP
277static int parse_argv(int argc, char *argv[]) {
278
a41fe3a2 279 enum {
acbeb427
ZJS
280 ARG_VERSION = 0x100,
281 ARG_PRIVATE_NETWORK,
bc2f673e 282 ARG_UUID,
5076f0cc 283 ARG_READ_ONLY,
57fb9fb5 284 ARG_CAPABILITY,
420c7379 285 ARG_DROP_CAPABILITY,
17fe0523
LP
286 ARG_LINK_JOURNAL,
287 ARG_BIND,
f4889f65 288 ARG_BIND_RO,
06c17c39 289 ARG_TMPFS,
f4889f65 290 ARG_SETENV,
eb91eb18 291 ARG_SHARE_SYSTEM,
89f7c846 292 ARG_REGISTER,
aa28aefe 293 ARG_KEEP_UNIT,
69c79d3c 294 ARG_NETWORK_INTERFACE,
c74e630d 295 ARG_NETWORK_MACVLAN,
4bbfe7ad 296 ARG_NETWORK_IPVLAN,
ab046dde 297 ARG_NETWORK_BRIDGE,
6afc95b7 298 ARG_PERSONALITY,
4d9f07b4 299 ARG_VOLATILE,
ec16945e 300 ARG_TEMPLATE,
f36933fe 301 ARG_PROPERTY,
6dac160c 302 ARG_PRIVATE_USERS,
c6c8f6e2 303 ARG_KILL_SIGNAL,
a41fe3a2
LP
304 };
305
88213476 306 static const struct option options[] = {
aa28aefe
LP
307 { "help", no_argument, NULL, 'h' },
308 { "version", no_argument, NULL, ARG_VERSION },
309 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
310 { "template", required_argument, NULL, ARG_TEMPLATE },
311 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
312 { "user", required_argument, NULL, 'u' },
313 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
314 { "boot", no_argument, NULL, 'b' },
315 { "uuid", required_argument, NULL, ARG_UUID },
316 { "read-only", no_argument, NULL, ARG_READ_ONLY },
317 { "capability", required_argument, NULL, ARG_CAPABILITY },
318 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
319 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
320 { "bind", required_argument, NULL, ARG_BIND },
321 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 322 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
323 { "machine", required_argument, NULL, 'M' },
324 { "slice", required_argument, NULL, 'S' },
325 { "setenv", required_argument, NULL, ARG_SETENV },
326 { "selinux-context", required_argument, NULL, 'Z' },
327 { "selinux-apifs-context", required_argument, NULL, 'L' },
328 { "quiet", no_argument, NULL, 'q' },
329 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
330 { "register", required_argument, NULL, ARG_REGISTER },
331 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
332 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 333 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 334 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 335 { "network-veth", no_argument, NULL, 'n' },
ab046dde 336 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 337 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 338 { "image", required_argument, NULL, 'i' },
4d9f07b4 339 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 340 { "port", required_argument, NULL, 'p' },
f36933fe 341 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 342 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 343 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 344 {}
88213476
LP
345 };
346
9444b1f2 347 int c, r;
a42c8b54 348 uint64_t plus = 0, minus = 0;
88213476
LP
349
350 assert(argc >= 0);
351 assert(argv);
352
0dfaa006 353 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
354
355 switch (c) {
356
357 case 'h':
601185b4
ZJS
358 help();
359 return 0;
88213476 360
acbeb427
ZJS
361 case ARG_VERSION:
362 puts(PACKAGE_STRING);
363 puts(SYSTEMD_FEATURES);
364 return 0;
365
88213476 366 case 'D':
ec16945e
LP
367 r = set_sanitized_path(&arg_directory, optarg);
368 if (r < 0)
369 return log_error_errno(r, "Invalid root directory: %m");
370
371 break;
372
373 case ARG_TEMPLATE:
374 r = set_sanitized_path(&arg_template, optarg);
375 if (r < 0)
376 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
377
378 break;
379
1b9e5b12 380 case 'i':
ec16945e
LP
381 r = set_sanitized_path(&arg_image, optarg);
382 if (r < 0)
383 return log_error_errno(r, "Invalid image path: %m");
384
385 break;
386
387 case 'x':
388 arg_ephemeral = true;
1b9e5b12
LP
389 break;
390
687d0825
MV
391 case 'u':
392 free(arg_user);
7027ff61
LP
393 arg_user = strdup(optarg);
394 if (!arg_user)
395 return log_oom();
687d0825
MV
396
397 break;
398
ab046dde 399 case ARG_NETWORK_BRIDGE:
c74e630d 400 arg_network_bridge = optarg;
ab046dde
TG
401
402 /* fall through */
403
0dfaa006 404 case 'n':
69c79d3c
LP
405 arg_network_veth = true;
406 arg_private_network = true;
407 break;
408
aa28aefe 409 case ARG_NETWORK_INTERFACE:
c74e630d
LP
410 if (strv_extend(&arg_network_interfaces, optarg) < 0)
411 return log_oom();
412
413 arg_private_network = true;
414 break;
415
416 case ARG_NETWORK_MACVLAN:
417 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
418 return log_oom();
419
4bbfe7ad
TG
420 arg_private_network = true;
421 break;
422
423 case ARG_NETWORK_IPVLAN:
424 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
425 return log_oom();
426
aa28aefe
LP
427 /* fall through */
428
ff01d048
LP
429 case ARG_PRIVATE_NETWORK:
430 arg_private_network = true;
a41fe3a2
LP
431 break;
432
0f0dbc46
LP
433 case 'b':
434 arg_boot = true;
435 break;
436
144f0fc0 437 case ARG_UUID:
9444b1f2
LP
438 r = sd_id128_from_string(optarg, &arg_uuid);
439 if (r < 0) {
aa96c6cb 440 log_error("Invalid UUID: %s", optarg);
9444b1f2 441 return r;
aa96c6cb 442 }
9444b1f2 443 break;
aa96c6cb 444
9444b1f2 445 case 'S':
c74e630d 446 arg_slice = optarg;
144f0fc0
LP
447 break;
448
7027ff61 449 case 'M':
eb91eb18
LP
450 if (isempty(optarg)) {
451 free(arg_machine);
452 arg_machine = NULL;
453 } else {
0c3c4284 454 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
455 log_error("Invalid machine name: %s", optarg);
456 return -EINVAL;
457 }
7027ff61 458
0c3c4284
LP
459 r = free_and_strdup(&arg_machine, optarg);
460 if (r < 0)
eb91eb18
LP
461 return log_oom();
462
463 break;
464 }
7027ff61 465
82adf6af
LP
466 case 'Z':
467 arg_selinux_context = optarg;
a8828ed9
DW
468 break;
469
82adf6af
LP
470 case 'L':
471 arg_selinux_apifs_context = optarg;
a8828ed9
DW
472 break;
473
bc2f673e
LP
474 case ARG_READ_ONLY:
475 arg_read_only = true;
476 break;
477
420c7379
LP
478 case ARG_CAPABILITY:
479 case ARG_DROP_CAPABILITY: {
a2a5291b 480 const char *state, *word;
5076f0cc
LP
481 size_t length;
482
483 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 484 _cleanup_free_ char *t;
5076f0cc
LP
485
486 t = strndup(word, length);
0d0f0c50
SL
487 if (!t)
488 return log_oom();
5076f0cc 489
39ed67d1
LP
490 if (streq(t, "all")) {
491 if (c == ARG_CAPABILITY)
a42c8b54 492 plus = (uint64_t) -1;
39ed67d1 493 else
a42c8b54 494 minus = (uint64_t) -1;
39ed67d1 495 } else {
2822da4f
LP
496 int cap;
497
498 cap = capability_from_name(t);
499 if (cap < 0) {
39ed67d1
LP
500 log_error("Failed to parse capability %s.", t);
501 return -EINVAL;
502 }
503
504 if (c == ARG_CAPABILITY)
a42c8b54 505 plus |= 1ULL << (uint64_t) cap;
39ed67d1 506 else
a42c8b54 507 minus |= 1ULL << (uint64_t) cap;
5076f0cc 508 }
5076f0cc
LP
509 }
510
511 break;
512 }
513
57fb9fb5
LP
514 case 'j':
515 arg_link_journal = LINK_GUEST;
574edc90 516 arg_link_journal_try = true;
57fb9fb5
LP
517 break;
518
519 case ARG_LINK_JOURNAL:
53e438e3 520 if (streq(optarg, "auto")) {
57fb9fb5 521 arg_link_journal = LINK_AUTO;
53e438e3
LP
522 arg_link_journal_try = false;
523 } else if (streq(optarg, "no")) {
57fb9fb5 524 arg_link_journal = LINK_NO;
53e438e3
LP
525 arg_link_journal_try = false;
526 } else if (streq(optarg, "guest")) {
57fb9fb5 527 arg_link_journal = LINK_GUEST;
53e438e3
LP
528 arg_link_journal_try = false;
529 } else if (streq(optarg, "host")) {
57fb9fb5 530 arg_link_journal = LINK_HOST;
53e438e3
LP
531 arg_link_journal_try = false;
532 } else if (streq(optarg, "try-guest")) {
574edc90
MP
533 arg_link_journal = LINK_GUEST;
534 arg_link_journal_try = true;
535 } else if (streq(optarg, "try-host")) {
536 arg_link_journal = LINK_HOST;
537 arg_link_journal_try = true;
538 } else {
57fb9fb5
LP
539 log_error("Failed to parse link journal mode %s", optarg);
540 return -EINVAL;
541 }
542
543 break;
544
17fe0523
LP
545 case ARG_BIND:
546 case ARG_BIND_RO: {
547 _cleanup_free_ char *a = NULL, *b = NULL;
548 char *e;
549 char ***x;
17fe0523
LP
550
551 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
552
553 e = strchr(optarg, ':');
554 if (e) {
555 a = strndup(optarg, e - optarg);
556 b = strdup(e + 1);
557 } else {
558 a = strdup(optarg);
559 b = strdup(optarg);
560 }
561
562 if (!a || !b)
563 return log_oom();
564
565 if (!path_is_absolute(a) || !path_is_absolute(b)) {
566 log_error("Invalid bind mount specification: %s", optarg);
567 return -EINVAL;
568 }
569
570 r = strv_extend(x, a);
571 if (r < 0)
b3451bed 572 return log_oom();
17fe0523
LP
573
574 r = strv_extend(x, b);
575 if (r < 0)
b3451bed 576 return log_oom();
17fe0523
LP
577
578 break;
579 }
580
06c17c39
LP
581 case ARG_TMPFS: {
582 _cleanup_free_ char *a = NULL, *b = NULL;
583 char *e;
584
585 e = strchr(optarg, ':');
586 if (e) {
587 a = strndup(optarg, e - optarg);
588 b = strdup(e + 1);
589 } else {
590 a = strdup(optarg);
591 b = strdup("mode=0755");
592 }
593
594 if (!a || !b)
595 return log_oom();
596
597 if (!path_is_absolute(a)) {
598 log_error("Invalid tmpfs specification: %s", optarg);
599 return -EINVAL;
600 }
601
602 r = strv_push(&arg_tmpfs, a);
603 if (r < 0)
604 return log_oom();
605
606 a = NULL;
607
608 r = strv_push(&arg_tmpfs, b);
609 if (r < 0)
610 return log_oom();
611
612 b = NULL;
613
614 break;
615 }
616
f4889f65
LP
617 case ARG_SETENV: {
618 char **n;
619
620 if (!env_assignment_is_valid(optarg)) {
621 log_error("Environment variable assignment '%s' is not valid.", optarg);
622 return -EINVAL;
623 }
624
625 n = strv_env_set(arg_setenv, optarg);
626 if (!n)
627 return log_oom();
628
629 strv_free(arg_setenv);
630 arg_setenv = n;
631 break;
632 }
633
284c0b91
LP
634 case 'q':
635 arg_quiet = true;
636 break;
637
8a96d94e
LP
638 case ARG_SHARE_SYSTEM:
639 arg_share_system = true;
640 break;
641
eb91eb18
LP
642 case ARG_REGISTER:
643 r = parse_boolean(optarg);
644 if (r < 0) {
645 log_error("Failed to parse --register= argument: %s", optarg);
646 return r;
647 }
648
649 arg_register = r;
650 break;
651
89f7c846
LP
652 case ARG_KEEP_UNIT:
653 arg_keep_unit = true;
654 break;
655
6afc95b7
LP
656 case ARG_PERSONALITY:
657
ac45f971 658 arg_personality = personality_from_string(optarg);
6afc95b7
LP
659 if (arg_personality == 0xffffffffLU) {
660 log_error("Unknown or unsupported personality '%s'.", optarg);
661 return -EINVAL;
662 }
663
664 break;
665
4d9f07b4
LP
666 case ARG_VOLATILE:
667
668 if (!optarg)
669 arg_volatile = VOLATILE_YES;
670 else {
671 r = parse_boolean(optarg);
672 if (r < 0) {
673 if (streq(optarg, "state"))
674 arg_volatile = VOLATILE_STATE;
675 else {
676 log_error("Failed to parse --volatile= argument: %s", optarg);
677 return r;
678 }
679 } else
680 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681 }
682
683 break;
684
6d0b55c2
LP
685 case 'p': {
686 const char *split, *e;
687 uint16_t container_port, host_port;
688 int protocol;
689 ExposePort *p;
690
691 if ((e = startswith(optarg, "tcp:")))
692 protocol = IPPROTO_TCP;
693 else if ((e = startswith(optarg, "udp:")))
694 protocol = IPPROTO_UDP;
695 else {
696 e = optarg;
697 protocol = IPPROTO_TCP;
698 }
699
700 split = strchr(e, ':');
701 if (split) {
702 char v[split - e + 1];
703
704 memcpy(v, e, split - e);
705 v[split - e] = 0;
706
707 r = safe_atou16(v, &host_port);
708 if (r < 0 || host_port <= 0) {
709 log_error("Failed to parse host port: %s", optarg);
710 return -EINVAL;
711 }
712
713 r = safe_atou16(split + 1, &container_port);
714 } else {
715 r = safe_atou16(e, &container_port);
716 host_port = container_port;
717 }
718
719 if (r < 0 || container_port <= 0) {
720 log_error("Failed to parse host port: %s", optarg);
721 return -EINVAL;
722 }
723
724 LIST_FOREACH(ports, p, arg_expose_ports) {
725 if (p->protocol == protocol && p->host_port == host_port) {
726 log_error("Duplicate port specification: %s", optarg);
727 return -EINVAL;
728 }
729 }
730
731 p = new(ExposePort, 1);
732 if (!p)
733 return log_oom();
734
735 p->protocol = protocol;
736 p->host_port = host_port;
737 p->container_port = container_port;
738
739 LIST_PREPEND(ports, arg_expose_ports, p);
740
741 break;
742 }
743
f36933fe
LP
744 case ARG_PROPERTY:
745 if (strv_extend(&arg_property, optarg) < 0)
746 return log_oom();
747
748 break;
749
6dac160c
LP
750 case ARG_PRIVATE_USERS:
751 if (optarg) {
752 _cleanup_free_ char *buffer = NULL;
753 const char *range, *shift;
754
755 range = strchr(optarg, ':');
756 if (range) {
757 buffer = strndup(optarg, range - optarg);
758 if (!buffer)
759 return log_oom();
760 shift = buffer;
761
762 range++;
763 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
764 log_error("Failed to parse UID range: %s", range);
765 return -EINVAL;
766 }
767 } else
768 shift = optarg;
769
770 if (parse_uid(shift, &arg_uid_shift) < 0) {
771 log_error("Failed to parse UID: %s", optarg);
772 return -EINVAL;
773 }
774 }
775
776 arg_userns = true;
777 break;
778
c6c8f6e2
LP
779 case ARG_KILL_SIGNAL:
780 arg_kill_signal = signal_from_string_try_harder(optarg);
781 if (arg_kill_signal < 0) {
782 log_error("Cannot parse signal: %s", optarg);
783 return -EINVAL;
784 }
785
786 break;
787
88213476
LP
788 case '?':
789 return -EINVAL;
790
791 default:
eb9da376 792 assert_not_reached("Unhandled option");
88213476 793 }
88213476 794
eb91eb18
LP
795 if (arg_share_system)
796 arg_register = false;
797
798 if (arg_boot && arg_share_system) {
799 log_error("--boot and --share-system may not be combined.");
800 return -EINVAL;
801 }
802
89f7c846
LP
803 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
804 log_error("--keep-unit may not be used when invoked from a user session.");
805 return -EINVAL;
806 }
807
1b9e5b12
LP
808 if (arg_directory && arg_image) {
809 log_error("--directory= and --image= may not be combined.");
810 return -EINVAL;
811 }
812
ec16945e
LP
813 if (arg_template && arg_image) {
814 log_error("--template= and --image= may not be combined.");
815 return -EINVAL;
816 }
817
818 if (arg_template && !(arg_directory || arg_machine)) {
819 log_error("--template= needs --directory= or --machine=.");
820 return -EINVAL;
821 }
822
823 if (arg_ephemeral && arg_template) {
824 log_error("--ephemeral and --template= may not be combined.");
825 return -EINVAL;
826 }
827
828 if (arg_ephemeral && arg_image) {
829 log_error("--ephemeral and --image= may not be combined.");
830 return -EINVAL;
831 }
832
df9a75e4
LP
833 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
834 log_error("--ephemeral and --link-journal= may not be combined.");
835 return -EINVAL;
836 }
837
4d9f07b4
LP
838 if (arg_volatile != VOLATILE_NO && arg_read_only) {
839 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
840 return -EINVAL;
841 }
842
6d0b55c2
LP
843 if (arg_expose_ports && !arg_private_network) {
844 log_error("Cannot use --port= without private networking.");
845 return -EINVAL;
846 }
847
a42c8b54
LP
848 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
849
c6c8f6e2
LP
850 if (arg_boot && arg_kill_signal <= 0)
851 arg_kill_signal = SIGRTMIN+3;
852
88213476
LP
853 return 1;
854}
855
856static int mount_all(const char *dest) {
857
858 typedef struct MountPoint {
859 const char *what;
860 const char *where;
861 const char *type;
862 const char *options;
863 unsigned long flags;
3bd66c05 864 bool fatal;
88213476
LP
865 } MountPoint;
866
867 static const MountPoint mount_table[] = {
06c17c39
LP
868 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
869 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
870 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
871 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
872 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 873 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
874 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
875 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
bbb99c30 876 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 877#ifdef HAVE_SELINUX
06c17c39
LP
878 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
879 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 880#endif
88213476
LP
881 };
882
883 unsigned k;
884 int r = 0;
885
886 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 887 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 888 const char *o;
88213476
LP
889 int t;
890
17fe0523
LP
891 where = strjoin(dest, "/", mount_table[k].where, NULL);
892 if (!where)
893 return log_oom();
88213476 894
e65aec12 895 t = path_is_mount_point(where, true);
da00518b 896 if (t < 0 && t != -ENOENT) {
da927ba9 897 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
898
899 if (r == 0)
900 r = t;
901
902 continue;
903 }
904
9c1c7f71
LP
905 /* Skip this entry if it is not a remount. */
906 if (mount_table[k].what && t > 0)
014a9c77
LP
907 continue;
908
79d80fc1
TG
909 t = mkdir_p(where, 0755);
910 if (t < 0) {
911 if (mount_table[k].fatal) {
da927ba9 912 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
913
914 if (r == 0)
915 r = t;
916 } else
da927ba9 917 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
918
919 continue;
920 }
88213476 921
a8828ed9 922#ifdef HAVE_SELINUX
82adf6af
LP
923 if (arg_selinux_apifs_context &&
924 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
925 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
926 if (!options)
927 return log_oom();
928
929 o = options;
930 } else
a8828ed9 931#endif
d002827b 932 o = mount_table[k].options;
a8828ed9 933
6dac160c
LP
934 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
935 char *uid_options = NULL;
936
937 if (o)
938 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
939 else
940 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
941 if (!uid_options)
942 return log_oom();
943
944 free(options);
945 o = options = uid_options;
946 }
a8828ed9 947
88213476
LP
948 if (mount(mount_table[k].what,
949 where,
950 mount_table[k].type,
951 mount_table[k].flags,
79d80fc1 952 o) < 0) {
88213476 953
79d80fc1 954 if (mount_table[k].fatal) {
56f64d95 955 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 956
79d80fc1
TG
957 if (r == 0)
958 r = -errno;
959 } else
56f64d95 960 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 961 }
88213476
LP
962 }
963
e58a1277
LP
964 return r;
965}
f8440af5 966
d6797c92 967static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
968 char **x, **y;
969
970 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 971 _cleanup_free_ char *where = NULL;
d2421337 972 struct stat source_st, dest_st;
2ed4e5e0 973 int r;
d2421337 974
4a62c710
MS
975 if (stat(*x, &source_st) < 0)
976 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 977
06c17c39
LP
978 where = strappend(dest, *y);
979 if (!where)
980 return log_oom();
981
2ed4e5e0
SL
982 r = stat(where, &dest_st);
983 if (r == 0) {
05e7da5a
AC
984 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
985 log_error("Cannot bind mount directory %s on file %s.", *x, where);
986 return -EINVAL;
987 }
988 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
989 log_error("Cannot bind mount file %s on directory %s.", *x, where);
d2421337
DR
990 return -EINVAL;
991 }
2ed4e5e0
SL
992 } else if (errno == ENOENT) {
993 r = mkdir_parents_label(where, 0755);
f647962d
MS
994 if (r < 0)
995 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 996 } else {
56f64d95 997 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
998 return -errno;
999 }
06c17c39 1000
05e7da5a
AC
1001 /* Create the mount point. Any non-directory file can be
1002 * mounted on any non-directory file (regular, fifo, socket,
1003 * char, block).
1004 */
79d80fc1
TG
1005 if (S_ISDIR(source_st.st_mode)) {
1006 r = mkdir_label(where, 0755);
f647962d
MS
1007 if (r < 0 && errno != EEXIST)
1008 return log_error_errno(r, "Failed to create mount point %s: %m", where);
05e7da5a 1009 } else {
79d80fc1 1010 r = touch(where);
f647962d
MS
1011 if (r < 0)
1012 return log_error_errno(r, "Failed to create mount point %s: %m", where);
d2421337 1013 }
17fe0523 1014
4543768d 1015 if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
4a62c710 1016 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 1017
d6797c92
LP
1018 if (ro) {
1019 r = bind_remount_recursive(where, true);
f647962d
MS
1020 if (r < 0)
1021 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
1022 }
1023 }
1024
1025 return 0;
1026}
1027
b12afc8c
LP
1028static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1029 char *to;
1030 int r;
1031
63c372cb 1032 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c
LP
1033
1034 r = path_is_mount_point(to, false);
da00518b 1035 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1036 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1037 if (r > 0)
1038 return 0;
1039
1040 mkdir_p(to, 0755);
1041
c0534580
LP
1042 /* The superblock mount options of the mount point need to be
1043 * identical to the hosts', and hence writable... */
1044 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1045 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1046
c0534580
LP
1047 /* ... hence let's only make the bind mount read-only, not the
1048 * superblock. */
1049 if (read_only) {
1050 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1051 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1052 }
b12afc8c
LP
1053 return 1;
1054}
1055
1056static int mount_cgroup(const char *dest) {
1057 _cleanup_set_free_free_ Set *controllers = NULL;
1058 _cleanup_free_ char *own_cgroup_path = NULL;
1059 const char *cgroup_root, *systemd_root, *systemd_own;
1060 int r;
1061
1062 controllers = set_new(&string_hash_ops);
1063 if (!controllers)
1064 return log_oom();
1065
1066 r = cg_kernel_controllers(controllers);
1067 if (r < 0)
1068 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1069
1070 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1073
63c372cb 1074 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
b12afc8c
LP
1075 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1076 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1077
1078 for (;;) {
1079 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1080
1081 controller = set_steal_first(controllers);
1082 if (!controller)
1083 break;
1084
1085 origin = strappend("/sys/fs/cgroup/", controller);
1086 if (!origin)
1087 return log_oom();
1088
1089 r = readlink_malloc(origin, &combined);
1090 if (r == -EINVAL) {
1091 /* Not a symbolic link, but directly a single cgroup hierarchy */
1092
1093 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1094 if (r < 0)
1095 return r;
1096
1097 } else if (r < 0)
1098 return log_error_errno(r, "Failed to read link %s: %m", origin);
1099 else {
1100 _cleanup_free_ char *target = NULL;
1101
1102 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1103 if (!target)
1104 return log_oom();
1105
1106 /* A symbolic link, a combination of controllers in one hierarchy */
1107
1108 if (!filename_is_valid(combined)) {
1109 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1110 continue;
1111 }
1112
1113 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1114 if (r < 0)
1115 return r;
1116
1117 if (symlink(combined, target) < 0)
83521414 1118 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1119 }
1120 }
1121
c0534580 1122 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1123 if (r < 0)
1124 return r;
1125
1126 /* Make our own cgroup a (writable) bind mount */
63c372cb 1127 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1128 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1129 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1130
1131 /* And then remount the systemd cgroup root read-only */
63c372cb 1132 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1133 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1134 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1135
1136 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1137 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1138
1139 return 0;
1140}
1141
06c17c39
LP
1142static int mount_tmpfs(const char *dest) {
1143 char **i, **o;
1144
1145 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1146 _cleanup_free_ char *where = NULL;
79d80fc1 1147 int r;
06c17c39
LP
1148
1149 where = strappend(dest, *i);
1150 if (!where)
1151 return log_oom();
1152
79d80fc1 1153 r = mkdir_label(where, 0755);
04a91939
LP
1154 if (r < 0 && r != -EEXIST)
1155 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1156
4a62c710
MS
1157 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1158 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1159 }
1160
1161 return 0;
1162}
1163
e58a1277 1164static int setup_timezone(const char *dest) {
d4036145
LP
1165 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1166 char *z, *y;
1167 int r;
f8440af5 1168
e58a1277
LP
1169 assert(dest);
1170
1171 /* Fix the timezone, if possible */
d4036145
LP
1172 r = readlink_malloc("/etc/localtime", &p);
1173 if (r < 0) {
1174 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1175 return 0;
1176 }
1177
1178 z = path_startswith(p, "../usr/share/zoneinfo/");
1179 if (!z)
1180 z = path_startswith(p, "/usr/share/zoneinfo/");
1181 if (!z) {
1182 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1183 return 0;
1184 }
1185
04bc4a3f
LP
1186 where = strappend(dest, "/etc/localtime");
1187 if (!where)
0d0f0c50 1188 return log_oom();
715ac17a 1189
d4036145
LP
1190 r = readlink_malloc(where, &q);
1191 if (r >= 0) {
1192 y = path_startswith(q, "../usr/share/zoneinfo/");
1193 if (!y)
1194 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1195
d4036145
LP
1196 /* Already pointing to the right place? Then do nothing .. */
1197 if (y && streq(y, z))
1198 return 0;
1199 }
1200
1201 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1202 if (!check)
0d0f0c50 1203 return log_oom();
4d1c38b8 1204
d4036145
LP
1205 if (access(check, F_OK) < 0) {
1206 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1207 return 0;
1208 }
68fb0892 1209
d4036145
LP
1210 what = strappend("../usr/share/zoneinfo/", z);
1211 if (!what)
1212 return log_oom();
1213
79d80fc1
TG
1214 r = mkdir_parents(where, 0755);
1215 if (r < 0) {
da927ba9 1216 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1217
1218 return 0;
1219 }
1220
1221 r = unlink(where);
1222 if (r < 0 && errno != ENOENT) {
56f64d95 1223 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1224
1225 return 0;
1226 }
4d9f07b4 1227
d4036145 1228 if (symlink(what, where) < 0) {
56f64d95 1229 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1230 return 0;
1231 }
e58a1277
LP
1232
1233 return 0;
88213476
LP
1234}
1235
2547bb41 1236static int setup_resolv_conf(const char *dest) {
c8b32e11 1237 _cleanup_free_ char *where = NULL;
79d80fc1 1238 int r;
2547bb41
LP
1239
1240 assert(dest);
1241
1242 if (arg_private_network)
1243 return 0;
1244
1245 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1246 where = strappend(dest, "/etc/resolv.conf");
1247 if (!where)
0d0f0c50 1248 return log_oom();
2547bb41 1249
77e63faf
LP
1250 /* We don't really care for the results of this really. If it
1251 * fails, it fails, but meh... */
79d80fc1
TG
1252 r = mkdir_parents(where, 0755);
1253 if (r < 0) {
da927ba9 1254 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1255
1256 return 0;
1257 }
1258
f2068bcc 1259 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1260 if (r < 0) {
da927ba9 1261 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1262
1263 return 0;
1264 }
2547bb41
LP
1265
1266 return 0;
1267}
1268
4d9f07b4
LP
1269static int setup_volatile_state(const char *directory) {
1270 const char *p;
1271 int r;
1272
1273 assert(directory);
1274
1275 if (arg_volatile != VOLATILE_STATE)
1276 return 0;
1277
1278 /* --volatile=state means we simply overmount /var
1279 with a tmpfs, and the rest read-only. */
1280
1281 r = bind_remount_recursive(directory, true);
f647962d
MS
1282 if (r < 0)
1283 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1284
63c372cb 1285 p = strjoina(directory, "/var");
79d80fc1 1286 r = mkdir(p, 0755);
4a62c710
MS
1287 if (r < 0 && errno != EEXIST)
1288 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1289
4a62c710
MS
1290 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1291 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1292
1293 return 0;
1294}
1295
1296static int setup_volatile(const char *directory) {
1297 bool tmpfs_mounted = false, bind_mounted = false;
1298 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1299 const char *f, *t;
1300 int r;
1301
1302 assert(directory);
1303
1304 if (arg_volatile != VOLATILE_YES)
1305 return 0;
1306
1307 /* --volatile=yes means we mount a tmpfs to the root dir, and
1308 the original /usr to use inside it, and that read-only. */
1309
4a62c710
MS
1310 if (!mkdtemp(template))
1311 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1312
1313 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1314 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1315 r = -errno;
1316 goto fail;
1317 }
1318
1319 tmpfs_mounted = true;
1320
63c372cb
LP
1321 f = strjoina(directory, "/usr");
1322 t = strjoina(template, "/usr");
4d9f07b4 1323
79d80fc1
TG
1324 r = mkdir(t, 0755);
1325 if (r < 0 && errno != EEXIST) {
56f64d95 1326 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1327 r = -errno;
1328 goto fail;
1329 }
1330
4543768d 1331 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1332 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1333 r = -errno;
1334 goto fail;
1335 }
1336
1337 bind_mounted = true;
1338
1339 r = bind_remount_recursive(t, true);
1340 if (r < 0) {
da927ba9 1341 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1342 goto fail;
1343 }
1344
1345 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1346 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1347 r = -errno;
1348 goto fail;
1349 }
1350
1351 rmdir(template);
1352
1353 return 0;
1354
1355fail:
1356 if (bind_mounted)
1357 umount(t);
1358 if (tmpfs_mounted)
1359 umount(template);
1360 rmdir(template);
1361 return r;
1362}
1363
9f24adc2
LP
1364static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1365
1366 snprintf(s, 37,
1367 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1368 SD_ID128_FORMAT_VAL(id));
1369
1370 return s;
1371}
1372
04bc4a3f 1373static int setup_boot_id(const char *dest) {
7fd1b19b 1374 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1375 sd_id128_t rnd = {};
04bc4a3f
LP
1376 char as_uuid[37];
1377 int r;
1378
1379 assert(dest);
1380
eb91eb18
LP
1381 if (arg_share_system)
1382 return 0;
1383
04bc4a3f
LP
1384 /* Generate a new randomized boot ID, so that each boot-up of
1385 * the container gets a new one */
1386
1387 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1388 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1389 if (!from || !to)
1390 return log_oom();
04bc4a3f
LP
1391
1392 r = sd_id128_randomize(&rnd);
f647962d
MS
1393 if (r < 0)
1394 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1395
9f24adc2 1396 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1397
574d5f2d 1398 r = write_string_file(from, as_uuid);
f647962d
MS
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1401
4543768d 1402 if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
56f64d95 1403 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1404 r = -errno;
4543768d 1405 } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1406 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1407
1408 unlink(from);
04bc4a3f
LP
1409 return r;
1410}
1411
e58a1277 1412static int copy_devnodes(const char *dest) {
88213476
LP
1413
1414 static const char devnodes[] =
1415 "null\0"
1416 "zero\0"
1417 "full\0"
1418 "random\0"
1419 "urandom\0"
85614d66
TG
1420 "tty\0"
1421 "net/tun\0";
88213476
LP
1422
1423 const char *d;
e58a1277 1424 int r = 0;
7fd1b19b 1425 _cleanup_umask_ mode_t u;
a258bf26
LP
1426
1427 assert(dest);
124640f1
LP
1428
1429 u = umask(0000);
88213476
LP
1430
1431 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1432 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1433 struct stat st;
88213476 1434
7f112f50
LP
1435 from = strappend("/dev/", d);
1436 to = strjoin(dest, "/dev/", d, NULL);
1437 if (!from || !to)
1438 return log_oom();
88213476
LP
1439
1440 if (stat(from, &st) < 0) {
1441
4a62c710
MS
1442 if (errno != ENOENT)
1443 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1444
a258bf26 1445 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1446
ed8b7a3e 1447 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1448 return -EIO;
a258bf26 1449
85614d66
TG
1450 } else {
1451 r = mkdir_parents(to, 0775);
1452 if (r < 0) {
da927ba9 1453 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1454 return -r;
1455 }
a258bf26 1456
81f5049b
AC
1457 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1458 if (errno != EPERM)
1459 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1460
1461 /* Some systems abusively restrict mknod but
1462 * allow bind mounts. */
1463 r = touch(to);
1464 if (r < 0)
1465 return log_error_errno(r, "touch (%s) failed: %m", to);
1466 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1467 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1468 }
6278cf60
LP
1469
1470 if (arg_userns && arg_uid_shift != UID_INVALID)
1471 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1472 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
88213476 1473 }
88213476
LP
1474 }
1475
e58a1277
LP
1476 return r;
1477}
88213476 1478
f2d88580
LP
1479static int setup_ptmx(const char *dest) {
1480 _cleanup_free_ char *p = NULL;
1481
1482 p = strappend(dest, "/dev/ptmx");
1483 if (!p)
1484 return log_oom();
1485
4a62c710
MS
1486 if (symlink("pts/ptmx", p) < 0)
1487 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580 1488
6278cf60
LP
1489 if (arg_userns && arg_uid_shift != UID_INVALID)
1490 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1491 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1492
f2d88580
LP
1493 return 0;
1494}
1495
e58a1277 1496static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1497 _cleanup_umask_ mode_t u;
1498 const char *to;
e58a1277 1499 int r;
e58a1277
LP
1500
1501 assert(dest);
1502 assert(console);
1503
1504 u = umask(0000);
1505
e58a1277 1506 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1507 if (r < 0)
1508 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1509
a258bf26
LP
1510 /* We need to bind mount the right tty to /dev/console since
1511 * ptys can only exist on pts file systems. To have something
81f5049b 1512 * to bind mount things on we create a empty regular file. */
a258bf26 1513
63c372cb 1514 to = strjoina(dest, "/dev/console");
81f5049b
AC
1515 r = touch(to);
1516 if (r < 0)
1517 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1518
4543768d 1519 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1520 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1521
25ea79fe 1522 return 0;
e58a1277
LP
1523}
1524
1525static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1526 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1527 _cleanup_umask_ mode_t u;
6d0b55c2 1528 int r, fd, k;
e58a1277
LP
1529 union {
1530 struct cmsghdr cmsghdr;
1531 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1532 } control = {};
1533 struct msghdr mh = {
1534 .msg_control = &control,
1535 .msg_controllen = sizeof(control),
1536 };
e58a1277
LP
1537 struct cmsghdr *cmsg;
1538
1539 assert(dest);
1540 assert(kmsg_socket >= 0);
a258bf26 1541
e58a1277 1542 u = umask(0000);
a258bf26 1543
f1e5dfe2
LP
1544 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1545 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1546 * on the reading side behave very similar to /proc/kmsg,
1547 * their writing side behaves differently from /dev/kmsg in
1548 * that writing blocks when nothing is reading. In order to
1549 * avoid any problems with containers deadlocking due to this
1550 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1551 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1552 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1553 return log_oom();
e58a1277 1554
4a62c710
MS
1555 if (mkfifo(from, 0600) < 0)
1556 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1557
1558 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1559 if (r < 0)
1560 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1561
4543768d 1562 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1563 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1564
1565 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1566 if (fd < 0)
1567 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1568
e58a1277
LP
1569 cmsg = CMSG_FIRSTHDR(&mh);
1570 cmsg->cmsg_level = SOL_SOCKET;
1571 cmsg->cmsg_type = SCM_RIGHTS;
1572 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1573 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1574
1575 mh.msg_controllen = cmsg->cmsg_len;
1576
1577 /* Store away the fd in the socket, so that it stays open as
1578 * long as we run the child */
6d0b55c2 1579 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1580 safe_close(fd);
e58a1277 1581
4a62c710
MS
1582 if (k < 0)
1583 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1584
f1e5dfe2
LP
1585 /* And now make the FIFO unavailable as /dev/kmsg... */
1586 unlink(from);
25ea79fe 1587 return 0;
88213476
LP
1588}
1589
6d0b55c2
LP
1590static int send_rtnl(int send_fd) {
1591 union {
1592 struct cmsghdr cmsghdr;
1593 uint8_t buf[CMSG_SPACE(sizeof(int))];
1594 } control = {};
1595 struct msghdr mh = {
1596 .msg_control = &control,
1597 .msg_controllen = sizeof(control),
1598 };
1599 struct cmsghdr *cmsg;
1600 _cleanup_close_ int fd = -1;
1601 ssize_t k;
1602
1603 assert(send_fd >= 0);
1604
1605 if (!arg_expose_ports)
1606 return 0;
1607
1608 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1609 if (fd < 0)
1610 return log_error_errno(errno, "failed to allocate container netlink: %m");
1611
1612 cmsg = CMSG_FIRSTHDR(&mh);
1613 cmsg->cmsg_level = SOL_SOCKET;
1614 cmsg->cmsg_type = SCM_RIGHTS;
1615 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1616 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1617
1618 mh.msg_controllen = cmsg->cmsg_len;
1619
1620 /* Store away the fd in the socket, so that it stays open as
1621 * long as we run the child */
1622 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1623 if (k < 0)
1624 return log_error_errno(errno, "Failed to send netlink fd: %m");
1625
1626 return 0;
1627}
1628
1629static int flush_ports(union in_addr_union *exposed) {
1630 ExposePort *p;
1631 int r, af = AF_INET;
1632
1633 assert(exposed);
1634
1635 if (!arg_expose_ports)
1636 return 0;
1637
1638 if (in_addr_is_null(af, exposed))
1639 return 0;
1640
1641 log_debug("Lost IP address.");
1642
1643 LIST_FOREACH(ports, p, arg_expose_ports) {
1644 r = fw_add_local_dnat(false,
1645 af,
1646 p->protocol,
1647 NULL,
1648 NULL, 0,
1649 NULL, 0,
1650 p->host_port,
1651 exposed,
1652 p->container_port,
1653 NULL);
1654 if (r < 0)
1655 log_warning_errno(r, "Failed to modify firewall: %m");
1656 }
1657
1658 *exposed = IN_ADDR_NULL;
1659 return 0;
1660}
1661
1662static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1663 _cleanup_free_ struct local_address *addresses = NULL;
1664 _cleanup_free_ char *pretty = NULL;
1665 union in_addr_union new_exposed;
1666 ExposePort *p;
1667 bool add;
1668 int af = AF_INET, r;
1669
1670 assert(exposed);
1671
1672 /* Invoked each time an address is added or removed inside the
1673 * container */
1674
1675 if (!arg_expose_ports)
1676 return 0;
1677
1678 r = local_addresses(rtnl, 0, af, &addresses);
1679 if (r < 0)
1680 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1681
1682 add = r > 0 &&
1683 addresses[0].family == af &&
1684 addresses[0].scope < RT_SCOPE_LINK;
1685
1686 if (!add)
1687 return flush_ports(exposed);
1688
1689 new_exposed = addresses[0].address;
1690 if (in_addr_equal(af, exposed, &new_exposed))
1691 return 0;
1692
1693 in_addr_to_string(af, &new_exposed, &pretty);
1694 log_debug("New container IP is %s.", strna(pretty));
1695
1696 LIST_FOREACH(ports, p, arg_expose_ports) {
1697
1698 r = fw_add_local_dnat(true,
1699 af,
1700 p->protocol,
1701 NULL,
1702 NULL, 0,
1703 NULL, 0,
1704 p->host_port,
1705 &new_exposed,
1706 p->container_port,
1707 in_addr_is_null(af, exposed) ? NULL : exposed);
1708 if (r < 0)
1709 log_warning_errno(r, "Failed to modify firewall: %m");
1710 }
1711
1712 *exposed = new_exposed;
1713 return 0;
1714}
1715
1716static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1717 union in_addr_union *exposed = userdata;
1718
1719 assert(rtnl);
1720 assert(m);
1721 assert(exposed);
1722
1723 expose_ports(rtnl, exposed);
1724 return 0;
1725}
1726
1727static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1728 union {
1729 struct cmsghdr cmsghdr;
1730 uint8_t buf[CMSG_SPACE(sizeof(int))];
1731 } control = {};
1732 struct msghdr mh = {
1733 .msg_control = &control,
1734 .msg_controllen = sizeof(control),
1735 };
1736 struct cmsghdr *cmsg;
1737 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1738 int fd, r;
1739 ssize_t k;
1740
1741 assert(event);
1742 assert(recv_fd >= 0);
1743 assert(ret);
1744
1745 if (!arg_expose_ports)
1746 return 0;
1747
1748 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1749 if (k < 0)
1750 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1751
1752 cmsg = CMSG_FIRSTHDR(&mh);
1753 assert(cmsg->cmsg_level == SOL_SOCKET);
1754 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1755 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1756 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1757
1758 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1759 if (r < 0) {
1760 safe_close(fd);
1761 return log_error_errno(r, "Failed to create rtnl object: %m");
1762 }
1763
1764 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1765 if (r < 0)
1766 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1767
1768 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1769 if (r < 0)
1770 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1771
1772 r = sd_rtnl_attach_event(rtnl, event, 0);
1773 if (r < 0)
1774 return log_error_errno(r, "Failed to add to even loop: %m");
1775
1776 *ret = rtnl;
1777 rtnl = NULL;
1778
1779 return 0;
1780}
1781
3a74cea5 1782static int setup_hostname(void) {
3a74cea5 1783
eb91eb18
LP
1784 if (arg_share_system)
1785 return 0;
1786
605f81a8 1787 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1788 return -errno;
3a74cea5 1789
7027ff61 1790 return 0;
3a74cea5
LP
1791}
1792
57fb9fb5 1793static int setup_journal(const char *directory) {
4d680aee 1794 sd_id128_t machine_id, this_id;
7fd1b19b 1795 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1796 char *id;
57fb9fb5
LP
1797 int r;
1798
df9a75e4
LP
1799 /* Don't link journals in ephemeral mode */
1800 if (arg_ephemeral)
1801 return 0;
1802
57fb9fb5 1803 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1804 if (!p)
1805 return log_oom();
57fb9fb5
LP
1806
1807 r = read_one_line_file(p, &b);
27407a01
ZJS
1808 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1809 return 0;
f647962d
MS
1810 else if (r < 0)
1811 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1812
27407a01
ZJS
1813 id = strstrip(b);
1814 if (isempty(id) && arg_link_journal == LINK_AUTO)
1815 return 0;
57fb9fb5 1816
27407a01
ZJS
1817 /* Verify validity */
1818 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1819 if (r < 0)
1820 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1821
4d680aee 1822 r = sd_id128_get_machine(&this_id);
f647962d
MS
1823 if (r < 0)
1824 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1825
1826 if (sd_id128_equal(machine_id, this_id)) {
1827 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1828 "Host and machine ids are equal (%s): refusing to link journals", id);
1829 if (arg_link_journal == LINK_AUTO)
1830 return 0;
df9a75e4 1831 return -EEXIST;
4d680aee
ZJS
1832 }
1833
1834 if (arg_link_journal == LINK_NO)
1835 return 0;
1836
57fb9fb5 1837 free(p);
27407a01
ZJS
1838 p = strappend("/var/log/journal/", id);
1839 q = strjoin(directory, "/var/log/journal/", id, NULL);
1840 if (!p || !q)
1841 return log_oom();
1842
1843 if (path_is_mount_point(p, false) > 0) {
1844 if (arg_link_journal != LINK_AUTO) {
1845 log_error("%s: already a mount point, refusing to use for journal", p);
1846 return -EEXIST;
1847 }
1848
1849 return 0;
57fb9fb5
LP
1850 }
1851
27407a01 1852 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1853 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1854 log_error("%s: already a mount point, refusing to use for journal", q);
1855 return -EEXIST;
57fb9fb5
LP
1856 }
1857
27407a01 1858 return 0;
57fb9fb5
LP
1859 }
1860
1861 r = readlink_and_make_absolute(p, &d);
1862 if (r >= 0) {
1863 if ((arg_link_journal == LINK_GUEST ||
1864 arg_link_journal == LINK_AUTO) &&
1865 path_equal(d, q)) {
1866
27407a01
ZJS
1867 r = mkdir_p(q, 0755);
1868 if (r < 0)
56f64d95 1869 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1870 return 0;
57fb9fb5
LP
1871 }
1872
4a62c710
MS
1873 if (unlink(p) < 0)
1874 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1875 } else if (r == -EINVAL) {
1876
1877 if (arg_link_journal == LINK_GUEST &&
1878 rmdir(p) < 0) {
1879
27407a01
ZJS
1880 if (errno == ENOTDIR) {
1881 log_error("%s already exists and is neither a symlink nor a directory", p);
1882 return r;
1883 } else {
56f64d95 1884 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1885 return -errno;
57fb9fb5 1886 }
57fb9fb5
LP
1887 }
1888 } else if (r != -ENOENT) {
56f64d95 1889 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1890 return r;
57fb9fb5
LP
1891 }
1892
1893 if (arg_link_journal == LINK_GUEST) {
1894
1895 if (symlink(q, p) < 0) {
574edc90 1896 if (arg_link_journal_try) {
56f64d95 1897 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1898 return 0;
1899 } else {
56f64d95 1900 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1901 return -errno;
1902 }
57fb9fb5
LP
1903 }
1904
27407a01
ZJS
1905 r = mkdir_p(q, 0755);
1906 if (r < 0)
56f64d95 1907 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1908 return 0;
57fb9fb5
LP
1909 }
1910
1911 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1912 /* don't create parents here -- if the host doesn't have
1913 * permanent journal set up, don't force it here */
1914 r = mkdir(p, 0755);
57fb9fb5 1915 if (r < 0) {
574edc90 1916 if (arg_link_journal_try) {
56f64d95 1917 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1918 return 0;
1919 } else {
56f64d95 1920 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1921 return r;
1922 }
57fb9fb5
LP
1923 }
1924
27407a01
ZJS
1925 } else if (access(p, F_OK) < 0)
1926 return 0;
57fb9fb5 1927
cdb2b9d0
LP
1928 if (dir_is_empty(q) == 0)
1929 log_warning("%s is not empty, proceeding anyway.", q);
1930
57fb9fb5
LP
1931 r = mkdir_p(q, 0755);
1932 if (r < 0) {
56f64d95 1933 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1934 return r;
57fb9fb5
LP
1935 }
1936
4543768d 1937 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1938 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1939
27407a01 1940 return 0;
57fb9fb5
LP
1941}
1942
88213476 1943static int drop_capabilities(void) {
5076f0cc 1944 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1945}
1946
5aa4bb6b 1947static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1948 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1949 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1950 int r;
1951
eb91eb18
LP
1952 if (!arg_register)
1953 return 0;
1954
1c03020c 1955 r = sd_bus_default_system(&bus);
f647962d
MS
1956 if (r < 0)
1957 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1958
89f7c846
LP
1959 if (arg_keep_unit) {
1960 r = sd_bus_call_method(
1961 bus,
1962 "org.freedesktop.machine1",
1963 "/org/freedesktop/machine1",
1964 "org.freedesktop.machine1.Manager",
5aa4bb6b 1965 "RegisterMachineWithNetwork",
89f7c846
LP
1966 &error,
1967 NULL,
5aa4bb6b 1968 "sayssusai",
89f7c846
LP
1969 arg_machine,
1970 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1971 "nspawn",
1972 "container",
1973 (uint32_t) pid,
5aa4bb6b
LP
1974 strempty(arg_directory),
1975 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1976 } else {
9457ac5b 1977 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 1978 char **i;
9457ac5b
LP
1979
1980 r = sd_bus_message_new_method_call(
89f7c846 1981 bus,
9457ac5b 1982 &m,
89f7c846
LP
1983 "org.freedesktop.machine1",
1984 "/org/freedesktop/machine1",
1985 "org.freedesktop.machine1.Manager",
5aa4bb6b 1986 "CreateMachineWithNetwork");
f647962d 1987 if (r < 0)
f36933fe 1988 return bus_log_create_error(r);
9457ac5b
LP
1989
1990 r = sd_bus_message_append(
1991 m,
5aa4bb6b 1992 "sayssusai",
89f7c846
LP
1993 arg_machine,
1994 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1995 "nspawn",
1996 "container",
1997 (uint32_t) pid,
5aa4bb6b
LP
1998 strempty(arg_directory),
1999 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2000 if (r < 0)
f36933fe 2001 return bus_log_create_error(r);
9457ac5b
LP
2002
2003 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2004 if (r < 0)
f36933fe 2005 return bus_log_create_error(r);
9457ac5b
LP
2006
2007 if (!isempty(arg_slice)) {
2008 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2009 if (r < 0)
f36933fe 2010 return bus_log_create_error(r);
9457ac5b
LP
2011 }
2012
2013 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2014 if (r < 0)
f36933fe 2015 return bus_log_create_error(r);
9457ac5b 2016
773ce3d8
LP
2017 /* If you make changes here, also make sure to update
2018 * systemd-nspawn@.service, to keep the device
2019 * policies in sync regardless if we are run with or
2020 * without the --keep-unit switch. */
63cc4c31 2021 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2022 /* Allow the container to
2023 * access and create the API
2024 * device nodes, so that
2025 * PrivateDevices= in the
2026 * container can work
2027 * fine */
2028 "/dev/null", "rwm",
2029 "/dev/zero", "rwm",
2030 "/dev/full", "rwm",
2031 "/dev/random", "rwm",
2032 "/dev/urandom", "rwm",
2033 "/dev/tty", "rwm",
864e1706 2034 "/dev/net/tun", "rwm",
9457ac5b
LP
2035 /* Allow the container
2036 * access to ptys. However,
2037 * do not permit the
2038 * container to ever create
2039 * these device nodes. */
2040 "/dev/pts/ptmx", "rw",
63cc4c31 2041 "char-pts", "rw");
f647962d
MS
2042 if (r < 0)
2043 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b 2044
f36933fe
LP
2045 STRV_FOREACH(i, arg_property) {
2046 r = sd_bus_message_open_container(m, 'r', "sv");
2047 if (r < 0)
2048 return bus_log_create_error(r);
2049
2050 r = bus_append_unit_property_assignment(m, *i);
2051 if (r < 0)
2052 return r;
2053
2054 r = sd_bus_message_close_container(m);
2055 if (r < 0)
2056 return bus_log_create_error(r);
2057 }
2058
9457ac5b 2059 r = sd_bus_message_close_container(m);
f647962d 2060 if (r < 0)
f36933fe 2061 return bus_log_create_error(r);
9457ac5b
LP
2062
2063 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2064 }
2065
9444b1f2 2066 if (r < 0) {
1f0cd86b
LP
2067 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2068 return r;
2069 }
2070
2071 return 0;
2072}
2073
2074static int terminate_machine(pid_t pid) {
2075 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2076 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 2077 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2078 const char *path;
2079 int r;
2080
eb91eb18
LP
2081 if (!arg_register)
2082 return 0;
2083
1a2399e5
LP
2084 /* If we are reusing the unit, then just exit, systemd will do
2085 * the right thing when we exit. */
2086 if (arg_keep_unit)
2087 return 0;
2088
76b54375 2089 r = sd_bus_default_system(&bus);
f647962d
MS
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2092
2093 r = sd_bus_call_method(
2094 bus,
2095 "org.freedesktop.machine1",
2096 "/org/freedesktop/machine1",
2097 "org.freedesktop.machine1.Manager",
2098 "GetMachineByPID",
2099 &error,
2100 &reply,
2101 "u",
2102 (uint32_t) pid);
2103 if (r < 0) {
2104 /* Note that the machine might already have been
2105 * cleaned up automatically, hence don't consider it a
2106 * failure if we cannot get the machine object. */
2107 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2108 return 0;
2109 }
2110
2111 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2112 if (r < 0)
2113 return bus_log_parse_error(r);
9444b1f2 2114
1f0cd86b
LP
2115 r = sd_bus_call_method(
2116 bus,
2117 "org.freedesktop.machine1",
2118 path,
2119 "org.freedesktop.machine1.Machine",
2120 "Terminate",
2121 &error,
2122 NULL,
2123 NULL);
2124 if (r < 0) {
2125 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2126 return 0;
2127 }
2128
9444b1f2
LP
2129 return 0;
2130}
2131
db999e0f
LP
2132static int reset_audit_loginuid(void) {
2133 _cleanup_free_ char *p = NULL;
2134 int r;
2135
2136 if (arg_share_system)
2137 return 0;
2138
2139 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2140 if (r == -ENOENT)
db999e0f 2141 return 0;
f647962d
MS
2142 if (r < 0)
2143 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2144
2145 /* Already reset? */
2146 if (streq(p, "4294967295"))
2147 return 0;
2148
2149 r = write_string_file("/proc/self/loginuid", "4294967295");
2150 if (r < 0) {
10a87006
LP
2151 log_error_errno(r,
2152 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2153 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2154 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2155 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2156 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2157
db999e0f 2158 sleep(5);
77b6e194 2159 }
db999e0f
LP
2160
2161 return 0;
77b6e194
LP
2162}
2163
4f758c23
LP
2164#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2165#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2166#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2167
a90e2305 2168static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2169 uint8_t result[8];
2170 size_t l, sz;
a90e2305
LP
2171 uint8_t *v, *i;
2172 int r;
01dde061
TG
2173
2174 l = strlen(arg_machine);
2175 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2176 if (idx > 0)
2177 sz += sizeof(idx);
a90e2305 2178
01dde061
TG
2179 v = alloca(sz);
2180
2181 /* fetch some persistent data unique to the host */
2182 r = sd_id128_get_machine((sd_id128_t*) v);
2183 if (r < 0)
2184 return r;
2185
2186 /* combine with some data unique (on this host) to this
2187 * container instance */
a90e2305
LP
2188 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2189 if (idx > 0) {
2190 idx = htole64(idx);
2191 memcpy(i, &idx, sizeof(idx));
2192 }
01dde061
TG
2193
2194 /* Let's hash the host machine ID plus the container name. We
2195 * use a fixed, but originally randomly created hash key here. */
4f758c23 2196 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2197
2198 assert_cc(ETH_ALEN <= sizeof(result));
2199 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2200
2201 /* see eth_random_addr in the kernel */
2202 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2203 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2204
2205 return 0;
2206}
2207
5aa4bb6b 2208static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2209 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2210 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2211 struct ether_addr mac_host, mac_container;
5aa4bb6b 2212 int r, i;
69c79d3c
LP
2213
2214 if (!arg_private_network)
2215 return 0;
2216
2217 if (!arg_network_veth)
2218 return 0;
2219
08af0da2
LP
2220 /* Use two different interface name prefixes depending whether
2221 * we are in bridge mode or not. */
c00524c9 2222 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2223 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2224
e867ceb6
LP
2225 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2226 if (r < 0)
2227 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2228
e867ceb6
LP
2229 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2230 if (r < 0)
2231 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2232
151b9b96 2233 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2234 if (r < 0)
2235 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2236
151b9b96 2237 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2238 if (r < 0)
2239 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2240
ab046dde 2241 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2242 if (r < 0)
2243 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2244
4f758c23 2245 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2246 if (r < 0)
2247 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2248
ee3a6a51 2249 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2250 if (r < 0)
2251 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2252
d8e538ec 2253 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2254 if (r < 0)
2255 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2256
ee3a6a51 2257 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2258 if (r < 0)
2259 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2260
ab046dde 2261 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2262 if (r < 0)
2263 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2264
4f758c23 2265 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2266 if (r < 0)
2267 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2268
ab046dde 2269 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2270 if (r < 0)
2271 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2272
2273 r = sd_rtnl_message_close_container(m);
f647962d
MS
2274 if (r < 0)
2275 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2276
2277 r = sd_rtnl_message_close_container(m);
f647962d
MS
2278 if (r < 0)
2279 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2280
2281 r = sd_rtnl_message_close_container(m);
f647962d
MS
2282 if (r < 0)
2283 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2284
2285 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2286 if (r < 0)
2287 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2288
5aa4bb6b 2289 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2290 if (i <= 0)
2291 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2292
2293 *ifi = i;
2294
69c79d3c
LP
2295 return 0;
2296}
2297
5aa4bb6b 2298static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2299 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2300 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2301 int r, bridge;
2302
2303 if (!arg_private_network)
2304 return 0;
2305
2306 if (!arg_network_veth)
2307 return 0;
2308
2309 if (!arg_network_bridge)
2310 return 0;
2311
2312 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2313 if (bridge <= 0)
2314 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2315
5aa4bb6b
LP
2316 *ifi = bridge;
2317
151b9b96 2318 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2321
151b9b96 2322 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2323 if (r < 0)
2324 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2325
039dd4af 2326 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2327 if (r < 0)
2328 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2329
ab046dde 2330 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2331 if (r < 0)
2332 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2333
2334 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2335 if (r < 0)
2336 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2337
2338 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2339 if (r < 0)
2340 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2341
2342 return 0;
2343}
2344
c74e630d
LP
2345static int parse_interface(struct udev *udev, const char *name) {
2346 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2347 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2348 int ifi;
2349
2350 ifi = (int) if_nametoindex(name);
4a62c710
MS
2351 if (ifi <= 0)
2352 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2353
2354 sprintf(ifi_str, "n%i", ifi);
2355 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2356 if (!d)
2357 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2358
2359 if (udev_device_get_is_initialized(d) <= 0) {
2360 log_error("Network interface %s is not initialized yet.", name);
2361 return -EBUSY;
2362 }
2363
2364 return ifi;
2365}
2366
69c79d3c 2367static int move_network_interfaces(pid_t pid) {
7e227024 2368 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2369 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2370 char **i;
2371 int r;
2372
2373 if (!arg_private_network)
2374 return 0;
2375
2376 if (strv_isempty(arg_network_interfaces))
2377 return 0;
2378
151b9b96 2379 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2382
7e227024
LP
2383 udev = udev_new();
2384 if (!udev) {
2385 log_error("Failed to connect to udev.");
2386 return -ENOMEM;
2387 }
2388
aa28aefe 2389 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2390 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2391 int ifi;
aa28aefe 2392
c74e630d
LP
2393 ifi = parse_interface(udev, *i);
2394 if (ifi < 0)
2395 return ifi;
2396
3125b3ef 2397 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2398 if (r < 0)
2399 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2400
c74e630d 2401 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2402 if (r < 0)
2403 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2404
c74e630d 2405 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2408 }
7e227024 2409
c74e630d
LP
2410 return 0;
2411}
2412
2413static int setup_macvlan(pid_t pid) {
2414 _cleanup_udev_unref_ struct udev *udev = NULL;
2415 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2416 unsigned idx = 0;
c74e630d
LP
2417 char **i;
2418 int r;
2419
2420 if (!arg_private_network)
2421 return 0;
2422
2423 if (strv_isempty(arg_network_macvlan))
2424 return 0;
2425
2426 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2427 if (r < 0)
2428 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2429
2430 udev = udev_new();
2431 if (!udev) {
2432 log_error("Failed to connect to udev.");
2433 return -ENOMEM;
2434 }
2435
2436 STRV_FOREACH(i, arg_network_macvlan) {
2437 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2438 _cleanup_free_ char *n = NULL;
e867ceb6 2439 struct ether_addr mac;
c74e630d
LP
2440 int ifi;
2441
2442 ifi = parse_interface(udev, *i);
2443 if (ifi < 0)
2444 return ifi;
2445
e867ceb6
LP
2446 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2449
c74e630d 2450 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2453
c74e630d 2454 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2457
2458 n = strappend("mv-", *i);
2459 if (!n)
2460 return log_oom();
2461
2462 strshorten(n, IFNAMSIZ-1);
2463
2464 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2465 if (r < 0)
2466 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2467
e867ceb6
LP
2468 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2471
aa28aefe 2472 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2473 if (r < 0)
2474 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2475
2476 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2477 if (r < 0)
2478 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2479
d8e538ec 2480 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2481 if (r < 0)
2482 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2483
2484 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2485 if (r < 0)
2486 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2487
2488 r = sd_rtnl_message_close_container(m);
f647962d
MS
2489 if (r < 0)
2490 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2491
2492 r = sd_rtnl_message_close_container(m);
f647962d
MS
2493 if (r < 0)
2494 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2495
2496 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2497 if (r < 0)
2498 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2499 }
2500
2501 return 0;
2502}
2503
4bbfe7ad
TG
2504static int setup_ipvlan(pid_t pid) {
2505 _cleanup_udev_unref_ struct udev *udev = NULL;
2506 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2507 char **i;
2508 int r;
2509
2510 if (!arg_private_network)
2511 return 0;
2512
2513 if (strv_isempty(arg_network_ipvlan))
2514 return 0;
2515
2516 r = sd_rtnl_open(&rtnl, 0);
2517 if (r < 0)
2518 return log_error_errno(r, "Failed to connect to netlink: %m");
2519
2520 udev = udev_new();
2521 if (!udev) {
2522 log_error("Failed to connect to udev.");
2523 return -ENOMEM;
2524 }
2525
2526 STRV_FOREACH(i, arg_network_ipvlan) {
2527 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2528 _cleanup_free_ char *n = NULL;
2529 int ifi;
2530
2531 ifi = parse_interface(udev, *i);
2532 if (ifi < 0)
2533 return ifi;
2534
2535 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to allocate netlink message: %m");
2538
2539 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to add netlink interface index: %m");
2542
2543 n = strappend("iv-", *i);
2544 if (!n)
2545 return log_oom();
2546
2547 strshorten(n, IFNAMSIZ-1);
2548
2549 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to add netlink interface name: %m");
2552
2553 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2554 if (r < 0)
2555 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2556
2557 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2558 if (r < 0)
2559 return log_error_errno(r, "Failed to open netlink container: %m");
2560
2561 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2562 if (r < 0)
2563 return log_error_errno(r, "Failed to open netlink container: %m");
2564
2565 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2566 if (r < 0)
2567 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2568
2569 r = sd_rtnl_message_close_container(m);
2570 if (r < 0)
2571 return log_error_errno(r, "Failed to close netlink container: %m");
2572
2573 r = sd_rtnl_message_close_container(m);
2574 if (r < 0)
2575 return log_error_errno(r, "Failed to close netlink container: %m");
2576
2577 r = sd_rtnl_call(rtnl, m, 0, NULL);
2578 if (r < 0)
2579 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2580 }
2581
2582 return 0;
2583}
2584
28650077 2585static int setup_seccomp(void) {
24fb1112
LP
2586
2587#ifdef HAVE_SECCOMP
9a71b112
JF
2588 static const struct {
2589 uint64_t capability;
2590 int syscall_num;
2591 } blacklist[] = {
2592 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2593 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2594 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2595 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2596 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2597 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2598 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2599 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2600 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
d0a0ccf3
JF
2601 };
2602
24fb1112 2603 scmp_filter_ctx seccomp;
28650077 2604 unsigned i;
24fb1112
LP
2605 int r;
2606
24fb1112
LP
2607 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2608 if (!seccomp)
2609 return log_oom();
2610
e9642be2 2611 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2612 if (r < 0) {
da927ba9 2613 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2614 goto finish;
2615 }
2616
28650077 2617 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2618 if (arg_retain & (1ULL << blacklist[i].capability))
2619 continue;
2620
2621 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2622 if (r == -EFAULT)
2623 continue; /* unknown syscall */
2624 if (r < 0) {
da927ba9 2625 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2626 goto finish;
2627 }
2628 }
2629
d0a0ccf3 2630
28650077
LP
2631 /*
2632 Audit is broken in containers, much of the userspace audit
2633 hookup will fail if running inside a container. We don't
2634 care and just turn off creation of audit sockets.
2635
2636 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2637 with EAFNOSUPPORT which audit userspace uses as indication
2638 that audit is disabled in the kernel.
2639 */
2640
3302da46 2641 r = seccomp_rule_add(
24fb1112
LP
2642 seccomp,
2643 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2644 SCMP_SYS(socket),
2645 2,
2646 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2647 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2648 if (r < 0) {
da927ba9 2649 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2650 goto finish;
2651 }
2652
2653 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2654 if (r < 0) {
da927ba9 2655 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2656 goto finish;
2657 }
2658
2659 r = seccomp_load(seccomp);
2660 if (r < 0)
da927ba9 2661 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2662
2663finish:
2664 seccomp_release(seccomp);
2665 return r;
2666#else
2667 return 0;
2668#endif
2669
2670}
2671
785890ac
LP
2672static int setup_propagate(const char *root) {
2673 const char *p, *q;
2674
2675 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2676 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2677 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2678 (void) mkdir_p(p, 0600);
2679
63c372cb 2680 q = strjoina(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2681 mkdir_parents(q, 0755);
2682 mkdir_p(q, 0600);
2683
2684 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2685 return log_error_errno(errno, "Failed to install propagation bind mount.");
2686
2687 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2688 return log_error_errno(errno, "Failed to make propagation mount read-only");
2689
2690 return 0;
2691}
2692
1b9e5b12
LP
2693static int setup_image(char **device_path, int *loop_nr) {
2694 struct loop_info64 info = {
2695 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2696 };
2697 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2698 _cleanup_free_ char* loopdev = NULL;
2699 struct stat st;
2700 int r, nr;
2701
2702 assert(device_path);
2703 assert(loop_nr);
ec16945e 2704 assert(arg_image);
1b9e5b12
LP
2705
2706 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2707 if (fd < 0)
2708 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2709
4a62c710
MS
2710 if (fstat(fd, &st) < 0)
2711 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2712
2713 if (S_ISBLK(st.st_mode)) {
2714 char *p;
2715
2716 p = strdup(arg_image);
2717 if (!p)
2718 return log_oom();
2719
2720 *device_path = p;
2721
2722 *loop_nr = -1;
2723
2724 r = fd;
2725 fd = -1;
2726
2727 return r;
2728 }
2729
2730 if (!S_ISREG(st.st_mode)) {
56f64d95 2731 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2732 return -EINVAL;
2733 }
2734
2735 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2736 if (control < 0)
2737 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2738
2739 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2740 if (nr < 0)
2741 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2742
2743 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2744 return log_oom();
2745
2746 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2747 if (loop < 0)
2748 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2749
4a62c710
MS
2750 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2751 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2752
2753 if (arg_read_only)
2754 info.lo_flags |= LO_FLAGS_READ_ONLY;
2755
4a62c710
MS
2756 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2757 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2758
2759 *device_path = loopdev;
2760 loopdev = NULL;
2761
2762 *loop_nr = nr;
2763
2764 r = loop;
2765 loop = -1;
2766
2767 return r;
2768}
2769
ada4799a
LP
2770#define PARTITION_TABLE_BLURB \
2771 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 2772 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 2773 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2774 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2775 "to be bootable with systemd-nspawn."
2776
1b9e5b12
LP
2777static int dissect_image(
2778 int fd,
727fd4fd
LP
2779 char **root_device, bool *root_device_rw,
2780 char **home_device, bool *home_device_rw,
2781 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2782 bool *secondary) {
2783
2784#ifdef HAVE_BLKID
01dc33ce
ZJS
2785 int home_nr = -1, srv_nr = -1;
2786#ifdef GPT_ROOT_NATIVE
2787 int root_nr = -1;
2788#endif
2789#ifdef GPT_ROOT_SECONDARY
2790 int secondary_root_nr = -1;
2791#endif
f6c51a81 2792 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2793 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2794 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2795 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2796 _cleanup_udev_unref_ struct udev *udev = NULL;
2797 struct udev_list_entry *first, *item;
f6c51a81 2798 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2799 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2800 const char *pttype = NULL;
2801 blkid_partlist pl;
2802 struct stat st;
c09ef2e4 2803 unsigned i;
1b9e5b12
LP
2804 int r;
2805
2806 assert(fd >= 0);
2807 assert(root_device);
2808 assert(home_device);
2809 assert(srv_device);
2810 assert(secondary);
ec16945e 2811 assert(arg_image);
1b9e5b12
LP
2812
2813 b = blkid_new_probe();
2814 if (!b)
2815 return log_oom();
2816
2817 errno = 0;
2818 r = blkid_probe_set_device(b, fd, 0, 0);
2819 if (r != 0) {
2820 if (errno == 0)
2821 return log_oom();
2822
56f64d95 2823 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2824 return -errno;
2825 }
2826
2827 blkid_probe_enable_partitions(b, 1);
2828 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2829
2830 errno = 0;
2831 r = blkid_do_safeprobe(b);
2832 if (r == -2 || r == 1) {
ada4799a
LP
2833 log_error("Failed to identify any partition table on\n"
2834 " %s\n"
2835 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2836 return -EINVAL;
2837 } else if (r != 0) {
2838 if (errno == 0)
2839 errno = EIO;
56f64d95 2840 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2841 return -errno;
2842 }
2843
48861960 2844 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2845
2846 is_gpt = streq_ptr(pttype, "gpt");
2847 is_mbr = streq_ptr(pttype, "dos");
2848
2849 if (!is_gpt && !is_mbr) {
2850 log_error("No GPT or MBR partition table discovered on\n"
2851 " %s\n"
2852 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2853 return -EINVAL;
2854 }
2855
2856 errno = 0;
2857 pl = blkid_probe_get_partitions(b);
2858 if (!pl) {
2859 if (errno == 0)
2860 return log_oom();
2861
2862 log_error("Failed to list partitions of %s", arg_image);
2863 return -errno;
2864 }
2865
2866 udev = udev_new();
2867 if (!udev)
2868 return log_oom();
2869
4a62c710
MS
2870 if (fstat(fd, &st) < 0)
2871 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2872
c09ef2e4
LP
2873 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2874 if (!d)
1b9e5b12
LP
2875 return log_oom();
2876
c09ef2e4
LP
2877 for (i = 0;; i++) {
2878 int n, m;
1b9e5b12 2879
c09ef2e4
LP
2880 if (i >= 10) {
2881 log_error("Kernel partitions never appeared.");
2882 return -ENXIO;
2883 }
2884
2885 e = udev_enumerate_new(udev);
2886 if (!e)
2887 return log_oom();
2888
2889 r = udev_enumerate_add_match_parent(e, d);
2890 if (r < 0)
2891 return log_oom();
2892
2893 r = udev_enumerate_scan_devices(e);
2894 if (r < 0)
2895 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2896
2897 /* Count the partitions enumerated by the kernel */
2898 n = 0;
2899 first = udev_enumerate_get_list_entry(e);
2900 udev_list_entry_foreach(item, first)
2901 n++;
2902
2903 /* Count the partitions enumerated by blkid */
2904 m = blkid_partlist_numof_partitions(pl);
2905 if (n == m + 1)
2906 break;
2907 if (n > m + 1) {
2908 log_error("blkid and kernel partition list do not match.");
2909 return -EIO;
2910 }
2911 if (n < m + 1) {
2912 unsigned j;
2913
2914 /* The kernel has probed fewer partitions than
2915 * blkid? Maybe the kernel prober is still
2916 * running or it got EBUSY because udev
2917 * already opened the device. Let's reprobe
2918 * the device, which is a synchronous call
2919 * that waits until probing is complete. */
2920
2921 for (j = 0; j < 20; j++) {
2922
2923 r = ioctl(fd, BLKRRPART, 0);
2924 if (r < 0)
2925 r = -errno;
2926 if (r >= 0 || r != -EBUSY)
2927 break;
2928
2929 /* If something else has the device
2930 * open, such as an udev rule, the
2931 * ioctl will return EBUSY. Since
2932 * there's no way to wait until it
2933 * isn't busy anymore, let's just wait
2934 * a bit, and try again.
2935 *
2936 * This is really something they
2937 * should fix in the kernel! */
2938
2939 usleep(50 * USEC_PER_MSEC);
2940 }
2941
2942 if (r < 0)
2943 return log_error_errno(r, "Failed to reread partition table: %m");
2944 }
2945
2946 e = udev_enumerate_unref(e);
2947 }
1b9e5b12
LP
2948
2949 first = udev_enumerate_get_list_entry(e);
2950 udev_list_entry_foreach(item, first) {
2951 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2952 const char *node;
727fd4fd 2953 unsigned long long flags;
1b9e5b12
LP
2954 blkid_partition pp;
2955 dev_t qn;
2956 int nr;
2957
2958 errno = 0;
2959 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2960 if (!q) {
2961 if (!errno)
2962 errno = ENOMEM;
2963
56f64d95 2964 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2965 return -errno;
2966 }
2967
2968 qn = udev_device_get_devnum(q);
2969 if (major(qn) == 0)
2970 continue;
2971
2972 if (st.st_rdev == qn)
2973 continue;
2974
2975 node = udev_device_get_devnode(q);
2976 if (!node)
2977 continue;
2978
2979 pp = blkid_partlist_devno_to_partition(pl, qn);
2980 if (!pp)
2981 continue;
2982
727fd4fd 2983 flags = blkid_partition_get_flags(pp);
727fd4fd 2984
1b9e5b12
LP
2985 nr = blkid_partition_get_partno(pp);
2986 if (nr < 0)
2987 continue;
2988
ada4799a
LP
2989 if (is_gpt) {
2990 sd_id128_t type_id;
2991 const char *stype;
1b9e5b12 2992
f6c51a81
LP
2993 if (flags & GPT_FLAG_NO_AUTO)
2994 continue;
2995
ada4799a
LP
2996 stype = blkid_partition_get_type_string(pp);
2997 if (!stype)
2998 continue;
1b9e5b12 2999
ada4799a 3000 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3001 continue;
3002
ada4799a 3003 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3004
ada4799a
LP
3005 if (home && nr >= home_nr)
3006 continue;
1b9e5b12 3007
ada4799a
LP
3008 home_nr = nr;
3009 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3010
ada4799a
LP
3011 r = free_and_strdup(&home, node);
3012 if (r < 0)
3013 return log_oom();
727fd4fd 3014
ada4799a
LP
3015 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3016
3017 if (srv && nr >= srv_nr)
3018 continue;
3019
3020 srv_nr = nr;
3021 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3022
3023 r = free_and_strdup(&srv, node);
3024 if (r < 0)
3025 return log_oom();
3026 }
1b9e5b12 3027#ifdef GPT_ROOT_NATIVE
ada4799a 3028 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3029
ada4799a
LP
3030 if (root && nr >= root_nr)
3031 continue;
1b9e5b12 3032
ada4799a
LP
3033 root_nr = nr;
3034 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3035
ada4799a
LP
3036 r = free_and_strdup(&root, node);
3037 if (r < 0)
3038 return log_oom();
3039 }
1b9e5b12
LP
3040#endif
3041#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3042 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3043
3044 if (secondary_root && nr >= secondary_root_nr)
3045 continue;
3046
3047 secondary_root_nr = nr;
3048 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3049
3050 r = free_and_strdup(&secondary_root, node);
3051 if (r < 0)
3052 return log_oom();
3053 }
3054#endif
f6c51a81
LP
3055 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3056
3057 if (generic)
3058 multiple_generic = true;
3059 else {
3060 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3061
3062 r = free_and_strdup(&generic, node);
3063 if (r < 0)
3064 return log_oom();
3065 }
3066 }
ada4799a
LP
3067
3068 } else if (is_mbr) {
3069 int type;
1b9e5b12 3070
f6c51a81
LP
3071 if (flags != 0x80) /* Bootable flag */
3072 continue;
3073
ada4799a
LP
3074 type = blkid_partition_get_type(pp);
3075 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3076 continue;
3077
f6c51a81
LP
3078 if (generic)
3079 multiple_generic = true;
3080 else {
3081 generic_rw = true;
727fd4fd 3082
f6c51a81
LP
3083 r = free_and_strdup(&root, node);
3084 if (r < 0)
3085 return log_oom();
3086 }
1b9e5b12 3087 }
1b9e5b12
LP
3088 }
3089
1b9e5b12
LP
3090 if (root) {
3091 *root_device = root;
3092 root = NULL;
727fd4fd
LP
3093
3094 *root_device_rw = root_rw;
1b9e5b12
LP
3095 *secondary = false;
3096 } else if (secondary_root) {
3097 *root_device = secondary_root;
3098 secondary_root = NULL;
727fd4fd
LP
3099
3100 *root_device_rw = secondary_root_rw;
1b9e5b12 3101 *secondary = true;
f6c51a81
LP
3102 } else if (generic) {
3103
3104 /* There were no partitions with precise meanings
3105 * around, but we found generic partitions. In this
3106 * case, if there's only one, we can go ahead and boot
3107 * it, otherwise we bail out, because we really cannot
3108 * make any sense of it. */
3109
3110 if (multiple_generic) {
3111 log_error("Identified multiple bootable Linux partitions on\n"
3112 " %s\n"
3113 PARTITION_TABLE_BLURB, arg_image);
3114 return -EINVAL;
3115 }
3116
3117 *root_device = generic;
3118 generic = NULL;
3119
3120 *root_device_rw = generic_rw;
3121 *secondary = false;
3122 } else {
3123 log_error("Failed to identify root partition in disk image\n"
3124 " %s\n"
3125 PARTITION_TABLE_BLURB, arg_image);
3126 return -EINVAL;
1b9e5b12
LP
3127 }
3128
3129 if (home) {
3130 *home_device = home;
3131 home = NULL;
727fd4fd
LP
3132
3133 *home_device_rw = home_rw;
1b9e5b12
LP
3134 }
3135
3136 if (srv) {
3137 *srv_device = srv;
3138 srv = NULL;
727fd4fd
LP
3139
3140 *srv_device_rw = srv_rw;
1b9e5b12
LP
3141 }
3142
3143 return 0;
3144#else
3145 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3146 return -EOPNOTSUPP;
1b9e5b12
LP
3147#endif
3148}
3149
727fd4fd 3150static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3151#ifdef HAVE_BLKID
3152 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3153 const char *fstype, *p;
3154 int r;
3155
3156 assert(what);
3157 assert(where);
3158
727fd4fd
LP
3159 if (arg_read_only)
3160 rw = false;
3161
1b9e5b12 3162 if (directory)
63c372cb 3163 p = strjoina(where, directory);
1b9e5b12
LP
3164 else
3165 p = where;
3166
3167 errno = 0;
3168 b = blkid_new_probe_from_filename(what);
3169 if (!b) {
3170 if (errno == 0)
3171 return log_oom();
56f64d95 3172 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3173 return -errno;
3174 }
3175
3176 blkid_probe_enable_superblocks(b, 1);
3177 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3178
3179 errno = 0;
3180 r = blkid_do_safeprobe(b);
3181 if (r == -1 || r == 1) {
3182 log_error("Cannot determine file system type of %s", what);
3183 return -EINVAL;
3184 } else if (r != 0) {
3185 if (errno == 0)
3186 errno = EIO;
56f64d95 3187 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3188 return -errno;
3189 }
3190
3191 errno = 0;
3192 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3193 if (errno == 0)
3194 errno = EINVAL;
3195 log_error("Failed to determine file system type of %s", what);
3196 return -errno;
3197 }
3198
3199 if (streq(fstype, "crypto_LUKS")) {
3200 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3201 return -EOPNOTSUPP;
1b9e5b12
LP
3202 }
3203
4a62c710
MS
3204 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3205 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3206
3207 return 0;
3208#else
3209 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3210 return -EOPNOTSUPP;
1b9e5b12
LP
3211#endif
3212}
3213
727fd4fd
LP
3214static int mount_devices(
3215 const char *where,
3216 const char *root_device, bool root_device_rw,
3217 const char *home_device, bool home_device_rw,
3218 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3219 int r;
3220
3221 assert(where);
3222
3223 if (root_device) {
727fd4fd 3224 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3225 if (r < 0)
3226 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3227 }
3228
3229 if (home_device) {
727fd4fd 3230 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3231 if (r < 0)
3232 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3233 }
3234
3235 if (srv_device) {
727fd4fd 3236 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3237 if (r < 0)
3238 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3239 }
3240
3241 return 0;
3242}
3243
3244static void loop_remove(int nr, int *image_fd) {
3245 _cleanup_close_ int control = -1;
e8c8ddcc 3246 int r;
1b9e5b12
LP
3247
3248 if (nr < 0)
3249 return;
3250
3251 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3252 r = ioctl(*image_fd, LOOP_CLR_FD);
3253 if (r < 0)
5e4074aa 3254 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3255 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3256 }
3257
3258 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3259 if (control < 0) {
56f64d95 3260 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3261 return;
e8c8ddcc 3262 }
1b9e5b12 3263
e8c8ddcc
TG
3264 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3265 if (r < 0)
5e4074aa 3266 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3267}
3268
0cb9fbcd
LP
3269static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3270 int pipe_fds[2];
3271 pid_t pid;
3272
3273 assert(database);
3274 assert(key);
3275 assert(rpid);
3276
4a62c710
MS
3277 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3278 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3279
3280 pid = fork();
4a62c710
MS
3281 if (pid < 0)
3282 return log_error_errno(errno, "Failed to fork getent child: %m");
3283 else if (pid == 0) {
0cb9fbcd
LP
3284 int nullfd;
3285 char *empty_env = NULL;
3286
3287 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3288 _exit(EXIT_FAILURE);
3289
3290 if (pipe_fds[0] > 2)
03e334a1 3291 safe_close(pipe_fds[0]);
0cb9fbcd 3292 if (pipe_fds[1] > 2)
03e334a1 3293 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3294
3295 nullfd = open("/dev/null", O_RDWR);
3296 if (nullfd < 0)
3297 _exit(EXIT_FAILURE);
3298
3299 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3300 _exit(EXIT_FAILURE);
3301
3302 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3303 _exit(EXIT_FAILURE);
3304
3305 if (nullfd > 2)
03e334a1 3306 safe_close(nullfd);
0cb9fbcd
LP
3307
3308 reset_all_signal_handlers();
3309 close_all_fds(NULL, 0);
3310
4de82926
MM
3311 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3312 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3313 _exit(EXIT_FAILURE);
3314 }
3315
03e334a1 3316 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3317
3318 *rpid = pid;
3319
3320 return pipe_fds[0];
3321}
3322
3323static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3324 char line[LINE_MAX], *x, *u, *g, *h;
3325 const char *word, *state;
0cb9fbcd
LP
3326 _cleanup_free_ uid_t *uids = NULL;
3327 _cleanup_free_ char *home = NULL;
3328 _cleanup_fclose_ FILE *f = NULL;
3329 _cleanup_close_ int fd = -1;
3330 unsigned n_uids = 0;
70f539ca 3331 size_t sz = 0, l;
0cb9fbcd
LP
3332 uid_t uid;
3333 gid_t gid;
3334 pid_t pid;
3335 int r;
3336
3337 assert(_home);
3338
3339 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3340 /* Reset everything fully to 0, just in case */
3341
4a62c710
MS
3342 if (setgroups(0, NULL) < 0)
3343 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3344
4a62c710
MS
3345 if (setresgid(0, 0, 0) < 0)
3346 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3347
4a62c710
MS
3348 if (setresuid(0, 0, 0) < 0)
3349 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3350
3351 *_home = NULL;
3352 return 0;
3353 }
3354
3355 /* First, get user credentials */
3356 fd = spawn_getent("passwd", arg_user, &pid);
3357 if (fd < 0)
3358 return fd;
3359
3360 f = fdopen(fd, "r");
3361 if (!f)
3362 return log_oom();
3363 fd = -1;
3364
3365 if (!fgets(line, sizeof(line), f)) {
3366
3367 if (!ferror(f)) {
3368 log_error("Failed to resolve user %s.", arg_user);
3369 return -ESRCH;
3370 }
3371
56f64d95 3372 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3373 return -errno;
3374 }
3375
3376 truncate_nl(line);
3377
820d3acf 3378 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3379
3380 x = strchr(line, ':');
3381 if (!x) {
3382 log_error("/etc/passwd entry has invalid user field.");
3383 return -EIO;
3384 }
3385
3386 u = strchr(x+1, ':');
3387 if (!u) {
3388 log_error("/etc/passwd entry has invalid password field.");
3389 return -EIO;
3390 }
3391
3392 u++;
3393 g = strchr(u, ':');
3394 if (!g) {
3395 log_error("/etc/passwd entry has invalid UID field.");
3396 return -EIO;
3397 }
3398
3399 *g = 0;
3400 g++;
3401 x = strchr(g, ':');
3402 if (!x) {
3403 log_error("/etc/passwd entry has invalid GID field.");
3404 return -EIO;
3405 }
3406
3407 *x = 0;
3408 h = strchr(x+1, ':');
3409 if (!h) {
3410 log_error("/etc/passwd entry has invalid GECOS field.");
3411 return -EIO;
3412 }
3413
3414 h++;
3415 x = strchr(h, ':');
3416 if (!x) {
3417 log_error("/etc/passwd entry has invalid home directory field.");
3418 return -EIO;
3419 }
3420
3421 *x = 0;
3422
3423 r = parse_uid(u, &uid);
3424 if (r < 0) {
3425 log_error("Failed to parse UID of user.");
3426 return -EIO;
3427 }
3428
3429 r = parse_gid(g, &gid);
3430 if (r < 0) {
3431 log_error("Failed to parse GID of user.");
3432 return -EIO;
3433 }
3434
3435 home = strdup(h);
3436 if (!home)
3437 return log_oom();
3438
3439 /* Second, get group memberships */
3440 fd = spawn_getent("initgroups", arg_user, &pid);
3441 if (fd < 0)
3442 return fd;
3443
3444 fclose(f);
3445 f = fdopen(fd, "r");
3446 if (!f)
3447 return log_oom();
3448 fd = -1;
3449
3450 if (!fgets(line, sizeof(line), f)) {
3451 if (!ferror(f)) {
3452 log_error("Failed to resolve user %s.", arg_user);
3453 return -ESRCH;
3454 }
3455
56f64d95 3456 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3457 return -errno;
3458 }
3459
3460 truncate_nl(line);
3461
820d3acf 3462 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3463
3464 /* Skip over the username and subsequent separator whitespace */
3465 x = line;
3466 x += strcspn(x, WHITESPACE);
3467 x += strspn(x, WHITESPACE);
3468
a2a5291b 3469 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3470 char c[l+1];
3471
a2a5291b 3472 memcpy(c, word, l);
0cb9fbcd
LP
3473 c[l] = 0;
3474
3475 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3476 return log_oom();
3477
3478 r = parse_uid(c, &uids[n_uids++]);
3479 if (r < 0) {
3480 log_error("Failed to parse group data from getent.");
3481 return -EIO;
3482 }
3483 }
3484
3485 r = mkdir_parents(home, 0775);
f647962d
MS
3486 if (r < 0)
3487 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3488
3489 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3490 if (r < 0 && r != -EEXIST)
3491 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3492
3493 fchown(STDIN_FILENO, uid, gid);
3494 fchown(STDOUT_FILENO, uid, gid);
3495 fchown(STDERR_FILENO, uid, gid);
3496
4a62c710
MS
3497 if (setgroups(n_uids, uids) < 0)
3498 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3499
4a62c710
MS
3500 if (setresgid(gid, gid, gid) < 0)
3501 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3502
4a62c710
MS
3503 if (setresuid(uid, uid, uid) < 0)
3504 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3505
3506 if (_home) {
3507 *_home = home;
3508 home = NULL;
3509 }
3510
3511 return 0;
3512}
3513
113cea80 3514/*
6d416b9c
LS
3515 * Return values:
3516 * < 0 : wait_for_terminate() failed to get the state of the
3517 * container, the container was terminated by a signal, or
3518 * failed for an unknown reason. No change is made to the
3519 * container argument.
3520 * > 0 : The program executed in the container terminated with an
3521 * error. The exit code of the program executed in the
919699ec
LP
3522 * container is returned. The container argument has been set
3523 * to CONTAINER_TERMINATED.
6d416b9c
LS
3524 * 0 : The container is being rebooted, has been shut down or exited
3525 * successfully. The container argument has been set to either
3526 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3527 *
6d416b9c
LS
3528 * That is, success is indicated by a return value of zero, and an
3529 * error is indicated by a non-zero value.
113cea80
DH
3530 */
3531static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3532 siginfo_t status;
919699ec 3533 int r;
113cea80
DH
3534
3535 r = wait_for_terminate(pid, &status);
f647962d
MS
3536 if (r < 0)
3537 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3538
3539 switch (status.si_code) {
fddbb89c 3540
113cea80 3541 case CLD_EXITED:
919699ec
LP
3542 if (status.si_status == 0) {
3543 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3544
fddbb89c 3545 } else
919699ec 3546 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3547
919699ec
LP
3548 *container = CONTAINER_TERMINATED;
3549 return status.si_status;
113cea80
DH
3550
3551 case CLD_KILLED:
3552 if (status.si_status == SIGINT) {
113cea80 3553
919699ec 3554 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3555 *container = CONTAINER_TERMINATED;
919699ec
LP
3556 return 0;
3557
113cea80 3558 } else if (status.si_status == SIGHUP) {
113cea80 3559
919699ec 3560 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3561 *container = CONTAINER_REBOOTED;
919699ec 3562 return 0;
113cea80 3563 }
919699ec 3564
113cea80
DH
3565 /* CLD_KILLED fallthrough */
3566
3567 case CLD_DUMPED:
fddbb89c 3568 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3569 return -EIO;
113cea80
DH
3570
3571 default:
fddbb89c 3572 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3573 return -EIO;
113cea80
DH
3574 }
3575
3576 return r;
3577}
3578
e866af3a
DH
3579static void nop_handler(int sig) {}
3580
023fb90b
LP
3581static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3582 pid_t pid;
3583
3584 pid = PTR_TO_UINT32(userdata);
3585 if (pid > 0) {
c6c8f6e2 3586 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3587 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3588 sd_event_source_set_userdata(s, NULL);
3589 return 0;
3590 }
3591 }
3592
3593 sd_event_exit(sd_event_source_get_event(s), 0);
3594 return 0;
3595}
3596
ec16945e 3597static int determine_names(void) {
1b9cebf6 3598 int r;
ec16945e
LP
3599
3600 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3601 if (arg_machine) {
3602 _cleanup_(image_unrefp) Image *i = NULL;
3603
3604 r = image_find(arg_machine, &i);
3605 if (r < 0)
3606 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3607 else if (r == 0) {
3608 log_error("No image for machine '%s': %m", arg_machine);
3609 return -ENOENT;
3610 }
3611
aceac2f0 3612 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3613 r = set_sanitized_path(&arg_image, i->path);
3614 else
3615 r = set_sanitized_path(&arg_directory, i->path);
3616 if (r < 0)
3617 return log_error_errno(r, "Invalid image directory: %m");
3618
aee327b8
LP
3619 if (!arg_ephemeral)
3620 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 3621 } else
ec16945e
LP
3622 arg_directory = get_current_dir_name();
3623
1b9cebf6
LP
3624 if (!arg_directory && !arg_machine) {
3625 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3626 return -EINVAL;
3627 }
3628 }
3629
3630 if (!arg_machine) {
b9ba4dab
LP
3631 if (arg_directory && path_equal(arg_directory, "/"))
3632 arg_machine = gethostname_malloc();
3633 else
3634 arg_machine = strdup(basename(arg_image ?: arg_directory));
3635
ec16945e
LP
3636 if (!arg_machine)
3637 return log_oom();
3638
3639 hostname_cleanup(arg_machine, false);
3640 if (!machine_name_is_valid(arg_machine)) {
3641 log_error("Failed to determine machine name automatically, please use -M.");
3642 return -EINVAL;
3643 }
b9ba4dab
LP
3644
3645 if (arg_ephemeral) {
3646 char *b;
3647
3648 /* Add a random suffix when this is an
3649 * ephemeral machine, so that we can run many
3650 * instances at once without manually having
3651 * to specify -M each time. */
3652
3653 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3654 return log_oom();
3655
3656 free(arg_machine);
3657 arg_machine = b;
3658 }
ec16945e
LP
3659 }
3660
3661 return 0;
3662}
3663
6dac160c
LP
3664static int determine_uid_shift(void) {
3665 int r;
3666
3667 if (!arg_userns)
3668 return 0;
3669
3670 if (arg_uid_shift == UID_INVALID) {
3671 struct stat st;
3672
3673 r = stat(arg_directory, &st);
3674 if (r < 0)
3675 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3676
3677 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3678
3679 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3680 log_error("UID and GID base of %s don't match.", arg_directory);
3681 return -EINVAL;
3682 }
3683
3684 arg_uid_range = UINT32_C(0x10000);
3685 }
3686
3687 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3688 log_error("UID base too high for UID range.");
3689 return -EINVAL;
3690 }
3691
3692 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3693 return 0;
3694}
3695
88213476 3696int main(int argc, char *argv[]) {
69c79d3c 3697
611b312b 3698 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3699 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3700 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3701 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3702 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3703 char veth_name[IFNAMSIZ];
ec16945e 3704 bool secondary = false, remove_subvol = false;
e866af3a 3705 sigset_t mask, mask_chld;
69c79d3c 3706 pid_t pid = 0;
ec16945e 3707 int ret = EXIT_SUCCESS;
6d0b55c2 3708 union in_addr_union exposed = {};
30535c16 3709 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
9c857b9d 3710 bool interactive;
88213476
LP
3711
3712 log_parse_environment();
3713 log_open();
3714
ec16945e
LP
3715 r = parse_argv(argc, argv);
3716 if (r <= 0)
88213476 3717 goto finish;
88213476 3718
ec16945e
LP
3719 r = determine_names();
3720 if (r < 0)
3721 goto finish;
7027ff61 3722
88213476
LP
3723 if (geteuid() != 0) {
3724 log_error("Need to be root.");
ec16945e 3725 r = -EPERM;
88213476
LP
3726 goto finish;
3727 }
3728
1b9e5b12
LP
3729 log_close();
3730 n_fd_passed = sd_listen_fds(false);
3731 if (n_fd_passed > 0) {
ec16945e
LP
3732 r = fdset_new_listen_fds(&fds, false);
3733 if (r < 0) {
3734 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3735 goto finish;
3736 }
88213476 3737 }
1b9e5b12
LP
3738 fdset_close_others(fds);
3739 log_open();
88213476 3740
1b9e5b12 3741 if (arg_directory) {
ec16945e
LP
3742 assert(!arg_image);
3743
c4e34a61
LP
3744 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3745 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3746 r = -EINVAL;
6b9132a9
LP
3747 goto finish;
3748 }
1b9e5b12 3749
30535c16 3750 if (arg_ephemeral) {
8a16a7b4 3751 _cleanup_free_ char *np = NULL;
ec16945e 3752
c4e34a61
LP
3753 /* If the specified path is a mount point we
3754 * generate the new snapshot immediately
3755 * inside it under a random name. However if
3756 * the specified is not a mount point we
3757 * create the new snapshot in the parent
3758 * directory, just next to it. */
3759 r = path_is_mount_point(arg_directory, false);
3760 if (r < 0) {
3761 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3762 goto finish;
3763 }
3764 if (r > 0)
3765 r = tempfn_random_child(arg_directory, &np);
3766 else
3767 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3768 if (r < 0) {
3769 log_error_errno(r, "Failed to generate name for snapshot: %m");
3770 goto finish;
3771 }
3772
30535c16
LP
3773 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3774 if (r < 0) {
3775 log_error_errno(r, "Failed to lock %s: %m", np);
3776 goto finish;
3777 }
3778
f70a17f8 3779 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
ec16945e 3780 if (r < 0) {
ec16945e
LP
3781 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3782 goto finish;
3783 }
3784
3785 free(arg_directory);
3786 arg_directory = np;
8a16a7b4 3787 np = NULL;
ec16945e
LP
3788
3789 remove_subvol = true;
30535c16
LP
3790
3791 } else {
3792 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3793 if (r == -EBUSY) {
3794 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3795 goto finish;
3796 }
3797 if (r < 0) {
3798 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3799 return r;
3800 }
3801
3802 if (arg_template) {
f70a17f8 3803 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
3804 if (r == -EEXIST) {
3805 if (!arg_quiet)
3806 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3807 } else if (r < 0) {
83521414 3808 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3809 goto finish;
3810 } else {
3811 if (!arg_quiet)
3812 log_info("Populated %s from template %s.", arg_directory, arg_template);
3813 }
3814 }
ec16945e
LP
3815 }
3816
1b9e5b12
LP
3817 if (arg_boot) {
3818 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3819 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3820 r = -EINVAL;
1b9e5b12
LP
3821 goto finish;
3822 }
3823 } else {
3824 const char *p;
3825
63c372cb 3826 p = strjoina(arg_directory,
1b9e5b12
LP
3827 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3828 if (access(p, F_OK) < 0) {
3829 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3830 r = -EINVAL;
1b9e5b12 3831 goto finish;
1b9e5b12
LP
3832 }
3833 }
ec16945e 3834
6b9132a9 3835 } else {
1b9e5b12 3836 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3837
ec16945e
LP
3838 assert(arg_image);
3839 assert(!arg_template);
3840
30535c16
LP
3841 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3842 if (r == -EBUSY) {
3843 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3844 goto finish;
3845 }
3846 if (r < 0) {
3847 r = log_error_errno(r, "Failed to create image lock: %m");
3848 goto finish;
3849 }
3850
1b9e5b12 3851 if (!mkdtemp(template)) {
56f64d95 3852 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3853 r = -errno;
6b9132a9 3854 goto finish;
1b9e5b12 3855 }
6b9132a9 3856
1b9e5b12
LP
3857 arg_directory = strdup(template);
3858 if (!arg_directory) {
3859 r = log_oom();
3860 goto finish;
6b9132a9 3861 }
88213476 3862
1b9e5b12
LP
3863 image_fd = setup_image(&device_path, &loop_nr);
3864 if (image_fd < 0) {
3865 r = image_fd;
842f3b0f
LP
3866 goto finish;
3867 }
1b9e5b12 3868
4d9f07b4
LP
3869 r = dissect_image(image_fd,
3870 &root_device, &root_device_rw,
3871 &home_device, &home_device_rw,
3872 &srv_device, &srv_device_rw,
3873 &secondary);
1b9e5b12
LP
3874 if (r < 0)
3875 goto finish;
842f3b0f 3876 }
842f3b0f 3877
6dac160c
LP
3878 r = determine_uid_shift();
3879 if (r < 0)
3880 goto finish;
3881
9c857b9d
LP
3882 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3883
db7feb7e
LP
3884 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3885 if (master < 0) {
ec16945e 3886 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3887 goto finish;
3888 }
3889
611b312b
LP
3890 r = ptsname_malloc(master, &console);
3891 if (r < 0) {
3892 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3893 goto finish;
3894 }
3895
a258bf26 3896 if (unlockpt(master) < 0) {
ec16945e 3897 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3898 goto finish;
3899 }
3900
9c857b9d
LP
3901 if (!arg_quiet)
3902 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3903 arg_machine, arg_image ?: arg_directory);
3904
a258bf26
LP
3905 assert_se(sigemptyset(&mask) == 0);
3906 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3907 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3908
023fb90b
LP
3909 assert_se(sigemptyset(&mask_chld) == 0);
3910 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3911
d87be9b0 3912 for (;;) {
6d0b55c2 3913 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3914 ContainerStatus container_status;
7566e267 3915 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3916 struct sigaction sa = {
3917 .sa_handler = nop_handler,
3918 .sa_flags = SA_NOCLDSTOP,
3919 };
3920
7566e267 3921 r = barrier_create(&barrier);
a2da110b 3922 if (r < 0) {
da927ba9 3923 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3924 goto finish;
3925 }
3926
6d0b55c2
LP
3927 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3928 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3929 goto finish;
3930 }
3931
3932 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3933 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3934 goto finish;
3935 }
3936
e866af3a
DH
3937 /* Child can be killed before execv(), so handle SIGCHLD
3938 * in order to interrupt parent's blocking calls and
3939 * give it a chance to call wait() and terminate. */
3940 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3941 if (r < 0) {
ec16945e 3942 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3943 goto finish;
3944 }
3945
e866af3a
DH
3946 r = sigaction(SIGCHLD, &sa, NULL);
3947 if (r < 0) {
ec16945e 3948 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3949 goto finish;
3950 }
3951
60e1651a
KW
3952 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3953 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3954 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3955 if (pid < 0) {
3956 if (errno == EINVAL)
ec16945e 3957 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3958 else
ec16945e 3959 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3960
d87be9b0
LP
3961 goto finish;
3962 }
a258bf26 3963
d87be9b0
LP
3964 if (pid == 0) {
3965 /* child */
0cb9fbcd 3966 _cleanup_free_ char *home = NULL;
5674767e 3967 unsigned n_env = 2;
d87be9b0 3968 const char *envp[] = {
e10a55fd 3969 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3970 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3971 NULL, /* TERM */
3972 NULL, /* HOME */
3973 NULL, /* USER */
3974 NULL, /* LOGNAME */
3975 NULL, /* container_uuid */
842f3b0f
LP
3976 NULL, /* LISTEN_FDS */
3977 NULL, /* LISTEN_PID */
d87be9b0
LP
3978 NULL
3979 };
f4889f65 3980 char **env_use;
a258bf26 3981
a2da110b
DH
3982 barrier_set_role(&barrier, BARRIER_CHILD);
3983
5674767e
ZJS
3984 envp[n_env] = strv_find_prefix(environ, "TERM=");
3985 if (envp[n_env])
3986 n_env ++;
a258bf26 3987
03e334a1 3988 master = safe_close(master);
a258bf26 3989
03e334a1 3990 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3991 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3992
d87be9b0 3993 reset_all_signal_handlers();
1b6d7fa7 3994 reset_signal_mask();
f5c1b9ee 3995
9c857b9d
LP
3996 if (interactive) {
3997 close_nointr(STDIN_FILENO);
3998 close_nointr(STDOUT_FILENO);
3999 close_nointr(STDERR_FILENO);
842f3b0f 4000
9c857b9d
LP
4001 r = open_terminal(console, O_RDWR);
4002 if (r != STDIN_FILENO) {
4003 if (r >= 0) {
4004 safe_close(r);
4005 r = -EINVAL;
4006 }
842f3b0f 4007
9c857b9d
LP
4008 log_error_errno(r, "Failed to open console: %m");
4009 _exit(EXIT_FAILURE);
4010 }
4011
4012 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4013 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4014 log_error_errno(errno, "Failed to duplicate console: %m");
4015 _exit(EXIT_FAILURE);
4016 }
842f3b0f 4017 }
bc2f673e 4018
d87be9b0 4019 if (setsid() < 0) {
56f64d95 4020 log_error_errno(errno, "setsid() failed: %m");
a2da110b 4021 _exit(EXIT_FAILURE);
bc2f673e
LP
4022 }
4023
db999e0f 4024 if (reset_audit_loginuid() < 0)
a2da110b 4025 _exit(EXIT_FAILURE);
db999e0f 4026
d87be9b0 4027 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 4028 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 4029 _exit(EXIT_FAILURE);
d87be9b0 4030 }
e58a1277 4031
6dac160c
LP
4032 if (arg_private_network)
4033 loopback_setup();
4034
d87be9b0
LP
4035 /* Mark everything as slave, so that we still
4036 * receive mounts from the real root, but don't
4037 * propagate mounts to the real root. */
4038 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 4039 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 4040 _exit(EXIT_FAILURE);
d87be9b0 4041 }
04bc4a3f 4042
727fd4fd
LP
4043 if (mount_devices(arg_directory,
4044 root_device, root_device_rw,
4045 home_device, home_device_rw,
4046 srv_device, srv_device_rw) < 0)
a2da110b 4047 _exit(EXIT_FAILURE);
1b9e5b12 4048
d87be9b0 4049 /* Turn directory into bind mount */
4543768d 4050 if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 4051 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 4052 _exit(EXIT_FAILURE);
d87be9b0 4053 }
88213476 4054
4d9f07b4
LP
4055 r = setup_volatile(arg_directory);
4056 if (r < 0)
a2da110b 4057 _exit(EXIT_FAILURE);
4d9f07b4
LP
4058
4059 if (setup_volatile_state(arg_directory) < 0)
a2da110b 4060 _exit(EXIT_FAILURE);
4d9f07b4
LP
4061
4062 r = base_filesystem_create(arg_directory);
4063 if (r < 0)
a2da110b 4064 _exit(EXIT_FAILURE);
4d9f07b4 4065
d6797c92 4066 if (arg_read_only) {
ec16945e
LP
4067 r = bind_remount_recursive(arg_directory, true);
4068 if (r < 0) {
4069 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 4070 _exit(EXIT_FAILURE);
d87be9b0 4071 }
d6797c92 4072 }
2547bb41 4073
d87be9b0 4074 if (mount_all(arg_directory) < 0)
a2da110b 4075 _exit(EXIT_FAILURE);
57fb9fb5 4076
d87be9b0 4077 if (copy_devnodes(arg_directory) < 0)
a2da110b 4078 _exit(EXIT_FAILURE);
a258bf26 4079
f2d88580 4080 if (setup_ptmx(arg_directory) < 0)
a2da110b 4081 _exit(EXIT_FAILURE);
f2d88580 4082
d87be9b0 4083 dev_setup(arg_directory);
88213476 4084
785890ac
LP
4085 if (setup_propagate(arg_directory) < 0)
4086 _exit(EXIT_FAILURE);
4087
28650077 4088 if (setup_seccomp() < 0)
a2da110b 4089 _exit(EXIT_FAILURE);
24fb1112 4090
d87be9b0 4091 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 4092 _exit(EXIT_FAILURE);
88213476 4093
d87be9b0 4094 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 4095 _exit(EXIT_FAILURE);
03e334a1 4096 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 4097
6d0b55c2
LP
4098 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4099 _exit(EXIT_FAILURE);
4100 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4101
b12afc8c
LP
4102 /* Tell the parent that we are ready, and that
4103 * it can cgroupify us to that we lack access
4104 * to certain devices and resources. */
6dac160c 4105 (void) barrier_place(&barrier); /* #1 */
b12afc8c 4106
d87be9b0 4107 if (setup_boot_id(arg_directory) < 0)
a2da110b 4108 _exit(EXIT_FAILURE);
a41fe3a2 4109
d87be9b0 4110 if (setup_timezone(arg_directory) < 0)
a2da110b 4111 _exit(EXIT_FAILURE);
88213476 4112
d87be9b0 4113 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 4114 _exit(EXIT_FAILURE);
687d0825 4115
d87be9b0 4116 if (setup_journal(arg_directory) < 0)
a2da110b 4117 _exit(EXIT_FAILURE);
687d0825 4118
d6797c92 4119 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 4120 _exit(EXIT_FAILURE);
17fe0523 4121
d6797c92 4122 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 4123 _exit(EXIT_FAILURE);
17fe0523 4124
06c17c39 4125 if (mount_tmpfs(arg_directory) < 0)
a2da110b 4126 _exit(EXIT_FAILURE);
06c17c39 4127
b12afc8c
LP
4128 /* Wait until we are cgroup-ified, so that we
4129 * can mount the right cgroup path writable */
6dac160c 4130 (void) barrier_place_and_sync(&barrier); /* #2 */
b12afc8c
LP
4131
4132 if (mount_cgroup(arg_directory) < 0)
4133 _exit(EXIT_FAILURE);
d96c1ecf 4134
d87be9b0 4135 if (chdir(arg_directory) < 0) {
56f64d95 4136 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 4137 _exit(EXIT_FAILURE);
687d0825
MV
4138 }
4139
d87be9b0 4140 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 4141 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 4142 _exit(EXIT_FAILURE);
687d0825
MV
4143 }
4144
d87be9b0 4145 if (chroot(".") < 0) {
56f64d95 4146 log_error_errno(errno, "chroot() failed: %m");
a2da110b 4147 _exit(EXIT_FAILURE);
687d0825
MV
4148 }
4149
d87be9b0 4150 if (chdir("/") < 0) {
56f64d95 4151 log_error_errno(errno, "chdir() failed: %m");
a2da110b 4152 _exit(EXIT_FAILURE);
687d0825
MV
4153 }
4154
6dac160c
LP
4155 if (arg_userns) {
4156 if (unshare(CLONE_NEWUSER) < 0) {
4157 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4158 _exit(EXIT_FAILURE);
4159 }
d87be9b0 4160
6dac160c
LP
4161 /* Tell the parent, that it now can
4162 * write the UID map. */
4163 (void) barrier_place(&barrier); /* #3 */
4164
4165 /* Wait until the parent wrote the UID
4166 * map */
4167 (void) barrier_place_and_sync(&barrier); /* #4 */
4168 }
4169
4170 umask(0022);
d87be9b0
LP
4171
4172 if (drop_capabilities() < 0) {
56f64d95 4173 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4174 _exit(EXIT_FAILURE);
687d0825 4175 }
687d0825 4176
6dac160c
LP
4177 setup_hostname();
4178
4179 if (arg_personality != 0xffffffffLU) {
4180 if (personality(arg_personality) < 0) {
4181 log_error_errno(errno, "personality() failed: %m");
4182 _exit(EXIT_FAILURE);
4183 }
4184 } else if (secondary) {
4185 if (personality(PER_LINUX32) < 0) {
4186 log_error_errno(errno, "personality() failed: %m");
4187 _exit(EXIT_FAILURE);
4188 }
4189 }
4190
4191#ifdef HAVE_SELINUX
4192 if (arg_selinux_context)
4193 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4194 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4195 _exit(EXIT_FAILURE);
4196 }
4197#endif
4198
0cb9fbcd
LP
4199 r = change_uid_gid(&home);
4200 if (r < 0)
a2da110b 4201 _exit(EXIT_FAILURE);
d87be9b0 4202
842f3b0f
LP
4203 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4204 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4205 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4206 log_oom();
a2da110b 4207 _exit(EXIT_FAILURE);
144f0fc0 4208 }
687d0825 4209
9444b1f2 4210 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4211 char as_uuid[37];
4212
4213 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4214 log_oom();
a2da110b 4215 _exit(EXIT_FAILURE);
842f3b0f
LP
4216 }
4217 }
4218
4219 if (fdset_size(fds) > 0) {
ec16945e
LP
4220 r = fdset_cloexec(fds, false);
4221 if (r < 0) {
4222 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4223 _exit(EXIT_FAILURE);
842f3b0f
LP
4224 }
4225
4226 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4227 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4228 log_oom();
a2da110b 4229 _exit(EXIT_FAILURE);
d87be9b0
LP
4230 }
4231 }
4232
f4889f65
LP
4233 if (!strv_isempty(arg_setenv)) {
4234 char **n;
4235
4236 n = strv_env_merge(2, envp, arg_setenv);
4237 if (!n) {
4238 log_oom();
a2da110b 4239 _exit(EXIT_FAILURE);
f4889f65
LP
4240 }
4241
4242 env_use = n;
4243 } else
4244 env_use = (char**) envp;
4245
6dac160c
LP
4246 /* Let the parent know that we are ready and
4247 * wait until the parent is ready with the
4248 * setup, too... */
4249 (void) barrier_place_and_sync(&barrier); /* #5 */
d96c1ecf 4250
d87be9b0
LP
4251 if (arg_boot) {
4252 char **a;
4253 size_t l;
88213476 4254
d87be9b0 4255 /* Automatically search for the init system */
0f0dbc46 4256
d87be9b0
LP
4257 l = 1 + argc - optind;
4258 a = newa(char*, l + 1);
4259 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4260
d87be9b0 4261 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4262 execve(a[0], a, env_use);
0f0dbc46 4263
d87be9b0 4264 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4265 execve(a[0], a, env_use);
0f0dbc46 4266
d87be9b0 4267 a[0] = (char*) "/sbin/init";
f4889f65 4268 execve(a[0], a, env_use);
d87be9b0 4269 } else if (argc > optind)
f4889f65 4270 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4271 else {
4272 chdir(home ? home : "/root");
f4889f65 4273 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4274 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4275 }
4276
56f64d95 4277 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4278 _exit(EXIT_FAILURE);
da5b3bad 4279 }
88213476 4280
a2da110b 4281 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4282 fdset_free(fds);
4283 fds = NULL;
4284
6d0b55c2
LP
4285 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4286 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4287
6dac160c
LP
4288 (void) barrier_place(&barrier); /* #1 */
4289
b12afc8c
LP
4290 /* Wait for the most basic Child-setup to be done,
4291 * before we add hardware to it, and place it in a
4292 * cgroup. */
6dac160c 4293 if (barrier_sync(&barrier)) { /* #1 */
5aa4bb6b 4294 int ifi = 0;
354bfd2b 4295
840295fc
LP
4296 r = move_network_interfaces(pid);
4297 if (r < 0)
4298 goto finish;
aa28aefe 4299
5aa4bb6b 4300 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4301 if (r < 0)
4302 goto finish;
ab046dde 4303
5aa4bb6b 4304 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4305 if (r < 0)
4306 goto finish;
ab046dde 4307
840295fc
LP
4308 r = setup_macvlan(pid);
4309 if (r < 0)
4310 goto finish;
c74e630d 4311
4bbfe7ad
TG
4312 r = setup_ipvlan(pid);
4313 if (r < 0)
4314 goto finish;
4315
5aa4bb6b
LP
4316 r = register_machine(pid, ifi);
4317 if (r < 0)
4318 goto finish;
4319
6dac160c
LP
4320 /* Notify the child that the parent is ready with all
4321 * its setup, and that the child can now hand over
4322 * control to the code to run inside the container. */
4323 (void) barrier_place(&barrier); /* #2 */
4324
4325 if (arg_userns) {
4326 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4327
4328 (void) barrier_place_and_sync(&barrier); /* #3 */
4329
4330 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4331 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4332 r = write_string_file(uid_map, line);
4333 if (r < 0) {
4334 log_error_errno(r, "Failed to write UID map: %m");
4335 goto finish;
4336 }
4337
4338 /* We always assign the same UID and GID ranges */
4339 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4340 r = write_string_file(uid_map, line);
4341 if (r < 0) {
4342 log_error_errno(r, "Failed to write GID map: %m");
4343 goto finish;
4344 }
4345
4346 (void) barrier_place(&barrier); /* #4 */
4347 }
4348
840295fc
LP
4349 /* Block SIGCHLD here, before notifying child.
4350 * process_pty() will handle it with the other signals. */
4351 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4352 if (r < 0)
4353 goto finish;
e866af3a 4354
840295fc
LP
4355 /* Reset signal to default */
4356 r = default_signals(SIGCHLD, -1);
4357 if (r < 0)
4358 goto finish;
e866af3a 4359
6dac160c
LP
4360 /* Let the child know that we are ready and wait that the child is completely ready now. */
4361 if (barrier_place_and_sync(&barrier)) { /* #5 */
6d0b55c2
LP
4362 _cleanup_event_unref_ sd_event *event = NULL;
4363 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4364 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4365 char last_char = 0;
b12afc8c 4366
733d15ac
LP
4367 sd_notifyf(false,
4368 "READY=1\n"
4369 "STATUS=Container running.\n"
4370 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4371
6d0b55c2
LP
4372 r = sd_event_new(&event);
4373 if (r < 0) {
4374 log_error_errno(r, "Failed to get default event source: %m");
4375 goto finish;
4376 }
88213476 4377
c6c8f6e2 4378 if (arg_kill_signal > 0) {
6d0b55c2
LP
4379 /* Try to kill the init system on SIGINT or SIGTERM */
4380 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4381 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4382 } else {
4383 /* Immediately exit */
4384 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4385 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4386 }
023fb90b 4387
6d0b55c2
LP
4388 /* simply exit on sigchld */
4389 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4390
6d0b55c2
LP
4391 if (arg_expose_ports) {
4392 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4393 if (r < 0)
4394 goto finish;
023fb90b 4395
6d0b55c2
LP
4396 (void) expose_ports(rtnl, &exposed);
4397 }
023fb90b 4398
6d0b55c2 4399 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4400
9c857b9d 4401 r = pty_forward_new(event, master, true, !interactive, &forward);
6d0b55c2
LP
4402 if (r < 0) {
4403 log_error_errno(r, "Failed to create PTY forwarder: %m");
4404 goto finish;
4405 }
023fb90b 4406
6d0b55c2
LP
4407 r = sd_event_loop(event);
4408 if (r < 0) {
4409 log_error_errno(r, "Failed to run event loop: %m");
4410 goto finish;
4411 }
4412
4413 pty_forward_get_last_char(forward, &last_char);
4414
4415 forward = pty_forward_free(forward);
4416
4417 if (!arg_quiet && last_char != '\n')
4418 putc('\n', stdout);
04d39279 4419
6d0b55c2
LP
4420 /* Kill if it is not dead yet anyway */
4421 terminate_machine(pid);
4422 }
840295fc 4423 }
1f0cd86b 4424
840295fc 4425 /* Normally redundant, but better safe than sorry */
04d39279 4426 kill(pid, SIGKILL);
a258bf26 4427
113cea80 4428 r = wait_for_container(pid, &container_status);
04d39279
LP
4429 pid = 0;
4430
ec16945e 4431 if (r < 0)
ce9f1527
LP
4432 /* We failed to wait for the container, or the
4433 * container exited abnormally */
ec16945e
LP
4434 goto finish;
4435 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4436 /* The container exited with a non-zero
4437 * status, or with zero status and no reboot
4438 * was requested. */
ec16945e 4439 ret = r;
d87be9b0 4440 break;
ec16945e 4441 }
88213476 4442
113cea80 4443 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4444
4445 if (arg_keep_unit) {
4446 /* Special handling if we are running as a
4447 * service: instead of simply restarting the
4448 * machine we want to restart the entire
4449 * service, so let's inform systemd about this
4450 * with the special exit code 133. The service
4451 * file uses RestartForceExitStatus=133 so
4452 * that this results in a full nspawn
4453 * restart. This is necessary since we might
4454 * have cgroup parameters set we want to have
4455 * flushed out. */
ec16945e
LP
4456 ret = 133;
4457 r = 0;
ce38dbc8
LP
4458 break;
4459 }
6d0b55c2
LP
4460
4461 flush_ports(&exposed);
d87be9b0 4462 }
88213476
LP
4463
4464finish:
af4ec430
LP
4465 sd_notify(false,
4466 "STOPPING=1\n"
4467 "STATUS=Terminating...");
4468
1b9e5b12
LP
4469 loop_remove(loop_nr, &image_fd);
4470
9444b1f2
LP
4471 if (pid > 0)
4472 kill(pid, SIGKILL);
88213476 4473
ec16945e
LP
4474 if (remove_subvol && arg_directory) {
4475 int k;
4476
d9e2daaf 4477 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
4478 if (k < 0)
4479 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4480 }
4481
785890ac
LP
4482 if (arg_machine) {
4483 const char *p;
4484
63c372cb 4485 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4486 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4487 }
4488
04d391da 4489 free(arg_directory);
ec16945e
LP
4490 free(arg_template);
4491 free(arg_image);
7027ff61 4492 free(arg_machine);
c74e630d
LP
4493 free(arg_user);
4494 strv_free(arg_setenv);
4495 strv_free(arg_network_interfaces);
4496 strv_free(arg_network_macvlan);
4bbfe7ad 4497 strv_free(arg_network_ipvlan);
c74e630d
LP
4498 strv_free(arg_bind);
4499 strv_free(arg_bind_ro);
06c17c39 4500 strv_free(arg_tmpfs);
88213476 4501
6d0b55c2
LP
4502 flush_ports(&exposed);
4503
4504 while (arg_expose_ports) {
4505 ExposePort *p = arg_expose_ports;
4506 LIST_REMOVE(ports, arg_expose_ports, p);
4507 free(p);
4508 }
4509
ec16945e 4510 return r < 0 ? EXIT_FAILURE : ret;
88213476 4511}