]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: split out seccomp call into nspawn-seccomp.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
8fe0087e 60#include "formats-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e
LP
63#include "hostname-util.h"
64#include "log.h"
65#include "loopback-setup.h"
e01ff70a 66#include "machine-id-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea
LP
78#include "nspawn-register.h"
79#include "nspawn-settings.h"
80#include "nspawn-setuid.h"
7732f92b 81#include "nspawn-stub-pid1.h"
f011b0b8 82#include "nspawn-seccomp.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
88#include "rm-rf.h"
68b02049 89#include "selinux-util.h"
8fe0087e 90#include "signal-util.h"
2583fbea 91#include "socket-util.h"
8fcde012 92#include "stat-util.h"
15a5e950 93#include "stdio-util.h"
07630cea 94#include "string-util.h"
8fe0087e
LP
95#include "strv.h"
96#include "terminal-util.h"
97#include "udev-util.h"
affb60b1 98#include "umask-util.h"
b1d4f8e1 99#include "user-util.h"
8fe0087e 100#include "util.h"
e9642be2 101
0e7ac751
LP
102/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
103 * UID range here */
104#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
105#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
106
113cea80
DH
107typedef enum ContainerStatus {
108 CONTAINER_TERMINATED,
109 CONTAINER_REBOOTED
110} ContainerStatus;
111
57fb9fb5
LP
112typedef enum LinkJournal {
113 LINK_NO,
114 LINK_AUTO,
115 LINK_HOST,
116 LINK_GUEST
117} LinkJournal;
88213476
LP
118
119static char *arg_directory = NULL;
ec16945e 120static char *arg_template = NULL;
5f932eb9 121static char *arg_chdir = NULL;
687d0825 122static char *arg_user = NULL;
9444b1f2 123static sd_id128_t arg_uuid = {};
7027ff61 124static char *arg_machine = NULL;
c74e630d
LP
125static const char *arg_selinux_context = NULL;
126static const char *arg_selinux_apifs_context = NULL;
9444b1f2 127static const char *arg_slice = NULL;
ff01d048 128static bool arg_private_network = false;
bc2f673e 129static bool arg_read_only = false;
7732f92b 130static StartMode arg_start_mode = START_PID1;
ec16945e 131static bool arg_ephemeral = false;
57fb9fb5 132static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 133static bool arg_link_journal_try = false;
5076f0cc
LP
134static uint64_t arg_retain =
135 (1ULL << CAP_CHOWN) |
136 (1ULL << CAP_DAC_OVERRIDE) |
137 (1ULL << CAP_DAC_READ_SEARCH) |
138 (1ULL << CAP_FOWNER) |
139 (1ULL << CAP_FSETID) |
140 (1ULL << CAP_IPC_OWNER) |
141 (1ULL << CAP_KILL) |
142 (1ULL << CAP_LEASE) |
143 (1ULL << CAP_LINUX_IMMUTABLE) |
144 (1ULL << CAP_NET_BIND_SERVICE) |
145 (1ULL << CAP_NET_BROADCAST) |
146 (1ULL << CAP_NET_RAW) |
147 (1ULL << CAP_SETGID) |
148 (1ULL << CAP_SETFCAP) |
149 (1ULL << CAP_SETPCAP) |
150 (1ULL << CAP_SETUID) |
151 (1ULL << CAP_SYS_ADMIN) |
152 (1ULL << CAP_SYS_CHROOT) |
153 (1ULL << CAP_SYS_NICE) |
154 (1ULL << CAP_SYS_PTRACE) |
155 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 156 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
157 (1ULL << CAP_SYS_BOOT) |
158 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
159 (1ULL << CAP_AUDIT_CONTROL) |
160 (1ULL << CAP_MKNOD);
5a8af538
LP
161static CustomMount *arg_custom_mounts = NULL;
162static unsigned arg_n_custom_mounts = 0;
f4889f65 163static char **arg_setenv = NULL;
284c0b91 164static bool arg_quiet = false;
8a96d94e 165static bool arg_share_system = false;
eb91eb18 166static bool arg_register = true;
89f7c846 167static bool arg_keep_unit = false;
aa28aefe 168static char **arg_network_interfaces = NULL;
c74e630d 169static char **arg_network_macvlan = NULL;
4bbfe7ad 170static char **arg_network_ipvlan = NULL;
69c79d3c 171static bool arg_network_veth = false;
f6d6bad1 172static char **arg_network_veth_extra = NULL;
f757855e 173static char *arg_network_bridge = NULL;
22b28dfd 174static char *arg_network_zone = NULL;
050f7277 175static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 176static char *arg_image = NULL;
f757855e 177static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 178static ExposePort *arg_expose_ports = NULL;
f36933fe 179static char **arg_property = NULL;
0de7acce 180static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 181static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 182static bool arg_userns_chown = false;
c6c8f6e2 183static int arg_kill_signal = 0;
efdb0237 184static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
185static SettingsMask arg_settings_mask = 0;
186static int arg_settings_trusted = -1;
187static char **arg_parameters = NULL;
6aadfa4c 188static const char *arg_container_service_name = "systemd-nspawn";
88213476 189
601185b4 190static void help(void) {
88213476
LP
191 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
192 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
193 " -h --help Show this help\n"
194 " --version Print version string\n"
69c79d3c 195 " -q --quiet Do not show status information\n"
1b9e5b12 196 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
197 " --template=PATH Initialize root directory from template directory,\n"
198 " if missing\n"
199 " -x --ephemeral Run container with snapshot of root directory, and\n"
200 " remove it after exit\n"
201 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 202 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 203 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 204 " --chdir=PATH Set working directory in the container\n"
a8828ed9 205 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 206 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 207 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 208 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 209 " --property=NAME=VALUE Set scope unit property\n"
19aac838 210 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
03cfe0d5 211 " --private-users[=UIDBASE[:NUIDS]]\n"
19aac838
LP
212 " Run within user namespace, user configured UID/GID range\n"
213 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
69c79d3c
LP
214 " --private-network Disable network in container\n"
215 " --network-interface=INTERFACE\n"
216 " Assign an existing network interface to the\n"
217 " container\n"
c74e630d
LP
218 " --network-macvlan=INTERFACE\n"
219 " Create a macvlan network interface based on an\n"
220 " existing network interface to the container\n"
4bbfe7ad
TG
221 " --network-ipvlan=INTERFACE\n"
222 " Create a ipvlan network interface based on an\n"
223 " existing network interface to the container\n"
a8eaaee7 224 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 225 " and container\n"
f6d6bad1
LP
226 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
227 " Add an additional virtual Ethernet link between\n"
228 " host and container\n"
ab046dde 229 " --network-bridge=INTERFACE\n"
a8eaaee7 230 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
231 " and container and add it to an existing bridge on\n"
232 " the host\n"
22b28dfd
LP
233 " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
234 " and add it to an automatically managed bridge interface\n"
6d0b55c2 235 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 236 " Expose a container IP port on the host\n"
82adf6af
LP
237 " -Z --selinux-context=SECLABEL\n"
238 " Set the SELinux security context to be used by\n"
239 " processes in the container\n"
240 " -L --selinux-apifs-context=SECLABEL\n"
241 " Set the SELinux security context to be used by\n"
242 " API/tmpfs file systems in the container\n"
a8828ed9
DW
243 " --capability=CAP In addition to the default, retain specified\n"
244 " capability\n"
245 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 246 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
247 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
248 " host, try-guest, try-host\n"
574edc90 249 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 250 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
251 " --bind=PATH[:PATH[:OPTIONS]]\n"
252 " Bind mount a file or directory from the host into\n"
a8828ed9 253 " the container\n"
5e5bfa6e
EY
254 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
255 " Similar, but creates a read-only bind mount\n"
06c17c39 256 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
257 " --overlay=PATH[:PATH...]:PATH\n"
258 " Create an overlay mount from the host to \n"
259 " the container\n"
260 " --overlay-ro=PATH[:PATH...]:PATH\n"
261 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 262 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 263 " --share-system Share system namespaces with host\n"
eb91eb18 264 " --register=BOOLEAN Register container as machine\n"
89f7c846 265 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 266 " the service unit nspawn is running in\n"
6d0b55c2 267 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 268 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 269 , program_invocation_short_name);
88213476
LP
270}
271
5a8af538
LP
272
273static int custom_mounts_prepare(void) {
274 unsigned i;
275 int r;
276
277 /* Ensure the mounts are applied prefix first. */
278 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
279
280 /* Allocate working directories for the overlay file systems that need it */
281 for (i = 0; i < arg_n_custom_mounts; i++) {
282 CustomMount *m = &arg_custom_mounts[i];
283
0de7acce 284 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
285
286 if (arg_userns_chown) {
287 log_error("--private-users-chown may not be combined with custom root mounts.");
288 return -EINVAL;
289 } else if (arg_uid_shift == UID_INVALID) {
290 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
291 return -EINVAL;
292 }
825d5287
RM
293 }
294
5a8af538
LP
295 if (m->type != CUSTOM_MOUNT_OVERLAY)
296 continue;
297
298 if (m->work_dir)
299 continue;
300
301 if (m->read_only)
302 continue;
303
14bcf25c 304 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
305 if (r < 0)
306 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
307 }
308
309 return 0;
310}
311
efdb0237
LP
312static int detect_unified_cgroup_hierarchy(void) {
313 const char *e;
314 int r;
315
316 /* Allow the user to control whether the unified hierarchy is used */
317 e = getenv("UNIFIED_CGROUP_HIERARCHY");
318 if (e) {
319 r = parse_boolean(e);
320 if (r < 0)
321 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
322
323 arg_unified_cgroup_hierarchy = r;
324 return 0;
325 }
326
327 /* Otherwise inherit the default from the host system */
328 r = cg_unified();
329 if (r < 0)
330 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
331
332 arg_unified_cgroup_hierarchy = r;
333 return 0;
334}
335
88213476
LP
336static int parse_argv(int argc, char *argv[]) {
337
a41fe3a2 338 enum {
acbeb427
ZJS
339 ARG_VERSION = 0x100,
340 ARG_PRIVATE_NETWORK,
bc2f673e 341 ARG_UUID,
5076f0cc 342 ARG_READ_ONLY,
57fb9fb5 343 ARG_CAPABILITY,
420c7379 344 ARG_DROP_CAPABILITY,
17fe0523
LP
345 ARG_LINK_JOURNAL,
346 ARG_BIND,
f4889f65 347 ARG_BIND_RO,
06c17c39 348 ARG_TMPFS,
5a8af538
LP
349 ARG_OVERLAY,
350 ARG_OVERLAY_RO,
eb91eb18 351 ARG_SHARE_SYSTEM,
89f7c846 352 ARG_REGISTER,
aa28aefe 353 ARG_KEEP_UNIT,
69c79d3c 354 ARG_NETWORK_INTERFACE,
c74e630d 355 ARG_NETWORK_MACVLAN,
4bbfe7ad 356 ARG_NETWORK_IPVLAN,
ab046dde 357 ARG_NETWORK_BRIDGE,
22b28dfd 358 ARG_NETWORK_ZONE,
f6d6bad1 359 ARG_NETWORK_VETH_EXTRA,
6afc95b7 360 ARG_PERSONALITY,
4d9f07b4 361 ARG_VOLATILE,
ec16945e 362 ARG_TEMPLATE,
f36933fe 363 ARG_PROPERTY,
6dac160c 364 ARG_PRIVATE_USERS,
c6c8f6e2 365 ARG_KILL_SIGNAL,
f757855e 366 ARG_SETTINGS,
5f932eb9 367 ARG_CHDIR,
7336138e 368 ARG_PRIVATE_USERS_CHOWN,
a41fe3a2
LP
369 };
370
88213476 371 static const struct option options[] = {
aa28aefe
LP
372 { "help", no_argument, NULL, 'h' },
373 { "version", no_argument, NULL, ARG_VERSION },
374 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
375 { "template", required_argument, NULL, ARG_TEMPLATE },
376 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
377 { "user", required_argument, NULL, 'u' },
378 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
7732f92b 379 { "as-pid2", no_argument, NULL, 'a' },
aa28aefe
LP
380 { "boot", no_argument, NULL, 'b' },
381 { "uuid", required_argument, NULL, ARG_UUID },
382 { "read-only", no_argument, NULL, ARG_READ_ONLY },
383 { "capability", required_argument, NULL, ARG_CAPABILITY },
384 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
385 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
386 { "bind", required_argument, NULL, ARG_BIND },
387 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 388 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
389 { "overlay", required_argument, NULL, ARG_OVERLAY },
390 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
391 { "machine", required_argument, NULL, 'M' },
392 { "slice", required_argument, NULL, 'S' },
a5f1cb3b 393 { "setenv", required_argument, NULL, 'E' },
aa28aefe
LP
394 { "selinux-context", required_argument, NULL, 'Z' },
395 { "selinux-apifs-context", required_argument, NULL, 'L' },
396 { "quiet", no_argument, NULL, 'q' },
397 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
398 { "register", required_argument, NULL, ARG_REGISTER },
399 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
400 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 401 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 402 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 403 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 404 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 405 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
22b28dfd 406 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
6afc95b7 407 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 408 { "image", required_argument, NULL, 'i' },
4d9f07b4 409 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 410 { "port", required_argument, NULL, 'p' },
f36933fe 411 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 412 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
7336138e 413 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
c6c8f6e2 414 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 415 { "settings", required_argument, NULL, ARG_SETTINGS },
5f932eb9 416 { "chdir", required_argument, NULL, ARG_CHDIR },
eb9da376 417 {}
88213476
LP
418 };
419
9444b1f2 420 int c, r;
6aadfa4c 421 const char *p, *e;
a42c8b54 422 uint64_t plus = 0, minus = 0;
f757855e 423 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
424
425 assert(argc >= 0);
426 assert(argv);
427
19aac838 428 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
429
430 switch (c) {
431
432 case 'h':
601185b4
ZJS
433 help();
434 return 0;
88213476 435
acbeb427 436 case ARG_VERSION:
3f6fd1ba 437 return version();
acbeb427 438
88213476 439 case 'D':
0f03c2a4 440 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 441 if (r < 0)
0f03c2a4 442 return r;
ec16945e
LP
443 break;
444
445 case ARG_TEMPLATE:
0f03c2a4 446 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 447 if (r < 0)
0f03c2a4 448 return r;
88213476
LP
449 break;
450
1b9e5b12 451 case 'i':
0f03c2a4 452 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 453 if (r < 0)
0f03c2a4 454 return r;
ec16945e
LP
455 break;
456
457 case 'x':
458 arg_ephemeral = true;
1b9e5b12
LP
459 break;
460
687d0825 461 case 'u':
2fc09a9c
DM
462 r = free_and_strdup(&arg_user, optarg);
463 if (r < 0)
7027ff61 464 return log_oom();
687d0825 465
f757855e 466 arg_settings_mask |= SETTING_USER;
687d0825
MV
467 break;
468
22b28dfd
LP
469 case ARG_NETWORK_ZONE: {
470 char *j;
471
472 j = strappend("vz-", optarg);
473 if (!j)
474 return log_oom();
475
476 if (!ifname_valid(j)) {
477 log_error("Network zone name not valid: %s", j);
478 free(j);
479 return -EINVAL;
480 }
481
482 free(arg_network_zone);
483 arg_network_zone = j;
484
485 arg_network_veth = true;
486 arg_private_network = true;
487 arg_settings_mask |= SETTING_NETWORK;
488 break;
489 }
490
ab046dde 491 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
492
493 if (!ifname_valid(optarg)) {
494 log_error("Bridge interface name not valid: %s", optarg);
495 return -EINVAL;
496 }
497
f757855e
LP
498 r = free_and_strdup(&arg_network_bridge, optarg);
499 if (r < 0)
500 return log_oom();
ab046dde
TG
501
502 /* fall through */
503
0dfaa006 504 case 'n':
69c79d3c
LP
505 arg_network_veth = true;
506 arg_private_network = true;
f757855e 507 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
508 break;
509
f6d6bad1
LP
510 case ARG_NETWORK_VETH_EXTRA:
511 r = veth_extra_parse(&arg_network_veth_extra, optarg);
512 if (r < 0)
513 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
514
515 arg_private_network = true;
516 arg_settings_mask |= SETTING_NETWORK;
517 break;
518
aa28aefe 519 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
520
521 if (!ifname_valid(optarg)) {
522 log_error("Network interface name not valid: %s", optarg);
523 return -EINVAL;
524 }
525
c74e630d
LP
526 if (strv_extend(&arg_network_interfaces, optarg) < 0)
527 return log_oom();
528
529 arg_private_network = true;
f757855e 530 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
531 break;
532
533 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
534
535 if (!ifname_valid(optarg)) {
536 log_error("MACVLAN network interface name not valid: %s", optarg);
537 return -EINVAL;
538 }
539
c74e630d 540 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
541 return log_oom();
542
4bbfe7ad 543 arg_private_network = true;
f757855e 544 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
545 break;
546
547 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
548
549 if (!ifname_valid(optarg)) {
550 log_error("IPVLAN network interface name not valid: %s", optarg);
551 return -EINVAL;
552 }
553
4bbfe7ad
TG
554 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
555 return log_oom();
556
aa28aefe
LP
557 /* fall through */
558
ff01d048
LP
559 case ARG_PRIVATE_NETWORK:
560 arg_private_network = true;
f757855e 561 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
562 break;
563
0f0dbc46 564 case 'b':
7732f92b
LP
565 if (arg_start_mode == START_PID2) {
566 log_error("--boot and --as-pid2 may not be combined.");
567 return -EINVAL;
568 }
569
570 arg_start_mode = START_BOOT;
571 arg_settings_mask |= SETTING_START_MODE;
572 break;
573
574 case 'a':
575 if (arg_start_mode == START_BOOT) {
576 log_error("--boot and --as-pid2 may not be combined.");
577 return -EINVAL;
578 }
579
580 arg_start_mode = START_PID2;
581 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
582 break;
583
144f0fc0 584 case ARG_UUID:
9444b1f2
LP
585 r = sd_id128_from_string(optarg, &arg_uuid);
586 if (r < 0) {
aa96c6cb 587 log_error("Invalid UUID: %s", optarg);
9444b1f2 588 return r;
aa96c6cb 589 }
f757855e
LP
590
591 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 592 break;
aa96c6cb 593
9444b1f2 594 case 'S':
c74e630d 595 arg_slice = optarg;
144f0fc0
LP
596 break;
597
7027ff61 598 case 'M':
c1521918 599 if (isempty(optarg))
97b11eed 600 arg_machine = mfree(arg_machine);
c1521918 601 else {
0c3c4284 602 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
603 log_error("Invalid machine name: %s", optarg);
604 return -EINVAL;
605 }
7027ff61 606
0c3c4284
LP
607 r = free_and_strdup(&arg_machine, optarg);
608 if (r < 0)
eb91eb18
LP
609 return log_oom();
610
611 break;
612 }
7027ff61 613
82adf6af
LP
614 case 'Z':
615 arg_selinux_context = optarg;
a8828ed9
DW
616 break;
617
82adf6af
LP
618 case 'L':
619 arg_selinux_apifs_context = optarg;
a8828ed9
DW
620 break;
621
bc2f673e
LP
622 case ARG_READ_ONLY:
623 arg_read_only = true;
f757855e 624 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
625 break;
626
420c7379
LP
627 case ARG_CAPABILITY:
628 case ARG_DROP_CAPABILITY: {
6cbe4ed1 629 p = optarg;
9ed794a3 630 for (;;) {
6cbe4ed1 631 _cleanup_free_ char *t = NULL;
5076f0cc 632
6cbe4ed1
SS
633 r = extract_first_word(&p, &t, ",", 0);
634 if (r < 0)
635 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 636
6cbe4ed1
SS
637 if (r == 0)
638 break;
5076f0cc 639
39ed67d1
LP
640 if (streq(t, "all")) {
641 if (c == ARG_CAPABILITY)
a42c8b54 642 plus = (uint64_t) -1;
39ed67d1 643 else
a42c8b54 644 minus = (uint64_t) -1;
39ed67d1 645 } else {
2822da4f
LP
646 int cap;
647
648 cap = capability_from_name(t);
649 if (cap < 0) {
39ed67d1
LP
650 log_error("Failed to parse capability %s.", t);
651 return -EINVAL;
652 }
653
654 if (c == ARG_CAPABILITY)
a42c8b54 655 plus |= 1ULL << (uint64_t) cap;
39ed67d1 656 else
a42c8b54 657 minus |= 1ULL << (uint64_t) cap;
5076f0cc 658 }
5076f0cc
LP
659 }
660
f757855e 661 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
662 break;
663 }
664
57fb9fb5
LP
665 case 'j':
666 arg_link_journal = LINK_GUEST;
574edc90 667 arg_link_journal_try = true;
57fb9fb5
LP
668 break;
669
670 case ARG_LINK_JOURNAL:
53e438e3 671 if (streq(optarg, "auto")) {
57fb9fb5 672 arg_link_journal = LINK_AUTO;
53e438e3
LP
673 arg_link_journal_try = false;
674 } else if (streq(optarg, "no")) {
57fb9fb5 675 arg_link_journal = LINK_NO;
53e438e3
LP
676 arg_link_journal_try = false;
677 } else if (streq(optarg, "guest")) {
57fb9fb5 678 arg_link_journal = LINK_GUEST;
53e438e3
LP
679 arg_link_journal_try = false;
680 } else if (streq(optarg, "host")) {
57fb9fb5 681 arg_link_journal = LINK_HOST;
53e438e3
LP
682 arg_link_journal_try = false;
683 } else if (streq(optarg, "try-guest")) {
574edc90
MP
684 arg_link_journal = LINK_GUEST;
685 arg_link_journal_try = true;
686 } else if (streq(optarg, "try-host")) {
687 arg_link_journal = LINK_HOST;
688 arg_link_journal_try = true;
689 } else {
57fb9fb5
LP
690 log_error("Failed to parse link journal mode %s", optarg);
691 return -EINVAL;
692 }
693
694 break;
695
17fe0523 696 case ARG_BIND:
f757855e
LP
697 case ARG_BIND_RO:
698 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
699 if (r < 0)
700 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 701
f757855e 702 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 703 break;
06c17c39 704
f757855e
LP
705 case ARG_TMPFS:
706 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
707 if (r < 0)
708 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 709
f757855e 710 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 711 break;
5a8af538
LP
712
713 case ARG_OVERLAY:
714 case ARG_OVERLAY_RO: {
715 _cleanup_free_ char *upper = NULL, *destination = NULL;
716 _cleanup_strv_free_ char **lower = NULL;
717 CustomMount *m;
718 unsigned n = 0;
719 char **i;
720
62f9f39a
RM
721 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
722 if (r == -ENOMEM)
06c17c39 723 return log_oom();
62f9f39a
RM
724 else if (r < 0) {
725 log_error("Invalid overlay specification: %s", optarg);
726 return r;
727 }
06c17c39 728
5a8af538
LP
729 STRV_FOREACH(i, lower) {
730 if (!path_is_absolute(*i)) {
731 log_error("Overlay path %s is not absolute.", *i);
732 return -EINVAL;
733 }
734
735 n++;
736 }
737
738 if (n < 2) {
739 log_error("--overlay= needs at least two colon-separated directories specified.");
740 return -EINVAL;
741 }
742
743 if (n == 2) {
744 /* If two parameters are specified,
745 * the first one is the lower, the
746 * second one the upper directory. And
af86c440
ZJS
747 * we'll also define the destination
748 * mount point the same as the upper. */
5a8af538
LP
749 upper = lower[1];
750 lower[1] = NULL;
751
752 destination = strdup(upper);
753 if (!destination)
754 return log_oom();
755
756 } else {
757 upper = lower[n - 2];
758 destination = lower[n - 1];
759 lower[n - 2] = NULL;
760 }
761
f757855e 762 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
763 if (!m)
764 return log_oom();
765
766 m->destination = destination;
767 m->source = upper;
768 m->lower = lower;
769 m->read_only = c == ARG_OVERLAY_RO;
770
771 upper = destination = NULL;
772 lower = NULL;
06c17c39 773
f757855e 774 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
775 break;
776 }
777
a5f1cb3b 778 case 'E': {
f4889f65
LP
779 char **n;
780
781 if (!env_assignment_is_valid(optarg)) {
782 log_error("Environment variable assignment '%s' is not valid.", optarg);
783 return -EINVAL;
784 }
785
786 n = strv_env_set(arg_setenv, optarg);
787 if (!n)
788 return log_oom();
789
790 strv_free(arg_setenv);
791 arg_setenv = n;
f757855e
LP
792
793 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
794 break;
795 }
796
284c0b91
LP
797 case 'q':
798 arg_quiet = true;
799 break;
800
8a96d94e
LP
801 case ARG_SHARE_SYSTEM:
802 arg_share_system = true;
803 break;
804
eb91eb18
LP
805 case ARG_REGISTER:
806 r = parse_boolean(optarg);
807 if (r < 0) {
808 log_error("Failed to parse --register= argument: %s", optarg);
809 return r;
810 }
811
812 arg_register = r;
813 break;
814
89f7c846
LP
815 case ARG_KEEP_UNIT:
816 arg_keep_unit = true;
817 break;
818
6afc95b7
LP
819 case ARG_PERSONALITY:
820
ac45f971 821 arg_personality = personality_from_string(optarg);
050f7277 822 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
823 log_error("Unknown or unsupported personality '%s'.", optarg);
824 return -EINVAL;
825 }
826
f757855e 827 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
828 break;
829
4d9f07b4
LP
830 case ARG_VOLATILE:
831
832 if (!optarg)
f757855e 833 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 834 else {
f757855e 835 VolatileMode m;
4d9f07b4 836
f757855e
LP
837 m = volatile_mode_from_string(optarg);
838 if (m < 0) {
839 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 840 return -EINVAL;
f757855e
LP
841 } else
842 arg_volatile_mode = m;
6d0b55c2
LP
843 }
844
f757855e
LP
845 arg_settings_mask |= SETTING_VOLATILE_MODE;
846 break;
6d0b55c2 847
f757855e
LP
848 case 'p':
849 r = expose_port_parse(&arg_expose_ports, optarg);
850 if (r == -EEXIST)
851 return log_error_errno(r, "Duplicate port specification: %s", optarg);
852 if (r < 0)
853 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 854
f757855e 855 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 856 break;
6d0b55c2 857
f36933fe
LP
858 case ARG_PROPERTY:
859 if (strv_extend(&arg_property, optarg) < 0)
860 return log_oom();
861
862 break;
863
6dac160c 864 case ARG_PRIVATE_USERS:
0de7acce
LP
865
866 r = optarg ? parse_boolean(optarg) : 1;
867 if (r == 0) {
868 /* no: User namespacing off */
869 arg_userns_mode = USER_NAMESPACE_NO;
870 arg_uid_shift = UID_INVALID;
871 arg_uid_range = UINT32_C(0x10000);
872 } else if (r > 0) {
873 /* yes: User namespacing on, UID range is read from root dir */
874 arg_userns_mode = USER_NAMESPACE_FIXED;
875 arg_uid_shift = UID_INVALID;
876 arg_uid_range = UINT32_C(0x10000);
877 } else if (streq(optarg, "pick")) {
878 /* pick: User namespacing on, UID range is picked randomly */
879 arg_userns_mode = USER_NAMESPACE_PICK;
880 arg_uid_shift = UID_INVALID;
881 arg_uid_range = UINT32_C(0x10000);
882 } else {
6dac160c
LP
883 _cleanup_free_ char *buffer = NULL;
884 const char *range, *shift;
885
0de7acce
LP
886 /* anything else: User namespacing on, UID range is explicitly configured */
887
6dac160c
LP
888 range = strchr(optarg, ':');
889 if (range) {
890 buffer = strndup(optarg, range - optarg);
891 if (!buffer)
892 return log_oom();
893 shift = buffer;
894
895 range++;
896 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
897 log_error("Failed to parse UID range: %s", range);
898 return -EINVAL;
899 }
900 } else
901 shift = optarg;
902
903 if (parse_uid(shift, &arg_uid_shift) < 0) {
904 log_error("Failed to parse UID: %s", optarg);
905 return -EINVAL;
906 }
0de7acce
LP
907
908 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
909 }
910
0de7acce 911 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
912 break;
913
0de7acce 914 case 'U':
ccabee0d
LP
915 if (userns_supported()) {
916 arg_userns_mode = USER_NAMESPACE_PICK;
917 arg_uid_shift = UID_INVALID;
918 arg_uid_range = UINT32_C(0x10000);
919
920 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
921 }
922
7336138e
LP
923 break;
924
0de7acce 925 case ARG_PRIVATE_USERS_CHOWN:
19aac838 926 arg_userns_chown = true;
0de7acce
LP
927
928 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
929 break;
930
c6c8f6e2
LP
931 case ARG_KILL_SIGNAL:
932 arg_kill_signal = signal_from_string_try_harder(optarg);
933 if (arg_kill_signal < 0) {
934 log_error("Cannot parse signal: %s", optarg);
935 return -EINVAL;
936 }
937
f757855e
LP
938 arg_settings_mask |= SETTING_KILL_SIGNAL;
939 break;
940
941 case ARG_SETTINGS:
942
943 /* no → do not read files
944 * yes → read files, do not override cmdline, trust only subset
945 * override → read files, override cmdline, trust only subset
946 * trusted → read files, do not override cmdline, trust all
947 */
948
949 r = parse_boolean(optarg);
950 if (r < 0) {
951 if (streq(optarg, "trusted")) {
952 mask_all_settings = false;
953 mask_no_settings = false;
954 arg_settings_trusted = true;
955
956 } else if (streq(optarg, "override")) {
957 mask_all_settings = false;
958 mask_no_settings = true;
959 arg_settings_trusted = -1;
960 } else
961 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
962 } else if (r > 0) {
963 /* yes */
964 mask_all_settings = false;
965 mask_no_settings = false;
966 arg_settings_trusted = -1;
967 } else {
968 /* no */
969 mask_all_settings = true;
970 mask_no_settings = false;
971 arg_settings_trusted = false;
972 }
973
c6c8f6e2
LP
974 break;
975
5f932eb9
LP
976 case ARG_CHDIR:
977 if (!path_is_absolute(optarg)) {
978 log_error("Working directory %s is not an absolute path.", optarg);
979 return -EINVAL;
980 }
981
982 r = free_and_strdup(&arg_chdir, optarg);
983 if (r < 0)
984 return log_oom();
985
986 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
987 break;
988
88213476
LP
989 case '?':
990 return -EINVAL;
991
992 default:
eb9da376 993 assert_not_reached("Unhandled option");
88213476 994 }
88213476 995
eb91eb18
LP
996 if (arg_share_system)
997 arg_register = false;
998
0de7acce 999 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1000 arg_userns_chown = true;
1001
7732f92b 1002 if (arg_start_mode != START_PID1 && arg_share_system) {
eb91eb18
LP
1003 log_error("--boot and --share-system may not be combined.");
1004 return -EINVAL;
1005 }
1006
89f7c846
LP
1007 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1008 log_error("--keep-unit may not be used when invoked from a user session.");
1009 return -EINVAL;
1010 }
1011
1b9e5b12
LP
1012 if (arg_directory && arg_image) {
1013 log_error("--directory= and --image= may not be combined.");
1014 return -EINVAL;
1015 }
1016
ec16945e
LP
1017 if (arg_template && arg_image) {
1018 log_error("--template= and --image= may not be combined.");
1019 return -EINVAL;
1020 }
1021
1022 if (arg_template && !(arg_directory || arg_machine)) {
1023 log_error("--template= needs --directory= or --machine=.");
1024 return -EINVAL;
1025 }
1026
1027 if (arg_ephemeral && arg_template) {
1028 log_error("--ephemeral and --template= may not be combined.");
1029 return -EINVAL;
1030 }
1031
1032 if (arg_ephemeral && arg_image) {
1033 log_error("--ephemeral and --image= may not be combined.");
1034 return -EINVAL;
1035 }
1036
df9a75e4
LP
1037 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1038 log_error("--ephemeral and --link-journal= may not be combined.");
1039 return -EINVAL;
1040 }
1041
ccabee0d 1042 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1043 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1044 return -EOPNOTSUPP;
1045 }
1046
1047 if (arg_userns_chown && arg_read_only) {
1048 log_error("--read-only and --private-users-chown may not be combined.");
1049 return -EINVAL;
1050 }
f757855e 1051
22b28dfd
LP
1052 if (arg_network_bridge && arg_network_zone) {
1053 log_error("--network-bridge= and --network-zone= may not be combined.");
1054 return -EINVAL;
1055 }
1056
f757855e
LP
1057 if (argc > optind) {
1058 arg_parameters = strv_copy(argv + optind);
1059 if (!arg_parameters)
1060 return log_oom();
1061
7732f92b 1062 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1063 }
1064
1065 /* Load all settings from .nspawn files */
1066 if (mask_no_settings)
1067 arg_settings_mask = 0;
1068
1069 /* Don't load any settings from .nspawn files */
1070 if (mask_all_settings)
1071 arg_settings_mask = _SETTINGS_MASK_ALL;
1072
1073 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1074
1075 r = detect_unified_cgroup_hierarchy();
1076 if (r < 0)
1077 return r;
1078
6aadfa4c
ILG
1079 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1080 if (e)
1081 arg_container_service_name = e;
1082
f757855e
LP
1083 return 1;
1084}
1085
1086static int verify_arguments(void) {
1087
1088 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1089 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1090 return -EINVAL;
1091 }
1092
6d0b55c2
LP
1093 if (arg_expose_ports && !arg_private_network) {
1094 log_error("Cannot use --port= without private networking.");
1095 return -EINVAL;
1096 }
1097
1c1ea217
EV
1098#ifndef HAVE_LIBIPTC
1099 if (arg_expose_ports) {
1100 log_error("--port= is not supported, compiled without libiptc support.");
1101 return -EOPNOTSUPP;
1102 }
1103#endif
1104
7732f92b 1105 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1106 arg_kill_signal = SIGRTMIN+3;
1107
f757855e 1108 return 0;
88213476
LP
1109}
1110
03cfe0d5
LP
1111static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1112 assert(p);
1113
0de7acce 1114 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1115 return 0;
1116
1117 if (uid == UID_INVALID && gid == GID_INVALID)
1118 return 0;
1119
1120 if (uid != UID_INVALID) {
1121 uid += arg_uid_shift;
1122
1123 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1124 return -EOVERFLOW;
1125 }
1126
1127 if (gid != GID_INVALID) {
1128 gid += (gid_t) arg_uid_shift;
1129
1130 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1131 return -EOVERFLOW;
1132 }
1133
1134 if (lchown(p, uid, gid) < 0)
1135 return -errno;
b12afc8c
LP
1136
1137 return 0;
1138}
1139
03cfe0d5
LP
1140static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1141 const char *q;
1142
1143 q = prefix_roota(root, path);
1144 if (mkdir(q, mode) < 0) {
1145 if (errno == EEXIST)
1146 return 0;
1147 return -errno;
1148 }
1149
1150 return userns_lchown(q, uid, gid);
1151}
1152
e58a1277 1153static int setup_timezone(const char *dest) {
03cfe0d5
LP
1154 _cleanup_free_ char *p = NULL, *q = NULL;
1155 const char *where, *check, *what;
d4036145
LP
1156 char *z, *y;
1157 int r;
f8440af5 1158
e58a1277
LP
1159 assert(dest);
1160
1161 /* Fix the timezone, if possible */
d4036145
LP
1162 r = readlink_malloc("/etc/localtime", &p);
1163 if (r < 0) {
1164 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1165 return 0;
1166 }
1167
1168 z = path_startswith(p, "../usr/share/zoneinfo/");
1169 if (!z)
1170 z = path_startswith(p, "/usr/share/zoneinfo/");
1171 if (!z) {
1172 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1173 return 0;
1174 }
1175
03cfe0d5 1176 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1177 r = readlink_malloc(where, &q);
1178 if (r >= 0) {
1179 y = path_startswith(q, "../usr/share/zoneinfo/");
1180 if (!y)
1181 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1182
d4036145
LP
1183 /* Already pointing to the right place? Then do nothing .. */
1184 if (y && streq(y, z))
1185 return 0;
1186 }
1187
03cfe0d5 1188 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1189 check = prefix_roota(dest, check);
03cfe0d5 1190 if (laccess(check, F_OK) < 0) {
d4036145
LP
1191 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1192 return 0;
1193 }
68fb0892 1194
79d80fc1
TG
1195 r = unlink(where);
1196 if (r < 0 && errno != ENOENT) {
56f64d95 1197 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1198 return 0;
1199 }
4d9f07b4 1200
03cfe0d5 1201 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1202 if (symlink(what, where) < 0) {
56f64d95 1203 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1204 return 0;
1205 }
e58a1277 1206
03cfe0d5
LP
1207 r = userns_lchown(where, 0, 0);
1208 if (r < 0)
1209 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1210
e58a1277 1211 return 0;
88213476
LP
1212}
1213
2547bb41 1214static int setup_resolv_conf(const char *dest) {
03cfe0d5 1215 const char *where = NULL;
79d80fc1 1216 int r;
2547bb41
LP
1217
1218 assert(dest);
1219
1220 if (arg_private_network)
1221 return 0;
1222
1223 /* Fix resolv.conf, if possible */
03cfe0d5 1224 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1225
f2068bcc 1226 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1227 if (r < 0) {
68a313c5
LP
1228 /* If the file already exists as symlink, let's
1229 * suppress the warning, under the assumption that
1230 * resolved or something similar runs inside and the
1231 * symlink points there.
1232 *
1233 * If the disk image is read-only, there's also no
1234 * point in complaining.
1235 */
1236 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1237 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1238 return 0;
1239 }
2547bb41 1240
03cfe0d5
LP
1241 r = userns_lchown(where, 0, 0);
1242 if (r < 0)
1243 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1244
2547bb41
LP
1245 return 0;
1246}
1247
9f24adc2 1248static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1249 assert(s);
9f24adc2
LP
1250
1251 snprintf(s, 37,
1252 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1253 SD_ID128_FORMAT_VAL(id));
1254
1255 return s;
1256}
1257
04bc4a3f 1258static int setup_boot_id(const char *dest) {
03cfe0d5 1259 const char *from, *to;
39883f62 1260 sd_id128_t rnd = {};
04bc4a3f
LP
1261 char as_uuid[37];
1262 int r;
1263
eb91eb18
LP
1264 if (arg_share_system)
1265 return 0;
1266
04bc4a3f
LP
1267 /* Generate a new randomized boot ID, so that each boot-up of
1268 * the container gets a new one */
1269
03cfe0d5
LP
1270 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1271 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1272
1273 r = sd_id128_randomize(&rnd);
f647962d
MS
1274 if (r < 0)
1275 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1276
9f24adc2 1277 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1278
4c1fc3e4 1279 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1280 if (r < 0)
1281 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1282
03cfe0d5
LP
1283 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1284 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1285 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1286 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1287
1288 unlink(from);
04bc4a3f
LP
1289 return r;
1290}
1291
e58a1277 1292static int copy_devnodes(const char *dest) {
88213476
LP
1293
1294 static const char devnodes[] =
1295 "null\0"
1296 "zero\0"
1297 "full\0"
1298 "random\0"
1299 "urandom\0"
85614d66
TG
1300 "tty\0"
1301 "net/tun\0";
88213476
LP
1302
1303 const char *d;
e58a1277 1304 int r = 0;
7fd1b19b 1305 _cleanup_umask_ mode_t u;
a258bf26
LP
1306
1307 assert(dest);
124640f1
LP
1308
1309 u = umask(0000);
88213476 1310
03cfe0d5
LP
1311 /* Create /dev/net, so that we can create /dev/net/tun in it */
1312 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1313 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1314
88213476 1315 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1316 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1317 struct stat st;
88213476 1318
7f112f50 1319 from = strappend("/dev/", d);
03cfe0d5 1320 to = prefix_root(dest, from);
88213476
LP
1321
1322 if (stat(from, &st) < 0) {
1323
4a62c710
MS
1324 if (errno != ENOENT)
1325 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1326
a258bf26 1327 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1328
03cfe0d5 1329 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1330 return -EIO;
a258bf26 1331
85614d66 1332 } else {
81f5049b
AC
1333 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1334 if (errno != EPERM)
1335 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1336
1337 /* Some systems abusively restrict mknod but
1338 * allow bind mounts. */
1339 r = touch(to);
1340 if (r < 0)
1341 return log_error_errno(r, "touch (%s) failed: %m", to);
1342 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1343 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1344 }
6278cf60 1345
03cfe0d5
LP
1346 r = userns_lchown(to, 0, 0);
1347 if (r < 0)
1348 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1349 }
88213476
LP
1350 }
1351
e58a1277
LP
1352 return r;
1353}
88213476 1354
03cfe0d5
LP
1355static int setup_pts(const char *dest) {
1356 _cleanup_free_ char *options = NULL;
1357 const char *p;
709f6e46 1358 int r;
03cfe0d5
LP
1359
1360#ifdef HAVE_SELINUX
1361 if (arg_selinux_apifs_context)
1362 (void) asprintf(&options,
3dce8915 1363 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1364 arg_uid_shift + TTY_GID,
1365 arg_selinux_apifs_context);
1366 else
1367#endif
1368 (void) asprintf(&options,
3dce8915 1369 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1370 arg_uid_shift + TTY_GID);
f2d88580 1371
03cfe0d5 1372 if (!options)
f2d88580
LP
1373 return log_oom();
1374
03cfe0d5 1375 /* Mount /dev/pts itself */
cc9fce65 1376 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1377 if (mkdir(p, 0755) < 0)
1378 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1379 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1380 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1381 r = userns_lchown(p, 0, 0);
1382 if (r < 0)
1383 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1384
1385 /* Create /dev/ptmx symlink */
1386 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1387 if (symlink("pts/ptmx", p) < 0)
1388 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1389 r = userns_lchown(p, 0, 0);
1390 if (r < 0)
1391 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1392
03cfe0d5
LP
1393 /* And fix /dev/pts/ptmx ownership */
1394 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1395 r = userns_lchown(p, 0, 0);
1396 if (r < 0)
1397 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1398
f2d88580
LP
1399 return 0;
1400}
1401
e58a1277 1402static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1403 _cleanup_umask_ mode_t u;
1404 const char *to;
e58a1277 1405 int r;
e58a1277
LP
1406
1407 assert(dest);
1408 assert(console);
1409
1410 u = umask(0000);
1411
03cfe0d5 1412 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1413 if (r < 0)
1414 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1415
a258bf26
LP
1416 /* We need to bind mount the right tty to /dev/console since
1417 * ptys can only exist on pts file systems. To have something
81f5049b 1418 * to bind mount things on we create a empty regular file. */
a258bf26 1419
03cfe0d5 1420 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1421 r = touch(to);
1422 if (r < 0)
1423 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1424
4543768d 1425 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1426 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1427
25ea79fe 1428 return 0;
e58a1277
LP
1429}
1430
1431static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1432 const char *from, *to;
7fd1b19b 1433 _cleanup_umask_ mode_t u;
d9603714 1434 int fd, r;
e58a1277 1435
e58a1277 1436 assert(kmsg_socket >= 0);
a258bf26 1437
e58a1277 1438 u = umask(0000);
a258bf26 1439
03cfe0d5 1440 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1441 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1442 * on the reading side behave very similar to /proc/kmsg,
1443 * their writing side behaves differently from /dev/kmsg in
1444 * that writing blocks when nothing is reading. In order to
1445 * avoid any problems with containers deadlocking due to this
1446 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1447 from = prefix_roota(dest, "/run/kmsg");
1448 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1449
4a62c710 1450 if (mkfifo(from, 0600) < 0)
03cfe0d5 1451 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1452 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1453 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1454
1455 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1456 if (fd < 0)
1457 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1458
e58a1277
LP
1459 /* Store away the fd in the socket, so that it stays open as
1460 * long as we run the child */
3ee897d6 1461 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1462 safe_close(fd);
e58a1277 1463
d9603714
DH
1464 if (r < 0)
1465 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1466
03cfe0d5
LP
1467 /* And now make the FIFO unavailable as /run/kmsg... */
1468 (void) unlink(from);
1469
25ea79fe 1470 return 0;
88213476
LP
1471}
1472
1c4baffc 1473static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1474 union in_addr_union *exposed = userdata;
1475
1476 assert(rtnl);
1477 assert(m);
1478 assert(exposed);
1479
7a8f6325 1480 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1481 return 0;
1482}
1483
3a74cea5 1484static int setup_hostname(void) {
3a74cea5 1485
eb91eb18
LP
1486 if (arg_share_system)
1487 return 0;
1488
605f81a8 1489 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1490 return -errno;
3a74cea5 1491
7027ff61 1492 return 0;
3a74cea5
LP
1493}
1494
57fb9fb5 1495static int setup_journal(const char *directory) {
e01ff70a 1496 sd_id128_t this_id;
0f5e1382 1497 _cleanup_free_ char *d = NULL;
e01ff70a 1498 const char *p, *q;
8054d749 1499 bool try;
e01ff70a 1500 char id[33];
57fb9fb5
LP
1501 int r;
1502
df9a75e4
LP
1503 /* Don't link journals in ephemeral mode */
1504 if (arg_ephemeral)
1505 return 0;
1506
8054d749
LP
1507 if (arg_link_journal == LINK_NO)
1508 return 0;
1509
1510 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1511
4d680aee 1512 r = sd_id128_get_machine(&this_id);
f647962d
MS
1513 if (r < 0)
1514 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1515
e01ff70a 1516 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1517 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1518 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1519 if (try)
4d680aee 1520 return 0;
df9a75e4 1521 return -EEXIST;
4d680aee
ZJS
1522 }
1523
03cfe0d5
LP
1524 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1525 if (r < 0)
1526 return log_error_errno(r, "Failed to create /var: %m");
1527
1528 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1529 if (r < 0)
1530 return log_error_errno(r, "Failed to create /var/log: %m");
1531
1532 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1533 if (r < 0)
1534 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1535
e01ff70a
MS
1536 (void) sd_id128_to_string(arg_uuid, id);
1537
03cfe0d5
LP
1538 p = strjoina("/var/log/journal/", id);
1539 q = prefix_roota(directory, p);
27407a01 1540
e26d6ce5 1541 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1542 if (try)
1543 return 0;
27407a01 1544
8054d749
LP
1545 log_error("%s: already a mount point, refusing to use for journal", p);
1546 return -EEXIST;
57fb9fb5
LP
1547 }
1548
e26d6ce5 1549 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1550 if (try)
1551 return 0;
57fb9fb5 1552
8054d749
LP
1553 log_error("%s: already a mount point, refusing to use for journal", q);
1554 return -EEXIST;
57fb9fb5
LP
1555 }
1556
1557 r = readlink_and_make_absolute(p, &d);
1558 if (r >= 0) {
1559 if ((arg_link_journal == LINK_GUEST ||
1560 arg_link_journal == LINK_AUTO) &&
1561 path_equal(d, q)) {
1562
03cfe0d5 1563 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1564 if (r < 0)
709f6e46 1565 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1566 return 0;
57fb9fb5
LP
1567 }
1568
4a62c710
MS
1569 if (unlink(p) < 0)
1570 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1571 } else if (r == -EINVAL) {
1572
1573 if (arg_link_journal == LINK_GUEST &&
1574 rmdir(p) < 0) {
1575
27407a01
ZJS
1576 if (errno == ENOTDIR) {
1577 log_error("%s already exists and is neither a symlink nor a directory", p);
1578 return r;
4314d33f
MS
1579 } else
1580 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1581 }
4314d33f
MS
1582 } else if (r != -ENOENT)
1583 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1584
1585 if (arg_link_journal == LINK_GUEST) {
1586
1587 if (symlink(q, p) < 0) {
8054d749 1588 if (try) {
56f64d95 1589 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1590 return 0;
4314d33f
MS
1591 } else
1592 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1593 }
1594
03cfe0d5 1595 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1596 if (r < 0)
709f6e46 1597 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1598 return 0;
57fb9fb5
LP
1599 }
1600
1601 if (arg_link_journal == LINK_HOST) {
ccddd104 1602 /* don't create parents here — if the host doesn't have
574edc90 1603 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1604
1605 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1606 if (try) {
56f64d95 1607 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1608 return 0;
4314d33f
MS
1609 } else
1610 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1611 }
1612
27407a01
ZJS
1613 } else if (access(p, F_OK) < 0)
1614 return 0;
57fb9fb5 1615
cdb2b9d0
LP
1616 if (dir_is_empty(q) == 0)
1617 log_warning("%s is not empty, proceeding anyway.", q);
1618
03cfe0d5 1619 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1620 if (r < 0)
1621 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1622
4543768d 1623 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1624 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1625
27407a01 1626 return 0;
57fb9fb5
LP
1627}
1628
88213476 1629static int drop_capabilities(void) {
a103496c 1630 return capability_bounding_set_drop(arg_retain, false);
88213476
LP
1631}
1632
db999e0f
LP
1633static int reset_audit_loginuid(void) {
1634 _cleanup_free_ char *p = NULL;
1635 int r;
1636
1637 if (arg_share_system)
1638 return 0;
1639
1640 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1641 if (r == -ENOENT)
db999e0f 1642 return 0;
f647962d
MS
1643 if (r < 0)
1644 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1645
1646 /* Already reset? */
1647 if (streq(p, "4294967295"))
1648 return 0;
1649
ad118bda 1650 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1651 if (r < 0) {
10a87006
LP
1652 log_error_errno(r,
1653 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1654 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1655 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1656 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1657 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1658
db999e0f 1659 sleep(5);
77b6e194 1660 }
db999e0f
LP
1661
1662 return 0;
77b6e194
LP
1663}
1664
24fb1112 1665
785890ac
LP
1666static int setup_propagate(const char *root) {
1667 const char *p, *q;
709f6e46 1668 int r;
785890ac
LP
1669
1670 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1671 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1672 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1673 (void) mkdir_p(p, 0600);
1674
709f6e46
MS
1675 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1676 if (r < 0)
1677 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1678
709f6e46
MS
1679 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1680 if (r < 0)
1681 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1682
709f6e46
MS
1683 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1684 if (r < 0)
1685 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1686
03cfe0d5 1687 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1688 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1689 return log_error_errno(errno, "Failed to install propagation bind mount.");
1690
1691 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1692 return log_error_errno(errno, "Failed to make propagation mount read-only");
1693
1694 return 0;
1695}
1696
1b9e5b12
LP
1697static int setup_image(char **device_path, int *loop_nr) {
1698 struct loop_info64 info = {
1699 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1700 };
1701 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1702 _cleanup_free_ char* loopdev = NULL;
1703 struct stat st;
1704 int r, nr;
1705
1706 assert(device_path);
1707 assert(loop_nr);
ec16945e 1708 assert(arg_image);
1b9e5b12
LP
1709
1710 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1711 if (fd < 0)
1712 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1713
4a62c710
MS
1714 if (fstat(fd, &st) < 0)
1715 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1716
1717 if (S_ISBLK(st.st_mode)) {
1718 char *p;
1719
1720 p = strdup(arg_image);
1721 if (!p)
1722 return log_oom();
1723
1724 *device_path = p;
1725
1726 *loop_nr = -1;
1727
1728 r = fd;
1729 fd = -1;
1730
1731 return r;
1732 }
1733
1734 if (!S_ISREG(st.st_mode)) {
070edd97 1735 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1736 return -EINVAL;
1737 }
1738
1739 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1740 if (control < 0)
1741 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1742
1743 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1744 if (nr < 0)
1745 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1746
1747 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1748 return log_oom();
1749
1750 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1751 if (loop < 0)
1752 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1753
4a62c710
MS
1754 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1755 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1756
1757 if (arg_read_only)
1758 info.lo_flags |= LO_FLAGS_READ_ONLY;
1759
4a62c710
MS
1760 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1761 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1762
1763 *device_path = loopdev;
1764 loopdev = NULL;
1765
1766 *loop_nr = nr;
1767
1768 r = loop;
1769 loop = -1;
1770
1771 return r;
1772}
1773
ada4799a
LP
1774#define PARTITION_TABLE_BLURB \
1775 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1776 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1777 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1778 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1779 "to be bootable with systemd-nspawn."
1780
1b9e5b12
LP
1781static int dissect_image(
1782 int fd,
727fd4fd
LP
1783 char **root_device, bool *root_device_rw,
1784 char **home_device, bool *home_device_rw,
1785 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1786 bool *secondary) {
1787
1788#ifdef HAVE_BLKID
01dc33ce
ZJS
1789 int home_nr = -1, srv_nr = -1;
1790#ifdef GPT_ROOT_NATIVE
1791 int root_nr = -1;
1792#endif
1793#ifdef GPT_ROOT_SECONDARY
1794 int secondary_root_nr = -1;
1795#endif
f6c51a81 1796 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1797 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1798 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1799 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1800 _cleanup_udev_unref_ struct udev *udev = NULL;
1801 struct udev_list_entry *first, *item;
f6c51a81 1802 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1803 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1804 const char *pttype = NULL;
1805 blkid_partlist pl;
1806 struct stat st;
c09ef2e4 1807 unsigned i;
1b9e5b12
LP
1808 int r;
1809
1810 assert(fd >= 0);
1811 assert(root_device);
1812 assert(home_device);
1813 assert(srv_device);
1814 assert(secondary);
ec16945e 1815 assert(arg_image);
1b9e5b12
LP
1816
1817 b = blkid_new_probe();
1818 if (!b)
1819 return log_oom();
1820
1821 errno = 0;
1822 r = blkid_probe_set_device(b, fd, 0, 0);
1823 if (r != 0) {
1824 if (errno == 0)
1825 return log_oom();
1826
e1427b13 1827 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1828 }
1829
1830 blkid_probe_enable_partitions(b, 1);
1831 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1832
1833 errno = 0;
1834 r = blkid_do_safeprobe(b);
1835 if (r == -2 || r == 1) {
ada4799a
LP
1836 log_error("Failed to identify any partition table on\n"
1837 " %s\n"
1838 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1839 return -EINVAL;
1840 } else if (r != 0) {
1841 if (errno == 0)
1842 errno = EIO;
e1427b13 1843 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1844 }
1845
48861960 1846 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1847
1848 is_gpt = streq_ptr(pttype, "gpt");
1849 is_mbr = streq_ptr(pttype, "dos");
1850
1851 if (!is_gpt && !is_mbr) {
1852 log_error("No GPT or MBR partition table discovered on\n"
1853 " %s\n"
1854 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1855 return -EINVAL;
1856 }
1857
1858 errno = 0;
1859 pl = blkid_probe_get_partitions(b);
1860 if (!pl) {
1861 if (errno == 0)
1862 return log_oom();
1863
1864 log_error("Failed to list partitions of %s", arg_image);
1865 return -errno;
1866 }
1867
1868 udev = udev_new();
1869 if (!udev)
1870 return log_oom();
1871
4a62c710
MS
1872 if (fstat(fd, &st) < 0)
1873 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1874
c09ef2e4
LP
1875 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1876 if (!d)
1b9e5b12
LP
1877 return log_oom();
1878
c09ef2e4
LP
1879 for (i = 0;; i++) {
1880 int n, m;
1b9e5b12 1881
c09ef2e4
LP
1882 if (i >= 10) {
1883 log_error("Kernel partitions never appeared.");
1884 return -ENXIO;
1885 }
1886
1887 e = udev_enumerate_new(udev);
1888 if (!e)
1889 return log_oom();
1890
1891 r = udev_enumerate_add_match_parent(e, d);
1892 if (r < 0)
1893 return log_oom();
1894
1895 r = udev_enumerate_scan_devices(e);
1896 if (r < 0)
1897 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1898
1899 /* Count the partitions enumerated by the kernel */
1900 n = 0;
1901 first = udev_enumerate_get_list_entry(e);
1902 udev_list_entry_foreach(item, first)
1903 n++;
1904
1905 /* Count the partitions enumerated by blkid */
1906 m = blkid_partlist_numof_partitions(pl);
1907 if (n == m + 1)
1908 break;
1909 if (n > m + 1) {
1910 log_error("blkid and kernel partition list do not match.");
1911 return -EIO;
1912 }
1913 if (n < m + 1) {
1914 unsigned j;
1915
1916 /* The kernel has probed fewer partitions than
1917 * blkid? Maybe the kernel prober is still
1918 * running or it got EBUSY because udev
1919 * already opened the device. Let's reprobe
1920 * the device, which is a synchronous call
1921 * that waits until probing is complete. */
1922
1923 for (j = 0; j < 20; j++) {
1924
1925 r = ioctl(fd, BLKRRPART, 0);
1926 if (r < 0)
1927 r = -errno;
1928 if (r >= 0 || r != -EBUSY)
1929 break;
1930
1931 /* If something else has the device
1932 * open, such as an udev rule, the
1933 * ioctl will return EBUSY. Since
1934 * there's no way to wait until it
1935 * isn't busy anymore, let's just wait
1936 * a bit, and try again.
1937 *
1938 * This is really something they
1939 * should fix in the kernel! */
1940
1941 usleep(50 * USEC_PER_MSEC);
1942 }
1943
1944 if (r < 0)
1945 return log_error_errno(r, "Failed to reread partition table: %m");
1946 }
1947
1948 e = udev_enumerate_unref(e);
1949 }
1b9e5b12
LP
1950
1951 first = udev_enumerate_get_list_entry(e);
1952 udev_list_entry_foreach(item, first) {
1953 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1954 const char *node;
727fd4fd 1955 unsigned long long flags;
1b9e5b12
LP
1956 blkid_partition pp;
1957 dev_t qn;
1958 int nr;
1959
1960 errno = 0;
1961 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1962 if (!q) {
1963 if (!errno)
1964 errno = ENOMEM;
1965
e1427b13 1966 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1967 }
1968
1969 qn = udev_device_get_devnum(q);
1970 if (major(qn) == 0)
1971 continue;
1972
1973 if (st.st_rdev == qn)
1974 continue;
1975
1976 node = udev_device_get_devnode(q);
1977 if (!node)
1978 continue;
1979
1980 pp = blkid_partlist_devno_to_partition(pl, qn);
1981 if (!pp)
1982 continue;
1983
727fd4fd 1984 flags = blkid_partition_get_flags(pp);
727fd4fd 1985
1b9e5b12
LP
1986 nr = blkid_partition_get_partno(pp);
1987 if (nr < 0)
1988 continue;
1989
ada4799a
LP
1990 if (is_gpt) {
1991 sd_id128_t type_id;
1992 const char *stype;
1b9e5b12 1993
f6c51a81
LP
1994 if (flags & GPT_FLAG_NO_AUTO)
1995 continue;
1996
ada4799a
LP
1997 stype = blkid_partition_get_type_string(pp);
1998 if (!stype)
1999 continue;
1b9e5b12 2000
ada4799a 2001 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2002 continue;
2003
ada4799a 2004 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2005
ada4799a
LP
2006 if (home && nr >= home_nr)
2007 continue;
1b9e5b12 2008
ada4799a
LP
2009 home_nr = nr;
2010 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2011
ada4799a
LP
2012 r = free_and_strdup(&home, node);
2013 if (r < 0)
2014 return log_oom();
727fd4fd 2015
ada4799a
LP
2016 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2017
2018 if (srv && nr >= srv_nr)
2019 continue;
2020
2021 srv_nr = nr;
2022 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2023
2024 r = free_and_strdup(&srv, node);
2025 if (r < 0)
2026 return log_oom();
2027 }
1b9e5b12 2028#ifdef GPT_ROOT_NATIVE
ada4799a 2029 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2030
ada4799a
LP
2031 if (root && nr >= root_nr)
2032 continue;
1b9e5b12 2033
ada4799a
LP
2034 root_nr = nr;
2035 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2036
ada4799a
LP
2037 r = free_and_strdup(&root, node);
2038 if (r < 0)
2039 return log_oom();
2040 }
1b9e5b12
LP
2041#endif
2042#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2043 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2044
2045 if (secondary_root && nr >= secondary_root_nr)
2046 continue;
2047
2048 secondary_root_nr = nr;
2049 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2050
2051 r = free_and_strdup(&secondary_root, node);
2052 if (r < 0)
2053 return log_oom();
2054 }
2055#endif
f6c51a81
LP
2056 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2057
2058 if (generic)
2059 multiple_generic = true;
2060 else {
2061 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2062
2063 r = free_and_strdup(&generic, node);
2064 if (r < 0)
2065 return log_oom();
2066 }
2067 }
ada4799a
LP
2068
2069 } else if (is_mbr) {
2070 int type;
1b9e5b12 2071
f6c51a81
LP
2072 if (flags != 0x80) /* Bootable flag */
2073 continue;
2074
ada4799a
LP
2075 type = blkid_partition_get_type(pp);
2076 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2077 continue;
2078
f6c51a81
LP
2079 if (generic)
2080 multiple_generic = true;
2081 else {
2082 generic_rw = true;
727fd4fd 2083
f6c51a81
LP
2084 r = free_and_strdup(&root, node);
2085 if (r < 0)
2086 return log_oom();
2087 }
1b9e5b12 2088 }
1b9e5b12
LP
2089 }
2090
1b9e5b12
LP
2091 if (root) {
2092 *root_device = root;
2093 root = NULL;
727fd4fd
LP
2094
2095 *root_device_rw = root_rw;
1b9e5b12
LP
2096 *secondary = false;
2097 } else if (secondary_root) {
2098 *root_device = secondary_root;
2099 secondary_root = NULL;
727fd4fd
LP
2100
2101 *root_device_rw = secondary_root_rw;
1b9e5b12 2102 *secondary = true;
f6c51a81
LP
2103 } else if (generic) {
2104
2105 /* There were no partitions with precise meanings
2106 * around, but we found generic partitions. In this
2107 * case, if there's only one, we can go ahead and boot
2108 * it, otherwise we bail out, because we really cannot
2109 * make any sense of it. */
2110
2111 if (multiple_generic) {
2112 log_error("Identified multiple bootable Linux partitions on\n"
2113 " %s\n"
2114 PARTITION_TABLE_BLURB, arg_image);
2115 return -EINVAL;
2116 }
2117
2118 *root_device = generic;
2119 generic = NULL;
2120
2121 *root_device_rw = generic_rw;
2122 *secondary = false;
2123 } else {
2124 log_error("Failed to identify root partition in disk image\n"
2125 " %s\n"
2126 PARTITION_TABLE_BLURB, arg_image);
2127 return -EINVAL;
1b9e5b12
LP
2128 }
2129
2130 if (home) {
2131 *home_device = home;
2132 home = NULL;
727fd4fd
LP
2133
2134 *home_device_rw = home_rw;
1b9e5b12
LP
2135 }
2136
2137 if (srv) {
2138 *srv_device = srv;
2139 srv = NULL;
727fd4fd
LP
2140
2141 *srv_device_rw = srv_rw;
1b9e5b12
LP
2142 }
2143
2144 return 0;
2145#else
2146 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2147 return -EOPNOTSUPP;
1b9e5b12
LP
2148#endif
2149}
2150
727fd4fd 2151static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2152#ifdef HAVE_BLKID
2153 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2154 const char *fstype, *p;
2155 int r;
2156
2157 assert(what);
2158 assert(where);
2159
727fd4fd
LP
2160 if (arg_read_only)
2161 rw = false;
2162
1b9e5b12 2163 if (directory)
63c372cb 2164 p = strjoina(where, directory);
1b9e5b12
LP
2165 else
2166 p = where;
2167
2168 errno = 0;
2169 b = blkid_new_probe_from_filename(what);
2170 if (!b) {
2171 if (errno == 0)
2172 return log_oom();
e1427b13 2173 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2174 }
2175
2176 blkid_probe_enable_superblocks(b, 1);
2177 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2178
2179 errno = 0;
2180 r = blkid_do_safeprobe(b);
2181 if (r == -1 || r == 1) {
2182 log_error("Cannot determine file system type of %s", what);
2183 return -EINVAL;
2184 } else if (r != 0) {
2185 if (errno == 0)
2186 errno = EIO;
e1427b13 2187 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2188 }
2189
2190 errno = 0;
2191 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2192 if (errno == 0)
2193 errno = EINVAL;
2194 log_error("Failed to determine file system type of %s", what);
2195 return -errno;
2196 }
2197
2198 if (streq(fstype, "crypto_LUKS")) {
2199 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2200 return -EOPNOTSUPP;
1b9e5b12
LP
2201 }
2202
4a62c710
MS
2203 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2204 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2205
2206 return 0;
2207#else
2208 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2209 return -EOPNOTSUPP;
1b9e5b12
LP
2210#endif
2211}
2212
e01ff70a
MS
2213static int setup_machine_id(const char *directory) {
2214 int r;
2215 const char *etc_machine_id, *t;
2216 _cleanup_free_ char *s = NULL;
2217
2218 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2219
2220 r = read_one_line_file(etc_machine_id, &s);
2221 if (r < 0)
2222 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2223
2224 t = strstrip(s);
2225
2226 if (!isempty(t)) {
2227 r = sd_id128_from_string(t, &arg_uuid);
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2230 } else {
2231 if (sd_id128_is_null(arg_uuid)) {
2232 r = sd_id128_randomize(&arg_uuid);
2233 if (r < 0)
2234 return log_error_errno(r, "Failed to generate random machine ID: %m");
2235 }
2236 }
2237
2238 r = machine_id_setup(directory, arg_uuid);
2239 if (r < 0)
2240 return log_error_errno(r, "Failed to setup machine ID: %m");
2241
2242 return 0;
2243}
2244
7336138e
LP
2245static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2246 int r;
2247
2248 assert(directory);
2249
0de7acce 2250 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2251 return 0;
2252
2253 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2254 if (r == -EOPNOTSUPP)
2255 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2256 if (r == -EBADE)
2257 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2258 if (r < 0)
2259 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2260 if (r == 0)
2261 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2262 else
2263 log_debug("Patched directory tree to match UID/GID range.");
2264
2265 return r;
2266}
2267
727fd4fd
LP
2268static int mount_devices(
2269 const char *where,
2270 const char *root_device, bool root_device_rw,
2271 const char *home_device, bool home_device_rw,
2272 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2273 int r;
2274
2275 assert(where);
2276
2277 if (root_device) {
727fd4fd 2278 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2279 if (r < 0)
2280 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2281 }
2282
2283 if (home_device) {
727fd4fd 2284 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2285 if (r < 0)
2286 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2287 }
2288
2289 if (srv_device) {
727fd4fd 2290 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2291 if (r < 0)
2292 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2293 }
2294
2295 return 0;
2296}
2297
2298static void loop_remove(int nr, int *image_fd) {
2299 _cleanup_close_ int control = -1;
e8c8ddcc 2300 int r;
1b9e5b12
LP
2301
2302 if (nr < 0)
2303 return;
2304
2305 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2306 r = ioctl(*image_fd, LOOP_CLR_FD);
2307 if (r < 0)
5e4074aa 2308 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2309 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2310 }
2311
2312 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2313 if (control < 0) {
56f64d95 2314 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2315 return;
e8c8ddcc 2316 }
1b9e5b12 2317
e8c8ddcc
TG
2318 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2319 if (r < 0)
5e4074aa 2320 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2321}
2322
113cea80 2323/*
6d416b9c
LS
2324 * Return values:
2325 * < 0 : wait_for_terminate() failed to get the state of the
2326 * container, the container was terminated by a signal, or
2327 * failed for an unknown reason. No change is made to the
2328 * container argument.
2329 * > 0 : The program executed in the container terminated with an
2330 * error. The exit code of the program executed in the
919699ec
LP
2331 * container is returned. The container argument has been set
2332 * to CONTAINER_TERMINATED.
6d416b9c
LS
2333 * 0 : The container is being rebooted, has been shut down or exited
2334 * successfully. The container argument has been set to either
2335 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2336 *
6d416b9c
LS
2337 * That is, success is indicated by a return value of zero, and an
2338 * error is indicated by a non-zero value.
113cea80
DH
2339 */
2340static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2341 siginfo_t status;
919699ec 2342 int r;
113cea80
DH
2343
2344 r = wait_for_terminate(pid, &status);
f647962d
MS
2345 if (r < 0)
2346 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2347
2348 switch (status.si_code) {
fddbb89c 2349
113cea80 2350 case CLD_EXITED:
b5a2179b 2351 if (status.si_status == 0)
919699ec 2352 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2353 else
919699ec 2354 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2355
919699ec
LP
2356 *container = CONTAINER_TERMINATED;
2357 return status.si_status;
113cea80
DH
2358
2359 case CLD_KILLED:
2360 if (status.si_status == SIGINT) {
919699ec 2361 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2362 *container = CONTAINER_TERMINATED;
919699ec
LP
2363 return 0;
2364
113cea80 2365 } else if (status.si_status == SIGHUP) {
919699ec 2366 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2367 *container = CONTAINER_REBOOTED;
919699ec 2368 return 0;
113cea80 2369 }
919699ec 2370
113cea80
DH
2371 /* CLD_KILLED fallthrough */
2372
2373 case CLD_DUMPED:
fddbb89c 2374 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2375 return -EIO;
113cea80
DH
2376
2377 default:
fddbb89c 2378 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2379 return -EIO;
113cea80 2380 }
113cea80
DH
2381}
2382
023fb90b
LP
2383static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2384 pid_t pid;
2385
4a0b58c4 2386 pid = PTR_TO_PID(userdata);
023fb90b 2387 if (pid > 0) {
c6c8f6e2 2388 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2389 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2390 sd_event_source_set_userdata(s, NULL);
2391 return 0;
2392 }
2393 }
2394
2395 sd_event_exit(sd_event_source_get_event(s), 0);
2396 return 0;
2397}
2398
ec16945e 2399static int determine_names(void) {
1b9cebf6 2400 int r;
ec16945e 2401
c1521918
LP
2402 if (arg_template && !arg_directory && arg_machine) {
2403
2404 /* If --template= was specified then we should not
2405 * search for a machine, but instead create a new one
2406 * in /var/lib/machine. */
2407
2408 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2409 if (!arg_directory)
2410 return log_oom();
2411 }
2412
ec16945e 2413 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2414 if (arg_machine) {
2415 _cleanup_(image_unrefp) Image *i = NULL;
2416
2417 r = image_find(arg_machine, &i);
2418 if (r < 0)
2419 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2420 else if (r == 0) {
2421 log_error("No image for machine '%s': %m", arg_machine);
2422 return -ENOENT;
2423 }
2424
aceac2f0 2425 if (i->type == IMAGE_RAW)
0f03c2a4 2426 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2427 else
0f03c2a4 2428 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2429 if (r < 0)
2430 return log_error_errno(r, "Invalid image directory: %m");
2431
aee327b8
LP
2432 if (!arg_ephemeral)
2433 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2434 } else
ec16945e
LP
2435 arg_directory = get_current_dir_name();
2436
1b9cebf6
LP
2437 if (!arg_directory && !arg_machine) {
2438 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2439 return -EINVAL;
2440 }
2441 }
2442
2443 if (!arg_machine) {
b9ba4dab
LP
2444 if (arg_directory && path_equal(arg_directory, "/"))
2445 arg_machine = gethostname_malloc();
2446 else
2447 arg_machine = strdup(basename(arg_image ?: arg_directory));
2448
ec16945e
LP
2449 if (!arg_machine)
2450 return log_oom();
2451
ae691c1d 2452 hostname_cleanup(arg_machine);
ec16945e
LP
2453 if (!machine_name_is_valid(arg_machine)) {
2454 log_error("Failed to determine machine name automatically, please use -M.");
2455 return -EINVAL;
2456 }
b9ba4dab
LP
2457
2458 if (arg_ephemeral) {
2459 char *b;
2460
2461 /* Add a random suffix when this is an
2462 * ephemeral machine, so that we can run many
2463 * instances at once without manually having
2464 * to specify -M each time. */
2465
2466 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2467 return log_oom();
2468
2469 free(arg_machine);
2470 arg_machine = b;
2471 }
ec16945e
LP
2472 }
2473
2474 return 0;
2475}
2476
03cfe0d5 2477static int determine_uid_shift(const char *directory) {
6dac160c
LP
2478 int r;
2479
0de7acce 2480 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2481 arg_uid_shift = 0;
6dac160c 2482 return 0;
03cfe0d5 2483 }
6dac160c
LP
2484
2485 if (arg_uid_shift == UID_INVALID) {
2486 struct stat st;
2487
03cfe0d5 2488 r = stat(directory, &st);
6dac160c 2489 if (r < 0)
03cfe0d5 2490 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2491
2492 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2493
2494 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2495 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2496 return -EINVAL;
2497 }
2498
2499 arg_uid_range = UINT32_C(0x10000);
2500 }
2501
2502 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2503 log_error("UID base too high for UID range.");
2504 return -EINVAL;
2505 }
2506
6dac160c
LP
2507 return 0;
2508}
2509
03cfe0d5
LP
2510static int inner_child(
2511 Barrier *barrier,
2512 const char *directory,
2513 bool secondary,
2514 int kmsg_socket,
2515 int rtnl_socket,
f757855e 2516 FDSet *fds) {
69c79d3c 2517
03cfe0d5 2518 _cleanup_free_ char *home = NULL;
e01ff70a 2519 char as_uuid[37];
6aadfa4c 2520 unsigned n_env = 1;
03cfe0d5
LP
2521 const char *envp[] = {
2522 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2523 NULL, /* container */
03cfe0d5
LP
2524 NULL, /* TERM */
2525 NULL, /* HOME */
2526 NULL, /* USER */
2527 NULL, /* LOGNAME */
2528 NULL, /* container_uuid */
2529 NULL, /* LISTEN_FDS */
2530 NULL, /* LISTEN_PID */
2531 NULL
2532 };
88213476 2533
2371271c 2534 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2535 int r;
88213476 2536
03cfe0d5
LP
2537 assert(barrier);
2538 assert(directory);
2539 assert(kmsg_socket >= 0);
88213476 2540
efdb0237
LP
2541 cg_unified_flush();
2542
0de7acce 2543 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2544 /* Tell the parent, that it now can write the UID map. */
2545 (void) barrier_place(barrier); /* #1 */
7027ff61 2546
03cfe0d5
LP
2547 /* Wait until the parent wrote the UID map */
2548 if (!barrier_place_and_sync(barrier)) { /* #2 */
2549 log_error("Parent died too early");
2550 return -ESRCH;
2551 }
88213476
LP
2552 }
2553
0de7acce
LP
2554 r = mount_all(NULL,
2555 arg_userns_mode != USER_NAMESPACE_NO,
2556 true,
2557 arg_private_network,
2558 arg_uid_shift,
2559 arg_uid_range,
2560 arg_selinux_apifs_context);
2561
03cfe0d5
LP
2562 if (r < 0)
2563 return r;
2564
d8fc6a00
LP
2565 r = mount_sysfs(NULL);
2566 if (r < 0)
2567 return r;
2568
03cfe0d5
LP
2569 /* Wait until we are cgroup-ified, so that we
2570 * can mount the right cgroup path writable */
2571 if (!barrier_place_and_sync(barrier)) { /* #3 */
2572 log_error("Parent died too early");
2573 return -ESRCH;
88213476
LP
2574 }
2575
e83bebef 2576 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2577 if (r < 0)
2578 return r;
ec16945e 2579
03cfe0d5
LP
2580 r = reset_uid_gid();
2581 if (r < 0)
2582 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2583
03cfe0d5
LP
2584 r = setup_boot_id(NULL);
2585 if (r < 0)
2586 return r;
ec16945e 2587
03cfe0d5
LP
2588 r = setup_kmsg(NULL, kmsg_socket);
2589 if (r < 0)
2590 return r;
2591 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2592
03cfe0d5 2593 umask(0022);
30535c16 2594
03cfe0d5
LP
2595 if (setsid() < 0)
2596 return log_error_errno(errno, "setsid() failed: %m");
2597
2598 if (arg_private_network)
2599 loopback_setup();
2600
7a8f6325
LP
2601 if (arg_expose_ports) {
2602 r = expose_port_send_rtnl(rtnl_socket);
2603 if (r < 0)
2604 return r;
2605 rtnl_socket = safe_close(rtnl_socket);
2606 }
03cfe0d5 2607
709f6e46
MS
2608 r = drop_capabilities();
2609 if (r < 0)
2610 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2611
2612 setup_hostname();
2613
050f7277 2614 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2615 if (personality(arg_personality) < 0)
2616 return log_error_errno(errno, "personality() failed: %m");
2617 } else if (secondary) {
2618 if (personality(PER_LINUX32) < 0)
2619 return log_error_errno(errno, "personality() failed: %m");
2620 }
2621
2622#ifdef HAVE_SELINUX
2623 if (arg_selinux_context)
2624 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2625 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2626#endif
2627
ee645080 2628 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2629 if (r < 0)
2630 return r;
2631
6aadfa4c
ILG
2632 /* LXC sets container=lxc, so follow the scheme here */
2633 envp[n_env++] = strjoina("container=", arg_container_service_name);
2634
03cfe0d5
LP
2635 envp[n_env] = strv_find_prefix(environ, "TERM=");
2636 if (envp[n_env])
313cefa1 2637 n_env++;
03cfe0d5
LP
2638
2639 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2640 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2641 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2642 return log_oom();
2643
e01ff70a 2644 assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
03cfe0d5 2645
e01ff70a
MS
2646 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2647 return log_oom();
03cfe0d5
LP
2648
2649 if (fdset_size(fds) > 0) {
2650 r = fdset_cloexec(fds, false);
2651 if (r < 0)
2652 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2653
2654 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2655 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2656 return log_oom();
2657 }
2658
2371271c
TG
2659 env_use = strv_env_merge(2, envp, arg_setenv);
2660 if (!env_use)
2661 return log_oom();
03cfe0d5
LP
2662
2663 /* Let the parent know that we are ready and
2664 * wait until the parent is ready with the
2665 * setup, too... */
2666 if (!barrier_place_and_sync(barrier)) { /* #4 */
2667 log_error("Parent died too early");
2668 return -ESRCH;
2669 }
2670
5f932eb9
LP
2671 if (arg_chdir)
2672 if (chdir(arg_chdir) < 0)
2673 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2674
7732f92b
LP
2675 if (arg_start_mode == START_PID2) {
2676 r = stub_pid1();
2677 if (r < 0)
2678 return r;
2679 }
2680
03cfe0d5
LP
2681 /* Now, explicitly close the log, so that we
2682 * then can close all remaining fds. Closing
2683 * the log explicitly first has the benefit
2684 * that the logging subsystem knows about it,
2685 * and is thus ready to be reopened should we
2686 * need it again. Note that the other fds
2687 * closed here are at least the locking and
2688 * barrier fds. */
2689 log_close();
2690 (void) fdset_close_others(fds);
2691
7732f92b 2692 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2693 char **a;
2694 size_t m;
2695
2696 /* Automatically search for the init system */
2697
75f32f04
ZJS
2698 m = strv_length(arg_parameters);
2699 a = newa(char*, m + 2);
2700 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2701 a[1 + m] = NULL;
03cfe0d5
LP
2702
2703 a[0] = (char*) "/usr/lib/systemd/systemd";
2704 execve(a[0], a, env_use);
2705
2706 a[0] = (char*) "/lib/systemd/systemd";
2707 execve(a[0], a, env_use);
2708
2709 a[0] = (char*) "/sbin/init";
2710 execve(a[0], a, env_use);
f757855e
LP
2711 } else if (!strv_isempty(arg_parameters))
2712 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2713 else {
5f932eb9 2714 if (!arg_chdir)
d929b0f9
ZJS
2715 /* If we cannot change the directory, we'll end up in /, that is expected. */
2716 (void) chdir(home ?: "/root");
5f932eb9 2717
03cfe0d5
LP
2718 execle("/bin/bash", "-bash", NULL, env_use);
2719 execle("/bin/sh", "-sh", NULL, env_use);
2720 }
2721
35607a8d 2722 r = -errno;
03cfe0d5 2723 (void) log_open();
35607a8d 2724 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2725}
2726
2727static int outer_child(
2728 Barrier *barrier,
2729 const char *directory,
2730 const char *console,
2731 const char *root_device, bool root_device_rw,
2732 const char *home_device, bool home_device_rw,
2733 const char *srv_device, bool srv_device_rw,
2734 bool interactive,
2735 bool secondary,
2736 int pid_socket,
e01ff70a 2737 int uuid_socket,
03cfe0d5
LP
2738 int kmsg_socket,
2739 int rtnl_socket,
825d5287 2740 int uid_shift_socket,
f757855e 2741 FDSet *fds) {
03cfe0d5
LP
2742
2743 pid_t pid;
2744 ssize_t l;
2745 int r;
2746
2747 assert(barrier);
2748 assert(directory);
2749 assert(console);
2750 assert(pid_socket >= 0);
e01ff70a 2751 assert(uuid_socket >= 0);
03cfe0d5
LP
2752 assert(kmsg_socket >= 0);
2753
efdb0237
LP
2754 cg_unified_flush();
2755
03cfe0d5
LP
2756 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2757 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2758
2759 if (interactive) {
2760 close_nointr(STDIN_FILENO);
2761 close_nointr(STDOUT_FILENO);
2762 close_nointr(STDERR_FILENO);
2763
2764 r = open_terminal(console, O_RDWR);
2765 if (r != STDIN_FILENO) {
2766 if (r >= 0) {
2767 safe_close(r);
2768 r = -EINVAL;
2769 }
2770
2771 return log_error_errno(r, "Failed to open console: %m");
2772 }
2773
2774 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2775 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2776 return log_error_errno(errno, "Failed to duplicate console: %m");
2777 }
2778
2779 r = reset_audit_loginuid();
2780 if (r < 0)
2781 return r;
2782
2783 /* Mark everything as slave, so that we still
2784 * receive mounts from the real root, but don't
2785 * propagate mounts to the real root. */
2786 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2787 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2788
2789 r = mount_devices(directory,
2790 root_device, root_device_rw,
2791 home_device, home_device_rw,
2792 srv_device, srv_device_rw);
2793 if (r < 0)
2794 return r;
2795
391567f4
LP
2796 r = determine_uid_shift(directory);
2797 if (r < 0)
2798 return r;
2799
0de7acce 2800 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2801 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2802 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2803 if (l < 0)
2804 return log_error_errno(errno, "Failed to send UID shift: %m");
2805 if (l != sizeof(arg_uid_shift)) {
2806 log_error("Short write while sending UID shift.");
2807 return -EIO;
2808 }
0e7ac751 2809
0de7acce 2810 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2811 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2812 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2813 * not it will pick a different one, and send it back to us. */
2814
2815 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2816 if (l < 0)
2817 return log_error_errno(errno, "Failed to recv UID shift: %m");
2818 if (l != sizeof(arg_uid_shift)) {
2819 log_error("Short read while recieving UID shift.");
2820 return -EIO;
2821 }
2822 }
2823
2824 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2825 }
2826
03cfe0d5
LP
2827 /* Turn directory into bind mount */
2828 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2829 return log_error_errno(errno, "Failed to make bind mount: %m");
2830
7336138e 2831 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
2832 if (r < 0)
2833 return r;
2834
0de7acce
LP
2835 r = setup_volatile(
2836 directory,
2837 arg_volatile_mode,
2838 arg_userns_mode != USER_NAMESPACE_NO,
2839 arg_uid_shift,
2840 arg_uid_range,
2841 arg_selinux_context);
03cfe0d5
LP
2842 if (r < 0)
2843 return r;
2844
0de7acce
LP
2845 r = setup_volatile_state(
2846 directory,
2847 arg_volatile_mode,
2848 arg_userns_mode != USER_NAMESPACE_NO,
2849 arg_uid_shift,
2850 arg_uid_range,
2851 arg_selinux_context);
03cfe0d5
LP
2852 if (r < 0)
2853 return r;
2854
03cfe0d5
LP
2855 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2856 if (r < 0)
2857 return r;
2858
03cfe0d5
LP
2859 if (arg_read_only) {
2860 r = bind_remount_recursive(directory, true);
2861 if (r < 0)
2862 return log_error_errno(r, "Failed to make tree read-only: %m");
2863 }
2864
0de7acce
LP
2865 r = mount_all(directory,
2866 arg_userns_mode != USER_NAMESPACE_NO,
2867 false,
2868 arg_private_network,
2869 arg_uid_shift,
2870 arg_uid_range,
2871 arg_selinux_apifs_context);
03cfe0d5
LP
2872 if (r < 0)
2873 return r;
2874
07fa00f9
LP
2875 r = copy_devnodes(directory);
2876 if (r < 0)
03cfe0d5
LP
2877 return r;
2878
2879 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2880
07fa00f9
LP
2881 r = setup_pts(directory);
2882 if (r < 0)
03cfe0d5
LP
2883 return r;
2884
2885 r = setup_propagate(directory);
2886 if (r < 0)
2887 return r;
2888
2889 r = setup_dev_console(directory, console);
2890 if (r < 0)
2891 return r;
2892
f011b0b8 2893 r = setup_seccomp(arg_retain);
03cfe0d5
LP
2894 if (r < 0)
2895 return r;
2896
2897 r = setup_timezone(directory);
2898 if (r < 0)
2899 return r;
2900
2901 r = setup_resolv_conf(directory);
2902 if (r < 0)
2903 return r;
2904
e01ff70a
MS
2905 r = setup_machine_id(directory);
2906 if (r < 0)
2907 return r;
2908
03cfe0d5
LP
2909 r = setup_journal(directory);
2910 if (r < 0)
2911 return r;
2912
0de7acce
LP
2913 r = mount_custom(
2914 directory,
2915 arg_custom_mounts,
2916 arg_n_custom_mounts,
2917 arg_userns_mode != USER_NAMESPACE_NO,
2918 arg_uid_shift,
2919 arg_uid_range,
2920 arg_selinux_apifs_context);
03cfe0d5
LP
2921 if (r < 0)
2922 return r;
2923
0de7acce
LP
2924 r = mount_cgroups(
2925 directory,
2926 arg_unified_cgroup_hierarchy,
2927 arg_userns_mode != USER_NAMESPACE_NO,
2928 arg_uid_shift,
2929 arg_uid_range,
2930 arg_selinux_apifs_context);
03cfe0d5
LP
2931 if (r < 0)
2932 return r;
2933
2934 r = mount_move_root(directory);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to move root directory: %m");
2937
2938 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2939 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2940 (arg_private_network ? CLONE_NEWNET : 0) |
0de7acce 2941 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0),
03cfe0d5
LP
2942 NULL);
2943 if (pid < 0)
2944 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2945 if (pid == 0) {
2946 pid_socket = safe_close(pid_socket);
e01ff70a 2947 uuid_socket = safe_close(uuid_socket);
825d5287 2948 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2949
2950 /* The inner child has all namespaces that are
2951 * requested, so that we all are owned by the user if
2952 * user namespaces are turned on. */
2953
f757855e 2954 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2955 if (r < 0)
2956 _exit(EXIT_FAILURE);
2957
2958 _exit(EXIT_SUCCESS);
2959 }
2960
2961 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2962 if (l < 0)
2963 return log_error_errno(errno, "Failed to send PID: %m");
2964 if (l != sizeof(pid)) {
2965 log_error("Short write while sending PID.");
2966 return -EIO;
2967 }
2968
e01ff70a
MS
2969 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2970 if (l < 0)
2971 return log_error_errno(errno, "Failed to send machine ID: %m");
2972 if (l != sizeof(arg_uuid)) {
2973 log_error("Short write while sending machine ID.");
2974 return -EIO;
2975 }
2976
03cfe0d5 2977 pid_socket = safe_close(pid_socket);
e01ff70a 2978 uuid_socket = safe_close(uuid_socket);
327e26d6
KN
2979 kmsg_socket = safe_close(kmsg_socket);
2980 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2981
2982 return 0;
2983}
2984
0e7ac751
LP
2985static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2986 unsigned n_tries = 100;
2987 uid_t candidate;
2988 int r;
2989
2990 assert(shift);
2991 assert(ret_lock_file);
0de7acce 2992 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2993 assert(arg_uid_range == 0x10000U);
2994
2995 candidate = *shift;
2996
2997 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2998
2999 for (;;) {
3000 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3001 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3002
3003 if (--n_tries <= 0)
3004 return -EBUSY;
3005
3006 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3007 goto next;
3008 if ((candidate & UINT32_C(0xFFFF)) != 0)
3009 goto next;
3010
3011 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3012 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3013 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3014 goto next;
3015 if (r < 0)
3016 return r;
3017
3018 /* Make some superficial checks whether the range is currently known in the user database */
3019 if (getpwuid(candidate))
3020 goto next;
3021 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3022 goto next;
3023 if (getgrgid(candidate))
3024 goto next;
3025 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3026 goto next;
3027
3028 *ret_lock_file = lf;
3029 lf = (struct LockFile) LOCK_FILE_INIT;
3030 *shift = candidate;
3031 return 0;
3032
3033 next:
3034 random_bytes(&candidate, sizeof(candidate));
3035 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3036 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3037 }
3038}
3039
03cfe0d5
LP
3040static int setup_uid_map(pid_t pid) {
3041 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3042 int r;
3043
3044 assert(pid > 1);
3045
3046 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3047 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3048 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3049 if (r < 0)
3050 return log_error_errno(r, "Failed to write UID map: %m");
3051
3052 /* We always assign the same UID and GID ranges */
3053 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3054 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3055 if (r < 0)
3056 return log_error_errno(r, "Failed to write GID map: %m");
3057
3058 return 0;
3059}
3060
f757855e
LP
3061static int load_settings(void) {
3062 _cleanup_(settings_freep) Settings *settings = NULL;
3063 _cleanup_fclose_ FILE *f = NULL;
3064 _cleanup_free_ char *p = NULL;
3065 const char *fn, *i;
3066 int r;
3067
3068 /* If all settings are masked, there's no point in looking for
3069 * the settings file */
3070 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3071 return 0;
3072
3073 fn = strjoina(arg_machine, ".nspawn");
3074
3075 /* We first look in the admin's directories in /etc and /run */
3076 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3077 _cleanup_free_ char *j = NULL;
3078
3079 j = strjoin(i, "/", fn, NULL);
3080 if (!j)
3081 return log_oom();
3082
3083 f = fopen(j, "re");
3084 if (f) {
3085 p = j;
3086 j = NULL;
3087
b938cb90 3088 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3089 if (arg_settings_trusted < 0)
3090 arg_settings_trusted = true;
3091
3092 break;
3093 }
3094
3095 if (errno != ENOENT)
3096 return log_error_errno(errno, "Failed to open %s: %m", j);
3097 }
3098
3099 if (!f) {
3100 /* After that, let's look for a file next to the
3101 * actual image we shall boot. */
3102
3103 if (arg_image) {
3104 p = file_in_same_dir(arg_image, fn);
3105 if (!p)
3106 return log_oom();
3107 } else if (arg_directory) {
3108 p = file_in_same_dir(arg_directory, fn);
3109 if (!p)
3110 return log_oom();
3111 }
3112
3113 if (p) {
3114 f = fopen(p, "re");
3115 if (!f && errno != ENOENT)
3116 return log_error_errno(errno, "Failed to open %s: %m", p);
3117
b938cb90 3118 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3119 if (arg_settings_trusted < 0)
3120 arg_settings_trusted = false;
3121 }
3122 }
3123
3124 if (!f)
3125 return 0;
3126
3127 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3128
3129 r = settings_load(f, p, &settings);
3130 if (r < 0)
3131 return r;
3132
3133 /* Copy over bits from the settings, unless they have been
3134 * explicitly masked by command line switches. */
3135
7732f92b
LP
3136 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3137 settings->start_mode >= 0) {
3138 arg_start_mode = settings->start_mode;
f757855e
LP
3139
3140 strv_free(arg_parameters);
3141 arg_parameters = settings->parameters;
3142 settings->parameters = NULL;
3143 }
3144
5f932eb9
LP
3145 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3146 settings->working_directory) {
3147 free(arg_chdir);
3148 arg_chdir = settings->working_directory;
3149 settings->working_directory = NULL;
3150 }
3151
f757855e
LP
3152 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3153 settings->environment) {
3154 strv_free(arg_setenv);
3155 arg_setenv = settings->environment;
3156 settings->environment = NULL;
3157 }
3158
3159 if ((arg_settings_mask & SETTING_USER) == 0 &&
3160 settings->user) {
3161 free(arg_user);
3162 arg_user = settings->user;
3163 settings->user = NULL;
3164 }
3165
3166 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3167 uint64_t plus;
f757855e 3168
0e265674
LP
3169 plus = settings->capability;
3170 if (settings_private_network(settings))
3171 plus |= (1ULL << CAP_NET_ADMIN);
3172
3173 if (!arg_settings_trusted && plus != 0) {
3174 if (settings->capability != 0)
3175 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3176 } else
3177 arg_retain |= plus;
f757855e
LP
3178
3179 arg_retain &= ~settings->drop_capability;
3180 }
3181
3182 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3183 settings->kill_signal > 0)
3184 arg_kill_signal = settings->kill_signal;
3185
3186 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3187 settings->personality != PERSONALITY_INVALID)
3188 arg_personality = settings->personality;
3189
3190 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3191 !sd_id128_is_null(settings->machine_id)) {
3192
3193 if (!arg_settings_trusted)
3194 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3195 else
3196 arg_uuid = settings->machine_id;
3197 }
3198
3199 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3200 settings->read_only >= 0)
3201 arg_read_only = settings->read_only;
3202
3203 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3204 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3205 arg_volatile_mode = settings->volatile_mode;
3206
3207 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3208 settings->n_custom_mounts > 0) {
3209
3210 if (!arg_settings_trusted)
3211 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3212 else {
3213 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3214 arg_custom_mounts = settings->custom_mounts;
3215 arg_n_custom_mounts = settings->n_custom_mounts;
3216
3217 settings->custom_mounts = NULL;
3218 settings->n_custom_mounts = 0;
3219 }
3220 }
3221
3222 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3223 (settings->private_network >= 0 ||
3224 settings->network_veth >= 0 ||
3225 settings->network_bridge ||
22b28dfd 3226 settings->network_zone ||
f757855e
LP
3227 settings->network_interfaces ||
3228 settings->network_macvlan ||
f6d6bad1
LP
3229 settings->network_ipvlan ||
3230 settings->network_veth_extra)) {
f757855e
LP
3231
3232 if (!arg_settings_trusted)
3233 log_warning("Ignoring network settings, file %s is not trusted.", p);
3234 else {
f6d6bad1 3235 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3236 arg_private_network = settings_private_network(settings);
3237
f757855e
LP
3238 strv_free(arg_network_interfaces);
3239 arg_network_interfaces = settings->network_interfaces;
3240 settings->network_interfaces = NULL;
3241
3242 strv_free(arg_network_macvlan);
3243 arg_network_macvlan = settings->network_macvlan;
3244 settings->network_macvlan = NULL;
3245
3246 strv_free(arg_network_ipvlan);
3247 arg_network_ipvlan = settings->network_ipvlan;
3248 settings->network_ipvlan = NULL;
3249
f6d6bad1
LP
3250 strv_free(arg_network_veth_extra);
3251 arg_network_veth_extra = settings->network_veth_extra;
3252 settings->network_veth_extra = NULL;
3253
f757855e
LP
3254 free(arg_network_bridge);
3255 arg_network_bridge = settings->network_bridge;
3256 settings->network_bridge = NULL;
22b28dfd
LP
3257
3258 free(arg_network_zone);
3259 arg_network_zone = settings->network_zone;
3260 settings->network_zone = NULL;
f757855e
LP
3261 }
3262 }
3263
3264 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3265 settings->expose_ports) {
3266
3267 if (!arg_settings_trusted)
3268 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3269 else {
3270 expose_port_free_all(arg_expose_ports);
3271 arg_expose_ports = settings->expose_ports;
3272 settings->expose_ports = NULL;
3273 }
3274 }
3275
0de7acce
LP
3276 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3277 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3278
3279 if (!arg_settings_trusted)
3280 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3281 else {
3282 arg_userns_mode = settings->userns_mode;
3283 arg_uid_shift = settings->uid_shift;
3284 arg_uid_range = settings->uid_range;
3285 arg_userns_chown = settings->userns_chown;
3286 }
3287 }
3288
f757855e
LP
3289 return 0;
3290}
3291
03cfe0d5
LP
3292int main(int argc, char *argv[]) {
3293
3294 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3295 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3296 _cleanup_close_ int master = -1, image_fd = -1;
3297 _cleanup_fdset_free_ FDSet *fds = NULL;
3298 int r, n_fd_passed, loop_nr = -1;
5aa3eba5 3299 char veth_name[IFNAMSIZ] = "";
03cfe0d5 3300 bool secondary = false, remove_subvol = false;
72c0a2c2 3301 sigset_t mask_chld;
03cfe0d5
LP
3302 pid_t pid = 0;
3303 int ret = EXIT_SUCCESS;
3304 union in_addr_union exposed = {};
3305 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
7513c5b8 3306 bool interactive, veth_created = false;
03cfe0d5
LP
3307
3308 log_parse_environment();
3309 log_open();
3310
7732f92b
LP
3311 /* Make sure rename_process() in the stub init process can work */
3312 saved_argv = argv;
3313 saved_argc = argc;
3314
03cfe0d5
LP
3315 r = parse_argv(argc, argv);
3316 if (r <= 0)
3317 goto finish;
3318
03cfe0d5
LP
3319 if (geteuid() != 0) {
3320 log_error("Need to be root.");
3321 r = -EPERM;
3322 goto finish;
3323 }
f757855e
LP
3324 r = determine_names();
3325 if (r < 0)
3326 goto finish;
3327
3328 r = load_settings();
3329 if (r < 0)
3330 goto finish;
3331
3332 r = verify_arguments();
3333 if (r < 0)
3334 goto finish;
03cfe0d5
LP
3335
3336 n_fd_passed = sd_listen_fds(false);
3337 if (n_fd_passed > 0) {
3338 r = fdset_new_listen_fds(&fds, false);
3339 if (r < 0) {
3340 log_error_errno(r, "Failed to collect file descriptors: %m");
3341 goto finish;
3342 }
3343 }
3344
3345 if (arg_directory) {
3346 assert(!arg_image);
3347
3348 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3349 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3350 r = -EINVAL;
3351 goto finish;
3352 }
3353
3354 if (arg_ephemeral) {
3355 _cleanup_free_ char *np = NULL;
3356
3357 /* If the specified path is a mount point we
3358 * generate the new snapshot immediately
3359 * inside it under a random name. However if
3360 * the specified is not a mount point we
3361 * create the new snapshot in the parent
3362 * directory, just next to it. */
e26d6ce5 3363 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3364 if (r < 0) {
3365 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3366 goto finish;
3367 }
3368 if (r > 0)
770b5ce4 3369 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3370 else
770b5ce4 3371 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3372 if (r < 0) {
3373 log_error_errno(r, "Failed to generate name for snapshot: %m");
3374 goto finish;
3375 }
3376
3377 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3378 if (r < 0) {
3379 log_error_errno(r, "Failed to lock %s: %m", np);
3380 goto finish;
3381 }
3382
5bcd08db 3383 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3384 if (r < 0) {
3385 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3386 goto finish;
ec16945e
LP
3387 }
3388
3389 free(arg_directory);
3390 arg_directory = np;
8a16a7b4 3391 np = NULL;
ec16945e
LP
3392
3393 remove_subvol = true;
30535c16
LP
3394
3395 } else {
3396 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3397 if (r == -EBUSY) {
3398 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3399 goto finish;
3400 }
3401 if (r < 0) {
3402 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3403 return r;
3404 }
3405
3406 if (arg_template) {
5bcd08db 3407 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3408 if (r == -EEXIST) {
3409 if (!arg_quiet)
3410 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3411 } else if (r < 0) {
83521414 3412 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3413 goto finish;
3414 } else {
3415 if (!arg_quiet)
3416 log_info("Populated %s from template %s.", arg_directory, arg_template);
3417 }
3418 }
ec16945e
LP
3419 }
3420
7732f92b 3421 if (arg_start_mode == START_BOOT) {
1b9e5b12 3422 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3423 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3424 r = -EINVAL;
1b9e5b12
LP
3425 goto finish;
3426 }
3427 } else {
3428 const char *p;
3429
16fb773e
LP
3430 p = strjoina(arg_directory, "/usr/");
3431 if (laccess(p, F_OK) < 0) {
3432 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3433 r = -EINVAL;
1b9e5b12 3434 goto finish;
1b9e5b12
LP
3435 }
3436 }
ec16945e 3437
6b9132a9 3438 } else {
1b9e5b12 3439 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3440
ec16945e
LP
3441 assert(arg_image);
3442 assert(!arg_template);
3443
30535c16
LP
3444 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3445 if (r == -EBUSY) {
3446 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3447 goto finish;
3448 }
3449 if (r < 0) {
3450 r = log_error_errno(r, "Failed to create image lock: %m");
3451 goto finish;
3452 }
3453
1b9e5b12 3454 if (!mkdtemp(template)) {
56f64d95 3455 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3456 r = -errno;
6b9132a9 3457 goto finish;
1b9e5b12 3458 }
6b9132a9 3459
1b9e5b12
LP
3460 arg_directory = strdup(template);
3461 if (!arg_directory) {
3462 r = log_oom();
3463 goto finish;
6b9132a9 3464 }
88213476 3465
1b9e5b12
LP
3466 image_fd = setup_image(&device_path, &loop_nr);
3467 if (image_fd < 0) {
3468 r = image_fd;
842f3b0f
LP
3469 goto finish;
3470 }
1b9e5b12 3471
4d9f07b4
LP
3472 r = dissect_image(image_fd,
3473 &root_device, &root_device_rw,
3474 &home_device, &home_device_rw,
3475 &srv_device, &srv_device_rw,
3476 &secondary);
1b9e5b12
LP
3477 if (r < 0)
3478 goto finish;
842f3b0f 3479 }
842f3b0f 3480
5a8af538
LP
3481 r = custom_mounts_prepare();
3482 if (r < 0)
3483 goto finish;
3484
03cfe0d5
LP
3485 interactive =
3486 isatty(STDIN_FILENO) > 0 &&
3487 isatty(STDOUT_FILENO) > 0;
9c857b9d 3488
db7feb7e
LP
3489 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3490 if (master < 0) {
ec16945e 3491 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3492 goto finish;
3493 }
3494
611b312b
LP
3495 r = ptsname_malloc(master, &console);
3496 if (r < 0) {
3497 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3498 goto finish;
68b02049
DW
3499 }
3500
3501 if (arg_selinux_apifs_context) {
3502 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3503 if (r < 0)
3504 goto finish;
a258bf26
LP
3505 }
3506
a258bf26 3507 if (unlockpt(master) < 0) {
ec16945e 3508 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3509 goto finish;
3510 }
3511
9c857b9d
LP
3512 if (!arg_quiet)
3513 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3514 arg_machine, arg_image ?: arg_directory);
3515
72c0a2c2 3516 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3517
023fb90b
LP
3518 assert_se(sigemptyset(&mask_chld) == 0);
3519 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3520
03cfe0d5
LP
3521 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3522 r = log_error_errno(errno, "Failed to become subreaper: %m");
3523 goto finish;
3524 }
3525
d87be9b0 3526 for (;;) {
03cfe0d5 3527 static const struct sigaction sa = {
189d5bac 3528 .sa_handler = nop_signal_handler,
e866af3a
DH
3529 .sa_flags = SA_NOCLDSTOP,
3530 };
0e7ac751
LP
3531
3532 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3533 _cleanup_close_ int etc_passwd_lock = -1;
3534 _cleanup_close_pair_ int
3535 kmsg_socket_pair[2] = { -1, -1 },
3536 rtnl_socket_pair[2] = { -1, -1 },
3537 pid_socket_pair[2] = { -1, -1 },
3538 uuid_socket_pair[2] = { -1, -1 },
3539 uid_shift_socket_pair[2] = { -1, -1 };
3540 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4afd3348 3541 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3542 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3543 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
0e7ac751 3544 ContainerStatus container_status;
dbb60d69 3545 char last_char = 0;
0e7ac751
LP
3546 int ifi = 0;
3547 ssize_t l;
3548
0de7acce 3549 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3550 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3551 * check with getpwuid() if the specific user already exists. Note that /etc might be
3552 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3553 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3554 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3555 * really ours. */
3556
3557 etc_passwd_lock = take_etc_passwd_lock(NULL);
3558 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3559 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3560 goto finish;
3561 }
3562 }
e866af3a 3563
7566e267 3564 r = barrier_create(&barrier);
a2da110b 3565 if (r < 0) {
da927ba9 3566 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3567 goto finish;
3568 }
3569
4610de50 3570 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3571 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3572 goto finish;
3573 }
3574
4610de50 3575 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3576 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3577 goto finish;
3578 }
3579
4610de50 3580 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3581 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3582 goto finish;
3583 }
3584
e01ff70a
MS
3585 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3586 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3587 goto finish;
3588 }
3589
0de7acce 3590 if (arg_userns_mode != USER_NAMESPACE_NO)
4610de50 3591 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3592 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3593 goto finish;
3594 }
3595
e866af3a
DH
3596 /* Child can be killed before execv(), so handle SIGCHLD
3597 * in order to interrupt parent's blocking calls and
3598 * give it a chance to call wait() and terminate. */
3599 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3600 if (r < 0) {
ec16945e 3601 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3602 goto finish;
3603 }
3604
e866af3a
DH
3605 r = sigaction(SIGCHLD, &sa, NULL);
3606 if (r < 0) {
ec16945e 3607 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3608 goto finish;
3609 }
3610
03cfe0d5 3611 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3612 if (pid < 0) {
3613 if (errno == EINVAL)
ec16945e 3614 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3615 else
ec16945e 3616 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3617
d87be9b0
LP
3618 goto finish;
3619 }
a258bf26 3620
d87be9b0 3621 if (pid == 0) {
03cfe0d5 3622 /* The outer child only has a file system namespace. */
a2da110b
DH
3623 barrier_set_role(&barrier, BARRIER_CHILD);
3624
03e334a1 3625 master = safe_close(master);
a258bf26 3626
03e334a1 3627 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3628 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3629 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
e01ff70a 3630 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
825d5287 3631 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3632
ce30c8dc
LP
3633 (void) reset_all_signal_handlers();
3634 (void) reset_signal_mask();
f5c1b9ee 3635
03cfe0d5
LP
3636 r = outer_child(&barrier,
3637 arg_directory,
3638 console,
3639 root_device, root_device_rw,
3640 home_device, home_device_rw,
3641 srv_device, srv_device_rw,
3642 interactive,
3643 secondary,
3644 pid_socket_pair[1],
e01ff70a 3645 uuid_socket_pair[1],
03cfe0d5
LP
3646 kmsg_socket_pair[1],
3647 rtnl_socket_pair[1],
825d5287 3648 uid_shift_socket_pair[1],
f757855e 3649 fds);
0cb9fbcd 3650 if (r < 0)
a2da110b 3651 _exit(EXIT_FAILURE);
d87be9b0 3652
03cfe0d5 3653 _exit(EXIT_SUCCESS);
da5b3bad 3654 }
88213476 3655
a2da110b 3656 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3657
2feceb5e 3658 fds = fdset_free(fds);
842f3b0f 3659
6d0b55c2
LP
3660 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3661 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3662 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
e01ff70a 3663 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
82116c43 3664 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3665
0de7acce 3666 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
3667 /* The child just let us know the UID shift it might have read from the image. */
3668 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3669 if (l < 0) {
3670 r = log_error_errno(errno, "Failed to read UID shift: %m");
3671 goto finish;
3672 }
3673 if (l != sizeof(arg_uid_shift)) {
3674 log_error("Short read while reading UID shift.");
3675 r = EIO;
3676 goto finish;
3677 }
3678
0de7acce 3679 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3680 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3681 * image, but if that's already in use, pick a new one, and report back to the child,
3682 * which one we now picked. */
3683
3684 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3685 if (r < 0) {
3686 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3687 goto finish;
3688 }
3689
3690 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3691 if (l < 0) {
3692 r = log_error_errno(errno, "Failed to send UID shift: %m");
3693 goto finish;
3694 }
3695 if (l != sizeof(arg_uid_shift)) {
3696 log_error("Short write while writing UID shift.");
3697 r = -EIO;
3698 goto finish;
3699 }
3700 }
3701 }
3702
03cfe0d5
LP
3703 /* Wait for the outer child. */
3704 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3705 if (r < 0)
3706 goto finish;
3707 if (r != 0) {
3708 r = -EIO;
3709 goto finish;
3710 }
3711 pid = 0;
6dac160c 3712
03cfe0d5
LP
3713 /* And now retrieve the PID of the inner child. */
3714 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3715 if (l < 0) {
3716 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3717 goto finish;
3718 }
3719 if (l != sizeof(pid)) {
76d44882 3720 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3721 r = EIO;
3722 goto finish;
3723 }
354bfd2b 3724
e01ff70a
MS
3725 /* We also retrieve container UUID in case it was generated by outer child */
3726 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3727 if (l < 0) {
3728 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3729 goto finish;
3730 }
3731 if (l != sizeof(arg_uuid)) {
3732 log_error("Short read while reading container machined ID.");
3733 r = EIO;
3734 goto finish;
3735 }
3736
03cfe0d5 3737 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3738
0de7acce 3739 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3740 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3741 log_error("Child died too early.");
3742 r = -ESRCH;
840295fc 3743 goto finish;
03cfe0d5 3744 }
ab046dde 3745
03cfe0d5 3746 r = setup_uid_map(pid);
840295fc
LP
3747 if (r < 0)
3748 goto finish;
ab046dde 3749
03cfe0d5
LP
3750 (void) barrier_place(&barrier); /* #2 */
3751 }
c74e630d 3752
9a2a5625 3753 if (arg_private_network) {
4bbfe7ad 3754
9a2a5625
LP
3755 r = move_network_interfaces(pid, arg_network_interfaces);
3756 if (r < 0)
3757 goto finish;
5aa4bb6b 3758
9a2a5625 3759 if (arg_network_veth) {
22b28dfd
LP
3760 r = setup_veth(arg_machine, pid, veth_name,
3761 arg_network_bridge || arg_network_zone);
9a2a5625
LP
3762 if (r < 0)
3763 goto finish;
3764 else if (r > 0)
3765 ifi = r;
6dac160c 3766
9a2a5625 3767 if (arg_network_bridge) {
22b28dfd
LP
3768 /* Add the interface to a bridge */
3769 r = setup_bridge(veth_name, arg_network_bridge, false);
3770 if (r < 0)
3771 goto finish;
3772 if (r > 0)
3773 ifi = r;
3774 } else if (arg_network_zone) {
3775 /* Add the interface to a bridge, possibly creating it */
3776 r = setup_bridge(veth_name, arg_network_zone, true);
9a2a5625
LP
3777 if (r < 0)
3778 goto finish;
3779 if (r > 0)
3780 ifi = r;
3781 }
3782 }
6dac160c 3783
f6d6bad1
LP
3784 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3785 if (r < 0)
3786 goto finish;
3787
7513c5b8
LP
3788 /* We created the primary and extra veth links now; let's remember this, so that we know to
3789 remove them later on. Note that we don't bother with removing veth links that were created
3790 here when their setup failed half-way, because in that case the kernel should be able to
3791 remove them on its own, since they cannot be referenced by anything yet. */
3792 veth_created = true;
3793
9a2a5625
LP
3794 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3795 if (r < 0)
3796 goto finish;
3797
3798 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3799 if (r < 0)
3800 goto finish;
3801 }
6dac160c 3802
b7103bc5
LP
3803 if (arg_register) {
3804 r = register_machine(
3805 arg_machine,
3806 pid,
3807 arg_directory,
3808 arg_uuid,
3809 ifi,
3810 arg_slice,
3811 arg_custom_mounts, arg_n_custom_mounts,
3812 arg_kill_signal,
3813 arg_property,
6aadfa4c
ILG
3814 arg_keep_unit,
3815 arg_container_service_name);
b7103bc5
LP
3816 if (r < 0)
3817 goto finish;
3818 }
6dac160c 3819
34829a32 3820 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3821 if (r < 0)
3822 goto finish;
3823
34829a32
LP
3824 if (arg_keep_unit) {
3825 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3826 if (r < 0)
3827 goto finish;
3828 }
efdb0237 3829
34829a32 3830 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3831 if (r < 0)
3832 goto finish;
6dac160c 3833
03cfe0d5
LP
3834 /* Notify the child that the parent is ready with all
3835 * its setup (including cgroup-ification), and that
3836 * the child can now hand over control to the code to
3837 * run inside the container. */
3838 (void) barrier_place(&barrier); /* #3 */
6dac160c 3839
03cfe0d5
LP
3840 /* Block SIGCHLD here, before notifying child.
3841 * process_pty() will handle it with the other signals. */
3842 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3843
03cfe0d5
LP
3844 /* Reset signal to default */
3845 r = default_signals(SIGCHLD, -1);
3846 if (r < 0) {
3847 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3848 goto finish;
3849 }
e866af3a 3850
03cfe0d5 3851 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3852 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3853 log_error("Child died too early.");
03cfe0d5
LP
3854 r = -ESRCH;
3855 goto finish;
3856 }
b12afc8c 3857
0e7ac751
LP
3858 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
3859 * in getpwuid(), thus we can release the /etc/passwd lock. */
3860 etc_passwd_lock = safe_close(etc_passwd_lock);
3861
03cfe0d5
LP
3862 sd_notifyf(false,
3863 "READY=1\n"
3864 "STATUS=Container running.\n"
3865 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3866
03cfe0d5
LP
3867 r = sd_event_new(&event);
3868 if (r < 0) {
3869 log_error_errno(r, "Failed to get default event source: %m");
3870 goto finish;
3871 }
88213476 3872
03cfe0d5
LP
3873 if (arg_kill_signal > 0) {
3874 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
3875 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3876 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
3877 } else {
3878 /* Immediately exit */
3879 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3880 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3881 }
023fb90b 3882
03cfe0d5
LP
3883 /* simply exit on sigchld */
3884 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3885
03cfe0d5 3886 if (arg_expose_ports) {
7a8f6325 3887 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3888 if (r < 0)
3889 goto finish;
023fb90b 3890
7a8f6325 3891 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3892 }
023fb90b 3893
03cfe0d5 3894 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3895
ae3dde80 3896 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3897 if (r < 0) {
3898 log_error_errno(r, "Failed to create PTY forwarder: %m");
3899 goto finish;
3900 }
023fb90b 3901
03cfe0d5
LP
3902 r = sd_event_loop(event);
3903 if (r < 0) {
3904 log_error_errno(r, "Failed to run event loop: %m");
3905 goto finish;
3906 }
6d0b55c2 3907
03cfe0d5 3908 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3909
03cfe0d5 3910 forward = pty_forward_free(forward);
6d0b55c2 3911
03cfe0d5
LP
3912 if (!arg_quiet && last_char != '\n')
3913 putc('\n', stdout);
04d39279 3914
03cfe0d5 3915 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3916 if (arg_register && !arg_keep_unit)
3917 terminate_machine(pid);
1f0cd86b 3918
840295fc 3919 /* Normally redundant, but better safe than sorry */
04d39279 3920 kill(pid, SIGKILL);
a258bf26 3921
113cea80 3922 r = wait_for_container(pid, &container_status);
04d39279
LP
3923 pid = 0;
3924
ec16945e 3925 if (r < 0)
ce9f1527
LP
3926 /* We failed to wait for the container, or the
3927 * container exited abnormally */
ec16945e 3928 goto finish;
9ed794a3 3929 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
ce9f1527
LP
3930 /* The container exited with a non-zero
3931 * status, or with zero status and no reboot
3932 * was requested. */
ec16945e 3933 ret = r;
d87be9b0 3934 break;
ec16945e 3935 }
88213476 3936
113cea80 3937 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3938
3939 if (arg_keep_unit) {
3940 /* Special handling if we are running as a
3941 * service: instead of simply restarting the
3942 * machine we want to restart the entire
3943 * service, so let's inform systemd about this
3944 * with the special exit code 133. The service
3945 * file uses RestartForceExitStatus=133 so
3946 * that this results in a full nspawn
3947 * restart. This is necessary since we might
3948 * have cgroup parameters set we want to have
3949 * flushed out. */
ec16945e
LP
3950 ret = 133;
3951 r = 0;
ce38dbc8
LP
3952 break;
3953 }
6d0b55c2 3954
7a8f6325 3955 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8 3956
ef3b2aa7 3957 (void) remove_veth_links(veth_name, arg_network_veth_extra);
7513c5b8 3958 veth_created = false;
d87be9b0 3959 }
88213476
LP
3960
3961finish:
af4ec430
LP
3962 sd_notify(false,
3963 "STOPPING=1\n"
3964 "STATUS=Terminating...");
3965
9444b1f2
LP
3966 if (pid > 0)
3967 kill(pid, SIGKILL);
88213476 3968
503546da
LP
3969 /* Try to flush whatever is still queued in the pty */
3970 if (master >= 0)
59f448cf 3971 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3972
03cfe0d5
LP
3973 loop_remove(loop_nr, &image_fd);
3974
ec16945e
LP
3975 if (remove_subvol && arg_directory) {
3976 int k;
3977
5bcd08db 3978 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3979 if (k < 0)
3980 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3981 }
3982
785890ac
LP
3983 if (arg_machine) {
3984 const char *p;
3985
63c372cb 3986 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3987 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3988 }
3989
7a8f6325 3990 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
3991
3992 if (veth_created)
3993 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 3994 (void) remove_bridge(arg_network_zone);
f757855e 3995
04d391da 3996 free(arg_directory);
ec16945e
LP
3997 free(arg_template);
3998 free(arg_image);
7027ff61 3999 free(arg_machine);
c74e630d 4000 free(arg_user);
5f932eb9 4001 free(arg_chdir);
c74e630d 4002 strv_free(arg_setenv);
f757855e 4003 free(arg_network_bridge);
c74e630d
LP
4004 strv_free(arg_network_interfaces);
4005 strv_free(arg_network_macvlan);
4bbfe7ad 4006 strv_free(arg_network_ipvlan);
f6d6bad1 4007 strv_free(arg_network_veth_extra);
f757855e
LP
4008 strv_free(arg_parameters);
4009 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4010 expose_port_free_all(arg_expose_ports);
6d0b55c2 4011
ec16945e 4012 return r < 0 ? EXIT_FAILURE : ret;
88213476 4013}