]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #3428 from toanju/networkd/brvlan
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
8fe0087e 60#include "formats-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e
LP
63#include "hostname-util.h"
64#include "log.h"
65#include "loopback-setup.h"
e01ff70a 66#include "machine-id-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea
LP
78#include "nspawn-register.h"
79#include "nspawn-settings.h"
80#include "nspawn-setuid.h"
7732f92b 81#include "nspawn-stub-pid1.h"
f011b0b8 82#include "nspawn-seccomp.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751
LP
103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
104 * UID range here */
105#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
106#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
107
113cea80
DH
108typedef enum ContainerStatus {
109 CONTAINER_TERMINATED,
110 CONTAINER_REBOOTED
111} ContainerStatus;
112
57fb9fb5
LP
113typedef enum LinkJournal {
114 LINK_NO,
115 LINK_AUTO,
116 LINK_HOST,
117 LINK_GUEST
118} LinkJournal;
88213476
LP
119
120static char *arg_directory = NULL;
ec16945e 121static char *arg_template = NULL;
5f932eb9 122static char *arg_chdir = NULL;
687d0825 123static char *arg_user = NULL;
9444b1f2 124static sd_id128_t arg_uuid = {};
7027ff61 125static char *arg_machine = NULL;
c74e630d
LP
126static const char *arg_selinux_context = NULL;
127static const char *arg_selinux_apifs_context = NULL;
9444b1f2 128static const char *arg_slice = NULL;
ff01d048 129static bool arg_private_network = false;
bc2f673e 130static bool arg_read_only = false;
7732f92b 131static StartMode arg_start_mode = START_PID1;
ec16945e 132static bool arg_ephemeral = false;
57fb9fb5 133static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 134static bool arg_link_journal_try = false;
520e0d54 135static uint64_t arg_caps_retain =
5076f0cc
LP
136 (1ULL << CAP_CHOWN) |
137 (1ULL << CAP_DAC_OVERRIDE) |
138 (1ULL << CAP_DAC_READ_SEARCH) |
139 (1ULL << CAP_FOWNER) |
140 (1ULL << CAP_FSETID) |
141 (1ULL << CAP_IPC_OWNER) |
142 (1ULL << CAP_KILL) |
143 (1ULL << CAP_LEASE) |
144 (1ULL << CAP_LINUX_IMMUTABLE) |
145 (1ULL << CAP_NET_BIND_SERVICE) |
146 (1ULL << CAP_NET_BROADCAST) |
147 (1ULL << CAP_NET_RAW) |
148 (1ULL << CAP_SETGID) |
149 (1ULL << CAP_SETFCAP) |
150 (1ULL << CAP_SETPCAP) |
151 (1ULL << CAP_SETUID) |
152 (1ULL << CAP_SYS_ADMIN) |
153 (1ULL << CAP_SYS_CHROOT) |
154 (1ULL << CAP_SYS_NICE) |
155 (1ULL << CAP_SYS_PTRACE) |
156 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 157 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
158 (1ULL << CAP_SYS_BOOT) |
159 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
160 (1ULL << CAP_AUDIT_CONTROL) |
161 (1ULL << CAP_MKNOD);
5a8af538
LP
162static CustomMount *arg_custom_mounts = NULL;
163static unsigned arg_n_custom_mounts = 0;
f4889f65 164static char **arg_setenv = NULL;
284c0b91 165static bool arg_quiet = false;
8a96d94e 166static bool arg_share_system = false;
eb91eb18 167static bool arg_register = true;
89f7c846 168static bool arg_keep_unit = false;
aa28aefe 169static char **arg_network_interfaces = NULL;
c74e630d 170static char **arg_network_macvlan = NULL;
4bbfe7ad 171static char **arg_network_ipvlan = NULL;
69c79d3c 172static bool arg_network_veth = false;
f6d6bad1 173static char **arg_network_veth_extra = NULL;
f757855e 174static char *arg_network_bridge = NULL;
22b28dfd 175static char *arg_network_zone = NULL;
050f7277 176static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 177static char *arg_image = NULL;
f757855e 178static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 179static ExposePort *arg_expose_ports = NULL;
f36933fe 180static char **arg_property = NULL;
0de7acce 181static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 182static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 183static bool arg_userns_chown = false;
c6c8f6e2 184static int arg_kill_signal = 0;
efdb0237 185static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
186static SettingsMask arg_settings_mask = 0;
187static int arg_settings_trusted = -1;
188static char **arg_parameters = NULL;
6aadfa4c 189static const char *arg_container_service_name = "systemd-nspawn";
88213476 190
601185b4 191static void help(void) {
88213476
LP
192 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
194 " -h --help Show this help\n"
195 " --version Print version string\n"
69c79d3c 196 " -q --quiet Do not show status information\n"
1b9e5b12 197 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
198 " --template=PATH Initialize root directory from template directory,\n"
199 " if missing\n"
200 " -x --ephemeral Run container with snapshot of root directory, and\n"
201 " remove it after exit\n"
202 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 203 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 204 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 205 " --chdir=PATH Set working directory in the container\n"
a8828ed9 206 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 207 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 208 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 209 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 210 " --property=NAME=VALUE Set scope unit property\n"
19aac838 211 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
03cfe0d5 212 " --private-users[=UIDBASE[:NUIDS]]\n"
19aac838
LP
213 " Run within user namespace, user configured UID/GID range\n"
214 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
69c79d3c
LP
215 " --private-network Disable network in container\n"
216 " --network-interface=INTERFACE\n"
217 " Assign an existing network interface to the\n"
218 " container\n"
c74e630d
LP
219 " --network-macvlan=INTERFACE\n"
220 " Create a macvlan network interface based on an\n"
221 " existing network interface to the container\n"
4bbfe7ad
TG
222 " --network-ipvlan=INTERFACE\n"
223 " Create a ipvlan network interface based on an\n"
224 " existing network interface to the container\n"
a8eaaee7 225 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 226 " and container\n"
f6d6bad1
LP
227 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
228 " Add an additional virtual Ethernet link between\n"
229 " host and container\n"
ab046dde 230 " --network-bridge=INTERFACE\n"
a8eaaee7 231 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
232 " and container and add it to an existing bridge on\n"
233 " the host\n"
22b28dfd
LP
234 " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
235 " and add it to an automatically managed bridge interface\n"
6d0b55c2 236 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 237 " Expose a container IP port on the host\n"
82adf6af
LP
238 " -Z --selinux-context=SECLABEL\n"
239 " Set the SELinux security context to be used by\n"
240 " processes in the container\n"
241 " -L --selinux-apifs-context=SECLABEL\n"
242 " Set the SELinux security context to be used by\n"
243 " API/tmpfs file systems in the container\n"
a8828ed9
DW
244 " --capability=CAP In addition to the default, retain specified\n"
245 " capability\n"
246 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 247 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
248 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
249 " host, try-guest, try-host\n"
574edc90 250 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 251 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
252 " --bind=PATH[:PATH[:OPTIONS]]\n"
253 " Bind mount a file or directory from the host into\n"
a8828ed9 254 " the container\n"
5e5bfa6e
EY
255 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
256 " Similar, but creates a read-only bind mount\n"
06c17c39 257 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
258 " --overlay=PATH[:PATH...]:PATH\n"
259 " Create an overlay mount from the host to \n"
260 " the container\n"
261 " --overlay-ro=PATH[:PATH...]:PATH\n"
262 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 263 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 264 " --share-system Share system namespaces with host\n"
eb91eb18 265 " --register=BOOLEAN Register container as machine\n"
89f7c846 266 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 267 " the service unit nspawn is running in\n"
6d0b55c2 268 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 269 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 270 , program_invocation_short_name);
88213476
LP
271}
272
5a8af538
LP
273
274static int custom_mounts_prepare(void) {
275 unsigned i;
276 int r;
277
278 /* Ensure the mounts are applied prefix first. */
279 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
280
281 /* Allocate working directories for the overlay file systems that need it */
282 for (i = 0; i < arg_n_custom_mounts; i++) {
283 CustomMount *m = &arg_custom_mounts[i];
284
0de7acce 285 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
286
287 if (arg_userns_chown) {
288 log_error("--private-users-chown may not be combined with custom root mounts.");
289 return -EINVAL;
290 } else if (arg_uid_shift == UID_INVALID) {
291 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
292 return -EINVAL;
293 }
825d5287
RM
294 }
295
5a8af538
LP
296 if (m->type != CUSTOM_MOUNT_OVERLAY)
297 continue;
298
299 if (m->work_dir)
300 continue;
301
302 if (m->read_only)
303 continue;
304
14bcf25c 305 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
306 if (r < 0)
307 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
308 }
309
310 return 0;
311}
312
efdb0237
LP
313static int detect_unified_cgroup_hierarchy(void) {
314 const char *e;
315 int r;
316
317 /* Allow the user to control whether the unified hierarchy is used */
318 e = getenv("UNIFIED_CGROUP_HIERARCHY");
319 if (e) {
320 r = parse_boolean(e);
321 if (r < 0)
322 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
323
324 arg_unified_cgroup_hierarchy = r;
325 return 0;
326 }
327
328 /* Otherwise inherit the default from the host system */
329 r = cg_unified();
330 if (r < 0)
331 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
332
333 arg_unified_cgroup_hierarchy = r;
334 return 0;
335}
336
88213476
LP
337static int parse_argv(int argc, char *argv[]) {
338
a41fe3a2 339 enum {
acbeb427
ZJS
340 ARG_VERSION = 0x100,
341 ARG_PRIVATE_NETWORK,
bc2f673e 342 ARG_UUID,
5076f0cc 343 ARG_READ_ONLY,
57fb9fb5 344 ARG_CAPABILITY,
420c7379 345 ARG_DROP_CAPABILITY,
17fe0523
LP
346 ARG_LINK_JOURNAL,
347 ARG_BIND,
f4889f65 348 ARG_BIND_RO,
06c17c39 349 ARG_TMPFS,
5a8af538
LP
350 ARG_OVERLAY,
351 ARG_OVERLAY_RO,
eb91eb18 352 ARG_SHARE_SYSTEM,
89f7c846 353 ARG_REGISTER,
aa28aefe 354 ARG_KEEP_UNIT,
69c79d3c 355 ARG_NETWORK_INTERFACE,
c74e630d 356 ARG_NETWORK_MACVLAN,
4bbfe7ad 357 ARG_NETWORK_IPVLAN,
ab046dde 358 ARG_NETWORK_BRIDGE,
22b28dfd 359 ARG_NETWORK_ZONE,
f6d6bad1 360 ARG_NETWORK_VETH_EXTRA,
6afc95b7 361 ARG_PERSONALITY,
4d9f07b4 362 ARG_VOLATILE,
ec16945e 363 ARG_TEMPLATE,
f36933fe 364 ARG_PROPERTY,
6dac160c 365 ARG_PRIVATE_USERS,
c6c8f6e2 366 ARG_KILL_SIGNAL,
f757855e 367 ARG_SETTINGS,
5f932eb9 368 ARG_CHDIR,
7336138e 369 ARG_PRIVATE_USERS_CHOWN,
a41fe3a2
LP
370 };
371
88213476 372 static const struct option options[] = {
aa28aefe
LP
373 { "help", no_argument, NULL, 'h' },
374 { "version", no_argument, NULL, ARG_VERSION },
375 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
376 { "template", required_argument, NULL, ARG_TEMPLATE },
377 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
378 { "user", required_argument, NULL, 'u' },
379 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
7732f92b 380 { "as-pid2", no_argument, NULL, 'a' },
aa28aefe
LP
381 { "boot", no_argument, NULL, 'b' },
382 { "uuid", required_argument, NULL, ARG_UUID },
383 { "read-only", no_argument, NULL, ARG_READ_ONLY },
384 { "capability", required_argument, NULL, ARG_CAPABILITY },
385 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
386 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
387 { "bind", required_argument, NULL, ARG_BIND },
388 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 389 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
390 { "overlay", required_argument, NULL, ARG_OVERLAY },
391 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
392 { "machine", required_argument, NULL, 'M' },
393 { "slice", required_argument, NULL, 'S' },
a5f1cb3b 394 { "setenv", required_argument, NULL, 'E' },
aa28aefe
LP
395 { "selinux-context", required_argument, NULL, 'Z' },
396 { "selinux-apifs-context", required_argument, NULL, 'L' },
397 { "quiet", no_argument, NULL, 'q' },
398 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
399 { "register", required_argument, NULL, ARG_REGISTER },
400 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
401 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 402 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 403 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 404 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 405 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 406 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
22b28dfd 407 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
6afc95b7 408 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 409 { "image", required_argument, NULL, 'i' },
4d9f07b4 410 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 411 { "port", required_argument, NULL, 'p' },
f36933fe 412 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 413 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
7336138e 414 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
c6c8f6e2 415 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 416 { "settings", required_argument, NULL, ARG_SETTINGS },
5f932eb9 417 { "chdir", required_argument, NULL, ARG_CHDIR },
eb9da376 418 {}
88213476
LP
419 };
420
9444b1f2 421 int c, r;
6aadfa4c 422 const char *p, *e;
a42c8b54 423 uint64_t plus = 0, minus = 0;
f757855e 424 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
425
426 assert(argc >= 0);
427 assert(argv);
428
19aac838 429 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
430
431 switch (c) {
432
433 case 'h':
601185b4
ZJS
434 help();
435 return 0;
88213476 436
acbeb427 437 case ARG_VERSION:
3f6fd1ba 438 return version();
acbeb427 439
88213476 440 case 'D':
0f03c2a4 441 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 442 if (r < 0)
0f03c2a4 443 return r;
ec16945e
LP
444 break;
445
446 case ARG_TEMPLATE:
0f03c2a4 447 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 448 if (r < 0)
0f03c2a4 449 return r;
88213476
LP
450 break;
451
1b9e5b12 452 case 'i':
0f03c2a4 453 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 454 if (r < 0)
0f03c2a4 455 return r;
ec16945e
LP
456 break;
457
458 case 'x':
459 arg_ephemeral = true;
1b9e5b12
LP
460 break;
461
687d0825 462 case 'u':
2fc09a9c
DM
463 r = free_and_strdup(&arg_user, optarg);
464 if (r < 0)
7027ff61 465 return log_oom();
687d0825 466
f757855e 467 arg_settings_mask |= SETTING_USER;
687d0825
MV
468 break;
469
22b28dfd
LP
470 case ARG_NETWORK_ZONE: {
471 char *j;
472
473 j = strappend("vz-", optarg);
474 if (!j)
475 return log_oom();
476
477 if (!ifname_valid(j)) {
478 log_error("Network zone name not valid: %s", j);
479 free(j);
480 return -EINVAL;
481 }
482
483 free(arg_network_zone);
484 arg_network_zone = j;
485
486 arg_network_veth = true;
487 arg_private_network = true;
488 arg_settings_mask |= SETTING_NETWORK;
489 break;
490 }
491
ab046dde 492 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
493
494 if (!ifname_valid(optarg)) {
495 log_error("Bridge interface name not valid: %s", optarg);
496 return -EINVAL;
497 }
498
f757855e
LP
499 r = free_and_strdup(&arg_network_bridge, optarg);
500 if (r < 0)
501 return log_oom();
ab046dde
TG
502
503 /* fall through */
504
0dfaa006 505 case 'n':
69c79d3c
LP
506 arg_network_veth = true;
507 arg_private_network = true;
f757855e 508 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
509 break;
510
f6d6bad1
LP
511 case ARG_NETWORK_VETH_EXTRA:
512 r = veth_extra_parse(&arg_network_veth_extra, optarg);
513 if (r < 0)
514 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
515
516 arg_private_network = true;
517 arg_settings_mask |= SETTING_NETWORK;
518 break;
519
aa28aefe 520 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
521
522 if (!ifname_valid(optarg)) {
523 log_error("Network interface name not valid: %s", optarg);
524 return -EINVAL;
525 }
526
c74e630d
LP
527 if (strv_extend(&arg_network_interfaces, optarg) < 0)
528 return log_oom();
529
530 arg_private_network = true;
f757855e 531 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
532 break;
533
534 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
535
536 if (!ifname_valid(optarg)) {
537 log_error("MACVLAN network interface name not valid: %s", optarg);
538 return -EINVAL;
539 }
540
c74e630d 541 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
542 return log_oom();
543
4bbfe7ad 544 arg_private_network = true;
f757855e 545 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
546 break;
547
548 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
549
550 if (!ifname_valid(optarg)) {
551 log_error("IPVLAN network interface name not valid: %s", optarg);
552 return -EINVAL;
553 }
554
4bbfe7ad
TG
555 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
556 return log_oom();
557
aa28aefe
LP
558 /* fall through */
559
ff01d048
LP
560 case ARG_PRIVATE_NETWORK:
561 arg_private_network = true;
f757855e 562 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
563 break;
564
0f0dbc46 565 case 'b':
7732f92b
LP
566 if (arg_start_mode == START_PID2) {
567 log_error("--boot and --as-pid2 may not be combined.");
568 return -EINVAL;
569 }
570
571 arg_start_mode = START_BOOT;
572 arg_settings_mask |= SETTING_START_MODE;
573 break;
574
575 case 'a':
576 if (arg_start_mode == START_BOOT) {
577 log_error("--boot and --as-pid2 may not be combined.");
578 return -EINVAL;
579 }
580
581 arg_start_mode = START_PID2;
582 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
583 break;
584
144f0fc0 585 case ARG_UUID:
9444b1f2
LP
586 r = sd_id128_from_string(optarg, &arg_uuid);
587 if (r < 0) {
aa96c6cb 588 log_error("Invalid UUID: %s", optarg);
9444b1f2 589 return r;
aa96c6cb 590 }
f757855e
LP
591
592 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 593 break;
aa96c6cb 594
9444b1f2 595 case 'S':
c74e630d 596 arg_slice = optarg;
144f0fc0
LP
597 break;
598
7027ff61 599 case 'M':
c1521918 600 if (isempty(optarg))
97b11eed 601 arg_machine = mfree(arg_machine);
c1521918 602 else {
0c3c4284 603 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
604 log_error("Invalid machine name: %s", optarg);
605 return -EINVAL;
606 }
7027ff61 607
0c3c4284
LP
608 r = free_and_strdup(&arg_machine, optarg);
609 if (r < 0)
eb91eb18
LP
610 return log_oom();
611
612 break;
613 }
7027ff61 614
82adf6af
LP
615 case 'Z':
616 arg_selinux_context = optarg;
a8828ed9
DW
617 break;
618
82adf6af
LP
619 case 'L':
620 arg_selinux_apifs_context = optarg;
a8828ed9
DW
621 break;
622
bc2f673e
LP
623 case ARG_READ_ONLY:
624 arg_read_only = true;
f757855e 625 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
626 break;
627
420c7379
LP
628 case ARG_CAPABILITY:
629 case ARG_DROP_CAPABILITY: {
6cbe4ed1 630 p = optarg;
9ed794a3 631 for (;;) {
6cbe4ed1 632 _cleanup_free_ char *t = NULL;
5076f0cc 633
6cbe4ed1
SS
634 r = extract_first_word(&p, &t, ",", 0);
635 if (r < 0)
636 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 637
6cbe4ed1
SS
638 if (r == 0)
639 break;
5076f0cc 640
39ed67d1
LP
641 if (streq(t, "all")) {
642 if (c == ARG_CAPABILITY)
a42c8b54 643 plus = (uint64_t) -1;
39ed67d1 644 else
a42c8b54 645 minus = (uint64_t) -1;
39ed67d1 646 } else {
2822da4f
LP
647 int cap;
648
649 cap = capability_from_name(t);
650 if (cap < 0) {
39ed67d1
LP
651 log_error("Failed to parse capability %s.", t);
652 return -EINVAL;
653 }
654
655 if (c == ARG_CAPABILITY)
a42c8b54 656 plus |= 1ULL << (uint64_t) cap;
39ed67d1 657 else
a42c8b54 658 minus |= 1ULL << (uint64_t) cap;
5076f0cc 659 }
5076f0cc
LP
660 }
661
f757855e 662 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
663 break;
664 }
665
57fb9fb5
LP
666 case 'j':
667 arg_link_journal = LINK_GUEST;
574edc90 668 arg_link_journal_try = true;
57fb9fb5
LP
669 break;
670
671 case ARG_LINK_JOURNAL:
53e438e3 672 if (streq(optarg, "auto")) {
57fb9fb5 673 arg_link_journal = LINK_AUTO;
53e438e3
LP
674 arg_link_journal_try = false;
675 } else if (streq(optarg, "no")) {
57fb9fb5 676 arg_link_journal = LINK_NO;
53e438e3
LP
677 arg_link_journal_try = false;
678 } else if (streq(optarg, "guest")) {
57fb9fb5 679 arg_link_journal = LINK_GUEST;
53e438e3
LP
680 arg_link_journal_try = false;
681 } else if (streq(optarg, "host")) {
57fb9fb5 682 arg_link_journal = LINK_HOST;
53e438e3
LP
683 arg_link_journal_try = false;
684 } else if (streq(optarg, "try-guest")) {
574edc90
MP
685 arg_link_journal = LINK_GUEST;
686 arg_link_journal_try = true;
687 } else if (streq(optarg, "try-host")) {
688 arg_link_journal = LINK_HOST;
689 arg_link_journal_try = true;
690 } else {
57fb9fb5
LP
691 log_error("Failed to parse link journal mode %s", optarg);
692 return -EINVAL;
693 }
694
695 break;
696
17fe0523 697 case ARG_BIND:
f757855e
LP
698 case ARG_BIND_RO:
699 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
700 if (r < 0)
701 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 702
f757855e 703 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 704 break;
06c17c39 705
f757855e
LP
706 case ARG_TMPFS:
707 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
708 if (r < 0)
709 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 710
f757855e 711 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 712 break;
5a8af538
LP
713
714 case ARG_OVERLAY:
715 case ARG_OVERLAY_RO: {
716 _cleanup_free_ char *upper = NULL, *destination = NULL;
717 _cleanup_strv_free_ char **lower = NULL;
718 CustomMount *m;
719 unsigned n = 0;
720 char **i;
721
62f9f39a
RM
722 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
723 if (r == -ENOMEM)
06c17c39 724 return log_oom();
62f9f39a
RM
725 else if (r < 0) {
726 log_error("Invalid overlay specification: %s", optarg);
727 return r;
728 }
06c17c39 729
5a8af538
LP
730 STRV_FOREACH(i, lower) {
731 if (!path_is_absolute(*i)) {
732 log_error("Overlay path %s is not absolute.", *i);
733 return -EINVAL;
734 }
735
736 n++;
737 }
738
739 if (n < 2) {
740 log_error("--overlay= needs at least two colon-separated directories specified.");
741 return -EINVAL;
742 }
743
744 if (n == 2) {
745 /* If two parameters are specified,
746 * the first one is the lower, the
747 * second one the upper directory. And
af86c440
ZJS
748 * we'll also define the destination
749 * mount point the same as the upper. */
5a8af538
LP
750 upper = lower[1];
751 lower[1] = NULL;
752
753 destination = strdup(upper);
754 if (!destination)
755 return log_oom();
756
757 } else {
758 upper = lower[n - 2];
759 destination = lower[n - 1];
760 lower[n - 2] = NULL;
761 }
762
f757855e 763 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
764 if (!m)
765 return log_oom();
766
767 m->destination = destination;
768 m->source = upper;
769 m->lower = lower;
770 m->read_only = c == ARG_OVERLAY_RO;
771
772 upper = destination = NULL;
773 lower = NULL;
06c17c39 774
f757855e 775 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
776 break;
777 }
778
a5f1cb3b 779 case 'E': {
f4889f65
LP
780 char **n;
781
782 if (!env_assignment_is_valid(optarg)) {
783 log_error("Environment variable assignment '%s' is not valid.", optarg);
784 return -EINVAL;
785 }
786
787 n = strv_env_set(arg_setenv, optarg);
788 if (!n)
789 return log_oom();
790
791 strv_free(arg_setenv);
792 arg_setenv = n;
f757855e
LP
793
794 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
795 break;
796 }
797
284c0b91
LP
798 case 'q':
799 arg_quiet = true;
800 break;
801
8a96d94e
LP
802 case ARG_SHARE_SYSTEM:
803 arg_share_system = true;
804 break;
805
eb91eb18
LP
806 case ARG_REGISTER:
807 r = parse_boolean(optarg);
808 if (r < 0) {
809 log_error("Failed to parse --register= argument: %s", optarg);
810 return r;
811 }
812
813 arg_register = r;
814 break;
815
89f7c846
LP
816 case ARG_KEEP_UNIT:
817 arg_keep_unit = true;
818 break;
819
6afc95b7
LP
820 case ARG_PERSONALITY:
821
ac45f971 822 arg_personality = personality_from_string(optarg);
050f7277 823 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
824 log_error("Unknown or unsupported personality '%s'.", optarg);
825 return -EINVAL;
826 }
827
f757855e 828 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
829 break;
830
4d9f07b4
LP
831 case ARG_VOLATILE:
832
833 if (!optarg)
f757855e 834 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 835 else {
f757855e 836 VolatileMode m;
4d9f07b4 837
f757855e
LP
838 m = volatile_mode_from_string(optarg);
839 if (m < 0) {
840 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 841 return -EINVAL;
f757855e
LP
842 } else
843 arg_volatile_mode = m;
6d0b55c2
LP
844 }
845
f757855e
LP
846 arg_settings_mask |= SETTING_VOLATILE_MODE;
847 break;
6d0b55c2 848
f757855e
LP
849 case 'p':
850 r = expose_port_parse(&arg_expose_ports, optarg);
851 if (r == -EEXIST)
852 return log_error_errno(r, "Duplicate port specification: %s", optarg);
853 if (r < 0)
854 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 855
f757855e 856 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 857 break;
6d0b55c2 858
f36933fe
LP
859 case ARG_PROPERTY:
860 if (strv_extend(&arg_property, optarg) < 0)
861 return log_oom();
862
863 break;
864
6dac160c 865 case ARG_PRIVATE_USERS:
0de7acce
LP
866
867 r = optarg ? parse_boolean(optarg) : 1;
868 if (r == 0) {
869 /* no: User namespacing off */
870 arg_userns_mode = USER_NAMESPACE_NO;
871 arg_uid_shift = UID_INVALID;
872 arg_uid_range = UINT32_C(0x10000);
873 } else if (r > 0) {
874 /* yes: User namespacing on, UID range is read from root dir */
875 arg_userns_mode = USER_NAMESPACE_FIXED;
876 arg_uid_shift = UID_INVALID;
877 arg_uid_range = UINT32_C(0x10000);
878 } else if (streq(optarg, "pick")) {
879 /* pick: User namespacing on, UID range is picked randomly */
880 arg_userns_mode = USER_NAMESPACE_PICK;
881 arg_uid_shift = UID_INVALID;
882 arg_uid_range = UINT32_C(0x10000);
883 } else {
6dac160c
LP
884 _cleanup_free_ char *buffer = NULL;
885 const char *range, *shift;
886
0de7acce
LP
887 /* anything else: User namespacing on, UID range is explicitly configured */
888
6dac160c
LP
889 range = strchr(optarg, ':');
890 if (range) {
891 buffer = strndup(optarg, range - optarg);
892 if (!buffer)
893 return log_oom();
894 shift = buffer;
895
896 range++;
897 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
898 log_error("Failed to parse UID range: %s", range);
899 return -EINVAL;
900 }
901 } else
902 shift = optarg;
903
904 if (parse_uid(shift, &arg_uid_shift) < 0) {
905 log_error("Failed to parse UID: %s", optarg);
906 return -EINVAL;
907 }
0de7acce
LP
908
909 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
910 }
911
0de7acce 912 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
913 break;
914
0de7acce 915 case 'U':
ccabee0d
LP
916 if (userns_supported()) {
917 arg_userns_mode = USER_NAMESPACE_PICK;
918 arg_uid_shift = UID_INVALID;
919 arg_uid_range = UINT32_C(0x10000);
920
921 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
922 }
923
7336138e
LP
924 break;
925
0de7acce 926 case ARG_PRIVATE_USERS_CHOWN:
19aac838 927 arg_userns_chown = true;
0de7acce
LP
928
929 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
930 break;
931
c6c8f6e2
LP
932 case ARG_KILL_SIGNAL:
933 arg_kill_signal = signal_from_string_try_harder(optarg);
934 if (arg_kill_signal < 0) {
935 log_error("Cannot parse signal: %s", optarg);
936 return -EINVAL;
937 }
938
f757855e
LP
939 arg_settings_mask |= SETTING_KILL_SIGNAL;
940 break;
941
942 case ARG_SETTINGS:
943
944 /* no → do not read files
945 * yes → read files, do not override cmdline, trust only subset
946 * override → read files, override cmdline, trust only subset
947 * trusted → read files, do not override cmdline, trust all
948 */
949
950 r = parse_boolean(optarg);
951 if (r < 0) {
952 if (streq(optarg, "trusted")) {
953 mask_all_settings = false;
954 mask_no_settings = false;
955 arg_settings_trusted = true;
956
957 } else if (streq(optarg, "override")) {
958 mask_all_settings = false;
959 mask_no_settings = true;
960 arg_settings_trusted = -1;
961 } else
962 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
963 } else if (r > 0) {
964 /* yes */
965 mask_all_settings = false;
966 mask_no_settings = false;
967 arg_settings_trusted = -1;
968 } else {
969 /* no */
970 mask_all_settings = true;
971 mask_no_settings = false;
972 arg_settings_trusted = false;
973 }
974
c6c8f6e2
LP
975 break;
976
5f932eb9
LP
977 case ARG_CHDIR:
978 if (!path_is_absolute(optarg)) {
979 log_error("Working directory %s is not an absolute path.", optarg);
980 return -EINVAL;
981 }
982
983 r = free_and_strdup(&arg_chdir, optarg);
984 if (r < 0)
985 return log_oom();
986
987 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
988 break;
989
88213476
LP
990 case '?':
991 return -EINVAL;
992
993 default:
eb9da376 994 assert_not_reached("Unhandled option");
88213476 995 }
88213476 996
eb91eb18
LP
997 if (arg_share_system)
998 arg_register = false;
999
0de7acce 1000 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1001 arg_userns_chown = true;
1002
7732f92b 1003 if (arg_start_mode != START_PID1 && arg_share_system) {
eb91eb18
LP
1004 log_error("--boot and --share-system may not be combined.");
1005 return -EINVAL;
1006 }
1007
89f7c846
LP
1008 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1009 log_error("--keep-unit may not be used when invoked from a user session.");
1010 return -EINVAL;
1011 }
1012
1b9e5b12
LP
1013 if (arg_directory && arg_image) {
1014 log_error("--directory= and --image= may not be combined.");
1015 return -EINVAL;
1016 }
1017
ec16945e
LP
1018 if (arg_template && arg_image) {
1019 log_error("--template= and --image= may not be combined.");
1020 return -EINVAL;
1021 }
1022
1023 if (arg_template && !(arg_directory || arg_machine)) {
1024 log_error("--template= needs --directory= or --machine=.");
1025 return -EINVAL;
1026 }
1027
1028 if (arg_ephemeral && arg_template) {
1029 log_error("--ephemeral and --template= may not be combined.");
1030 return -EINVAL;
1031 }
1032
1033 if (arg_ephemeral && arg_image) {
1034 log_error("--ephemeral and --image= may not be combined.");
1035 return -EINVAL;
1036 }
1037
df9a75e4
LP
1038 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1039 log_error("--ephemeral and --link-journal= may not be combined.");
1040 return -EINVAL;
1041 }
1042
ccabee0d 1043 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1044 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1045 return -EOPNOTSUPP;
1046 }
1047
1048 if (arg_userns_chown && arg_read_only) {
1049 log_error("--read-only and --private-users-chown may not be combined.");
1050 return -EINVAL;
1051 }
f757855e 1052
22b28dfd
LP
1053 if (arg_network_bridge && arg_network_zone) {
1054 log_error("--network-bridge= and --network-zone= may not be combined.");
1055 return -EINVAL;
1056 }
1057
f757855e
LP
1058 if (argc > optind) {
1059 arg_parameters = strv_copy(argv + optind);
1060 if (!arg_parameters)
1061 return log_oom();
1062
7732f92b 1063 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1064 }
1065
1066 /* Load all settings from .nspawn files */
1067 if (mask_no_settings)
1068 arg_settings_mask = 0;
1069
1070 /* Don't load any settings from .nspawn files */
1071 if (mask_all_settings)
1072 arg_settings_mask = _SETTINGS_MASK_ALL;
1073
520e0d54 1074 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e
LP
1075
1076 r = detect_unified_cgroup_hierarchy();
1077 if (r < 0)
1078 return r;
1079
6aadfa4c
ILG
1080 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1081 if (e)
1082 arg_container_service_name = e;
1083
f757855e
LP
1084 return 1;
1085}
1086
1087static int verify_arguments(void) {
1088
1089 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1090 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1091 return -EINVAL;
1092 }
1093
6d0b55c2
LP
1094 if (arg_expose_ports && !arg_private_network) {
1095 log_error("Cannot use --port= without private networking.");
1096 return -EINVAL;
1097 }
1098
1c1ea217
EV
1099#ifndef HAVE_LIBIPTC
1100 if (arg_expose_ports) {
1101 log_error("--port= is not supported, compiled without libiptc support.");
1102 return -EOPNOTSUPP;
1103 }
1104#endif
1105
7732f92b 1106 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1107 arg_kill_signal = SIGRTMIN+3;
1108
f757855e 1109 return 0;
88213476
LP
1110}
1111
03cfe0d5
LP
1112static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1113 assert(p);
1114
0de7acce 1115 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1116 return 0;
1117
1118 if (uid == UID_INVALID && gid == GID_INVALID)
1119 return 0;
1120
1121 if (uid != UID_INVALID) {
1122 uid += arg_uid_shift;
1123
1124 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1125 return -EOVERFLOW;
1126 }
1127
1128 if (gid != GID_INVALID) {
1129 gid += (gid_t) arg_uid_shift;
1130
1131 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1132 return -EOVERFLOW;
1133 }
1134
1135 if (lchown(p, uid, gid) < 0)
1136 return -errno;
b12afc8c
LP
1137
1138 return 0;
1139}
1140
03cfe0d5
LP
1141static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1142 const char *q;
1143
1144 q = prefix_roota(root, path);
1145 if (mkdir(q, mode) < 0) {
1146 if (errno == EEXIST)
1147 return 0;
1148 return -errno;
1149 }
1150
1151 return userns_lchown(q, uid, gid);
1152}
1153
e58a1277 1154static int setup_timezone(const char *dest) {
03cfe0d5
LP
1155 _cleanup_free_ char *p = NULL, *q = NULL;
1156 const char *where, *check, *what;
d4036145
LP
1157 char *z, *y;
1158 int r;
f8440af5 1159
e58a1277
LP
1160 assert(dest);
1161
1162 /* Fix the timezone, if possible */
d4036145
LP
1163 r = readlink_malloc("/etc/localtime", &p);
1164 if (r < 0) {
1165 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1166 return 0;
1167 }
1168
1169 z = path_startswith(p, "../usr/share/zoneinfo/");
1170 if (!z)
1171 z = path_startswith(p, "/usr/share/zoneinfo/");
1172 if (!z) {
1173 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1174 return 0;
1175 }
1176
03cfe0d5 1177 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1178 r = readlink_malloc(where, &q);
1179 if (r >= 0) {
1180 y = path_startswith(q, "../usr/share/zoneinfo/");
1181 if (!y)
1182 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1183
d4036145
LP
1184 /* Already pointing to the right place? Then do nothing .. */
1185 if (y && streq(y, z))
1186 return 0;
1187 }
1188
03cfe0d5 1189 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1190 check = prefix_roota(dest, check);
03cfe0d5 1191 if (laccess(check, F_OK) < 0) {
d4036145
LP
1192 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1193 return 0;
1194 }
68fb0892 1195
79d80fc1
TG
1196 r = unlink(where);
1197 if (r < 0 && errno != ENOENT) {
56f64d95 1198 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1199 return 0;
1200 }
4d9f07b4 1201
03cfe0d5 1202 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1203 if (symlink(what, where) < 0) {
56f64d95 1204 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1205 return 0;
1206 }
e58a1277 1207
03cfe0d5
LP
1208 r = userns_lchown(where, 0, 0);
1209 if (r < 0)
1210 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1211
e58a1277 1212 return 0;
88213476
LP
1213}
1214
2547bb41 1215static int setup_resolv_conf(const char *dest) {
03cfe0d5 1216 const char *where = NULL;
79d80fc1 1217 int r;
2547bb41
LP
1218
1219 assert(dest);
1220
1221 if (arg_private_network)
1222 return 0;
1223
1224 /* Fix resolv.conf, if possible */
03cfe0d5 1225 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1226
f2068bcc 1227 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1228 if (r < 0) {
68a313c5
LP
1229 /* If the file already exists as symlink, let's
1230 * suppress the warning, under the assumption that
1231 * resolved or something similar runs inside and the
1232 * symlink points there.
1233 *
1234 * If the disk image is read-only, there's also no
1235 * point in complaining.
1236 */
1237 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1238 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1239 return 0;
1240 }
2547bb41 1241
03cfe0d5
LP
1242 r = userns_lchown(where, 0, 0);
1243 if (r < 0)
1244 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1245
2547bb41
LP
1246 return 0;
1247}
1248
9f24adc2 1249static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1250 assert(s);
9f24adc2
LP
1251
1252 snprintf(s, 37,
1253 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1254 SD_ID128_FORMAT_VAL(id));
1255
1256 return s;
1257}
1258
04bc4a3f 1259static int setup_boot_id(const char *dest) {
03cfe0d5 1260 const char *from, *to;
39883f62 1261 sd_id128_t rnd = {};
04bc4a3f
LP
1262 char as_uuid[37];
1263 int r;
1264
eb91eb18
LP
1265 if (arg_share_system)
1266 return 0;
1267
04bc4a3f
LP
1268 /* Generate a new randomized boot ID, so that each boot-up of
1269 * the container gets a new one */
1270
03cfe0d5
LP
1271 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1272 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1273
1274 r = sd_id128_randomize(&rnd);
f647962d
MS
1275 if (r < 0)
1276 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1277
9f24adc2 1278 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1279
4c1fc3e4 1280 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1281 if (r < 0)
1282 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1283
03cfe0d5
LP
1284 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1285 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1286 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1287 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1288
1289 unlink(from);
04bc4a3f
LP
1290 return r;
1291}
1292
e58a1277 1293static int copy_devnodes(const char *dest) {
88213476
LP
1294
1295 static const char devnodes[] =
1296 "null\0"
1297 "zero\0"
1298 "full\0"
1299 "random\0"
1300 "urandom\0"
85614d66
TG
1301 "tty\0"
1302 "net/tun\0";
88213476
LP
1303
1304 const char *d;
e58a1277 1305 int r = 0;
7fd1b19b 1306 _cleanup_umask_ mode_t u;
a258bf26
LP
1307
1308 assert(dest);
124640f1
LP
1309
1310 u = umask(0000);
88213476 1311
03cfe0d5
LP
1312 /* Create /dev/net, so that we can create /dev/net/tun in it */
1313 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1314 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1315
88213476 1316 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1317 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1318 struct stat st;
88213476 1319
7f112f50 1320 from = strappend("/dev/", d);
03cfe0d5 1321 to = prefix_root(dest, from);
88213476
LP
1322
1323 if (stat(from, &st) < 0) {
1324
4a62c710
MS
1325 if (errno != ENOENT)
1326 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1327
a258bf26 1328 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1329
03cfe0d5 1330 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1331 return -EIO;
a258bf26 1332
85614d66 1333 } else {
81f5049b
AC
1334 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1335 if (errno != EPERM)
1336 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1337
1338 /* Some systems abusively restrict mknod but
1339 * allow bind mounts. */
1340 r = touch(to);
1341 if (r < 0)
1342 return log_error_errno(r, "touch (%s) failed: %m", to);
1343 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1344 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1345 }
6278cf60 1346
03cfe0d5
LP
1347 r = userns_lchown(to, 0, 0);
1348 if (r < 0)
1349 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1350 }
88213476
LP
1351 }
1352
e58a1277
LP
1353 return r;
1354}
88213476 1355
03cfe0d5
LP
1356static int setup_pts(const char *dest) {
1357 _cleanup_free_ char *options = NULL;
1358 const char *p;
709f6e46 1359 int r;
03cfe0d5
LP
1360
1361#ifdef HAVE_SELINUX
1362 if (arg_selinux_apifs_context)
1363 (void) asprintf(&options,
3dce8915 1364 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1365 arg_uid_shift + TTY_GID,
1366 arg_selinux_apifs_context);
1367 else
1368#endif
1369 (void) asprintf(&options,
3dce8915 1370 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1371 arg_uid_shift + TTY_GID);
f2d88580 1372
03cfe0d5 1373 if (!options)
f2d88580
LP
1374 return log_oom();
1375
03cfe0d5 1376 /* Mount /dev/pts itself */
cc9fce65 1377 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1378 if (mkdir(p, 0755) < 0)
1379 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1380 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1381 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1382 r = userns_lchown(p, 0, 0);
1383 if (r < 0)
1384 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1385
1386 /* Create /dev/ptmx symlink */
1387 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1388 if (symlink("pts/ptmx", p) < 0)
1389 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1390 r = userns_lchown(p, 0, 0);
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1393
03cfe0d5
LP
1394 /* And fix /dev/pts/ptmx ownership */
1395 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1396 r = userns_lchown(p, 0, 0);
1397 if (r < 0)
1398 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1399
f2d88580
LP
1400 return 0;
1401}
1402
e58a1277 1403static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1404 _cleanup_umask_ mode_t u;
1405 const char *to;
e58a1277 1406 int r;
e58a1277
LP
1407
1408 assert(dest);
1409 assert(console);
1410
1411 u = umask(0000);
1412
03cfe0d5 1413 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1414 if (r < 0)
1415 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1416
a258bf26
LP
1417 /* We need to bind mount the right tty to /dev/console since
1418 * ptys can only exist on pts file systems. To have something
81f5049b 1419 * to bind mount things on we create a empty regular file. */
a258bf26 1420
03cfe0d5 1421 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1422 r = touch(to);
1423 if (r < 0)
1424 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1425
4543768d 1426 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1427 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1428
25ea79fe 1429 return 0;
e58a1277
LP
1430}
1431
1432static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1433 const char *from, *to;
7fd1b19b 1434 _cleanup_umask_ mode_t u;
d9603714 1435 int fd, r;
e58a1277 1436
e58a1277 1437 assert(kmsg_socket >= 0);
a258bf26 1438
e58a1277 1439 u = umask(0000);
a258bf26 1440
03cfe0d5 1441 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1442 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1443 * on the reading side behave very similar to /proc/kmsg,
1444 * their writing side behaves differently from /dev/kmsg in
1445 * that writing blocks when nothing is reading. In order to
1446 * avoid any problems with containers deadlocking due to this
1447 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1448 from = prefix_roota(dest, "/run/kmsg");
1449 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1450
4a62c710 1451 if (mkfifo(from, 0600) < 0)
03cfe0d5 1452 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1453 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1454 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1455
1456 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1457 if (fd < 0)
1458 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1459
e58a1277
LP
1460 /* Store away the fd in the socket, so that it stays open as
1461 * long as we run the child */
3ee897d6 1462 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1463 safe_close(fd);
e58a1277 1464
d9603714
DH
1465 if (r < 0)
1466 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1467
03cfe0d5
LP
1468 /* And now make the FIFO unavailable as /run/kmsg... */
1469 (void) unlink(from);
1470
25ea79fe 1471 return 0;
88213476
LP
1472}
1473
1c4baffc 1474static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1475 union in_addr_union *exposed = userdata;
1476
1477 assert(rtnl);
1478 assert(m);
1479 assert(exposed);
1480
7a8f6325 1481 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1482 return 0;
1483}
1484
3a74cea5 1485static int setup_hostname(void) {
3a74cea5 1486
eb91eb18
LP
1487 if (arg_share_system)
1488 return 0;
1489
605f81a8 1490 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1491 return -errno;
3a74cea5 1492
7027ff61 1493 return 0;
3a74cea5
LP
1494}
1495
57fb9fb5 1496static int setup_journal(const char *directory) {
e01ff70a 1497 sd_id128_t this_id;
0f5e1382 1498 _cleanup_free_ char *d = NULL;
e01ff70a 1499 const char *p, *q;
8054d749 1500 bool try;
e01ff70a 1501 char id[33];
57fb9fb5
LP
1502 int r;
1503
df9a75e4
LP
1504 /* Don't link journals in ephemeral mode */
1505 if (arg_ephemeral)
1506 return 0;
1507
8054d749
LP
1508 if (arg_link_journal == LINK_NO)
1509 return 0;
1510
1511 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1512
4d680aee 1513 r = sd_id128_get_machine(&this_id);
f647962d
MS
1514 if (r < 0)
1515 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1516
e01ff70a 1517 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1518 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1519 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1520 if (try)
4d680aee 1521 return 0;
df9a75e4 1522 return -EEXIST;
4d680aee
ZJS
1523 }
1524
03cfe0d5
LP
1525 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1526 if (r < 0)
1527 return log_error_errno(r, "Failed to create /var: %m");
1528
1529 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1530 if (r < 0)
1531 return log_error_errno(r, "Failed to create /var/log: %m");
1532
1533 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1536
e01ff70a
MS
1537 (void) sd_id128_to_string(arg_uuid, id);
1538
03cfe0d5
LP
1539 p = strjoina("/var/log/journal/", id);
1540 q = prefix_roota(directory, p);
27407a01 1541
e26d6ce5 1542 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1543 if (try)
1544 return 0;
27407a01 1545
8054d749
LP
1546 log_error("%s: already a mount point, refusing to use for journal", p);
1547 return -EEXIST;
57fb9fb5
LP
1548 }
1549
e26d6ce5 1550 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1551 if (try)
1552 return 0;
57fb9fb5 1553
8054d749
LP
1554 log_error("%s: already a mount point, refusing to use for journal", q);
1555 return -EEXIST;
57fb9fb5
LP
1556 }
1557
1558 r = readlink_and_make_absolute(p, &d);
1559 if (r >= 0) {
1560 if ((arg_link_journal == LINK_GUEST ||
1561 arg_link_journal == LINK_AUTO) &&
1562 path_equal(d, q)) {
1563
03cfe0d5 1564 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1565 if (r < 0)
709f6e46 1566 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1567 return 0;
57fb9fb5
LP
1568 }
1569
4a62c710
MS
1570 if (unlink(p) < 0)
1571 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1572 } else if (r == -EINVAL) {
1573
1574 if (arg_link_journal == LINK_GUEST &&
1575 rmdir(p) < 0) {
1576
27407a01
ZJS
1577 if (errno == ENOTDIR) {
1578 log_error("%s already exists and is neither a symlink nor a directory", p);
1579 return r;
4314d33f
MS
1580 } else
1581 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1582 }
4314d33f
MS
1583 } else if (r != -ENOENT)
1584 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1585
1586 if (arg_link_journal == LINK_GUEST) {
1587
1588 if (symlink(q, p) < 0) {
8054d749 1589 if (try) {
56f64d95 1590 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1591 return 0;
4314d33f
MS
1592 } else
1593 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1594 }
1595
03cfe0d5 1596 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1597 if (r < 0)
709f6e46 1598 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1599 return 0;
57fb9fb5
LP
1600 }
1601
1602 if (arg_link_journal == LINK_HOST) {
ccddd104 1603 /* don't create parents here — if the host doesn't have
574edc90 1604 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1605
1606 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1607 if (try) {
56f64d95 1608 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1609 return 0;
4314d33f
MS
1610 } else
1611 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1612 }
1613
27407a01
ZJS
1614 } else if (access(p, F_OK) < 0)
1615 return 0;
57fb9fb5 1616
cdb2b9d0
LP
1617 if (dir_is_empty(q) == 0)
1618 log_warning("%s is not empty, proceeding anyway.", q);
1619
03cfe0d5 1620 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1621 if (r < 0)
1622 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1623
4543768d 1624 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1625 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1626
27407a01 1627 return 0;
57fb9fb5
LP
1628}
1629
88213476 1630static int drop_capabilities(void) {
520e0d54 1631 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1632}
1633
db999e0f
LP
1634static int reset_audit_loginuid(void) {
1635 _cleanup_free_ char *p = NULL;
1636 int r;
1637
1638 if (arg_share_system)
1639 return 0;
1640
1641 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1642 if (r == -ENOENT)
db999e0f 1643 return 0;
f647962d
MS
1644 if (r < 0)
1645 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1646
1647 /* Already reset? */
1648 if (streq(p, "4294967295"))
1649 return 0;
1650
ad118bda 1651 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1652 if (r < 0) {
10a87006
LP
1653 log_error_errno(r,
1654 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1655 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1656 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1657 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1658 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1659
db999e0f 1660 sleep(5);
77b6e194 1661 }
db999e0f
LP
1662
1663 return 0;
77b6e194
LP
1664}
1665
24fb1112 1666
785890ac
LP
1667static int setup_propagate(const char *root) {
1668 const char *p, *q;
709f6e46 1669 int r;
785890ac
LP
1670
1671 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1672 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1673 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1674 (void) mkdir_p(p, 0600);
1675
709f6e46
MS
1676 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1677 if (r < 0)
1678 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1679
709f6e46
MS
1680 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1681 if (r < 0)
1682 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1683
709f6e46
MS
1684 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1685 if (r < 0)
1686 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1687
03cfe0d5 1688 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1689 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1690 return log_error_errno(errno, "Failed to install propagation bind mount.");
1691
1692 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1693 return log_error_errno(errno, "Failed to make propagation mount read-only");
1694
1695 return 0;
1696}
1697
1b9e5b12
LP
1698static int setup_image(char **device_path, int *loop_nr) {
1699 struct loop_info64 info = {
1700 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1701 };
1702 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1703 _cleanup_free_ char* loopdev = NULL;
1704 struct stat st;
1705 int r, nr;
1706
1707 assert(device_path);
1708 assert(loop_nr);
ec16945e 1709 assert(arg_image);
1b9e5b12
LP
1710
1711 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1712 if (fd < 0)
1713 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1714
4a62c710
MS
1715 if (fstat(fd, &st) < 0)
1716 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1717
1718 if (S_ISBLK(st.st_mode)) {
1719 char *p;
1720
1721 p = strdup(arg_image);
1722 if (!p)
1723 return log_oom();
1724
1725 *device_path = p;
1726
1727 *loop_nr = -1;
1728
1729 r = fd;
1730 fd = -1;
1731
1732 return r;
1733 }
1734
1735 if (!S_ISREG(st.st_mode)) {
070edd97 1736 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1737 return -EINVAL;
1738 }
1739
1740 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1741 if (control < 0)
1742 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1743
1744 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1745 if (nr < 0)
1746 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1747
1748 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1749 return log_oom();
1750
1751 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1752 if (loop < 0)
1753 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1754
4a62c710
MS
1755 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1756 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1757
1758 if (arg_read_only)
1759 info.lo_flags |= LO_FLAGS_READ_ONLY;
1760
4a62c710
MS
1761 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1762 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1763
1764 *device_path = loopdev;
1765 loopdev = NULL;
1766
1767 *loop_nr = nr;
1768
1769 r = loop;
1770 loop = -1;
1771
1772 return r;
1773}
1774
ada4799a
LP
1775#define PARTITION_TABLE_BLURB \
1776 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1777 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1778 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1779 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1780 "to be bootable with systemd-nspawn."
1781
1b9e5b12
LP
1782static int dissect_image(
1783 int fd,
727fd4fd
LP
1784 char **root_device, bool *root_device_rw,
1785 char **home_device, bool *home_device_rw,
1786 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1787 bool *secondary) {
1788
1789#ifdef HAVE_BLKID
01dc33ce
ZJS
1790 int home_nr = -1, srv_nr = -1;
1791#ifdef GPT_ROOT_NATIVE
1792 int root_nr = -1;
1793#endif
1794#ifdef GPT_ROOT_SECONDARY
1795 int secondary_root_nr = -1;
1796#endif
f6c51a81 1797 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1798 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1799 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1800 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1801 _cleanup_udev_unref_ struct udev *udev = NULL;
1802 struct udev_list_entry *first, *item;
f6c51a81 1803 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1804 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1805 const char *pttype = NULL;
1806 blkid_partlist pl;
1807 struct stat st;
c09ef2e4 1808 unsigned i;
1b9e5b12
LP
1809 int r;
1810
1811 assert(fd >= 0);
1812 assert(root_device);
1813 assert(home_device);
1814 assert(srv_device);
1815 assert(secondary);
ec16945e 1816 assert(arg_image);
1b9e5b12
LP
1817
1818 b = blkid_new_probe();
1819 if (!b)
1820 return log_oom();
1821
1822 errno = 0;
1823 r = blkid_probe_set_device(b, fd, 0, 0);
1824 if (r != 0) {
1825 if (errno == 0)
1826 return log_oom();
1827
e1427b13 1828 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1829 }
1830
1831 blkid_probe_enable_partitions(b, 1);
1832 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1833
1834 errno = 0;
1835 r = blkid_do_safeprobe(b);
1836 if (r == -2 || r == 1) {
ada4799a
LP
1837 log_error("Failed to identify any partition table on\n"
1838 " %s\n"
1839 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1840 return -EINVAL;
1841 } else if (r != 0) {
1842 if (errno == 0)
1843 errno = EIO;
e1427b13 1844 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1845 }
1846
48861960 1847 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1848
1849 is_gpt = streq_ptr(pttype, "gpt");
1850 is_mbr = streq_ptr(pttype, "dos");
1851
1852 if (!is_gpt && !is_mbr) {
1853 log_error("No GPT or MBR partition table discovered on\n"
1854 " %s\n"
1855 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1856 return -EINVAL;
1857 }
1858
1859 errno = 0;
1860 pl = blkid_probe_get_partitions(b);
1861 if (!pl) {
1862 if (errno == 0)
1863 return log_oom();
1864
1865 log_error("Failed to list partitions of %s", arg_image);
1866 return -errno;
1867 }
1868
1869 udev = udev_new();
1870 if (!udev)
1871 return log_oom();
1872
4a62c710
MS
1873 if (fstat(fd, &st) < 0)
1874 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1875
c09ef2e4
LP
1876 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1877 if (!d)
1b9e5b12
LP
1878 return log_oom();
1879
c09ef2e4
LP
1880 for (i = 0;; i++) {
1881 int n, m;
1b9e5b12 1882
c09ef2e4
LP
1883 if (i >= 10) {
1884 log_error("Kernel partitions never appeared.");
1885 return -ENXIO;
1886 }
1887
1888 e = udev_enumerate_new(udev);
1889 if (!e)
1890 return log_oom();
1891
1892 r = udev_enumerate_add_match_parent(e, d);
1893 if (r < 0)
1894 return log_oom();
1895
1896 r = udev_enumerate_scan_devices(e);
1897 if (r < 0)
1898 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1899
1900 /* Count the partitions enumerated by the kernel */
1901 n = 0;
1902 first = udev_enumerate_get_list_entry(e);
1903 udev_list_entry_foreach(item, first)
1904 n++;
1905
1906 /* Count the partitions enumerated by blkid */
1907 m = blkid_partlist_numof_partitions(pl);
1908 if (n == m + 1)
1909 break;
1910 if (n > m + 1) {
1911 log_error("blkid and kernel partition list do not match.");
1912 return -EIO;
1913 }
1914 if (n < m + 1) {
1915 unsigned j;
1916
1917 /* The kernel has probed fewer partitions than
1918 * blkid? Maybe the kernel prober is still
1919 * running or it got EBUSY because udev
1920 * already opened the device. Let's reprobe
1921 * the device, which is a synchronous call
1922 * that waits until probing is complete. */
1923
1924 for (j = 0; j < 20; j++) {
1925
1926 r = ioctl(fd, BLKRRPART, 0);
1927 if (r < 0)
1928 r = -errno;
1929 if (r >= 0 || r != -EBUSY)
1930 break;
1931
1932 /* If something else has the device
1933 * open, such as an udev rule, the
1934 * ioctl will return EBUSY. Since
1935 * there's no way to wait until it
1936 * isn't busy anymore, let's just wait
1937 * a bit, and try again.
1938 *
1939 * This is really something they
1940 * should fix in the kernel! */
1941
1942 usleep(50 * USEC_PER_MSEC);
1943 }
1944
1945 if (r < 0)
1946 return log_error_errno(r, "Failed to reread partition table: %m");
1947 }
1948
1949 e = udev_enumerate_unref(e);
1950 }
1b9e5b12
LP
1951
1952 first = udev_enumerate_get_list_entry(e);
1953 udev_list_entry_foreach(item, first) {
1954 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1955 const char *node;
727fd4fd 1956 unsigned long long flags;
1b9e5b12
LP
1957 blkid_partition pp;
1958 dev_t qn;
1959 int nr;
1960
1961 errno = 0;
1962 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1963 if (!q) {
1964 if (!errno)
1965 errno = ENOMEM;
1966
e1427b13 1967 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1968 }
1969
1970 qn = udev_device_get_devnum(q);
1971 if (major(qn) == 0)
1972 continue;
1973
1974 if (st.st_rdev == qn)
1975 continue;
1976
1977 node = udev_device_get_devnode(q);
1978 if (!node)
1979 continue;
1980
1981 pp = blkid_partlist_devno_to_partition(pl, qn);
1982 if (!pp)
1983 continue;
1984
727fd4fd 1985 flags = blkid_partition_get_flags(pp);
727fd4fd 1986
1b9e5b12
LP
1987 nr = blkid_partition_get_partno(pp);
1988 if (nr < 0)
1989 continue;
1990
ada4799a
LP
1991 if (is_gpt) {
1992 sd_id128_t type_id;
1993 const char *stype;
1b9e5b12 1994
f6c51a81
LP
1995 if (flags & GPT_FLAG_NO_AUTO)
1996 continue;
1997
ada4799a
LP
1998 stype = blkid_partition_get_type_string(pp);
1999 if (!stype)
2000 continue;
1b9e5b12 2001
ada4799a 2002 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2003 continue;
2004
ada4799a 2005 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2006
ada4799a
LP
2007 if (home && nr >= home_nr)
2008 continue;
1b9e5b12 2009
ada4799a
LP
2010 home_nr = nr;
2011 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2012
ada4799a
LP
2013 r = free_and_strdup(&home, node);
2014 if (r < 0)
2015 return log_oom();
727fd4fd 2016
ada4799a
LP
2017 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2018
2019 if (srv && nr >= srv_nr)
2020 continue;
2021
2022 srv_nr = nr;
2023 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2024
2025 r = free_and_strdup(&srv, node);
2026 if (r < 0)
2027 return log_oom();
2028 }
1b9e5b12 2029#ifdef GPT_ROOT_NATIVE
ada4799a 2030 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2031
ada4799a
LP
2032 if (root && nr >= root_nr)
2033 continue;
1b9e5b12 2034
ada4799a
LP
2035 root_nr = nr;
2036 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2037
ada4799a
LP
2038 r = free_and_strdup(&root, node);
2039 if (r < 0)
2040 return log_oom();
2041 }
1b9e5b12
LP
2042#endif
2043#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2044 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2045
2046 if (secondary_root && nr >= secondary_root_nr)
2047 continue;
2048
2049 secondary_root_nr = nr;
2050 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2051
2052 r = free_and_strdup(&secondary_root, node);
2053 if (r < 0)
2054 return log_oom();
2055 }
2056#endif
f6c51a81
LP
2057 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2058
2059 if (generic)
2060 multiple_generic = true;
2061 else {
2062 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2063
2064 r = free_and_strdup(&generic, node);
2065 if (r < 0)
2066 return log_oom();
2067 }
2068 }
ada4799a
LP
2069
2070 } else if (is_mbr) {
2071 int type;
1b9e5b12 2072
f6c51a81
LP
2073 if (flags != 0x80) /* Bootable flag */
2074 continue;
2075
ada4799a
LP
2076 type = blkid_partition_get_type(pp);
2077 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2078 continue;
2079
f6c51a81
LP
2080 if (generic)
2081 multiple_generic = true;
2082 else {
2083 generic_rw = true;
727fd4fd 2084
f6c51a81
LP
2085 r = free_and_strdup(&root, node);
2086 if (r < 0)
2087 return log_oom();
2088 }
1b9e5b12 2089 }
1b9e5b12
LP
2090 }
2091
1b9e5b12
LP
2092 if (root) {
2093 *root_device = root;
2094 root = NULL;
727fd4fd
LP
2095
2096 *root_device_rw = root_rw;
1b9e5b12
LP
2097 *secondary = false;
2098 } else if (secondary_root) {
2099 *root_device = secondary_root;
2100 secondary_root = NULL;
727fd4fd
LP
2101
2102 *root_device_rw = secondary_root_rw;
1b9e5b12 2103 *secondary = true;
f6c51a81
LP
2104 } else if (generic) {
2105
2106 /* There were no partitions with precise meanings
2107 * around, but we found generic partitions. In this
2108 * case, if there's only one, we can go ahead and boot
2109 * it, otherwise we bail out, because we really cannot
2110 * make any sense of it. */
2111
2112 if (multiple_generic) {
2113 log_error("Identified multiple bootable Linux partitions on\n"
2114 " %s\n"
2115 PARTITION_TABLE_BLURB, arg_image);
2116 return -EINVAL;
2117 }
2118
2119 *root_device = generic;
2120 generic = NULL;
2121
2122 *root_device_rw = generic_rw;
2123 *secondary = false;
2124 } else {
2125 log_error("Failed to identify root partition in disk image\n"
2126 " %s\n"
2127 PARTITION_TABLE_BLURB, arg_image);
2128 return -EINVAL;
1b9e5b12
LP
2129 }
2130
2131 if (home) {
2132 *home_device = home;
2133 home = NULL;
727fd4fd
LP
2134
2135 *home_device_rw = home_rw;
1b9e5b12
LP
2136 }
2137
2138 if (srv) {
2139 *srv_device = srv;
2140 srv = NULL;
727fd4fd
LP
2141
2142 *srv_device_rw = srv_rw;
1b9e5b12
LP
2143 }
2144
2145 return 0;
2146#else
2147 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2148 return -EOPNOTSUPP;
1b9e5b12
LP
2149#endif
2150}
2151
727fd4fd 2152static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2153#ifdef HAVE_BLKID
2154 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2155 const char *fstype, *p;
2156 int r;
2157
2158 assert(what);
2159 assert(where);
2160
727fd4fd
LP
2161 if (arg_read_only)
2162 rw = false;
2163
1b9e5b12 2164 if (directory)
63c372cb 2165 p = strjoina(where, directory);
1b9e5b12
LP
2166 else
2167 p = where;
2168
2169 errno = 0;
2170 b = blkid_new_probe_from_filename(what);
2171 if (!b) {
2172 if (errno == 0)
2173 return log_oom();
e1427b13 2174 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2175 }
2176
2177 blkid_probe_enable_superblocks(b, 1);
2178 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2179
2180 errno = 0;
2181 r = blkid_do_safeprobe(b);
2182 if (r == -1 || r == 1) {
2183 log_error("Cannot determine file system type of %s", what);
2184 return -EINVAL;
2185 } else if (r != 0) {
2186 if (errno == 0)
2187 errno = EIO;
e1427b13 2188 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2189 }
2190
2191 errno = 0;
2192 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2193 if (errno == 0)
2194 errno = EINVAL;
2195 log_error("Failed to determine file system type of %s", what);
2196 return -errno;
2197 }
2198
2199 if (streq(fstype, "crypto_LUKS")) {
2200 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2201 return -EOPNOTSUPP;
1b9e5b12
LP
2202 }
2203
4a62c710
MS
2204 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2205 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2206
2207 return 0;
2208#else
2209 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2210 return -EOPNOTSUPP;
1b9e5b12
LP
2211#endif
2212}
2213
e01ff70a
MS
2214static int setup_machine_id(const char *directory) {
2215 int r;
2216 const char *etc_machine_id, *t;
2217 _cleanup_free_ char *s = NULL;
2218
2219 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2220
2221 r = read_one_line_file(etc_machine_id, &s);
2222 if (r < 0)
2223 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2224
2225 t = strstrip(s);
2226
2227 if (!isempty(t)) {
2228 r = sd_id128_from_string(t, &arg_uuid);
2229 if (r < 0)
2230 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2231 } else {
2232 if (sd_id128_is_null(arg_uuid)) {
2233 r = sd_id128_randomize(&arg_uuid);
2234 if (r < 0)
2235 return log_error_errno(r, "Failed to generate random machine ID: %m");
2236 }
2237 }
2238
2239 r = machine_id_setup(directory, arg_uuid);
2240 if (r < 0)
2241 return log_error_errno(r, "Failed to setup machine ID: %m");
2242
2243 return 0;
2244}
2245
7336138e
LP
2246static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2247 int r;
2248
2249 assert(directory);
2250
0de7acce 2251 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2252 return 0;
2253
2254 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2255 if (r == -EOPNOTSUPP)
2256 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2257 if (r == -EBADE)
2258 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2259 if (r < 0)
2260 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2261 if (r == 0)
2262 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2263 else
2264 log_debug("Patched directory tree to match UID/GID range.");
2265
2266 return r;
2267}
2268
727fd4fd
LP
2269static int mount_devices(
2270 const char *where,
2271 const char *root_device, bool root_device_rw,
2272 const char *home_device, bool home_device_rw,
2273 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2274 int r;
2275
2276 assert(where);
2277
2278 if (root_device) {
727fd4fd 2279 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2280 if (r < 0)
2281 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2282 }
2283
2284 if (home_device) {
727fd4fd 2285 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2286 if (r < 0)
2287 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2288 }
2289
2290 if (srv_device) {
727fd4fd 2291 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2292 if (r < 0)
2293 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2294 }
2295
2296 return 0;
2297}
2298
2299static void loop_remove(int nr, int *image_fd) {
2300 _cleanup_close_ int control = -1;
e8c8ddcc 2301 int r;
1b9e5b12
LP
2302
2303 if (nr < 0)
2304 return;
2305
2306 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2307 r = ioctl(*image_fd, LOOP_CLR_FD);
2308 if (r < 0)
5e4074aa 2309 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2310 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2311 }
2312
2313 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2314 if (control < 0) {
56f64d95 2315 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2316 return;
e8c8ddcc 2317 }
1b9e5b12 2318
e8c8ddcc
TG
2319 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2320 if (r < 0)
5e4074aa 2321 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2322}
2323
113cea80 2324/*
6d416b9c
LS
2325 * Return values:
2326 * < 0 : wait_for_terminate() failed to get the state of the
2327 * container, the container was terminated by a signal, or
2328 * failed for an unknown reason. No change is made to the
2329 * container argument.
2330 * > 0 : The program executed in the container terminated with an
2331 * error. The exit code of the program executed in the
919699ec
LP
2332 * container is returned. The container argument has been set
2333 * to CONTAINER_TERMINATED.
6d416b9c
LS
2334 * 0 : The container is being rebooted, has been shut down or exited
2335 * successfully. The container argument has been set to either
2336 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2337 *
6d416b9c
LS
2338 * That is, success is indicated by a return value of zero, and an
2339 * error is indicated by a non-zero value.
113cea80
DH
2340 */
2341static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2342 siginfo_t status;
919699ec 2343 int r;
113cea80
DH
2344
2345 r = wait_for_terminate(pid, &status);
f647962d
MS
2346 if (r < 0)
2347 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2348
2349 switch (status.si_code) {
fddbb89c 2350
113cea80 2351 case CLD_EXITED:
b5a2179b 2352 if (status.si_status == 0)
919699ec 2353 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2354 else
919699ec 2355 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2356
919699ec
LP
2357 *container = CONTAINER_TERMINATED;
2358 return status.si_status;
113cea80
DH
2359
2360 case CLD_KILLED:
2361 if (status.si_status == SIGINT) {
919699ec 2362 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2363 *container = CONTAINER_TERMINATED;
919699ec
LP
2364 return 0;
2365
113cea80 2366 } else if (status.si_status == SIGHUP) {
919699ec 2367 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2368 *container = CONTAINER_REBOOTED;
919699ec 2369 return 0;
113cea80 2370 }
919699ec 2371
113cea80
DH
2372 /* CLD_KILLED fallthrough */
2373
2374 case CLD_DUMPED:
fddbb89c 2375 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2376 return -EIO;
113cea80
DH
2377
2378 default:
fddbb89c 2379 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2380 return -EIO;
113cea80 2381 }
113cea80
DH
2382}
2383
023fb90b
LP
2384static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2385 pid_t pid;
2386
4a0b58c4 2387 pid = PTR_TO_PID(userdata);
023fb90b 2388 if (pid > 0) {
c6c8f6e2 2389 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2390 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2391 sd_event_source_set_userdata(s, NULL);
2392 return 0;
2393 }
2394 }
2395
2396 sd_event_exit(sd_event_source_get_event(s), 0);
2397 return 0;
2398}
2399
ec16945e 2400static int determine_names(void) {
1b9cebf6 2401 int r;
ec16945e 2402
c1521918
LP
2403 if (arg_template && !arg_directory && arg_machine) {
2404
2405 /* If --template= was specified then we should not
2406 * search for a machine, but instead create a new one
2407 * in /var/lib/machine. */
2408
2409 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2410 if (!arg_directory)
2411 return log_oom();
2412 }
2413
ec16945e 2414 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2415 if (arg_machine) {
2416 _cleanup_(image_unrefp) Image *i = NULL;
2417
2418 r = image_find(arg_machine, &i);
2419 if (r < 0)
2420 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2421 else if (r == 0) {
2422 log_error("No image for machine '%s': %m", arg_machine);
2423 return -ENOENT;
2424 }
2425
aceac2f0 2426 if (i->type == IMAGE_RAW)
0f03c2a4 2427 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2428 else
0f03c2a4 2429 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2430 if (r < 0)
2431 return log_error_errno(r, "Invalid image directory: %m");
2432
aee327b8
LP
2433 if (!arg_ephemeral)
2434 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2435 } else
ec16945e
LP
2436 arg_directory = get_current_dir_name();
2437
1b9cebf6
LP
2438 if (!arg_directory && !arg_machine) {
2439 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2440 return -EINVAL;
2441 }
2442 }
2443
2444 if (!arg_machine) {
b9ba4dab
LP
2445 if (arg_directory && path_equal(arg_directory, "/"))
2446 arg_machine = gethostname_malloc();
2447 else
2448 arg_machine = strdup(basename(arg_image ?: arg_directory));
2449
ec16945e
LP
2450 if (!arg_machine)
2451 return log_oom();
2452
ae691c1d 2453 hostname_cleanup(arg_machine);
ec16945e
LP
2454 if (!machine_name_is_valid(arg_machine)) {
2455 log_error("Failed to determine machine name automatically, please use -M.");
2456 return -EINVAL;
2457 }
b9ba4dab
LP
2458
2459 if (arg_ephemeral) {
2460 char *b;
2461
2462 /* Add a random suffix when this is an
2463 * ephemeral machine, so that we can run many
2464 * instances at once without manually having
2465 * to specify -M each time. */
2466
2467 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2468 return log_oom();
2469
2470 free(arg_machine);
2471 arg_machine = b;
2472 }
ec16945e
LP
2473 }
2474
2475 return 0;
2476}
2477
03cfe0d5 2478static int determine_uid_shift(const char *directory) {
6dac160c
LP
2479 int r;
2480
0de7acce 2481 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2482 arg_uid_shift = 0;
6dac160c 2483 return 0;
03cfe0d5 2484 }
6dac160c
LP
2485
2486 if (arg_uid_shift == UID_INVALID) {
2487 struct stat st;
2488
03cfe0d5 2489 r = stat(directory, &st);
6dac160c 2490 if (r < 0)
03cfe0d5 2491 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2492
2493 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2494
2495 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2496 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2497 return -EINVAL;
2498 }
2499
2500 arg_uid_range = UINT32_C(0x10000);
2501 }
2502
2503 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2504 log_error("UID base too high for UID range.");
2505 return -EINVAL;
2506 }
2507
6dac160c
LP
2508 return 0;
2509}
2510
03cfe0d5
LP
2511static int inner_child(
2512 Barrier *barrier,
2513 const char *directory,
2514 bool secondary,
2515 int kmsg_socket,
2516 int rtnl_socket,
f757855e 2517 FDSet *fds) {
69c79d3c 2518
03cfe0d5 2519 _cleanup_free_ char *home = NULL;
e01ff70a 2520 char as_uuid[37];
6aadfa4c 2521 unsigned n_env = 1;
03cfe0d5
LP
2522 const char *envp[] = {
2523 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2524 NULL, /* container */
03cfe0d5
LP
2525 NULL, /* TERM */
2526 NULL, /* HOME */
2527 NULL, /* USER */
2528 NULL, /* LOGNAME */
2529 NULL, /* container_uuid */
2530 NULL, /* LISTEN_FDS */
2531 NULL, /* LISTEN_PID */
2532 NULL
2533 };
88213476 2534
2371271c 2535 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2536 int r;
88213476 2537
03cfe0d5
LP
2538 assert(barrier);
2539 assert(directory);
2540 assert(kmsg_socket >= 0);
88213476 2541
efdb0237
LP
2542 cg_unified_flush();
2543
0de7acce 2544 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2545 /* Tell the parent, that it now can write the UID map. */
2546 (void) barrier_place(barrier); /* #1 */
7027ff61 2547
03cfe0d5
LP
2548 /* Wait until the parent wrote the UID map */
2549 if (!barrier_place_and_sync(barrier)) { /* #2 */
2550 log_error("Parent died too early");
2551 return -ESRCH;
2552 }
88213476
LP
2553 }
2554
0de7acce
LP
2555 r = mount_all(NULL,
2556 arg_userns_mode != USER_NAMESPACE_NO,
2557 true,
2558 arg_private_network,
2559 arg_uid_shift,
2560 arg_uid_range,
2561 arg_selinux_apifs_context);
2562
03cfe0d5
LP
2563 if (r < 0)
2564 return r;
2565
d8fc6a00
LP
2566 r = mount_sysfs(NULL);
2567 if (r < 0)
2568 return r;
2569
03cfe0d5
LP
2570 /* Wait until we are cgroup-ified, so that we
2571 * can mount the right cgroup path writable */
2572 if (!barrier_place_and_sync(barrier)) { /* #3 */
2573 log_error("Parent died too early");
2574 return -ESRCH;
88213476
LP
2575 }
2576
e83bebef 2577 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2578 if (r < 0)
2579 return r;
ec16945e 2580
03cfe0d5
LP
2581 r = reset_uid_gid();
2582 if (r < 0)
2583 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2584
03cfe0d5
LP
2585 r = setup_boot_id(NULL);
2586 if (r < 0)
2587 return r;
ec16945e 2588
03cfe0d5
LP
2589 r = setup_kmsg(NULL, kmsg_socket);
2590 if (r < 0)
2591 return r;
2592 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2593
03cfe0d5 2594 umask(0022);
30535c16 2595
03cfe0d5
LP
2596 if (setsid() < 0)
2597 return log_error_errno(errno, "setsid() failed: %m");
2598
2599 if (arg_private_network)
2600 loopback_setup();
2601
7a8f6325
LP
2602 if (arg_expose_ports) {
2603 r = expose_port_send_rtnl(rtnl_socket);
2604 if (r < 0)
2605 return r;
2606 rtnl_socket = safe_close(rtnl_socket);
2607 }
03cfe0d5 2608
709f6e46
MS
2609 r = drop_capabilities();
2610 if (r < 0)
2611 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2612
2613 setup_hostname();
2614
050f7277 2615 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2616 if (personality(arg_personality) < 0)
2617 return log_error_errno(errno, "personality() failed: %m");
2618 } else if (secondary) {
2619 if (personality(PER_LINUX32) < 0)
2620 return log_error_errno(errno, "personality() failed: %m");
2621 }
2622
2623#ifdef HAVE_SELINUX
2624 if (arg_selinux_context)
2625 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2626 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2627#endif
2628
ee645080 2629 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2630 if (r < 0)
2631 return r;
2632
6aadfa4c
ILG
2633 /* LXC sets container=lxc, so follow the scheme here */
2634 envp[n_env++] = strjoina("container=", arg_container_service_name);
2635
03cfe0d5
LP
2636 envp[n_env] = strv_find_prefix(environ, "TERM=");
2637 if (envp[n_env])
313cefa1 2638 n_env++;
03cfe0d5
LP
2639
2640 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2641 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2642 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2643 return log_oom();
2644
e01ff70a 2645 assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
03cfe0d5 2646
e01ff70a
MS
2647 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2648 return log_oom();
03cfe0d5
LP
2649
2650 if (fdset_size(fds) > 0) {
2651 r = fdset_cloexec(fds, false);
2652 if (r < 0)
2653 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2654
2655 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2656 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2657 return log_oom();
2658 }
2659
2371271c
TG
2660 env_use = strv_env_merge(2, envp, arg_setenv);
2661 if (!env_use)
2662 return log_oom();
03cfe0d5
LP
2663
2664 /* Let the parent know that we are ready and
2665 * wait until the parent is ready with the
2666 * setup, too... */
2667 if (!barrier_place_and_sync(barrier)) { /* #4 */
2668 log_error("Parent died too early");
2669 return -ESRCH;
2670 }
2671
5f932eb9
LP
2672 if (arg_chdir)
2673 if (chdir(arg_chdir) < 0)
2674 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2675
7732f92b
LP
2676 if (arg_start_mode == START_PID2) {
2677 r = stub_pid1();
2678 if (r < 0)
2679 return r;
2680 }
2681
03cfe0d5
LP
2682 /* Now, explicitly close the log, so that we
2683 * then can close all remaining fds. Closing
2684 * the log explicitly first has the benefit
2685 * that the logging subsystem knows about it,
2686 * and is thus ready to be reopened should we
2687 * need it again. Note that the other fds
2688 * closed here are at least the locking and
2689 * barrier fds. */
2690 log_close();
2691 (void) fdset_close_others(fds);
2692
7732f92b 2693 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2694 char **a;
2695 size_t m;
2696
2697 /* Automatically search for the init system */
2698
75f32f04
ZJS
2699 m = strv_length(arg_parameters);
2700 a = newa(char*, m + 2);
2701 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2702 a[1 + m] = NULL;
03cfe0d5
LP
2703
2704 a[0] = (char*) "/usr/lib/systemd/systemd";
2705 execve(a[0], a, env_use);
2706
2707 a[0] = (char*) "/lib/systemd/systemd";
2708 execve(a[0], a, env_use);
2709
2710 a[0] = (char*) "/sbin/init";
2711 execve(a[0], a, env_use);
f757855e
LP
2712 } else if (!strv_isempty(arg_parameters))
2713 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2714 else {
5f932eb9 2715 if (!arg_chdir)
d929b0f9
ZJS
2716 /* If we cannot change the directory, we'll end up in /, that is expected. */
2717 (void) chdir(home ?: "/root");
5f932eb9 2718
03cfe0d5
LP
2719 execle("/bin/bash", "-bash", NULL, env_use);
2720 execle("/bin/sh", "-sh", NULL, env_use);
2721 }
2722
35607a8d 2723 r = -errno;
03cfe0d5 2724 (void) log_open();
35607a8d 2725 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2726}
2727
2728static int outer_child(
2729 Barrier *barrier,
2730 const char *directory,
2731 const char *console,
2732 const char *root_device, bool root_device_rw,
2733 const char *home_device, bool home_device_rw,
2734 const char *srv_device, bool srv_device_rw,
2735 bool interactive,
2736 bool secondary,
2737 int pid_socket,
e01ff70a 2738 int uuid_socket,
03cfe0d5
LP
2739 int kmsg_socket,
2740 int rtnl_socket,
825d5287 2741 int uid_shift_socket,
f757855e 2742 FDSet *fds) {
03cfe0d5
LP
2743
2744 pid_t pid;
2745 ssize_t l;
2746 int r;
2747
2748 assert(barrier);
2749 assert(directory);
2750 assert(console);
2751 assert(pid_socket >= 0);
e01ff70a 2752 assert(uuid_socket >= 0);
03cfe0d5
LP
2753 assert(kmsg_socket >= 0);
2754
efdb0237
LP
2755 cg_unified_flush();
2756
03cfe0d5
LP
2757 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2758 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2759
2760 if (interactive) {
2761 close_nointr(STDIN_FILENO);
2762 close_nointr(STDOUT_FILENO);
2763 close_nointr(STDERR_FILENO);
2764
2765 r = open_terminal(console, O_RDWR);
2766 if (r != STDIN_FILENO) {
2767 if (r >= 0) {
2768 safe_close(r);
2769 r = -EINVAL;
2770 }
2771
2772 return log_error_errno(r, "Failed to open console: %m");
2773 }
2774
2775 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2776 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2777 return log_error_errno(errno, "Failed to duplicate console: %m");
2778 }
2779
2780 r = reset_audit_loginuid();
2781 if (r < 0)
2782 return r;
2783
2784 /* Mark everything as slave, so that we still
2785 * receive mounts from the real root, but don't
2786 * propagate mounts to the real root. */
2787 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2788 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2789
2790 r = mount_devices(directory,
2791 root_device, root_device_rw,
2792 home_device, home_device_rw,
2793 srv_device, srv_device_rw);
2794 if (r < 0)
2795 return r;
2796
391567f4
LP
2797 r = determine_uid_shift(directory);
2798 if (r < 0)
2799 return r;
2800
0de7acce 2801 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2802 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2803 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2804 if (l < 0)
2805 return log_error_errno(errno, "Failed to send UID shift: %m");
2806 if (l != sizeof(arg_uid_shift)) {
2807 log_error("Short write while sending UID shift.");
2808 return -EIO;
2809 }
0e7ac751 2810
0de7acce 2811 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2812 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2813 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2814 * not it will pick a different one, and send it back to us. */
2815
2816 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2817 if (l < 0)
2818 return log_error_errno(errno, "Failed to recv UID shift: %m");
2819 if (l != sizeof(arg_uid_shift)) {
2820 log_error("Short read while recieving UID shift.");
2821 return -EIO;
2822 }
2823 }
2824
2825 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2826 }
2827
03cfe0d5
LP
2828 /* Turn directory into bind mount */
2829 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2830 return log_error_errno(errno, "Failed to make bind mount: %m");
2831
7336138e 2832 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
2833 if (r < 0)
2834 return r;
2835
0de7acce
LP
2836 r = setup_volatile(
2837 directory,
2838 arg_volatile_mode,
2839 arg_userns_mode != USER_NAMESPACE_NO,
2840 arg_uid_shift,
2841 arg_uid_range,
2842 arg_selinux_context);
03cfe0d5
LP
2843 if (r < 0)
2844 return r;
2845
0de7acce
LP
2846 r = setup_volatile_state(
2847 directory,
2848 arg_volatile_mode,
2849 arg_userns_mode != USER_NAMESPACE_NO,
2850 arg_uid_shift,
2851 arg_uid_range,
2852 arg_selinux_context);
03cfe0d5
LP
2853 if (r < 0)
2854 return r;
2855
03cfe0d5
LP
2856 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2857 if (r < 0)
2858 return r;
2859
03cfe0d5
LP
2860 if (arg_read_only) {
2861 r = bind_remount_recursive(directory, true);
2862 if (r < 0)
2863 return log_error_errno(r, "Failed to make tree read-only: %m");
2864 }
2865
0de7acce
LP
2866 r = mount_all(directory,
2867 arg_userns_mode != USER_NAMESPACE_NO,
2868 false,
2869 arg_private_network,
2870 arg_uid_shift,
2871 arg_uid_range,
2872 arg_selinux_apifs_context);
03cfe0d5
LP
2873 if (r < 0)
2874 return r;
2875
07fa00f9
LP
2876 r = copy_devnodes(directory);
2877 if (r < 0)
03cfe0d5
LP
2878 return r;
2879
2880 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2881
07fa00f9
LP
2882 r = setup_pts(directory);
2883 if (r < 0)
03cfe0d5
LP
2884 return r;
2885
2886 r = setup_propagate(directory);
2887 if (r < 0)
2888 return r;
2889
2890 r = setup_dev_console(directory, console);
2891 if (r < 0)
2892 return r;
2893
520e0d54 2894 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
2895 if (r < 0)
2896 return r;
2897
2898 r = setup_timezone(directory);
2899 if (r < 0)
2900 return r;
2901
2902 r = setup_resolv_conf(directory);
2903 if (r < 0)
2904 return r;
2905
e01ff70a
MS
2906 r = setup_machine_id(directory);
2907 if (r < 0)
2908 return r;
2909
03cfe0d5
LP
2910 r = setup_journal(directory);
2911 if (r < 0)
2912 return r;
2913
0de7acce
LP
2914 r = mount_custom(
2915 directory,
2916 arg_custom_mounts,
2917 arg_n_custom_mounts,
2918 arg_userns_mode != USER_NAMESPACE_NO,
2919 arg_uid_shift,
2920 arg_uid_range,
2921 arg_selinux_apifs_context);
03cfe0d5
LP
2922 if (r < 0)
2923 return r;
2924
0de7acce
LP
2925 r = mount_cgroups(
2926 directory,
2927 arg_unified_cgroup_hierarchy,
2928 arg_userns_mode != USER_NAMESPACE_NO,
2929 arg_uid_shift,
2930 arg_uid_range,
2931 arg_selinux_apifs_context);
03cfe0d5
LP
2932 if (r < 0)
2933 return r;
2934
2935 r = mount_move_root(directory);
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to move root directory: %m");
2938
2939 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2940 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2941 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2942 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2943 if (pid < 0)
2944 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2945 if (pid == 0) {
2946 pid_socket = safe_close(pid_socket);
e01ff70a 2947 uuid_socket = safe_close(uuid_socket);
825d5287 2948 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2949
2950 /* The inner child has all namespaces that are
2951 * requested, so that we all are owned by the user if
2952 * user namespaces are turned on. */
2953
f757855e 2954 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2955 if (r < 0)
2956 _exit(EXIT_FAILURE);
2957
2958 _exit(EXIT_SUCCESS);
2959 }
2960
2961 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2962 if (l < 0)
2963 return log_error_errno(errno, "Failed to send PID: %m");
2964 if (l != sizeof(pid)) {
2965 log_error("Short write while sending PID.");
2966 return -EIO;
2967 }
2968
e01ff70a
MS
2969 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2970 if (l < 0)
2971 return log_error_errno(errno, "Failed to send machine ID: %m");
2972 if (l != sizeof(arg_uuid)) {
2973 log_error("Short write while sending machine ID.");
2974 return -EIO;
2975 }
2976
03cfe0d5 2977 pid_socket = safe_close(pid_socket);
e01ff70a 2978 uuid_socket = safe_close(uuid_socket);
327e26d6
KN
2979 kmsg_socket = safe_close(kmsg_socket);
2980 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2981
2982 return 0;
2983}
2984
0e7ac751
LP
2985static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2986 unsigned n_tries = 100;
2987 uid_t candidate;
2988 int r;
2989
2990 assert(shift);
2991 assert(ret_lock_file);
0de7acce 2992 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2993 assert(arg_uid_range == 0x10000U);
2994
2995 candidate = *shift;
2996
2997 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2998
2999 for (;;) {
3000 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3001 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3002
3003 if (--n_tries <= 0)
3004 return -EBUSY;
3005
3006 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3007 goto next;
3008 if ((candidate & UINT32_C(0xFFFF)) != 0)
3009 goto next;
3010
3011 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3012 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3013 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3014 goto next;
3015 if (r < 0)
3016 return r;
3017
3018 /* Make some superficial checks whether the range is currently known in the user database */
3019 if (getpwuid(candidate))
3020 goto next;
3021 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3022 goto next;
3023 if (getgrgid(candidate))
3024 goto next;
3025 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3026 goto next;
3027
3028 *ret_lock_file = lf;
3029 lf = (struct LockFile) LOCK_FILE_INIT;
3030 *shift = candidate;
3031 return 0;
3032
3033 next:
3034 random_bytes(&candidate, sizeof(candidate));
3035 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3036 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3037 }
3038}
3039
03cfe0d5
LP
3040static int setup_uid_map(pid_t pid) {
3041 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3042 int r;
3043
3044 assert(pid > 1);
3045
3046 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3047 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3048 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3049 if (r < 0)
3050 return log_error_errno(r, "Failed to write UID map: %m");
3051
3052 /* We always assign the same UID and GID ranges */
3053 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3054 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3055 if (r < 0)
3056 return log_error_errno(r, "Failed to write GID map: %m");
3057
3058 return 0;
3059}
3060
f757855e
LP
3061static int load_settings(void) {
3062 _cleanup_(settings_freep) Settings *settings = NULL;
3063 _cleanup_fclose_ FILE *f = NULL;
3064 _cleanup_free_ char *p = NULL;
3065 const char *fn, *i;
3066 int r;
3067
3068 /* If all settings are masked, there's no point in looking for
3069 * the settings file */
3070 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3071 return 0;
3072
3073 fn = strjoina(arg_machine, ".nspawn");
3074
3075 /* We first look in the admin's directories in /etc and /run */
3076 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3077 _cleanup_free_ char *j = NULL;
3078
3079 j = strjoin(i, "/", fn, NULL);
3080 if (!j)
3081 return log_oom();
3082
3083 f = fopen(j, "re");
3084 if (f) {
3085 p = j;
3086 j = NULL;
3087
b938cb90 3088 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3089 if (arg_settings_trusted < 0)
3090 arg_settings_trusted = true;
3091
3092 break;
3093 }
3094
3095 if (errno != ENOENT)
3096 return log_error_errno(errno, "Failed to open %s: %m", j);
3097 }
3098
3099 if (!f) {
3100 /* After that, let's look for a file next to the
3101 * actual image we shall boot. */
3102
3103 if (arg_image) {
3104 p = file_in_same_dir(arg_image, fn);
3105 if (!p)
3106 return log_oom();
3107 } else if (arg_directory) {
3108 p = file_in_same_dir(arg_directory, fn);
3109 if (!p)
3110 return log_oom();
3111 }
3112
3113 if (p) {
3114 f = fopen(p, "re");
3115 if (!f && errno != ENOENT)
3116 return log_error_errno(errno, "Failed to open %s: %m", p);
3117
b938cb90 3118 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3119 if (arg_settings_trusted < 0)
3120 arg_settings_trusted = false;
3121 }
3122 }
3123
3124 if (!f)
3125 return 0;
3126
3127 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3128
3129 r = settings_load(f, p, &settings);
3130 if (r < 0)
3131 return r;
3132
3133 /* Copy over bits from the settings, unless they have been
3134 * explicitly masked by command line switches. */
3135
7732f92b
LP
3136 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3137 settings->start_mode >= 0) {
3138 arg_start_mode = settings->start_mode;
f757855e
LP
3139
3140 strv_free(arg_parameters);
3141 arg_parameters = settings->parameters;
3142 settings->parameters = NULL;
3143 }
3144
5f932eb9
LP
3145 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3146 settings->working_directory) {
3147 free(arg_chdir);
3148 arg_chdir = settings->working_directory;
3149 settings->working_directory = NULL;
3150 }
3151
f757855e
LP
3152 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3153 settings->environment) {
3154 strv_free(arg_setenv);
3155 arg_setenv = settings->environment;
3156 settings->environment = NULL;
3157 }
3158
3159 if ((arg_settings_mask & SETTING_USER) == 0 &&
3160 settings->user) {
3161 free(arg_user);
3162 arg_user = settings->user;
3163 settings->user = NULL;
3164 }
3165
3166 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3167 uint64_t plus;
f757855e 3168
0e265674
LP
3169 plus = settings->capability;
3170 if (settings_private_network(settings))
3171 plus |= (1ULL << CAP_NET_ADMIN);
3172
3173 if (!arg_settings_trusted && plus != 0) {
3174 if (settings->capability != 0)
3175 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3176 } else
520e0d54 3177 arg_caps_retain |= plus;
f757855e 3178
520e0d54 3179 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3180 }
3181
3182 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3183 settings->kill_signal > 0)
3184 arg_kill_signal = settings->kill_signal;
3185
3186 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3187 settings->personality != PERSONALITY_INVALID)
3188 arg_personality = settings->personality;
3189
3190 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3191 !sd_id128_is_null(settings->machine_id)) {
3192
3193 if (!arg_settings_trusted)
3194 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3195 else
3196 arg_uuid = settings->machine_id;
3197 }
3198
3199 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3200 settings->read_only >= 0)
3201 arg_read_only = settings->read_only;
3202
3203 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3204 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3205 arg_volatile_mode = settings->volatile_mode;
3206
3207 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3208 settings->n_custom_mounts > 0) {
3209
3210 if (!arg_settings_trusted)
3211 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3212 else {
3213 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3214 arg_custom_mounts = settings->custom_mounts;
3215 arg_n_custom_mounts = settings->n_custom_mounts;
3216
3217 settings->custom_mounts = NULL;
3218 settings->n_custom_mounts = 0;
3219 }
3220 }
3221
3222 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3223 (settings->private_network >= 0 ||
3224 settings->network_veth >= 0 ||
3225 settings->network_bridge ||
22b28dfd 3226 settings->network_zone ||
f757855e
LP
3227 settings->network_interfaces ||
3228 settings->network_macvlan ||
f6d6bad1
LP
3229 settings->network_ipvlan ||
3230 settings->network_veth_extra)) {
f757855e
LP
3231
3232 if (!arg_settings_trusted)
3233 log_warning("Ignoring network settings, file %s is not trusted.", p);
3234 else {
f6d6bad1 3235 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3236 arg_private_network = settings_private_network(settings);
3237
f757855e
LP
3238 strv_free(arg_network_interfaces);
3239 arg_network_interfaces = settings->network_interfaces;
3240 settings->network_interfaces = NULL;
3241
3242 strv_free(arg_network_macvlan);
3243 arg_network_macvlan = settings->network_macvlan;
3244 settings->network_macvlan = NULL;
3245
3246 strv_free(arg_network_ipvlan);
3247 arg_network_ipvlan = settings->network_ipvlan;
3248 settings->network_ipvlan = NULL;
3249
f6d6bad1
LP
3250 strv_free(arg_network_veth_extra);
3251 arg_network_veth_extra = settings->network_veth_extra;
3252 settings->network_veth_extra = NULL;
3253
f757855e
LP
3254 free(arg_network_bridge);
3255 arg_network_bridge = settings->network_bridge;
3256 settings->network_bridge = NULL;
22b28dfd
LP
3257
3258 free(arg_network_zone);
3259 arg_network_zone = settings->network_zone;
3260 settings->network_zone = NULL;
f757855e
LP
3261 }
3262 }
3263
3264 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3265 settings->expose_ports) {
3266
3267 if (!arg_settings_trusted)
3268 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3269 else {
3270 expose_port_free_all(arg_expose_ports);
3271 arg_expose_ports = settings->expose_ports;
3272 settings->expose_ports = NULL;
3273 }
3274 }
3275
0de7acce
LP
3276 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3277 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3278
3279 if (!arg_settings_trusted)
3280 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3281 else {
3282 arg_userns_mode = settings->userns_mode;
3283 arg_uid_shift = settings->uid_shift;
3284 arg_uid_range = settings->uid_range;
3285 arg_userns_chown = settings->userns_chown;
3286 }
3287 }
3288
f757855e
LP
3289 return 0;
3290}
3291
03cfe0d5
LP
3292int main(int argc, char *argv[]) {
3293
3294 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3295 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3296 _cleanup_close_ int master = -1, image_fd = -1;
3297 _cleanup_fdset_free_ FDSet *fds = NULL;
3298 int r, n_fd_passed, loop_nr = -1;
5aa3eba5 3299 char veth_name[IFNAMSIZ] = "";
03cfe0d5 3300 bool secondary = false, remove_subvol = false;
72c0a2c2 3301 sigset_t mask_chld;
03cfe0d5
LP
3302 pid_t pid = 0;
3303 int ret = EXIT_SUCCESS;
3304 union in_addr_union exposed = {};
3305 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
7513c5b8 3306 bool interactive, veth_created = false;
03cfe0d5
LP
3307
3308 log_parse_environment();
3309 log_open();
3310
7732f92b
LP
3311 /* Make sure rename_process() in the stub init process can work */
3312 saved_argv = argv;
3313 saved_argc = argc;
3314
03cfe0d5
LP
3315 r = parse_argv(argc, argv);
3316 if (r <= 0)
3317 goto finish;
3318
03cfe0d5
LP
3319 if (geteuid() != 0) {
3320 log_error("Need to be root.");
3321 r = -EPERM;
3322 goto finish;
3323 }
f757855e
LP
3324 r = determine_names();
3325 if (r < 0)
3326 goto finish;
3327
3328 r = load_settings();
3329 if (r < 0)
3330 goto finish;
3331
3332 r = verify_arguments();
3333 if (r < 0)
3334 goto finish;
03cfe0d5
LP
3335
3336 n_fd_passed = sd_listen_fds(false);
3337 if (n_fd_passed > 0) {
3338 r = fdset_new_listen_fds(&fds, false);
3339 if (r < 0) {
3340 log_error_errno(r, "Failed to collect file descriptors: %m");
3341 goto finish;
3342 }
3343 }
3344
3345 if (arg_directory) {
3346 assert(!arg_image);
3347
3348 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3349 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3350 r = -EINVAL;
3351 goto finish;
3352 }
3353
3354 if (arg_ephemeral) {
3355 _cleanup_free_ char *np = NULL;
3356
3357 /* If the specified path is a mount point we
3358 * generate the new snapshot immediately
3359 * inside it under a random name. However if
3360 * the specified is not a mount point we
3361 * create the new snapshot in the parent
3362 * directory, just next to it. */
e26d6ce5 3363 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3364 if (r < 0) {
3365 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3366 goto finish;
3367 }
3368 if (r > 0)
770b5ce4 3369 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3370 else
770b5ce4 3371 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3372 if (r < 0) {
3373 log_error_errno(r, "Failed to generate name for snapshot: %m");
3374 goto finish;
3375 }
3376
3377 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3378 if (r < 0) {
3379 log_error_errno(r, "Failed to lock %s: %m", np);
3380 goto finish;
3381 }
3382
5bcd08db 3383 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3384 if (r < 0) {
3385 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3386 goto finish;
ec16945e
LP
3387 }
3388
3389 free(arg_directory);
3390 arg_directory = np;
8a16a7b4 3391 np = NULL;
ec16945e
LP
3392
3393 remove_subvol = true;
30535c16
LP
3394
3395 } else {
3396 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3397 if (r == -EBUSY) {
3398 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3399 goto finish;
3400 }
3401 if (r < 0) {
3402 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3403 return r;
3404 }
3405
3406 if (arg_template) {
5bcd08db 3407 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3408 if (r == -EEXIST) {
3409 if (!arg_quiet)
3410 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3411 } else if (r < 0) {
83521414 3412 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3413 goto finish;
3414 } else {
3415 if (!arg_quiet)
3416 log_info("Populated %s from template %s.", arg_directory, arg_template);
3417 }
3418 }
ec16945e
LP
3419 }
3420
7732f92b 3421 if (arg_start_mode == START_BOOT) {
1b9e5b12 3422 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3423 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3424 r = -EINVAL;
1b9e5b12
LP
3425 goto finish;
3426 }
3427 } else {
3428 const char *p;
3429
16fb773e
LP
3430 p = strjoina(arg_directory, "/usr/");
3431 if (laccess(p, F_OK) < 0) {
3432 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3433 r = -EINVAL;
1b9e5b12 3434 goto finish;
1b9e5b12
LP
3435 }
3436 }
ec16945e 3437
6b9132a9 3438 } else {
1b9e5b12 3439 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3440
ec16945e
LP
3441 assert(arg_image);
3442 assert(!arg_template);
3443
30535c16
LP
3444 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3445 if (r == -EBUSY) {
3446 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3447 goto finish;
3448 }
3449 if (r < 0) {
3450 r = log_error_errno(r, "Failed to create image lock: %m");
3451 goto finish;
3452 }
3453
1b9e5b12 3454 if (!mkdtemp(template)) {
56f64d95 3455 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3456 r = -errno;
6b9132a9 3457 goto finish;
1b9e5b12 3458 }
6b9132a9 3459
1b9e5b12
LP
3460 arg_directory = strdup(template);
3461 if (!arg_directory) {
3462 r = log_oom();
3463 goto finish;
6b9132a9 3464 }
88213476 3465
1b9e5b12
LP
3466 image_fd = setup_image(&device_path, &loop_nr);
3467 if (image_fd < 0) {
3468 r = image_fd;
842f3b0f
LP
3469 goto finish;
3470 }
1b9e5b12 3471
4d9f07b4
LP
3472 r = dissect_image(image_fd,
3473 &root_device, &root_device_rw,
3474 &home_device, &home_device_rw,
3475 &srv_device, &srv_device_rw,
3476 &secondary);
1b9e5b12
LP
3477 if (r < 0)
3478 goto finish;
842f3b0f 3479 }
842f3b0f 3480
5a8af538
LP
3481 r = custom_mounts_prepare();
3482 if (r < 0)
3483 goto finish;
3484
03cfe0d5
LP
3485 interactive =
3486 isatty(STDIN_FILENO) > 0 &&
3487 isatty(STDOUT_FILENO) > 0;
9c857b9d 3488
db7feb7e
LP
3489 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3490 if (master < 0) {
ec16945e 3491 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3492 goto finish;
3493 }
3494
611b312b
LP
3495 r = ptsname_malloc(master, &console);
3496 if (r < 0) {
3497 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3498 goto finish;
68b02049
DW
3499 }
3500
3501 if (arg_selinux_apifs_context) {
3502 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3503 if (r < 0)
3504 goto finish;
a258bf26
LP
3505 }
3506
a258bf26 3507 if (unlockpt(master) < 0) {
ec16945e 3508 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3509 goto finish;
3510 }
3511
9c857b9d
LP
3512 if (!arg_quiet)
3513 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3514 arg_machine, arg_image ?: arg_directory);
3515
72c0a2c2 3516 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3517
023fb90b
LP
3518 assert_se(sigemptyset(&mask_chld) == 0);
3519 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3520
03cfe0d5
LP
3521 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3522 r = log_error_errno(errno, "Failed to become subreaper: %m");
3523 goto finish;
3524 }
3525
d87be9b0 3526 for (;;) {
03cfe0d5 3527 static const struct sigaction sa = {
189d5bac 3528 .sa_handler = nop_signal_handler,
e866af3a
DH
3529 .sa_flags = SA_NOCLDSTOP,
3530 };
0e7ac751
LP
3531
3532 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3533 _cleanup_close_ int etc_passwd_lock = -1;
3534 _cleanup_close_pair_ int
3535 kmsg_socket_pair[2] = { -1, -1 },
3536 rtnl_socket_pair[2] = { -1, -1 },
3537 pid_socket_pair[2] = { -1, -1 },
3538 uuid_socket_pair[2] = { -1, -1 },
3539 uid_shift_socket_pair[2] = { -1, -1 };
3540 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4afd3348 3541 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3542 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3543 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
0e7ac751 3544 ContainerStatus container_status;
dbb60d69 3545 char last_char = 0;
0e7ac751
LP
3546 int ifi = 0;
3547 ssize_t l;
3548
0de7acce 3549 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3550 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3551 * check with getpwuid() if the specific user already exists. Note that /etc might be
3552 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3553 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3554 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3555 * really ours. */
3556
3557 etc_passwd_lock = take_etc_passwd_lock(NULL);
3558 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3559 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3560 goto finish;
3561 }
3562 }
e866af3a 3563
7566e267 3564 r = barrier_create(&barrier);
a2da110b 3565 if (r < 0) {
da927ba9 3566 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3567 goto finish;
3568 }
3569
4610de50 3570 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3571 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3572 goto finish;
3573 }
3574
4610de50 3575 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3576 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3577 goto finish;
3578 }
3579
4610de50 3580 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3581 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3582 goto finish;
3583 }
3584
e01ff70a
MS
3585 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3586 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3587 goto finish;
3588 }
3589
0de7acce 3590 if (arg_userns_mode != USER_NAMESPACE_NO)
4610de50 3591 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3592 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3593 goto finish;
3594 }
3595
e866af3a
DH
3596 /* Child can be killed before execv(), so handle SIGCHLD
3597 * in order to interrupt parent's blocking calls and
3598 * give it a chance to call wait() and terminate. */
3599 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3600 if (r < 0) {
ec16945e 3601 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3602 goto finish;
3603 }
3604
e866af3a
DH
3605 r = sigaction(SIGCHLD, &sa, NULL);
3606 if (r < 0) {
ec16945e 3607 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3608 goto finish;
3609 }
3610
8869a0b4 3611 pid = raw_clone(SIGCHLD|CLONE_NEWNS);
d87be9b0
LP
3612 if (pid < 0) {
3613 if (errno == EINVAL)
ec16945e 3614 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3615 else
ec16945e 3616 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3617
d87be9b0
LP
3618 goto finish;
3619 }
a258bf26 3620
d87be9b0 3621 if (pid == 0) {
03cfe0d5 3622 /* The outer child only has a file system namespace. */
a2da110b
DH
3623 barrier_set_role(&barrier, BARRIER_CHILD);
3624
03e334a1 3625 master = safe_close(master);
a258bf26 3626
03e334a1 3627 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3628 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3629 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
e01ff70a 3630 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
825d5287 3631 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3632
ce30c8dc
LP
3633 (void) reset_all_signal_handlers();
3634 (void) reset_signal_mask();
f5c1b9ee 3635
03cfe0d5
LP
3636 r = outer_child(&barrier,
3637 arg_directory,
3638 console,
3639 root_device, root_device_rw,
3640 home_device, home_device_rw,
3641 srv_device, srv_device_rw,
3642 interactive,
3643 secondary,
3644 pid_socket_pair[1],
e01ff70a 3645 uuid_socket_pair[1],
03cfe0d5
LP
3646 kmsg_socket_pair[1],
3647 rtnl_socket_pair[1],
825d5287 3648 uid_shift_socket_pair[1],
f757855e 3649 fds);
0cb9fbcd 3650 if (r < 0)
a2da110b 3651 _exit(EXIT_FAILURE);
d87be9b0 3652
03cfe0d5 3653 _exit(EXIT_SUCCESS);
da5b3bad 3654 }
88213476 3655
a2da110b 3656 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3657
2feceb5e 3658 fds = fdset_free(fds);
842f3b0f 3659
6d0b55c2
LP
3660 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3661 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3662 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
e01ff70a 3663 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
82116c43 3664 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3665
0de7acce 3666 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
3667 /* The child just let us know the UID shift it might have read from the image. */
3668 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3669 if (l < 0) {
3670 r = log_error_errno(errno, "Failed to read UID shift: %m");
3671 goto finish;
3672 }
3673 if (l != sizeof(arg_uid_shift)) {
3674 log_error("Short read while reading UID shift.");
3675 r = EIO;
3676 goto finish;
3677 }
3678
0de7acce 3679 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3680 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3681 * image, but if that's already in use, pick a new one, and report back to the child,
3682 * which one we now picked. */
3683
3684 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3685 if (r < 0) {
3686 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3687 goto finish;
3688 }
3689
3690 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3691 if (l < 0) {
3692 r = log_error_errno(errno, "Failed to send UID shift: %m");
3693 goto finish;
3694 }
3695 if (l != sizeof(arg_uid_shift)) {
3696 log_error("Short write while writing UID shift.");
3697 r = -EIO;
3698 goto finish;
3699 }
3700 }
3701 }
3702
03cfe0d5
LP
3703 /* Wait for the outer child. */
3704 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3705 if (r < 0)
3706 goto finish;
3707 if (r != 0) {
3708 r = -EIO;
3709 goto finish;
3710 }
3711 pid = 0;
6dac160c 3712
03cfe0d5
LP
3713 /* And now retrieve the PID of the inner child. */
3714 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3715 if (l < 0) {
3716 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3717 goto finish;
3718 }
3719 if (l != sizeof(pid)) {
76d44882 3720 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3721 r = EIO;
3722 goto finish;
3723 }
354bfd2b 3724
e01ff70a
MS
3725 /* We also retrieve container UUID in case it was generated by outer child */
3726 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3727 if (l < 0) {
3728 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3729 goto finish;
3730 }
3731 if (l != sizeof(arg_uuid)) {
3732 log_error("Short read while reading container machined ID.");
3733 r = EIO;
3734 goto finish;
3735 }
3736
03cfe0d5 3737 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3738
0de7acce 3739 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3740 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3741 log_error("Child died too early.");
3742 r = -ESRCH;
840295fc 3743 goto finish;
03cfe0d5 3744 }
ab046dde 3745
03cfe0d5 3746 r = setup_uid_map(pid);
840295fc
LP
3747 if (r < 0)
3748 goto finish;
ab046dde 3749
03cfe0d5
LP
3750 (void) barrier_place(&barrier); /* #2 */
3751 }
c74e630d 3752
9a2a5625 3753 if (arg_private_network) {
4bbfe7ad 3754
9a2a5625
LP
3755 r = move_network_interfaces(pid, arg_network_interfaces);
3756 if (r < 0)
3757 goto finish;
5aa4bb6b 3758
9a2a5625 3759 if (arg_network_veth) {
22b28dfd
LP
3760 r = setup_veth(arg_machine, pid, veth_name,
3761 arg_network_bridge || arg_network_zone);
9a2a5625
LP
3762 if (r < 0)
3763 goto finish;
3764 else if (r > 0)
3765 ifi = r;
6dac160c 3766
9a2a5625 3767 if (arg_network_bridge) {
22b28dfd
LP
3768 /* Add the interface to a bridge */
3769 r = setup_bridge(veth_name, arg_network_bridge, false);
3770 if (r < 0)
3771 goto finish;
3772 if (r > 0)
3773 ifi = r;
3774 } else if (arg_network_zone) {
3775 /* Add the interface to a bridge, possibly creating it */
3776 r = setup_bridge(veth_name, arg_network_zone, true);
9a2a5625
LP
3777 if (r < 0)
3778 goto finish;
3779 if (r > 0)
3780 ifi = r;
3781 }
3782 }
6dac160c 3783
f6d6bad1
LP
3784 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3785 if (r < 0)
3786 goto finish;
3787
7513c5b8
LP
3788 /* We created the primary and extra veth links now; let's remember this, so that we know to
3789 remove them later on. Note that we don't bother with removing veth links that were created
3790 here when their setup failed half-way, because in that case the kernel should be able to
3791 remove them on its own, since they cannot be referenced by anything yet. */
3792 veth_created = true;
3793
9a2a5625
LP
3794 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3795 if (r < 0)
3796 goto finish;
3797
3798 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3799 if (r < 0)
3800 goto finish;
3801 }
6dac160c 3802
b7103bc5
LP
3803 if (arg_register) {
3804 r = register_machine(
3805 arg_machine,
3806 pid,
3807 arg_directory,
3808 arg_uuid,
3809 ifi,
3810 arg_slice,
3811 arg_custom_mounts, arg_n_custom_mounts,
3812 arg_kill_signal,
3813 arg_property,
6aadfa4c
ILG
3814 arg_keep_unit,
3815 arg_container_service_name);
b7103bc5
LP
3816 if (r < 0)
3817 goto finish;
3818 }
6dac160c 3819
34829a32 3820 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3821 if (r < 0)
3822 goto finish;
3823
34829a32
LP
3824 if (arg_keep_unit) {
3825 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3826 if (r < 0)
3827 goto finish;
3828 }
efdb0237 3829
34829a32 3830 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3831 if (r < 0)
3832 goto finish;
6dac160c 3833
03cfe0d5
LP
3834 /* Notify the child that the parent is ready with all
3835 * its setup (including cgroup-ification), and that
3836 * the child can now hand over control to the code to
3837 * run inside the container. */
3838 (void) barrier_place(&barrier); /* #3 */
6dac160c 3839
03cfe0d5
LP
3840 /* Block SIGCHLD here, before notifying child.
3841 * process_pty() will handle it with the other signals. */
3842 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3843
03cfe0d5
LP
3844 /* Reset signal to default */
3845 r = default_signals(SIGCHLD, -1);
3846 if (r < 0) {
3847 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3848 goto finish;
3849 }
e866af3a 3850
03cfe0d5 3851 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3852 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3853 log_error("Child died too early.");
03cfe0d5
LP
3854 r = -ESRCH;
3855 goto finish;
3856 }
b12afc8c 3857
0e7ac751
LP
3858 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
3859 * in getpwuid(), thus we can release the /etc/passwd lock. */
3860 etc_passwd_lock = safe_close(etc_passwd_lock);
3861
03cfe0d5
LP
3862 sd_notifyf(false,
3863 "READY=1\n"
3864 "STATUS=Container running.\n"
3865 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3866
03cfe0d5
LP
3867 r = sd_event_new(&event);
3868 if (r < 0) {
3869 log_error_errno(r, "Failed to get default event source: %m");
3870 goto finish;
3871 }
88213476 3872
03cfe0d5
LP
3873 if (arg_kill_signal > 0) {
3874 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
3875 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3876 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
3877 } else {
3878 /* Immediately exit */
3879 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3880 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3881 }
023fb90b 3882
03cfe0d5
LP
3883 /* simply exit on sigchld */
3884 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3885
03cfe0d5 3886 if (arg_expose_ports) {
7a8f6325 3887 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3888 if (r < 0)
3889 goto finish;
023fb90b 3890
7a8f6325 3891 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3892 }
023fb90b 3893
03cfe0d5 3894 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3895
ae3dde80 3896 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3897 if (r < 0) {
3898 log_error_errno(r, "Failed to create PTY forwarder: %m");
3899 goto finish;
3900 }
023fb90b 3901
03cfe0d5
LP
3902 r = sd_event_loop(event);
3903 if (r < 0) {
3904 log_error_errno(r, "Failed to run event loop: %m");
3905 goto finish;
3906 }
6d0b55c2 3907
03cfe0d5 3908 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3909
03cfe0d5 3910 forward = pty_forward_free(forward);
6d0b55c2 3911
03cfe0d5
LP
3912 if (!arg_quiet && last_char != '\n')
3913 putc('\n', stdout);
04d39279 3914
03cfe0d5 3915 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3916 if (arg_register && !arg_keep_unit)
3917 terminate_machine(pid);
1f0cd86b 3918
840295fc 3919 /* Normally redundant, but better safe than sorry */
04d39279 3920 kill(pid, SIGKILL);
a258bf26 3921
113cea80 3922 r = wait_for_container(pid, &container_status);
04d39279
LP
3923 pid = 0;
3924
ec16945e 3925 if (r < 0)
ce9f1527
LP
3926 /* We failed to wait for the container, or the
3927 * container exited abnormally */
ec16945e 3928 goto finish;
9ed794a3 3929 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
ce9f1527
LP
3930 /* The container exited with a non-zero
3931 * status, or with zero status and no reboot
3932 * was requested. */
ec16945e 3933 ret = r;
d87be9b0 3934 break;
ec16945e 3935 }
88213476 3936
113cea80 3937 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3938
3939 if (arg_keep_unit) {
3940 /* Special handling if we are running as a
3941 * service: instead of simply restarting the
3942 * machine we want to restart the entire
3943 * service, so let's inform systemd about this
3944 * with the special exit code 133. The service
3945 * file uses RestartForceExitStatus=133 so
3946 * that this results in a full nspawn
3947 * restart. This is necessary since we might
3948 * have cgroup parameters set we want to have
3949 * flushed out. */
ec16945e
LP
3950 ret = 133;
3951 r = 0;
ce38dbc8
LP
3952 break;
3953 }
6d0b55c2 3954
7a8f6325 3955 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8 3956
ef3b2aa7 3957 (void) remove_veth_links(veth_name, arg_network_veth_extra);
7513c5b8 3958 veth_created = false;
d87be9b0 3959 }
88213476
LP
3960
3961finish:
af4ec430
LP
3962 sd_notify(false,
3963 "STOPPING=1\n"
3964 "STATUS=Terminating...");
3965
9444b1f2
LP
3966 if (pid > 0)
3967 kill(pid, SIGKILL);
88213476 3968
503546da
LP
3969 /* Try to flush whatever is still queued in the pty */
3970 if (master >= 0)
59f448cf 3971 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3972
03cfe0d5
LP
3973 loop_remove(loop_nr, &image_fd);
3974
ec16945e
LP
3975 if (remove_subvol && arg_directory) {
3976 int k;
3977
5bcd08db 3978 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3979 if (k < 0)
3980 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3981 }
3982
785890ac
LP
3983 if (arg_machine) {
3984 const char *p;
3985
63c372cb 3986 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3987 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3988 }
3989
7a8f6325 3990 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
3991
3992 if (veth_created)
3993 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 3994 (void) remove_bridge(arg_network_zone);
f757855e 3995
04d391da 3996 free(arg_directory);
ec16945e
LP
3997 free(arg_template);
3998 free(arg_image);
7027ff61 3999 free(arg_machine);
c74e630d 4000 free(arg_user);
5f932eb9 4001 free(arg_chdir);
c74e630d 4002 strv_free(arg_setenv);
f757855e 4003 free(arg_network_bridge);
c74e630d
LP
4004 strv_free(arg_network_interfaces);
4005 strv_free(arg_network_macvlan);
4bbfe7ad 4006 strv_free(arg_network_ipvlan);
f6d6bad1 4007 strv_free(arg_network_veth_extra);
f757855e
LP
4008 strv_free(arg_parameters);
4009 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4010 expose_port_free_all(arg_expose_ports);
6d0b55c2 4011
ec16945e 4012 return r < 0 ? EXIT_FAILURE : ret;
88213476 4013}