]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn-mount: Remove unused parameters
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
8fe0087e
LP
14#include <sys/personality.h>
15#include <sys/prctl.h>
16#include <sys/types.h>
6916b164 17#include <sys/wait.h>
8fe0087e 18#include <unistd.h>
1b9e5b12 19
b053cd5f 20#include "sd-bus.h"
1f0cd86b 21#include "sd-daemon.h"
1f0cd86b 22#include "sd-id128.h"
8fe0087e 23
b5efdb8a 24#include "alloc-util.h"
8fe0087e
LP
25#include "barrier.h"
26#include "base-filesystem.h"
27#include "blkid-util.h"
28#include "btrfs-util.h"
b8ea7a6e 29#include "bus-error.h"
b053cd5f 30#include "bus-util.h"
8fe0087e 31#include "cap-list.h"
430f0182 32#include "capability-util.h"
04d391da 33#include "cgroup-util.h"
8fe0087e 34#include "copy.h"
d107bb7d 35#include "cpu-set-util.h"
4fc9982c 36#include "dev-setup.h"
2d845785 37#include "dissect-image.h"
8fe0087e 38#include "env-util.h"
3ffd4af2 39#include "fd-util.h"
842f3b0f 40#include "fdset.h"
a5c32cff 41#include "fileio.h"
f97b34a6 42#include "format-util.h"
f4f15635 43#include "fs-util.h"
1b9e5b12 44#include "gpt.h"
4623e8e6 45#include "hexdecoct.h"
8fe0087e 46#include "hostname-util.h"
910fd145 47#include "id128-util.h"
8fe0087e 48#include "log.h"
2d845785 49#include "loop-util.h"
8fe0087e 50#include "loopback-setup.h"
1b9cebf6 51#include "machine-image.h"
8fe0087e 52#include "macro.h"
44dbef90 53#include "main-func.h"
f5947a5e 54#include "missing_sched.h"
8fe0087e 55#include "mkdir.h"
4349cd7c 56#include "mount-util.h"
049af8ad 57#include "mountpoint-util.h"
0cb8e3d1 58#include "namespace-util.h"
8fe0087e 59#include "netlink-util.h"
07630cea 60#include "nspawn-cgroup.h"
3603efde 61#include "nspawn-def.h"
07630cea
LP
62#include "nspawn-expose-ports.h"
63#include "nspawn-mount.h"
64#include "nspawn-network.h"
de40a303 65#include "nspawn-oci.h"
7336138e 66#include "nspawn-patch-uid.h"
07630cea 67#include "nspawn-register.h"
910fd145 68#include "nspawn-seccomp.h"
07630cea
LP
69#include "nspawn-settings.h"
70#include "nspawn-setuid.h"
7732f92b 71#include "nspawn-stub-pid1.h"
d8b4d14d 72#include "nulstr-util.h"
d58ad743 73#include "os-util.h"
50ebcf6c 74#include "pager.h"
6bedfcbb 75#include "parse-util.h"
8fe0087e 76#include "path-util.h"
294bf0c3 77#include "pretty-print.h"
0b452006 78#include "process-util.h"
8fe0087e
LP
79#include "ptyfwd.h"
80#include "random-util.h"
8869a0b4 81#include "raw-clone.h"
bf428efb 82#include "rlimit-util.h"
8fe0087e 83#include "rm-rf.h"
de40a303
LP
84#if HAVE_SECCOMP
85#include "seccomp-util.h"
86#endif
68b02049 87#include "selinux-util.h"
8fe0087e 88#include "signal-util.h"
2583fbea 89#include "socket-util.h"
8fcde012 90#include "stat-util.h"
15a5e950 91#include "stdio-util.h"
5c828e66 92#include "string-table.h"
07630cea 93#include "string-util.h"
8fe0087e 94#include "strv.h"
de40a303 95#include "sysctl-util.h"
8fe0087e 96#include "terminal-util.h"
e4de7287 97#include "tmpfile-util.h"
affb60b1 98#include "umask-util.h"
43c3fb46 99#include "unit-name.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
62b1e758
YW
103#if HAVE_SPLIT_USR
104#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
105#else
106#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
107#endif
108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
6145bb4f 118 CONTAINER_REBOOTED,
113cea80
DH
119} ContainerStatus;
120
88213476 121static char *arg_directory = NULL;
ec16945e 122static char *arg_template = NULL;
5f932eb9 123static char *arg_chdir = NULL;
b53ede69
PW
124static char *arg_pivot_root_new = NULL;
125static char *arg_pivot_root_old = NULL;
687d0825 126static char *arg_user = NULL;
de40a303
LP
127static uid_t arg_uid = UID_INVALID;
128static gid_t arg_gid = GID_INVALID;
129static gid_t* arg_supplementary_gids = NULL;
130static size_t arg_n_supplementary_gids = 0;
9444b1f2 131static sd_id128_t arg_uuid = {};
3a9530e5
LP
132static char *arg_machine = NULL; /* The name used by the host to refer to this */
133static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
134static const char *arg_selinux_context = NULL;
135static const char *arg_selinux_apifs_context = NULL;
de40a303 136static char *arg_slice = NULL;
ff01d048 137static bool arg_private_network = false;
bc2f673e 138static bool arg_read_only = false;
7732f92b 139static StartMode arg_start_mode = START_PID1;
ec16945e 140static bool arg_ephemeral = false;
57fb9fb5 141static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 142static bool arg_link_journal_try = false;
520e0d54 143static uint64_t arg_caps_retain =
50b52222
LP
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 155 (1ULL << CAP_MKNOD) |
5076f0cc
LP
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
5076f0cc 159 (1ULL << CAP_SETFCAP) |
50b52222 160 (1ULL << CAP_SETGID) |
5076f0cc
LP
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
50b52222 164 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
50b52222 169 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 170static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 171static CustomMount *arg_custom_mounts = NULL;
88614c8a 172static size_t arg_n_custom_mounts = 0;
f4889f65 173static char **arg_setenv = NULL;
284c0b91 174static bool arg_quiet = false;
eb91eb18 175static bool arg_register = true;
89f7c846 176static bool arg_keep_unit = false;
aa28aefe 177static char **arg_network_interfaces = NULL;
c74e630d 178static char **arg_network_macvlan = NULL;
4bbfe7ad 179static char **arg_network_ipvlan = NULL;
69c79d3c 180static bool arg_network_veth = false;
f6d6bad1 181static char **arg_network_veth_extra = NULL;
f757855e 182static char *arg_network_bridge = NULL;
22b28dfd 183static char *arg_network_zone = NULL;
d7bea6b6 184static char *arg_network_namespace_path = NULL;
bb068de0 185static PagerFlags arg_pager_flags = 0;
050f7277 186static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 187static char *arg_image = NULL;
de40a303 188static char *arg_oci_bundle = NULL;
f757855e 189static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 190static ExposePort *arg_expose_ports = NULL;
f36933fe 191static char **arg_property = NULL;
de40a303 192static sd_bus_message *arg_property_message = NULL;
0de7acce 193static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 194static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 195static bool arg_userns_chown = false;
c6c8f6e2 196static int arg_kill_signal = 0;
5da38d07 197static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
198static SettingsMask arg_settings_mask = 0;
199static int arg_settings_trusted = -1;
200static char **arg_parameters = NULL;
6aadfa4c 201static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 202static bool arg_notify_ready = false;
5a8ff0e6 203static bool arg_use_cgns = true;
0c582db0 204static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 205static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
206static void *arg_root_hash = NULL;
207static size_t arg_root_hash_size = 0;
960e4569
LP
208static char **arg_syscall_whitelist = NULL;
209static char **arg_syscall_blacklist = NULL;
de40a303
LP
210#if HAVE_SECCOMP
211static scmp_filter_ctx arg_seccomp = NULL;
212#endif
bf428efb 213static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 214static bool arg_no_new_privileges = false;
81f345df
LP
215static int arg_oom_score_adjust = 0;
216static bool arg_oom_score_adjust_set = false;
0985c7c4 217static CPUSet arg_cpu_set = {};
09d423e9 218static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 219static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
220static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
221static DeviceNode* arg_extra_nodes = NULL;
222static size_t arg_n_extra_nodes = 0;
223static char **arg_sysctl = NULL;
224static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
88213476 225
6145bb4f
LP
226STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
227STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
228STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
237STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
238STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
241STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
247STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
248STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
252#if HAVE_SECCOMP
253STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
254#endif
0985c7c4 255STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
256STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
257
dce66ffe
ZJS
258static int handle_arg_console(const char *arg) {
259 if (streq(arg, "help")) {
260 puts("interactive\n"
261 "read-only\n"
262 "passive\n"
263 "pipe");
264 return 0;
265 }
266
267 if (streq(arg, "interactive"))
268 arg_console_mode = CONSOLE_INTERACTIVE;
269 else if (streq(arg, "read-only"))
270 arg_console_mode = CONSOLE_READ_ONLY;
271 else if (streq(arg, "passive"))
272 arg_console_mode = CONSOLE_PASSIVE;
273 else if (streq(arg, "pipe"))
274 arg_console_mode = CONSOLE_PIPE;
275 else
276 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
277
278 arg_settings_mask |= SETTING_CONSOLE_MODE;
279 return 1;
280}
281
37ec0fdd
LP
282static int help(void) {
283 _cleanup_free_ char *link = NULL;
284 int r;
285
bb068de0 286 (void) pager_open(arg_pager_flags);
50ebcf6c 287
37ec0fdd
LP
288 r = terminal_urlify_man("systemd-nspawn", "1", &link);
289 if (r < 0)
290 return log_oom();
291
25148653 292 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 293 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
294 " -h --help Show this help\n"
295 " --version Print version string\n"
69c79d3c 296 " -q --quiet Do not show status information\n"
bb068de0 297 " --no-pager Do not pipe output into a pager\n"
25148653
LP
298 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
299 "%3$sImage:%4$s\n"
1b9e5b12 300 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
301 " --template=PATH Initialize root directory from template directory,\n"
302 " if missing\n"
303 " -x --ephemeral Run container with snapshot of root directory, and\n"
304 " remove it after exit\n"
25e68fd3
LP
305 " -i --image=PATH Root file system disk image (or device node) for\n"
306 " the container\n"
de40a303 307 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
308 " --read-only Mount the root directory read-only\n"
309 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 310 " --root-hash=HASH Specify verity root hash for root disk image\n"
25148653
LP
311 " --pivot-root=PATH[:PATH]\n"
312 " Pivot root to given directory in the container\n\n"
313 "%3$sExecution:%4$s\n"
7732f92b 314 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 315 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 316 " --chdir=PATH Set working directory in the container\n"
25148653
LP
317 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
318 " -u --user=USER Run the command under specified user or UID\n"
319 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
320 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
321 "%3$sSystem Identity:%4$s\n"
a8828ed9 322 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 323 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
324 " --uuid=UUID Set a specific machine UUID for the container\n\n"
325 "%3$sProperties:%4$s\n"
a8828ed9 326 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 327 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
328 " --register=BOOLEAN Register container as machine\n"
329 " --keep-unit Do not register a scope for the machine, reuse\n"
330 " the service unit nspawn is running in\n\n"
331 "%3$sUser Namespacing:%4$s\n"
90b4a64d 332 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 333 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 334 " Similar, but with user configured UID/GID range\n"
25148653
LP
335 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
336 "%3$sNetworking:%4$s\n"
69c79d3c
LP
337 " --private-network Disable network in container\n"
338 " --network-interface=INTERFACE\n"
339 " Assign an existing network interface to the\n"
340 " container\n"
c74e630d
LP
341 " --network-macvlan=INTERFACE\n"
342 " Create a macvlan network interface based on an\n"
343 " existing network interface to the container\n"
4bbfe7ad
TG
344 " --network-ipvlan=INTERFACE\n"
345 " Create a ipvlan network interface based on an\n"
346 " existing network interface to the container\n"
a8eaaee7 347 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 348 " and container\n"
f6d6bad1
LP
349 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
350 " Add an additional virtual Ethernet link between\n"
351 " host and container\n"
ab046dde 352 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
353 " Add a virtual Ethernet connection to the container\n"
354 " and attach it to an existing bridge on the host\n"
355 " --network-zone=NAME Similar, but attach the new interface to an\n"
356 " an automatically managed bridge interface\n"
d7bea6b6
DP
357 " --network-namespace-path=PATH\n"
358 " Set network namespace to the one represented by\n"
359 " the specified kernel namespace file node\n"
6d0b55c2 360 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
361 " Expose a container IP port on the host\n\n"
362 "%3$sSecurity:%4$s\n"
a8828ed9
DW
363 " --capability=CAP In addition to the default, retain specified\n"
364 " capability\n"
365 " --drop-capability=CAP Drop the specified capability from the default set\n"
f4e803c8 366 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
367 " --system-call-filter=LIST|~LIST\n"
368 " Permit/prohibit specific system calls\n"
25148653
LP
369 " -Z --selinux-context=SECLABEL\n"
370 " Set the SELinux security context to be used by\n"
371 " processes in the container\n"
372 " -L --selinux-apifs-context=SECLABEL\n"
373 " Set the SELinux security context to be used by\n"
374 " API/tmpfs file systems in the container\n\n"
375 "%3$sResources:%4$s\n"
bf428efb 376 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
377 " --oom-score-adjust=VALUE\n"
378 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
379 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
380 " --personality=ARCH Pick personality for this container\n\n"
25148653 381 "%3$sIntegration:%4$s\n"
09d423e9 382 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 383 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
384 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
385 " host, try-guest, try-host\n"
386 " -j Equivalent to --link-journal=try-guest\n\n"
387 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
388 " --bind=PATH[:PATH[:OPTIONS]]\n"
389 " Bind mount a file or directory from the host into\n"
a8828ed9 390 " the container\n"
5e5bfa6e
EY
391 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
392 " Similar, but creates a read-only bind mount\n"
de40a303
LP
393 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
394 " it\n"
06c17c39 395 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
396 " --overlay=PATH[:PATH...]:PATH\n"
397 " Create an overlay mount from the host to \n"
398 " the container\n"
399 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
400 " Similar, but creates a read-only overlay mount\n\n"
401 "%3$sInput/Output:%4$s\n"
de40a303
LP
402 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
403 " set up for the container.\n"
404 " -P --pipe Equivalent to --console=pipe\n"
25148653 405 "\nSee the %2$s for details.\n"
37ec0fdd
LP
406 , program_invocation_short_name
407 , link
37a92352
LP
408 , ansi_underline(), ansi_normal()
409 , ansi_highlight(), ansi_normal()
410 );
37ec0fdd
LP
411
412 return 0;
88213476
LP
413}
414
86c0dd4a 415static int custom_mount_check_all(void) {
88614c8a 416 size_t i;
5a8af538 417
5a8af538
LP
418 for (i = 0; i < arg_n_custom_mounts; i++) {
419 CustomMount *m = &arg_custom_mounts[i];
420
0de7acce 421 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
422 if (arg_userns_chown)
423 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
424 "--private-users-chown may not be combined with custom root mounts.");
425 else if (arg_uid_shift == UID_INVALID)
426 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
427 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 428 }
5a8af538
LP
429 }
430
431 return 0;
432}
433
8199d554 434static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 435 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 436 int r;
5da38d07 437
efdb0237 438 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
439
440 e = getenv(var);
441 if (!e) {
d5fc5b2f 442 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
443 var = "UNIFIED_CGROUP_HIERARCHY";
444 e = getenv(var);
c78c095b
ZJS
445 }
446
447 if (!isempty(e)) {
efdb0237
LP
448 r = parse_boolean(e);
449 if (r < 0)
c78c095b 450 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
451 if (r > 0)
452 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
453 else
454 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
455 }
456
8199d554
LP
457 return 0;
458}
459
460static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
461 int r;
462
75b0d8b8
ZJS
463 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
464 * in the image actually supports. */
b4cccbc1
LP
465 r = cg_all_unified();
466 if (r < 0)
467 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
468 if (r > 0) {
a8725a06
ZJS
469 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
470 * routine only detects 231, so we'll have a false negative here for 230. */
471 r = systemd_installation_has_version(directory, 230);
472 if (r < 0)
473 return log_error_errno(r, "Failed to determine systemd version in container: %m");
474 if (r > 0)
475 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
476 else
477 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 478 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
479 /* Mixed cgroup hierarchy support was added in 233 */
480 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
481 if (r < 0)
482 return log_error_errno(r, "Failed to determine systemd version in container: %m");
483 if (r > 0)
484 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
485 else
486 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
487 } else
5da38d07 488 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 489
8199d554
LP
490 log_debug("Using %s hierarchy for container.",
491 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
492 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
493
efdb0237
LP
494 return 0;
495}
496
8a99bd0c
ZJS
497static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
498 uint64_t mask = 0;
499 int r;
500
501 for (;;) {
502 _cleanup_free_ char *t = NULL;
503
504 r = extract_first_word(&spec, &t, ",", 0);
505 if (r < 0)
506 return log_error_errno(r, "Failed to parse capability %s.", t);
507 if (r == 0)
508 break;
509
510 if (streq(t, "help")) {
511 for (int i = 0; i < capability_list_length(); i++) {
512 const char *name;
513
514 name = capability_to_name(i);
515 if (name)
516 puts(name);
517 }
518
519 return 0; /* quit */
520 }
521
522 if (streq(t, "all"))
523 mask = (uint64_t) -1;
524 else {
525 r = capability_from_name(t);
526 if (r < 0)
527 return log_error_errno(r, "Failed to parse capability %s.", t);
528
529 mask |= 1ULL << r;
530 }
531 }
532
533 *ret_mask = mask;
534 return 1; /* continue */
535}
536
49048684 537static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
538 int r;
539
540 r = getenv_bool(name);
541 if (r == -ENXIO)
49048684 542 return 0;
0c582db0 543 if (r < 0)
49048684 544 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 545
0c582db0 546 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 547 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 548 return 0;
0c582db0
LB
549}
550
49048684 551static int parse_mount_settings_env(void) {
4f086aab 552 const char *e;
1099ceeb
LP
553 int r;
554
555 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
556 if (r < 0 && r != -ENXIO)
557 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
558 if (r >= 0)
559 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
560
561 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 562 if (streq_ptr(e, "network"))
4f086aab 563 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 564
49048684
ZJS
565 else if (e) {
566 r = parse_boolean(e);
567 if (r < 0)
568 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
569
570 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
571 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 572 }
4f086aab 573
49048684 574 return 0;
4f086aab
SU
575}
576
49048684 577static int parse_environment(void) {
d5455d2f
LP
578 const char *e;
579 int r;
580
49048684
ZJS
581 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
582 if (r < 0)
583 return r;
584 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
585 if (r < 0)
586 return r;
587 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
588 if (r < 0)
589 return r;
590 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
591 if (r < 0)
592 return r;
d5455d2f 593
49048684
ZJS
594 r = parse_mount_settings_env();
595 if (r < 0)
596 return r;
d5455d2f 597
489fae52
ZJS
598 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
599 * even if it is supported. If not supported, it has no effect. */
de40a303 600 if (!cg_ns_supported())
489fae52 601 arg_use_cgns = false;
de40a303
LP
602 else {
603 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
604 if (r < 0) {
605 if (r != -ENXIO)
49048684 606 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
607
608 arg_use_cgns = true;
609 } else {
610 arg_use_cgns = r > 0;
611 arg_settings_mask |= SETTING_USE_CGNS;
612 }
613 }
d5455d2f
LP
614
615 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
616 if (e)
617 arg_container_service_name = e;
618
49048684 619 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
620}
621
88213476 622static int parse_argv(int argc, char *argv[]) {
a41fe3a2 623 enum {
acbeb427
ZJS
624 ARG_VERSION = 0x100,
625 ARG_PRIVATE_NETWORK,
bc2f673e 626 ARG_UUID,
5076f0cc 627 ARG_READ_ONLY,
57fb9fb5 628 ARG_CAPABILITY,
420c7379 629 ARG_DROP_CAPABILITY,
17fe0523
LP
630 ARG_LINK_JOURNAL,
631 ARG_BIND,
f4889f65 632 ARG_BIND_RO,
06c17c39 633 ARG_TMPFS,
5a8af538
LP
634 ARG_OVERLAY,
635 ARG_OVERLAY_RO,
de40a303 636 ARG_INACCESSIBLE,
eb91eb18 637 ARG_SHARE_SYSTEM,
89f7c846 638 ARG_REGISTER,
aa28aefe 639 ARG_KEEP_UNIT,
69c79d3c 640 ARG_NETWORK_INTERFACE,
c74e630d 641 ARG_NETWORK_MACVLAN,
4bbfe7ad 642 ARG_NETWORK_IPVLAN,
ab046dde 643 ARG_NETWORK_BRIDGE,
22b28dfd 644 ARG_NETWORK_ZONE,
f6d6bad1 645 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 646 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 647 ARG_PERSONALITY,
4d9f07b4 648 ARG_VOLATILE,
ec16945e 649 ARG_TEMPLATE,
f36933fe 650 ARG_PROPERTY,
6dac160c 651 ARG_PRIVATE_USERS,
c6c8f6e2 652 ARG_KILL_SIGNAL,
f757855e 653 ARG_SETTINGS,
5f932eb9 654 ARG_CHDIR,
b53ede69 655 ARG_PIVOT_ROOT,
7336138e 656 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 657 ARG_NOTIFY_READY,
4623e8e6 658 ARG_ROOT_HASH,
960e4569 659 ARG_SYSTEM_CALL_FILTER,
bf428efb 660 ARG_RLIMIT,
3a9530e5 661 ARG_HOSTNAME,
66edd963 662 ARG_NO_NEW_PRIVILEGES,
81f345df 663 ARG_OOM_SCORE_ADJUST,
d107bb7d 664 ARG_CPU_AFFINITY,
09d423e9 665 ARG_RESOLV_CONF,
1688841f 666 ARG_TIMEZONE,
de40a303
LP
667 ARG_CONSOLE,
668 ARG_PIPE,
669 ARG_OCI_BUNDLE,
bb068de0 670 ARG_NO_PAGER,
a41fe3a2
LP
671 };
672
88213476 673 static const struct option options[] = {
d7bea6b6
DP
674 { "help", no_argument, NULL, 'h' },
675 { "version", no_argument, NULL, ARG_VERSION },
676 { "directory", required_argument, NULL, 'D' },
677 { "template", required_argument, NULL, ARG_TEMPLATE },
678 { "ephemeral", no_argument, NULL, 'x' },
679 { "user", required_argument, NULL, 'u' },
680 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
681 { "as-pid2", no_argument, NULL, 'a' },
682 { "boot", no_argument, NULL, 'b' },
683 { "uuid", required_argument, NULL, ARG_UUID },
684 { "read-only", no_argument, NULL, ARG_READ_ONLY },
685 { "capability", required_argument, NULL, ARG_CAPABILITY },
686 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 687 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
688 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
689 { "bind", required_argument, NULL, ARG_BIND },
690 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
691 { "tmpfs", required_argument, NULL, ARG_TMPFS },
692 { "overlay", required_argument, NULL, ARG_OVERLAY },
693 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 694 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 695 { "machine", required_argument, NULL, 'M' },
3a9530e5 696 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
697 { "slice", required_argument, NULL, 'S' },
698 { "setenv", required_argument, NULL, 'E' },
699 { "selinux-context", required_argument, NULL, 'Z' },
700 { "selinux-apifs-context", required_argument, NULL, 'L' },
701 { "quiet", no_argument, NULL, 'q' },
702 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
703 { "register", required_argument, NULL, ARG_REGISTER },
704 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
705 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
706 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
707 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
708 { "network-veth", no_argument, NULL, 'n' },
709 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
710 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
711 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
712 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
713 { "personality", required_argument, NULL, ARG_PERSONALITY },
714 { "image", required_argument, NULL, 'i' },
715 { "volatile", optional_argument, NULL, ARG_VOLATILE },
716 { "port", required_argument, NULL, 'p' },
717 { "property", required_argument, NULL, ARG_PROPERTY },
718 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
719 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
720 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
721 { "settings", required_argument, NULL, ARG_SETTINGS },
722 { "chdir", required_argument, NULL, ARG_CHDIR },
723 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
724 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
725 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
726 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 727 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 728 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 729 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 730 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 731 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
732 { "console", required_argument, NULL, ARG_CONSOLE },
733 { "pipe", no_argument, NULL, ARG_PIPE },
734 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 735 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
eb9da376 736 {}
88213476
LP
737 };
738
9444b1f2 739 int c, r;
a42c8b54 740 uint64_t plus = 0, minus = 0;
f757855e 741 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
742
743 assert(argc >= 0);
744 assert(argv);
745
de40a303 746 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
747 switch (c) {
748
749 case 'h':
37ec0fdd 750 return help();
88213476 751
acbeb427 752 case ARG_VERSION:
3f6fd1ba 753 return version();
acbeb427 754
88213476 755 case 'D':
0f03c2a4 756 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 757 if (r < 0)
0f03c2a4 758 return r;
de40a303
LP
759
760 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
761 break;
762
763 case ARG_TEMPLATE:
0f03c2a4 764 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 765 if (r < 0)
0f03c2a4 766 return r;
de40a303
LP
767
768 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
769 break;
770
1b9e5b12 771 case 'i':
0f03c2a4 772 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 773 if (r < 0)
0f03c2a4 774 return r;
de40a303
LP
775
776 arg_settings_mask |= SETTING_DIRECTORY;
777 break;
778
779 case ARG_OCI_BUNDLE:
780 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
781 if (r < 0)
782 return r;
783
ec16945e
LP
784 break;
785
786 case 'x':
787 arg_ephemeral = true;
a2f577fc 788 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
789 break;
790
687d0825 791 case 'u':
2fc09a9c
DM
792 r = free_and_strdup(&arg_user, optarg);
793 if (r < 0)
7027ff61 794 return log_oom();
687d0825 795
f757855e 796 arg_settings_mask |= SETTING_USER;
687d0825
MV
797 break;
798
22b28dfd
LP
799 case ARG_NETWORK_ZONE: {
800 char *j;
801
b910cc72 802 j = strjoin("vz-", optarg);
22b28dfd
LP
803 if (!j)
804 return log_oom();
805
806 if (!ifname_valid(j)) {
807 log_error("Network zone name not valid: %s", j);
808 free(j);
809 return -EINVAL;
810 }
811
df1fac6d 812 free_and_replace(arg_network_zone, j);
22b28dfd
LP
813
814 arg_network_veth = true;
815 arg_private_network = true;
816 arg_settings_mask |= SETTING_NETWORK;
817 break;
818 }
819
ab046dde 820 case ARG_NETWORK_BRIDGE:
ef76dff2 821
baaa35ad
ZJS
822 if (!ifname_valid(optarg))
823 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
824 "Bridge interface name not valid: %s", optarg);
ef76dff2 825
f757855e
LP
826 r = free_and_strdup(&arg_network_bridge, optarg);
827 if (r < 0)
828 return log_oom();
ab046dde 829
4831981d 830 _fallthrough_;
0dfaa006 831 case 'n':
69c79d3c
LP
832 arg_network_veth = true;
833 arg_private_network = true;
f757855e 834 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
835 break;
836
f6d6bad1
LP
837 case ARG_NETWORK_VETH_EXTRA:
838 r = veth_extra_parse(&arg_network_veth_extra, optarg);
839 if (r < 0)
840 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
841
842 arg_private_network = true;
843 arg_settings_mask |= SETTING_NETWORK;
844 break;
845
aa28aefe 846 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
847 if (!ifname_valid(optarg))
848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
849 "Network interface name not valid: %s", optarg);
ef76dff2 850
c74e630d
LP
851 if (strv_extend(&arg_network_interfaces, optarg) < 0)
852 return log_oom();
853
854 arg_private_network = true;
f757855e 855 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
856 break;
857
858 case ARG_NETWORK_MACVLAN:
ef76dff2 859
baaa35ad
ZJS
860 if (!ifname_valid(optarg))
861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
862 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 863
c74e630d 864 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
865 return log_oom();
866
4bbfe7ad 867 arg_private_network = true;
f757855e 868 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
869 break;
870
871 case ARG_NETWORK_IPVLAN:
ef76dff2 872
baaa35ad
ZJS
873 if (!ifname_valid(optarg))
874 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
875 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 876
4bbfe7ad
TG
877 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
878 return log_oom();
879
4831981d 880 _fallthrough_;
ff01d048
LP
881 case ARG_PRIVATE_NETWORK:
882 arg_private_network = true;
f757855e 883 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
884 break;
885
d7bea6b6
DP
886 case ARG_NETWORK_NAMESPACE_PATH:
887 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
888 if (r < 0)
889 return r;
890
de40a303 891 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
892 break;
893
0f0dbc46 894 case 'b':
baaa35ad
ZJS
895 if (arg_start_mode == START_PID2)
896 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
897 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
898
899 arg_start_mode = START_BOOT;
900 arg_settings_mask |= SETTING_START_MODE;
901 break;
902
903 case 'a':
baaa35ad
ZJS
904 if (arg_start_mode == START_BOOT)
905 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
906 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
907
908 arg_start_mode = START_PID2;
909 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
910 break;
911
144f0fc0 912 case ARG_UUID:
9444b1f2 913 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
914 if (r < 0)
915 return log_error_errno(r, "Invalid UUID: %s", optarg);
916
baaa35ad
ZJS
917 if (sd_id128_is_null(arg_uuid))
918 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
919 "Machine UUID may not be all zeroes.");
f757855e
LP
920
921 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 922 break;
aa96c6cb 923
43c3fb46
LP
924 case 'S': {
925 _cleanup_free_ char *mangled = NULL;
926
927 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
928 if (r < 0)
929 return log_oom();
930
43c3fb46 931 free_and_replace(arg_slice, mangled);
de40a303 932 arg_settings_mask |= SETTING_SLICE;
144f0fc0 933 break;
43c3fb46 934 }
144f0fc0 935
7027ff61 936 case 'M':
c1521918 937 if (isempty(optarg))
97b11eed 938 arg_machine = mfree(arg_machine);
c1521918 939 else {
baaa35ad
ZJS
940 if (!machine_name_is_valid(optarg))
941 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
942 "Invalid machine name: %s", optarg);
7027ff61 943
0c3c4284
LP
944 r = free_and_strdup(&arg_machine, optarg);
945 if (r < 0)
eb91eb18 946 return log_oom();
eb91eb18 947 }
9ce6d1b3 948 break;
7027ff61 949
3a9530e5
LP
950 case ARG_HOSTNAME:
951 if (isempty(optarg))
952 arg_hostname = mfree(arg_hostname);
953 else {
baaa35ad
ZJS
954 if (!hostname_is_valid(optarg, false))
955 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
956 "Invalid hostname: %s", optarg);
3a9530e5
LP
957
958 r = free_and_strdup(&arg_hostname, optarg);
959 if (r < 0)
960 return log_oom();
961 }
962
963 arg_settings_mask |= SETTING_HOSTNAME;
964 break;
965
82adf6af
LP
966 case 'Z':
967 arg_selinux_context = optarg;
a8828ed9
DW
968 break;
969
82adf6af
LP
970 case 'L':
971 arg_selinux_apifs_context = optarg;
a8828ed9
DW
972 break;
973
bc2f673e
LP
974 case ARG_READ_ONLY:
975 arg_read_only = true;
f757855e 976 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
977 break;
978
420c7379
LP
979 case ARG_CAPABILITY:
980 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
981 uint64_t m;
982 r = parse_capability_spec(optarg, &m);
983 if (r <= 0)
984 return r;
5076f0cc 985
8a99bd0c
ZJS
986 if (c == ARG_CAPABILITY)
987 plus |= m;
988 else
989 minus |= m;
f757855e 990 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
991 break;
992 }
66edd963
LP
993 case ARG_NO_NEW_PRIVILEGES:
994 r = parse_boolean(optarg);
995 if (r < 0)
996 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
997
998 arg_no_new_privileges = r;
999 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1000 break;
1001
57fb9fb5
LP
1002 case 'j':
1003 arg_link_journal = LINK_GUEST;
574edc90 1004 arg_link_journal_try = true;
4e1d6aa9 1005 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1006 break;
1007
1008 case ARG_LINK_JOURNAL:
4e1d6aa9 1009 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1010 if (r < 0)
1011 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1012
4e1d6aa9 1013 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1014 break;
1015
17fe0523 1016 case ARG_BIND:
f757855e
LP
1017 case ARG_BIND_RO:
1018 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1019 if (r < 0)
1020 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1021
f757855e 1022 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1023 break;
06c17c39 1024
f757855e
LP
1025 case ARG_TMPFS:
1026 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1027 if (r < 0)
1028 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1029
f757855e 1030 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1031 break;
5a8af538
LP
1032
1033 case ARG_OVERLAY:
ad85779a
LP
1034 case ARG_OVERLAY_RO:
1035 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1036 if (r == -EADDRNOTAVAIL)
1037 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1038 if (r < 0)
1039 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1040
f757855e 1041 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1042 break;
06c17c39 1043
de40a303
LP
1044 case ARG_INACCESSIBLE:
1045 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1048
1049 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1050 break;
1051
a5f1cb3b 1052 case 'E': {
f4889f65
LP
1053 char **n;
1054
baaa35ad
ZJS
1055 if (!env_assignment_is_valid(optarg))
1056 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1057 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1058
1059 n = strv_env_set(arg_setenv, optarg);
1060 if (!n)
1061 return log_oom();
1062
130d3d22 1063 strv_free_and_replace(arg_setenv, n);
f757855e 1064 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1065 break;
1066 }
1067
284c0b91
LP
1068 case 'q':
1069 arg_quiet = true;
1070 break;
1071
8a96d94e 1072 case ARG_SHARE_SYSTEM:
a6b5216c 1073 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1074 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1075 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1076 arg_clone_ns_flags = 0;
8a96d94e
LP
1077 break;
1078
eb91eb18
LP
1079 case ARG_REGISTER:
1080 r = parse_boolean(optarg);
1081 if (r < 0) {
1082 log_error("Failed to parse --register= argument: %s", optarg);
1083 return r;
1084 }
1085
1086 arg_register = r;
1087 break;
1088
89f7c846
LP
1089 case ARG_KEEP_UNIT:
1090 arg_keep_unit = true;
1091 break;
1092
6afc95b7
LP
1093 case ARG_PERSONALITY:
1094
ac45f971 1095 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1096 if (arg_personality == PERSONALITY_INVALID)
1097 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1098 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1099
f757855e 1100 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1101 break;
1102
4d9f07b4
LP
1103 case ARG_VOLATILE:
1104
1105 if (!optarg)
f757855e 1106 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1107 else if (streq(optarg, "help")) {
1108 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1109 return 0;
1110 } else {
f757855e 1111 VolatileMode m;
4d9f07b4 1112
f757855e 1113 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1114 if (m < 0)
1115 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1116 "Failed to parse --volatile= argument: %s", optarg);
1117 else
f757855e 1118 arg_volatile_mode = m;
6d0b55c2
LP
1119 }
1120
f757855e
LP
1121 arg_settings_mask |= SETTING_VOLATILE_MODE;
1122 break;
6d0b55c2 1123
f757855e
LP
1124 case 'p':
1125 r = expose_port_parse(&arg_expose_ports, optarg);
1126 if (r == -EEXIST)
1127 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1128 if (r < 0)
1129 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1130
f757855e 1131 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1132 break;
6d0b55c2 1133
f36933fe
LP
1134 case ARG_PROPERTY:
1135 if (strv_extend(&arg_property, optarg) < 0)
1136 return log_oom();
1137
1138 break;
1139
ae209204
ZJS
1140 case ARG_PRIVATE_USERS: {
1141 int boolean = -1;
0de7acce 1142
ae209204
ZJS
1143 if (!optarg)
1144 boolean = true;
1145 else if (!in_charset(optarg, DIGITS))
1146 /* do *not* parse numbers as booleans */
1147 boolean = parse_boolean(optarg);
1148
1149 if (boolean == false) {
0de7acce
LP
1150 /* no: User namespacing off */
1151 arg_userns_mode = USER_NAMESPACE_NO;
1152 arg_uid_shift = UID_INVALID;
1153 arg_uid_range = UINT32_C(0x10000);
ae209204 1154 } else if (boolean == true) {
0de7acce
LP
1155 /* yes: User namespacing on, UID range is read from root dir */
1156 arg_userns_mode = USER_NAMESPACE_FIXED;
1157 arg_uid_shift = UID_INVALID;
1158 arg_uid_range = UINT32_C(0x10000);
1159 } else if (streq(optarg, "pick")) {
1160 /* pick: User namespacing on, UID range is picked randomly */
1161 arg_userns_mode = USER_NAMESPACE_PICK;
1162 arg_uid_shift = UID_INVALID;
1163 arg_uid_range = UINT32_C(0x10000);
1164 } else {
6c2058b3 1165 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1166 const char *range, *shift;
1167
0de7acce
LP
1168 /* anything else: User namespacing on, UID range is explicitly configured */
1169
6dac160c
LP
1170 range = strchr(optarg, ':');
1171 if (range) {
6c2058b3
ZJS
1172 buffer = strndup(optarg, range - optarg);
1173 if (!buffer)
1174 return log_oom();
1175 shift = buffer;
6dac160c
LP
1176
1177 range++;
bfd292ec
ZJS
1178 r = safe_atou32(range, &arg_uid_range);
1179 if (r < 0)
be715731 1180 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1181 } else
1182 shift = optarg;
1183
be715731
ZJS
1184 r = parse_uid(shift, &arg_uid_shift);
1185 if (r < 0)
1186 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1187
1188 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1189 }
1190
baaa35ad
ZJS
1191 if (arg_uid_range <= 0)
1192 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1193 "UID range cannot be 0.");
be715731 1194
0de7acce 1195 arg_settings_mask |= SETTING_USERNS;
6dac160c 1196 break;
ae209204 1197 }
6dac160c 1198
0de7acce 1199 case 'U':
ccabee0d
LP
1200 if (userns_supported()) {
1201 arg_userns_mode = USER_NAMESPACE_PICK;
1202 arg_uid_shift = UID_INVALID;
1203 arg_uid_range = UINT32_C(0x10000);
1204
1205 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1206 }
1207
7336138e
LP
1208 break;
1209
0de7acce 1210 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1211 arg_userns_chown = true;
0de7acce
LP
1212
1213 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1214 break;
1215
c6c8f6e2 1216 case ARG_KILL_SIGNAL:
5c828e66
LP
1217 if (streq(optarg, "help")) {
1218 DUMP_STRING_TABLE(signal, int, _NSIG);
1219 return 0;
1220 }
1221
29a3db75 1222 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1223 if (arg_kill_signal < 0)
1224 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1225 "Cannot parse signal: %s", optarg);
c6c8f6e2 1226
f757855e
LP
1227 arg_settings_mask |= SETTING_KILL_SIGNAL;
1228 break;
1229
1230 case ARG_SETTINGS:
1231
1232 /* no → do not read files
1233 * yes → read files, do not override cmdline, trust only subset
1234 * override → read files, override cmdline, trust only subset
1235 * trusted → read files, do not override cmdline, trust all
1236 */
1237
1238 r = parse_boolean(optarg);
1239 if (r < 0) {
1240 if (streq(optarg, "trusted")) {
1241 mask_all_settings = false;
1242 mask_no_settings = false;
1243 arg_settings_trusted = true;
1244
1245 } else if (streq(optarg, "override")) {
1246 mask_all_settings = false;
1247 mask_no_settings = true;
1248 arg_settings_trusted = -1;
1249 } else
1250 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1251 } else if (r > 0) {
1252 /* yes */
1253 mask_all_settings = false;
1254 mask_no_settings = false;
1255 arg_settings_trusted = -1;
1256 } else {
1257 /* no */
1258 mask_all_settings = true;
1259 mask_no_settings = false;
1260 arg_settings_trusted = false;
1261 }
1262
c6c8f6e2
LP
1263 break;
1264
5f932eb9 1265 case ARG_CHDIR:
baaa35ad
ZJS
1266 if (!path_is_absolute(optarg))
1267 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1268 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1269
1270 r = free_and_strdup(&arg_chdir, optarg);
1271 if (r < 0)
1272 return log_oom();
1273
1274 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1275 break;
1276
b53ede69
PW
1277 case ARG_PIVOT_ROOT:
1278 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1279 if (r < 0)
1280 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1281
1282 arg_settings_mask |= SETTING_PIVOT_ROOT;
1283 break;
1284
9c1e04d0
AP
1285 case ARG_NOTIFY_READY:
1286 r = parse_boolean(optarg);
baaa35ad
ZJS
1287 if (r < 0)
1288 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1289 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1290 arg_notify_ready = r;
1291 arg_settings_mask |= SETTING_NOTIFY_READY;
1292 break;
1293
4623e8e6
LP
1294 case ARG_ROOT_HASH: {
1295 void *k;
1296 size_t l;
1297
1298 r = unhexmem(optarg, strlen(optarg), &k, &l);
1299 if (r < 0)
1300 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1301 if (l < sizeof(sd_id128_t)) {
4623e8e6 1302 free(k);
c6147113 1303 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6
LP
1304 }
1305
1306 free(arg_root_hash);
1307 arg_root_hash = k;
1308 arg_root_hash_size = l;
1309 break;
1310 }
1311
960e4569
LP
1312 case ARG_SYSTEM_CALL_FILTER: {
1313 bool negative;
1314 const char *items;
1315
1316 negative = optarg[0] == '~';
1317 items = negative ? optarg + 1 : optarg;
1318
1319 for (;;) {
1320 _cleanup_free_ char *word = NULL;
1321
1322 r = extract_first_word(&items, &word, NULL, 0);
1323 if (r == 0)
1324 break;
1325 if (r == -ENOMEM)
1326 return log_oom();
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to parse system call filter: %m");
1329
1330 if (negative)
1331 r = strv_extend(&arg_syscall_blacklist, word);
1332 else
1333 r = strv_extend(&arg_syscall_whitelist, word);
1334 if (r < 0)
1335 return log_oom();
1336 }
1337
1338 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1339 break;
1340 }
1341
bf428efb
LP
1342 case ARG_RLIMIT: {
1343 const char *eq;
622ecfa8 1344 _cleanup_free_ char *name = NULL;
bf428efb
LP
1345 int rl;
1346
5c828e66
LP
1347 if (streq(optarg, "help")) {
1348 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1349 return 0;
1350 }
1351
bf428efb 1352 eq = strchr(optarg, '=');
baaa35ad
ZJS
1353 if (!eq)
1354 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1355 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1356
1357 name = strndup(optarg, eq - optarg);
1358 if (!name)
1359 return log_oom();
1360
1361 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1362 if (rl < 0)
1363 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1364 "Unknown resource limit: %s", name);
bf428efb
LP
1365
1366 if (!arg_rlimit[rl]) {
1367 arg_rlimit[rl] = new0(struct rlimit, 1);
1368 if (!arg_rlimit[rl])
1369 return log_oom();
1370 }
1371
1372 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1375
1376 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1377 break;
1378 }
1379
81f345df
LP
1380 case ARG_OOM_SCORE_ADJUST:
1381 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1382 if (r < 0)
1383 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1384
1385 arg_oom_score_adjust_set = true;
1386 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1387 break;
1388
d107bb7d 1389 case ARG_CPU_AFFINITY: {
0985c7c4 1390 CPUSet cpuset;
d107bb7d
LP
1391
1392 r = parse_cpu_set(optarg, &cpuset);
1393 if (r < 0)
0985c7c4 1394 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1395
0985c7c4
ZJS
1396 cpu_set_reset(&arg_cpu_set);
1397 arg_cpu_set = cpuset;
d107bb7d
LP
1398 arg_settings_mask |= SETTING_CPU_AFFINITY;
1399 break;
1400 }
1401
09d423e9
LP
1402 case ARG_RESOLV_CONF:
1403 if (streq(optarg, "help")) {
1404 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1405 return 0;
1406 }
1407
1408 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1409 if (arg_resolv_conf < 0)
1410 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1411 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1412
1413 arg_settings_mask |= SETTING_RESOLV_CONF;
1414 break;
1415
1688841f
LP
1416 case ARG_TIMEZONE:
1417 if (streq(optarg, "help")) {
1418 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1419 return 0;
1420 }
1421
1422 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1423 if (arg_timezone < 0)
1424 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1425 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1426
1427 arg_settings_mask |= SETTING_TIMEZONE;
1428 break;
1429
de40a303 1430 case ARG_CONSOLE:
dce66ffe
ZJS
1431 r = handle_arg_console(optarg);
1432 if (r <= 0)
1433 return r;
de40a303
LP
1434 break;
1435
1436 case 'P':
1437 case ARG_PIPE:
dce66ffe
ZJS
1438 r = handle_arg_console("pipe");
1439 if (r <= 0)
1440 return r;
de40a303
LP
1441 break;
1442
bb068de0
ZJS
1443 case ARG_NO_PAGER:
1444 arg_pager_flags |= PAGER_DISABLE;
1445 break;
1446
88213476
LP
1447 case '?':
1448 return -EINVAL;
1449
1450 default:
eb9da376 1451 assert_not_reached("Unhandled option");
88213476 1452 }
88213476 1453
60f1ec13
LP
1454 if (argc > optind) {
1455 strv_free(arg_parameters);
1456 arg_parameters = strv_copy(argv + optind);
1457 if (!arg_parameters)
1458 return log_oom();
d7bea6b6 1459
60f1ec13
LP
1460 arg_settings_mask |= SETTING_START_MODE;
1461 }
1462
1463 if (arg_ephemeral && arg_template && !arg_directory)
1464 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1465 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1466 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1467 * --directory=". */
1468 arg_directory = TAKE_PTR(arg_template);
1469
bd4b15f2 1470 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1471
de40a303 1472 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1473 r = parse_environment();
1474 if (r < 0)
1475 return r;
de40a303 1476
60f1ec13
LP
1477 /* Load all settings from .nspawn files */
1478 if (mask_no_settings)
1479 arg_settings_mask = 0;
1480
1481 /* Don't load any settings from .nspawn files */
1482 if (mask_all_settings)
1483 arg_settings_mask = _SETTINGS_MASK_ALL;
1484
1485 return 1;
1486}
1487
1488static int verify_arguments(void) {
1489 int r;
a6b5216c 1490
75b0d8b8
ZJS
1491 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1492 /* If we are running the stub init in the container, we don't need to look at what the init
1493 * in the container supports, because we are not using it. Let's immediately pick the right
1494 * setting based on the host system configuration.
1495 *
1496 * We only do this, if the user didn't use an environment variable to override the detection.
1497 */
1498
1499 r = cg_all_unified();
1500 if (r < 0)
1501 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1502 if (r > 0)
1503 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1504 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1505 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1506 else
1507 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1508 }
1509
4f086aab
SU
1510 if (arg_userns_mode != USER_NAMESPACE_NO)
1511 arg_mount_settings |= MOUNT_USE_USERNS;
1512
1513 if (arg_private_network)
1514 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1515
48a8d337
LB
1516 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1517 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1518 arg_register = false;
baaa35ad 1519 if (arg_start_mode != START_PID1)
60f1ec13 1520 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1521 }
eb91eb18 1522
0de7acce 1523 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1524 arg_userns_chown = true;
1525
60f1ec13
LP
1526 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1527 arg_kill_signal = SIGRTMIN+3;
1528
e5a4bb0d
LP
1529 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1530 arg_read_only = true;
1531
baaa35ad 1532 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1533 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1534 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1535 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1536
baaa35ad 1537 if (arg_directory && arg_image)
60f1ec13 1538 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1539
baaa35ad 1540 if (arg_template && arg_image)
60f1ec13 1541 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1542
baaa35ad 1543 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1544 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1545
baaa35ad 1546 if (arg_ephemeral && arg_template)
60f1ec13 1547 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1548
baaa35ad 1549 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1550 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1551
baaa35ad 1552 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1553 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1554
baaa35ad 1555 if (arg_userns_chown && arg_read_only)
de40a303
LP
1556 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1557 "--read-only and --private-users-chown may not be combined.");
f757855e 1558
e5a4bb0d
LP
1559 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1560 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1561 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1562 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1563 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1564
de40a303
LP
1565 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1566 * to avoid conflicts between different network options. */
60f1ec13
LP
1567 if (arg_network_namespace_path &&
1568 (arg_network_interfaces || arg_network_macvlan ||
1569 arg_network_ipvlan || arg_network_veth_extra ||
1570 arg_network_bridge || arg_network_zone ||
1571 arg_network_veth || arg_private_network))
de40a303 1572 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1573
60f1ec13 1574 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1575 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1576 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1577
baaa35ad 1578 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1579 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1580
baaa35ad 1581 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1583
baaa35ad 1584 if (arg_expose_ports && !arg_private_network)
60f1ec13 1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1586
349cc4a5 1587#if ! HAVE_LIBIPTC
baaa35ad 1588 if (arg_expose_ports)
60f1ec13 1589 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1590#endif
1591
60f1ec13
LP
1592 r = custom_mount_check_all();
1593 if (r < 0)
1594 return r;
c6c8f6e2 1595
f757855e 1596 return 0;
88213476
LP
1597}
1598
03cfe0d5
LP
1599static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1600 assert(p);
1601
0de7acce 1602 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1603 return 0;
1604
1605 if (uid == UID_INVALID && gid == GID_INVALID)
1606 return 0;
1607
1608 if (uid != UID_INVALID) {
1609 uid += arg_uid_shift;
1610
1611 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1612 return -EOVERFLOW;
1613 }
1614
1615 if (gid != GID_INVALID) {
1616 gid += (gid_t) arg_uid_shift;
1617
1618 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1619 return -EOVERFLOW;
1620 }
1621
1622 if (lchown(p, uid, gid) < 0)
1623 return -errno;
b12afc8c
LP
1624
1625 return 0;
1626}
1627
03cfe0d5
LP
1628static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1629 const char *q;
dae8b82e 1630 int r;
03cfe0d5
LP
1631
1632 q = prefix_roota(root, path);
dae8b82e
ZJS
1633 r = mkdir_errno_wrapper(q, mode);
1634 if (r == -EEXIST)
1635 return 0;
1636 if (r < 0)
1637 return r;
03cfe0d5
LP
1638
1639 return userns_lchown(q, uid, gid);
1640}
1641
1688841f 1642static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1643 return PATH_STARTSWITH_SET(
1644 path,
1645 "../usr/share/zoneinfo/",
1646 "/usr/share/zoneinfo/");
1688841f
LP
1647}
1648
83205269
LP
1649static bool etc_writable(void) {
1650 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1651}
1652
e58a1277 1653static int setup_timezone(const char *dest) {
1688841f
LP
1654 _cleanup_free_ char *p = NULL, *etc = NULL;
1655 const char *where, *check;
1656 TimezoneMode m;
d4036145 1657 int r;
f8440af5 1658
e58a1277
LP
1659 assert(dest);
1660
1688841f 1661 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1662 r = readlink_malloc("/etc/localtime", &p);
1663 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1664 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1665 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1666 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1667 else if (r < 0) {
1668 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1669 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1670 * file.
1671 *
1672 * Example:
1673 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1674 */
1675 return 0;
1676 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1677 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1678 else
1679 m = arg_timezone;
1680 } else
1681 m = arg_timezone;
1682
1683 if (m == TIMEZONE_OFF)
1684 return 0;
1685
a5648b80 1686 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1687 if (r < 0) {
1688841f 1688 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1689 return 0;
1690 }
1691
1688841f
LP
1692 where = strjoina(etc, "/localtime");
1693
1694 switch (m) {
1695
1696 case TIMEZONE_DELETE:
1697 if (unlink(where) < 0)
1698 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1699
d4036145 1700 return 0;
d4036145 1701
1688841f
LP
1702 case TIMEZONE_SYMLINK: {
1703 _cleanup_free_ char *q = NULL;
1704 const char *z, *what;
4d1c38b8 1705
1688841f
LP
1706 z = timezone_from_path(p);
1707 if (!z) {
1708 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1709 return 0;
1688841f 1710 }
d4036145 1711
1688841f
LP
1712 r = readlink_malloc(where, &q);
1713 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1714 return 0; /* Already pointing to the right place? Then do nothing .. */
1715
1716 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1717 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1718 if (r < 0)
1719 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1720 else {
1721 if (unlink(where) < 0 && errno != ENOENT) {
1722 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1723 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1724 return 0;
1725 }
1726
1727 what = strjoina("../usr/share/zoneinfo/", z);
1728 if (symlink(what, where) < 0) {
1729 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1730 errno, "Failed to correct timezone of container, ignoring: %m");
1731 return 0;
1732 }
1733
1734 break;
1735 }
1736
1737 _fallthrough_;
d4036145 1738 }
68fb0892 1739
1688841f
LP
1740 case TIMEZONE_BIND: {
1741 _cleanup_free_ char *resolved = NULL;
1742 int found;
1743
a5648b80 1744 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1745 if (found < 0) {
1746 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1747 return 0;
1748 }
1749
1750 if (found == 0) /* missing? */
1751 (void) touch(resolved);
1752
1753 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1754 if (r >= 0)
1755 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1756
1757 _fallthrough_;
79d80fc1 1758 }
4d9f07b4 1759
1688841f
LP
1760 case TIMEZONE_COPY:
1761 /* If mounting failed, try to copy */
8a016c74 1762 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1763 if (r < 0) {
1764 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1765 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1766 return 0;
1767 }
1768
1769 break;
1770
1771 default:
1772 assert_not_reached("unexpected mode");
d4036145 1773 }
e58a1277 1774
1688841f 1775 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1776 r = userns_lchown(where, 0, 0);
1777 if (r < 0)
1688841f 1778 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1779
e58a1277 1780 return 0;
88213476
LP
1781}
1782
09d423e9
LP
1783static int have_resolv_conf(const char *path) {
1784 assert(path);
1785
1786 if (access(path, F_OK) < 0) {
1787 if (errno == ENOENT)
1788 return 0;
1789
1790 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1791 }
1792
1793 return 1;
1794}
1795
7357272e 1796static int resolved_listening(void) {
b8ea7a6e 1797 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1798 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1799 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1800 int r;
1801
7357272e 1802 /* Check if resolved is listening */
b053cd5f
LP
1803
1804 r = sd_bus_open_system(&bus);
1805 if (r < 0)
b8ea7a6e 1806 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1807
7357272e 1808 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1809 if (r < 0)
1810 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1811 if (r == 0)
1812 return 0;
7357272e
DM
1813
1814 r = sd_bus_get_property_string(bus,
1815 "org.freedesktop.resolve1",
1816 "/org/freedesktop/resolve1",
1817 "org.freedesktop.resolve1.Manager",
1818 "DNSStubListener",
b8ea7a6e 1819 &error,
7357272e
DM
1820 &dns_stub_listener_mode);
1821 if (r < 0)
b8ea7a6e 1822 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1823
1824 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1825}
1826
2547bb41 1827static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1828 _cleanup_free_ char *etc = NULL;
1829 const char *where, *what;
1830 ResolvConfMode m;
1831 int r;
2547bb41
LP
1832
1833 assert(dest);
1834
09d423e9
LP
1835 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1836 if (arg_private_network)
1837 m = RESOLV_CONF_OFF;
1838 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
83205269 1839 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
09d423e9 1840 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 1841 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 1842 else
83205269 1843 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
09d423e9
LP
1844 } else
1845 m = arg_resolv_conf;
1846
1847 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1848 return 0;
1849
a5648b80 1850 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
1851 if (r < 0) {
1852 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1853 return 0;
1854 }
1855
1856 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1857
1858 if (m == RESOLV_CONF_DELETE) {
1859 if (unlink(where) < 0)
1860 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1861
87447ae4
LP
1862 return 0;
1863 }
79d80fc1 1864
09d423e9
LP
1865 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1866 what = STATIC_RESOLV_CONF;
1867 else
1868 what = "/etc/resolv.conf";
87447ae4 1869
09d423e9
LP
1870 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1871 _cleanup_free_ char *resolved = NULL;
1872 int found;
1873
a5648b80 1874 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
1875 if (found < 0) {
1876 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1877 return 0;
1878 }
3539724c 1879
87447ae4
LP
1880 if (found == 0) /* missing? */
1881 (void) touch(resolved);
5367354d 1882
09d423e9 1883 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1884 if (r >= 0)
87447ae4 1885 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1886 }
1887
1888 /* If that didn't work, let's copy the file */
8a016c74 1889 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 1890 if (r < 0) {
3539724c
LP
1891 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1892 * resolved or something similar runs inside and the symlink points there.
68a313c5 1893 *
3539724c 1894 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1895 */
09d423e9 1896 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1897 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1898 return 0;
1899 }
2547bb41 1900
03cfe0d5
LP
1901 r = userns_lchown(where, 0, 0);
1902 if (r < 0)
3539724c 1903 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1904
2547bb41
LP
1905 return 0;
1906}
1907
1e4f1671 1908static int setup_boot_id(void) {
cdde6ba6
LP
1909 _cleanup_(unlink_and_freep) char *from = NULL;
1910 _cleanup_free_ char *path = NULL;
3bbaff3e 1911 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1912 const char *to;
04bc4a3f
LP
1913 int r;
1914
1eacc470 1915 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 1916
1eacc470 1917 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
1918 if (r < 0)
1919 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1920
1921 r = sd_id128_randomize(&rnd);
f647962d
MS
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1924
cdde6ba6 1925 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1926 if (r < 0)
1927 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1928
cdde6ba6
LP
1929 from = TAKE_PTR(path);
1930 to = "/proc/sys/kernel/random/boot_id";
1931
60e76d48 1932 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1933 if (r < 0)
1934 return r;
04bc4a3f 1935
cdde6ba6 1936 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1937}
1938
e58a1277 1939static int copy_devnodes(const char *dest) {
88213476
LP
1940 static const char devnodes[] =
1941 "null\0"
1942 "zero\0"
1943 "full\0"
1944 "random\0"
1945 "urandom\0"
85614d66
TG
1946 "tty\0"
1947 "net/tun\0";
88213476 1948
de40a303 1949 _cleanup_umask_ mode_t u;
88213476 1950 const char *d;
e58a1277 1951 int r = 0;
a258bf26
LP
1952
1953 assert(dest);
124640f1
LP
1954
1955 u = umask(0000);
88213476 1956
03cfe0d5
LP
1957 /* Create /dev/net, so that we can create /dev/net/tun in it */
1958 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1959 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1960
88213476 1961 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1962 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1963 struct stat st;
88213476 1964
c6134d3e 1965 from = path_join("/dev/", d);
8967f291
LP
1966 if (!from)
1967 return log_oom();
1968
c6134d3e 1969 to = path_join(dest, from);
8967f291
LP
1970 if (!to)
1971 return log_oom();
88213476
LP
1972
1973 if (stat(from, &st) < 0) {
1974
4a62c710
MS
1975 if (errno != ENOENT)
1976 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1977
baaa35ad
ZJS
1978 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1979 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1980 "%s is not a char or block device, cannot copy.", from);
1981 else {
8dfce114
LP
1982 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1983
81f5049b 1984 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1985 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1986 if (errno == EEXIST)
8dbf71ec 1987 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1988 if (errno != EPERM)
1989 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1990
8dfce114 1991 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1992 r = touch(to);
1993 if (r < 0)
1994 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1995 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1996 if (r < 0)
1997 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1998 }
6278cf60 1999
03cfe0d5
LP
2000 r = userns_lchown(to, 0, 0);
2001 if (r < 0)
2002 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2003
657ee2d8 2004 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2005 if (!dn)
2006 return log_oom();
2007
2008 r = userns_mkdir(dest, dn, 0755, 0, 0);
2009 if (r < 0)
2010 return log_error_errno(r, "Failed to create '%s': %m", dn);
2011
2012 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2013 return log_oom();
2014
c6134d3e 2015 prefixed = path_join(dest, sl);
8dfce114
LP
2016 if (!prefixed)
2017 return log_oom();
2018
2d9b74ba 2019 t = path_join("..", d);
8dfce114
LP
2020 if (!t)
2021 return log_oom();
2022
2023 if (symlink(t, prefixed) < 0)
2024 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2025 }
88213476
LP
2026 }
2027
e58a1277
LP
2028 return r;
2029}
88213476 2030
de40a303
LP
2031static int make_extra_nodes(const char *dest) {
2032 _cleanup_umask_ mode_t u;
2033 size_t i;
2034 int r;
2035
2036 u = umask(0000);
2037
2038 for (i = 0; i < arg_n_extra_nodes; i++) {
2039 _cleanup_free_ char *path = NULL;
2040 DeviceNode *n = arg_extra_nodes + i;
2041
c6134d3e 2042 path = path_join(dest, n->path);
de40a303
LP
2043 if (!path)
2044 return log_oom();
2045
2046 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2047 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2048
2049 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2050 if (r < 0)
2051 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2052 }
2053
2054 return 0;
2055}
2056
03cfe0d5
LP
2057static int setup_pts(const char *dest) {
2058 _cleanup_free_ char *options = NULL;
2059 const char *p;
709f6e46 2060 int r;
03cfe0d5 2061
349cc4a5 2062#if HAVE_SELINUX
03cfe0d5
LP
2063 if (arg_selinux_apifs_context)
2064 (void) asprintf(&options,
3dce8915 2065 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2066 arg_uid_shift + TTY_GID,
2067 arg_selinux_apifs_context);
2068 else
2069#endif
2070 (void) asprintf(&options,
3dce8915 2071 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2072 arg_uid_shift + TTY_GID);
f2d88580 2073
03cfe0d5 2074 if (!options)
f2d88580
LP
2075 return log_oom();
2076
03cfe0d5 2077 /* Mount /dev/pts itself */
cc9fce65 2078 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2079 r = mkdir_errno_wrapper(p, 0755);
2080 if (r < 0)
2081 return log_error_errno(r, "Failed to create /dev/pts: %m");
2082
60e76d48
ZJS
2083 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2084 if (r < 0)
2085 return r;
709f6e46
MS
2086 r = userns_lchown(p, 0, 0);
2087 if (r < 0)
2088 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2089
2090 /* Create /dev/ptmx symlink */
2091 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2092 if (symlink("pts/ptmx", p) < 0)
2093 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2094 r = userns_lchown(p, 0, 0);
2095 if (r < 0)
2096 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2097
03cfe0d5
LP
2098 /* And fix /dev/pts/ptmx ownership */
2099 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2100 r = userns_lchown(p, 0, 0);
2101 if (r < 0)
2102 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2103
f2d88580
LP
2104 return 0;
2105}
2106
3acc84eb
FB
2107static int setup_stdio_as_dev_console(void) {
2108 int terminal;
e58a1277 2109 int r;
e58a1277 2110
3acc84eb
FB
2111 terminal = open_terminal("/dev/console", O_RDWR);
2112 if (terminal < 0)
2113 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2114
3acc84eb
FB
2115 /* Make sure we can continue logging to the original stderr, even if
2116 * stderr points elsewhere now */
2117 r = log_dup_console();
2118 if (r < 0)
2119 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2120
3acc84eb
FB
2121 /* invalidates 'terminal' on success and failure */
2122 r = rearrange_stdio(terminal, terminal, terminal);
f647962d 2123 if (r < 0)
3acc84eb
FB
2124 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2125
2126 return 0;
2127}
88213476 2128
3acc84eb
FB
2129static int setup_dev_console(const char *console) {
2130 _cleanup_free_ char *p = NULL;
2131 int r;
a258bf26 2132
3acc84eb
FB
2133 /* Create /dev/console symlink */
2134 r = path_make_relative("/dev", console, &p);
81f5049b 2135 if (r < 0)
3acc84eb
FB
2136 return log_error_errno(r, "Failed to create relative path: %m");
2137
2138 if (symlink(p, "/dev/console") < 0)
2139 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2140
3acc84eb 2141 return 0;
e58a1277
LP
2142}
2143
8e5430c4
LP
2144static int setup_keyring(void) {
2145 key_serial_t keyring;
2146
2147 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2148 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2149 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2150 * these system calls let's make sure we don't leak anything into the container. */
2151
2152 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2153 if (keyring == -1) {
2154 if (errno == ENOSYS)
2155 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2156 else if (IN_SET(errno, EACCES, EPERM))
2157 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2158 else
2159 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2160 }
2161
2162 return 0;
2163}
2164
1e4f1671 2165static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2166 _cleanup_(unlink_and_freep) char *from = NULL;
2167 _cleanup_free_ char *fifo = NULL;
2168 _cleanup_close_ int fd = -1;
7fd1b19b 2169 _cleanup_umask_ mode_t u;
9ec5a93c 2170 int r;
e58a1277 2171
e58a1277 2172 assert(kmsg_socket >= 0);
a258bf26 2173
e58a1277 2174 u = umask(0000);
a258bf26 2175
1eacc470 2176 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2177 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2178 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2179 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2180
1eacc470 2181 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2182 if (r < 0)
2183 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2184
9ec5a93c 2185 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2186 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2187
2188 from = TAKE_PTR(fifo);
9ec5a93c 2189
1eacc470 2190 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2191 if (r < 0)
2192 return r;
e58a1277 2193
669fc4e5 2194 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2195 if (fd < 0)
2196 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2197
9ec5a93c 2198 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2199 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2200 if (r < 0)
2201 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2202
25ea79fe 2203 return 0;
88213476
LP
2204}
2205
1c4baffc 2206static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2207 union in_addr_union *exposed = userdata;
2208
2209 assert(rtnl);
2210 assert(m);
2211 assert(exposed);
2212
7a8f6325 2213 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2214 return 0;
2215}
2216
3a74cea5 2217static int setup_hostname(void) {
c818eef1 2218 int r;
3a74cea5 2219
0c582db0 2220 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2221 return 0;
2222
c818eef1
LP
2223 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2226
7027ff61 2227 return 0;
3a74cea5
LP
2228}
2229
57fb9fb5 2230static int setup_journal(const char *directory) {
0f5e1382 2231 _cleanup_free_ char *d = NULL;
b2238e38
LP
2232 const char *dirname, *p, *q;
2233 sd_id128_t this_id;
2234 char id[33];
8054d749 2235 bool try;
57fb9fb5
LP
2236 int r;
2237
df9a75e4
LP
2238 /* Don't link journals in ephemeral mode */
2239 if (arg_ephemeral)
2240 return 0;
2241
8054d749
LP
2242 if (arg_link_journal == LINK_NO)
2243 return 0;
2244
2245 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2246
4d680aee 2247 r = sd_id128_get_machine(&this_id);
f647962d
MS
2248 if (r < 0)
2249 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2250
e01ff70a 2251 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2252 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2253 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2254 if (try)
4d680aee 2255 return 0;
df9a75e4 2256 return -EEXIST;
4d680aee
ZJS
2257 }
2258
369ca6da
ZJS
2259 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2260 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2261 if (r < 0) {
2262 bool ignore = r == -EROFS && try;
2263 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2264 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2265 return ignore ? 0 : r;
2266 }
2267 }
03cfe0d5 2268
e01ff70a
MS
2269 (void) sd_id128_to_string(arg_uuid, id);
2270
03cfe0d5
LP
2271 p = strjoina("/var/log/journal/", id);
2272 q = prefix_roota(directory, p);
27407a01 2273
e1873695 2274 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2275 if (try)
2276 return 0;
27407a01 2277
baaa35ad
ZJS
2278 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2279 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2280 }
2281
e1873695 2282 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2283 if (try)
2284 return 0;
57fb9fb5 2285
baaa35ad
ZJS
2286 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2287 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2288 }
2289
2290 r = readlink_and_make_absolute(p, &d);
2291 if (r >= 0) {
3742095b 2292 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2293 path_equal(d, q)) {
2294
03cfe0d5 2295 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2296 if (r < 0)
709f6e46 2297 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2298 return 0;
57fb9fb5
LP
2299 }
2300
4a62c710
MS
2301 if (unlink(p) < 0)
2302 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2303 } else if (r == -EINVAL) {
2304
2305 if (arg_link_journal == LINK_GUEST &&
2306 rmdir(p) < 0) {
2307
27407a01
ZJS
2308 if (errno == ENOTDIR) {
2309 log_error("%s already exists and is neither a symlink nor a directory", p);
2310 return r;
4314d33f
MS
2311 } else
2312 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2313 }
4314d33f
MS
2314 } else if (r != -ENOENT)
2315 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2316
2317 if (arg_link_journal == LINK_GUEST) {
2318
2319 if (symlink(q, p) < 0) {
8054d749 2320 if (try) {
56f64d95 2321 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2322 return 0;
4314d33f
MS
2323 } else
2324 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2325 }
2326
03cfe0d5 2327 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2328 if (r < 0)
709f6e46 2329 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2330 return 0;
57fb9fb5
LP
2331 }
2332
2333 if (arg_link_journal == LINK_HOST) {
ccddd104 2334 /* don't create parents here — if the host doesn't have
574edc90 2335 * permanent journal set up, don't force it here */
ba8e6c4d 2336
dae8b82e
ZJS
2337 r = mkdir_errno_wrapper(p, 0755);
2338 if (r < 0 && r != -EEXIST) {
8054d749 2339 if (try) {
dae8b82e 2340 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2341 return 0;
4314d33f 2342 } else
dae8b82e 2343 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2344 }
2345
27407a01
ZJS
2346 } else if (access(p, F_OK) < 0)
2347 return 0;
57fb9fb5 2348
cdb2b9d0
LP
2349 if (dir_is_empty(q) == 0)
2350 log_warning("%s is not empty, proceeding anyway.", q);
2351
03cfe0d5 2352 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2353 if (r < 0)
2354 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2355
60e76d48
ZJS
2356 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2357 if (r < 0)
4a62c710 2358 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2359
27407a01 2360 return 0;
57fb9fb5
LP
2361}
2362
de40a303
LP
2363static int drop_capabilities(uid_t uid) {
2364 CapabilityQuintet q;
2365
2366 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2367 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2368 * arg_caps_retain. */
2369
2370 if (capability_quintet_is_set(&arg_full_capabilities)) {
2371 q = arg_full_capabilities;
2372
2373 if (q.bounding == (uint64_t) -1)
2374 q.bounding = uid == 0 ? arg_caps_retain : 0;
2375
2376 if (q.effective == (uint64_t) -1)
2377 q.effective = uid == 0 ? q.bounding : 0;
2378
2379 if (q.inheritable == (uint64_t) -1)
2380 q.inheritable = uid == 0 ? q.bounding : 0;
2381
2382 if (q.permitted == (uint64_t) -1)
2383 q.permitted = uid == 0 ? q.bounding : 0;
2384
2385 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2386 q.ambient = 0;
f66ad460
AZ
2387
2388 if (capability_quintet_mangle(&q))
2389 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2390
2391 } else {
de40a303
LP
2392 q = (CapabilityQuintet) {
2393 .bounding = arg_caps_retain,
2394 .effective = uid == 0 ? arg_caps_retain : 0,
2395 .inheritable = uid == 0 ? arg_caps_retain : 0,
2396 .permitted = uid == 0 ? arg_caps_retain : 0,
2397 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2398 };
2399
f66ad460
AZ
2400 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2401 * in order to maintain the same behavior as systemd < 242. */
2402 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2403 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2404 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2405
2406 }
2407
de40a303 2408 return capability_quintet_enforce(&q);
88213476
LP
2409}
2410
db999e0f
LP
2411static int reset_audit_loginuid(void) {
2412 _cleanup_free_ char *p = NULL;
2413 int r;
2414
0c582db0 2415 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2416 return 0;
2417
2418 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2419 if (r == -ENOENT)
db999e0f 2420 return 0;
f647962d
MS
2421 if (r < 0)
2422 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2423
2424 /* Already reset? */
2425 if (streq(p, "4294967295"))
2426 return 0;
2427
57512c89 2428 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2429 if (r < 0) {
10a87006
LP
2430 log_error_errno(r,
2431 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2432 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2433 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2434 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2435 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2436
db999e0f 2437 sleep(5);
77b6e194 2438 }
db999e0f
LP
2439
2440 return 0;
77b6e194
LP
2441}
2442
785890ac
LP
2443static int setup_propagate(const char *root) {
2444 const char *p, *q;
709f6e46 2445 int r;
785890ac
LP
2446
2447 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2448 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2449 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2450 (void) mkdir_p(p, 0600);
2451
709f6e46
MS
2452 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2453 if (r < 0)
2454 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2455
709f6e46
MS
2456 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2459
709f6e46
MS
2460 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2463
03cfe0d5 2464 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2465 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2466 if (r < 0)
2467 return r;
785890ac 2468
60e76d48
ZJS
2469 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2470 if (r < 0)
2471 return r;
785890ac 2472
19caffac
AC
2473 /* machined will MS_MOVE into that directory, and that's only
2474 * supported for non-shared mounts. */
60e76d48 2475 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2476}
2477
317feb4d 2478static int setup_machine_id(const char *directory) {
691675ba
LP
2479 const char *etc_machine_id;
2480 sd_id128_t id;
3bbaff3e 2481 int r;
e01ff70a 2482
317feb4d
LP
2483 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2484 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2485 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2486 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2487 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2488 * container behaves nicely). */
2489
e01ff70a
MS
2490 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2491
691675ba 2492 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2493 if (r < 0) {
2494 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2495 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2496
317feb4d
LP
2497 if (sd_id128_is_null(arg_uuid)) {
2498 r = sd_id128_randomize(&arg_uuid);
2499 if (r < 0)
2500 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2501 }
2502 } else {
baaa35ad
ZJS
2503 if (sd_id128_is_null(id))
2504 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2505 "Machine ID in container image is zero, refusing.");
e01ff70a 2506
317feb4d
LP
2507 arg_uuid = id;
2508 }
691675ba 2509
e01ff70a
MS
2510 return 0;
2511}
2512
7336138e
LP
2513static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2514 int r;
2515
2516 assert(directory);
2517
0de7acce 2518 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2519 return 0;
2520
2521 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2522 if (r == -EOPNOTSUPP)
2523 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2524 if (r == -EBADE)
2525 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2526 if (r < 0)
2527 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2528 if (r == 0)
2529 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2530 else
2531 log_debug("Patched directory tree to match UID/GID range.");
2532
2533 return r;
2534}
2535
113cea80 2536/*
6d416b9c
LS
2537 * Return values:
2538 * < 0 : wait_for_terminate() failed to get the state of the
2539 * container, the container was terminated by a signal, or
2540 * failed for an unknown reason. No change is made to the
2541 * container argument.
2542 * > 0 : The program executed in the container terminated with an
2543 * error. The exit code of the program executed in the
919699ec
LP
2544 * container is returned. The container argument has been set
2545 * to CONTAINER_TERMINATED.
6d416b9c
LS
2546 * 0 : The container is being rebooted, has been shut down or exited
2547 * successfully. The container argument has been set to either
2548 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2549 *
6d416b9c
LS
2550 * That is, success is indicated by a return value of zero, and an
2551 * error is indicated by a non-zero value.
113cea80
DH
2552 */
2553static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2554 siginfo_t status;
919699ec 2555 int r;
113cea80
DH
2556
2557 r = wait_for_terminate(pid, &status);
f647962d
MS
2558 if (r < 0)
2559 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2560
2561 switch (status.si_code) {
fddbb89c 2562
113cea80 2563 case CLD_EXITED:
b5a2179b 2564 if (status.si_status == 0)
919699ec 2565 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2566 else
919699ec 2567 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2568
919699ec
LP
2569 *container = CONTAINER_TERMINATED;
2570 return status.si_status;
113cea80
DH
2571
2572 case CLD_KILLED:
2573 if (status.si_status == SIGINT) {
919699ec 2574 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2575 *container = CONTAINER_TERMINATED;
919699ec
LP
2576 return 0;
2577
113cea80 2578 } else if (status.si_status == SIGHUP) {
919699ec 2579 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2580 *container = CONTAINER_REBOOTED;
919699ec 2581 return 0;
113cea80 2582 }
919699ec 2583
4831981d 2584 _fallthrough_;
113cea80 2585 case CLD_DUMPED:
baaa35ad
ZJS
2586 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2587 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2588
2589 default:
baaa35ad
ZJS
2590 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2591 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2592 }
113cea80
DH
2593}
2594
023fb90b
LP
2595static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2596 pid_t pid;
2597
4a0b58c4 2598 pid = PTR_TO_PID(userdata);
023fb90b 2599 if (pid > 0) {
c6c8f6e2 2600 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2601 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2602 sd_event_source_set_userdata(s, NULL);
2603 return 0;
2604 }
2605 }
2606
2607 sd_event_exit(sd_event_source_get_event(s), 0);
2608 return 0;
2609}
2610
6916b164 2611static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2612 pid_t pid;
2613
2614 assert(s);
2615 assert(ssi);
2616
2617 pid = PTR_TO_PID(userdata);
2618
6916b164
AU
2619 for (;;) {
2620 siginfo_t si = {};
abdb9b08 2621
6916b164
AU
2622 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2623 return log_error_errno(errno, "Failed to waitid(): %m");
2624 if (si.si_pid == 0) /* No pending children. */
2625 break;
abdb9b08 2626 if (si.si_pid == pid) {
6916b164
AU
2627 /* The main process we care for has exited. Return from
2628 * signal handler but leave the zombie. */
2629 sd_event_exit(sd_event_source_get_event(s), 0);
2630 break;
2631 }
abdb9b08 2632
6916b164
AU
2633 /* Reap all other children. */
2634 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2635 }
2636
2637 return 0;
2638}
2639
abdb9b08
LP
2640static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2641 pid_t pid;
2642
2643 assert(m);
2644
2645 pid = PTR_TO_PID(userdata);
2646
2647 if (arg_kill_signal > 0) {
2648 log_info("Container termination requested. Attempting to halt container.");
2649 (void) kill(pid, arg_kill_signal);
2650 } else {
2651 log_info("Container termination requested. Exiting.");
2652 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2653 }
2654
2655 return 0;
2656}
2657
ec16945e 2658static int determine_names(void) {
1b9cebf6 2659 int r;
ec16945e 2660
c1521918
LP
2661 if (arg_template && !arg_directory && arg_machine) {
2662
2663 /* If --template= was specified then we should not
2664 * search for a machine, but instead create a new one
2665 * in /var/lib/machine. */
2666
657ee2d8 2667 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2668 if (!arg_directory)
2669 return log_oom();
2670 }
2671
ec16945e 2672 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2673 if (arg_machine) {
2674 _cleanup_(image_unrefp) Image *i = NULL;
2675
5ef46e5f 2676 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2677 if (r == -ENOENT)
2678 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2681
eb38edce 2682 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2683 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2684 else
0f03c2a4 2685 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2686 if (r < 0)
0f3be6ca 2687 return log_oom();
1b9cebf6 2688
aee327b8
LP
2689 if (!arg_ephemeral)
2690 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2691 } else {
2692 r = safe_getcwd(&arg_directory);
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to determine current directory: %m");
2695 }
ec16945e 2696
c6147113
LP
2697 if (!arg_directory && !arg_image)
2698 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2699 }
2700
2701 if (!arg_machine) {
b9ba4dab
LP
2702 if (arg_directory && path_equal(arg_directory, "/"))
2703 arg_machine = gethostname_malloc();
4827ab48
LP
2704 else {
2705 if (arg_image) {
2706 char *e;
2707
2708 arg_machine = strdup(basename(arg_image));
2709
2710 /* Truncate suffix if there is one */
2711 e = endswith(arg_machine, ".raw");
2712 if (e)
2713 *e = 0;
2714 } else
2715 arg_machine = strdup(basename(arg_directory));
2716 }
ec16945e
LP
2717 if (!arg_machine)
2718 return log_oom();
2719
ae691c1d 2720 hostname_cleanup(arg_machine);
c6147113
LP
2721 if (!machine_name_is_valid(arg_machine))
2722 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2723
2724 if (arg_ephemeral) {
2725 char *b;
2726
2727 /* Add a random suffix when this is an
2728 * ephemeral machine, so that we can run many
2729 * instances at once without manually having
2730 * to specify -M each time. */
2731
2732 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2733 return log_oom();
2734
2735 free(arg_machine);
2736 arg_machine = b;
2737 }
ec16945e
LP
2738 }
2739
2740 return 0;
2741}
2742
8d4aa2bb 2743static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2744 char *chased;
2745 int r;
2746
2747 assert(p);
2748
2749 if (!*p)
2750 return 0;
2751
a5648b80 2752 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
2753 if (r < 0)
2754 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2755
a5648b80 2756 return free_and_replace(*p, chased);
3f342ec4
LP
2757}
2758
03cfe0d5 2759static int determine_uid_shift(const char *directory) {
6dac160c
LP
2760 int r;
2761
0de7acce 2762 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2763 arg_uid_shift = 0;
6dac160c 2764 return 0;
03cfe0d5 2765 }
6dac160c
LP
2766
2767 if (arg_uid_shift == UID_INVALID) {
2768 struct stat st;
2769
03cfe0d5 2770 r = stat(directory, &st);
6dac160c 2771 if (r < 0)
03cfe0d5 2772 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2773
2774 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2775
baaa35ad
ZJS
2776 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2778 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2779
2780 arg_uid_range = UINT32_C(0x10000);
2781 }
2782
baaa35ad
ZJS
2783 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2784 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2785 "UID base too high for UID range.");
6dac160c 2786
6dac160c
LP
2787 return 0;
2788}
2789
de40a303
LP
2790static unsigned long effective_clone_ns_flags(void) {
2791 unsigned long flags = arg_clone_ns_flags;
2792
2793 if (arg_private_network)
2794 flags |= CLONE_NEWNET;
2795 if (arg_use_cgns)
2796 flags |= CLONE_NEWCGROUP;
2797 if (arg_userns_mode != USER_NAMESPACE_NO)
2798 flags |= CLONE_NEWUSER;
2799
2800 return flags;
2801}
2802
2803static int patch_sysctl(void) {
2804
2805 /* This table is inspired by runc's sysctl() function */
2806 static const struct {
2807 const char *key;
2808 bool prefix;
2809 unsigned long clone_flags;
2810 } safe_sysctl[] = {
2811 { "kernel.hostname", false, CLONE_NEWUTS },
2812 { "kernel.domainname", false, CLONE_NEWUTS },
2813 { "kernel.msgmax", false, CLONE_NEWIPC },
2814 { "kernel.msgmnb", false, CLONE_NEWIPC },
2815 { "kernel.msgmni", false, CLONE_NEWIPC },
2816 { "kernel.sem", false, CLONE_NEWIPC },
2817 { "kernel.shmall", false, CLONE_NEWIPC },
2818 { "kernel.shmmax", false, CLONE_NEWIPC },
2819 { "kernel.shmmni", false, CLONE_NEWIPC },
2820 { "fs.mqueue.", true, CLONE_NEWIPC },
2821 { "net.", true, CLONE_NEWNET },
2822 };
2823
2824 unsigned long flags;
2825 char **k, **v;
2826 int r;
2827
2828 flags = effective_clone_ns_flags();
2829
2830 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2831 bool good = false;
2832 size_t i;
2833
2834 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2835
2836 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2837 continue;
2838
2839 if (safe_sysctl[i].prefix)
2840 good = startswith(*k, safe_sysctl[i].key);
2841 else
2842 good = streq(*k, safe_sysctl[i].key);
2843
2844 if (good)
2845 break;
2846 }
2847
c6147113
LP
2848 if (!good)
2849 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
2850
2851 r = sysctl_write(*k, *v);
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2854 }
2855
2856 return 0;
2857}
2858
03cfe0d5
LP
2859static int inner_child(
2860 Barrier *barrier,
2861 const char *directory,
2862 bool secondary,
2863 int kmsg_socket,
2864 int rtnl_socket,
3acc84eb 2865 int master_pty_socket,
f757855e 2866 FDSet *fds) {
69c79d3c 2867
03cfe0d5 2868 _cleanup_free_ char *home = NULL;
e01ff70a 2869 char as_uuid[37];
88614c8a 2870 size_t n_env = 1;
03cfe0d5 2871 const char *envp[] = {
0c300adf 2872 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2873 NULL, /* container */
03cfe0d5
LP
2874 NULL, /* TERM */
2875 NULL, /* HOME */
2876 NULL, /* USER */
2877 NULL, /* LOGNAME */
2878 NULL, /* container_uuid */
2879 NULL, /* LISTEN_FDS */
2880 NULL, /* LISTEN_PID */
9c1e04d0 2881 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2882 NULL
2883 };
1a68e1e5 2884 const char *exec_target;
2371271c 2885 _cleanup_strv_free_ char **env_use = NULL;
de40a303 2886 int r, which_failed;
88213476 2887
b37469d7
LP
2888 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2889 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2890 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2891 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2892 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2893 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2894 * namespace.
2895 *
2896 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2897 * unshare(). See below. */
2898
03cfe0d5
LP
2899 assert(barrier);
2900 assert(directory);
2901 assert(kmsg_socket >= 0);
88213476 2902
de40a303
LP
2903 log_debug("Inner child is initializing.");
2904
0de7acce 2905 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2906 /* Tell the parent, that it now can write the UID map. */
2907 (void) barrier_place(barrier); /* #1 */
7027ff61 2908
03cfe0d5 2909 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2910 if (!barrier_place_and_sync(barrier)) /* #2 */
2911 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2912 "Parent died too early");
88213476
LP
2913 }
2914
6d66bd3b
EV
2915 r = reset_uid_gid();
2916 if (r < 0)
2917 return log_error_errno(r, "Couldn't become new root: %m");
2918
0de7acce 2919 r = mount_all(NULL,
4f086aab 2920 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2921 arg_uid_shift,
0de7acce 2922 arg_selinux_apifs_context);
03cfe0d5
LP
2923 if (r < 0)
2924 return r;
2925
04413780
ZJS
2926 if (!arg_network_namespace_path && arg_private_network) {
2927 r = unshare(CLONE_NEWNET);
2928 if (r < 0)
2929 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2930
2931 /* Tell the parent that it can setup network interfaces. */
2932 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2933 }
2934
4f086aab 2935 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2936 if (r < 0)
2937 return r;
2938
03cfe0d5
LP
2939 /* Wait until we are cgroup-ified, so that we
2940 * can mount the right cgroup path writable */
baaa35ad
ZJS
2941 if (!barrier_place_and_sync(barrier)) /* #4 */
2942 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2943 "Parent died too early");
88213476 2944
489fae52 2945 if (arg_use_cgns) {
0996ef00
CB
2946 r = unshare(CLONE_NEWCGROUP);
2947 if (r < 0)
04413780 2948 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2949 r = mount_cgroups(
2950 "",
2951 arg_unified_cgroup_hierarchy,
2952 arg_userns_mode != USER_NAMESPACE_NO,
2953 arg_uid_shift,
2954 arg_uid_range,
5a8ff0e6 2955 arg_selinux_apifs_context,
ada54120 2956 true);
0996ef00
CB
2957 if (r < 0)
2958 return r;
2959 } else {
2960 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2961 if (r < 0)
2962 return r;
2963 }
ec16945e 2964
1e4f1671 2965 r = setup_boot_id();
03cfe0d5
LP
2966 if (r < 0)
2967 return r;
ec16945e 2968
1e4f1671 2969 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2970 if (r < 0)
2971 return r;
2972 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2973
de40a303
LP
2974 r = mount_custom(
2975 "/",
2976 arg_custom_mounts,
2977 arg_n_custom_mounts,
de40a303
LP
2978 0,
2979 arg_selinux_apifs_context,
5f0a6347 2980 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
2981 if (r < 0)
2982 return r;
2983
03cfe0d5
LP
2984 if (setsid() < 0)
2985 return log_error_errno(errno, "setsid() failed: %m");
2986
2987 if (arg_private_network)
2988 loopback_setup();
2989
7a8f6325
LP
2990 if (arg_expose_ports) {
2991 r = expose_port_send_rtnl(rtnl_socket);
2992 if (r < 0)
2993 return r;
2994 rtnl_socket = safe_close(rtnl_socket);
2995 }
03cfe0d5 2996
3acc84eb 2997 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 2998 _cleanup_close_ int master = -1;
3acc84eb
FB
2999 _cleanup_free_ char *console = NULL;
3000
3001 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3002 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3003 if (master < 0)
dc98caea 3004 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3005
3006 r = setup_dev_console(console);
3007 if (r < 0)
3008 return log_error_errno(r, "Failed to setup /dev/console: %m");
3009
3010 r = send_one_fd(master_pty_socket, master, 0);
3011 if (r < 0)
3012 return log_error_errno(r, "Failed to send master fd: %m");
3013 master_pty_socket = safe_close(master_pty_socket);
3014
3015 r = setup_stdio_as_dev_console();
3016 if (r < 0)
3017 return r;
3018 }
3019
de40a303
LP
3020 r = patch_sysctl();
3021 if (r < 0)
3022 return r;
3023
81f345df
LP
3024 if (arg_oom_score_adjust_set) {
3025 r = set_oom_score_adjust(arg_oom_score_adjust);
3026 if (r < 0)
3027 return log_error_errno(r, "Failed to adjust OOM score: %m");
3028 }
3029
0985c7c4
ZJS
3030 if (arg_cpu_set.set)
3031 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3032 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3033
c818eef1 3034 (void) setup_hostname();
03cfe0d5 3035
050f7277 3036 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3037 r = safe_personality(arg_personality);
3038 if (r < 0)
3039 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3040 } else if (secondary) {
21022b9d
LP
3041 r = safe_personality(PER_LINUX32);
3042 if (r < 0)
3043 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3044 }
3045
de40a303
LP
3046 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3047 if (r < 0)
3048 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3049
3050#if HAVE_SECCOMP
3051 if (arg_seccomp) {
3052
3053 if (is_seccomp_available()) {
3054
3055 r = seccomp_load(arg_seccomp);
7bc5e0b1 3056 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3057 return log_error_errno(r, "Failed to install seccomp filter: %m");
3058 if (r < 0)
3059 log_debug_errno(r, "Failed to install seccomp filter: %m");
3060 }
3061 } else
3062#endif
3063 {
3064 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3065 if (r < 0)
3066 return r;
3067 }
3068
349cc4a5 3069#if HAVE_SELINUX
03cfe0d5 3070 if (arg_selinux_context)
2ed96880 3071 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3072 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3073#endif
3074
de40a303
LP
3075 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3076 * if we need to later on. */
3077 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3078 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3079
3080 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3081 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3082 else
3083 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
3084 if (r < 0)
3085 return r;
3086
de40a303
LP
3087 r = drop_capabilities(getuid());
3088 if (r < 0)
3089 return log_error_errno(r, "Dropping capabilities failed: %m");
3090
66edd963
LP
3091 if (arg_no_new_privileges)
3092 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3093 return log_error_errno(errno, "Failed to disable new privileges: %m");
3094
6aadfa4c
ILG
3095 /* LXC sets container=lxc, so follow the scheme here */
3096 envp[n_env++] = strjoina("container=", arg_container_service_name);
3097
03cfe0d5
LP
3098 envp[n_env] = strv_find_prefix(environ, "TERM=");
3099 if (envp[n_env])
313cefa1 3100 n_env++;
03cfe0d5 3101
de40a303
LP
3102 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3103 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3104 return log_oom();
3105
3106 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3107 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3108 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3109 return log_oom();
03cfe0d5 3110
3bbaff3e 3111 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3112
691675ba 3113 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3114 return log_oom();
03cfe0d5
LP
3115
3116 if (fdset_size(fds) > 0) {
3117 r = fdset_cloexec(fds, false);
3118 if (r < 0)
3119 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3120
3121 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3122 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3123 return log_oom();
3124 }
9c1e04d0
AP
3125 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3126 return log_oom();
03cfe0d5 3127
2371271c
TG
3128 env_use = strv_env_merge(2, envp, arg_setenv);
3129 if (!env_use)
3130 return log_oom();
03cfe0d5
LP
3131
3132 /* Let the parent know that we are ready and
3133 * wait until the parent is ready with the
3134 * setup, too... */
baaa35ad
ZJS
3135 if (!barrier_place_and_sync(barrier)) /* #5 */
3136 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3137 "Parent died too early");
03cfe0d5 3138
5f932eb9
LP
3139 if (arg_chdir)
3140 if (chdir(arg_chdir) < 0)
3141 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3142
7732f92b 3143 if (arg_start_mode == START_PID2) {
75bf701f 3144 r = stub_pid1(arg_uuid);
7732f92b
LP
3145 if (r < 0)
3146 return r;
3147 }
3148
de40a303
LP
3149 log_debug("Inner child completed, invoking payload.");
3150
8ca082b4
LP
3151 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3152 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3153 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3154 log_close();
8ca082b4
LP
3155 log_set_open_when_needed(true);
3156
03cfe0d5
LP
3157 (void) fdset_close_others(fds);
3158
7732f92b 3159 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3160 char **a;
3161 size_t m;
3162
3163 /* Automatically search for the init system */
3164
75f32f04
ZJS
3165 m = strv_length(arg_parameters);
3166 a = newa(char*, m + 2);
3167 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3168 a[1 + m] = NULL;
03cfe0d5 3169
ced58da7 3170 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3171 execve(a[0], a, env_use);
3172
ced58da7 3173 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3174 execve(a[0], a, env_use);
3175
ced58da7 3176 a[0] = (char*) "/sbin/init";
03cfe0d5 3177 execve(a[0], a, env_use);
ced58da7
LP
3178
3179 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3180 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3181 const char *dollar_path;
3182
1a68e1e5 3183 exec_target = arg_parameters[0];
b6b180b7
LP
3184
3185 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3186 * binary. */
3187 dollar_path = strv_env_get(env_use, "PATH");
3188 if (dollar_path) {
3189 if (putenv((char*) dollar_path) != 0)
3190 return log_error_errno(errno, "Failed to update $PATH: %m");
3191 }
3192
f757855e 3193 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3194 } else {
5f932eb9 3195 if (!arg_chdir)
d929b0f9
ZJS
3196 /* If we cannot change the directory, we'll end up in /, that is expected. */
3197 (void) chdir(home ?: "/root");
5f932eb9 3198
03cfe0d5
LP
3199 execle("/bin/bash", "-bash", NULL, env_use);
3200 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3201
3202 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3203 }
3204
8ca082b4 3205 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3206}
3207
9c1e04d0 3208static int setup_sd_notify_child(void) {
271f518f 3209 _cleanup_close_ int fd = -1;
9c1e04d0 3210 union sockaddr_union sa = {
44ed5214
LP
3211 .un.sun_family = AF_UNIX,
3212 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3213 };
3214 int r;
3215
3216 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3217 if (fd < 0)
3218 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3219
3220 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3221 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3222
9c1e04d0 3223 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3224 if (r < 0)
44ed5214 3225 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3226
adc7d9f0 3227 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3228 if (r < 0)
adc7d9f0 3229 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3230
2ff48e98 3231 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3232 if (r < 0)
2ff48e98 3233 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3234
271f518f 3235 return TAKE_FD(fd);
9c1e04d0
AP
3236}
3237
03cfe0d5
LP
3238static int outer_child(
3239 Barrier *barrier,
3240 const char *directory,
2d845785 3241 DissectedImage *dissected_image,
03cfe0d5
LP
3242 bool secondary,
3243 int pid_socket,
e01ff70a 3244 int uuid_socket,
9c1e04d0 3245 int notify_socket,
03cfe0d5
LP
3246 int kmsg_socket,
3247 int rtnl_socket,
825d5287 3248 int uid_shift_socket,
3acc84eb 3249 int master_pty_socket,
8199d554 3250 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3251 FDSet *fds,
3252 int netns_fd) {
03cfe0d5 3253
bf428efb 3254 _cleanup_close_ int fd = -1;
03cfe0d5
LP
3255 pid_t pid;
3256 ssize_t l;
de40a303 3257 int r;
03cfe0d5 3258
b37469d7
LP
3259 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3260 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3261 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3262 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3263
03cfe0d5
LP
3264 assert(barrier);
3265 assert(directory);
03cfe0d5 3266 assert(pid_socket >= 0);
e01ff70a 3267 assert(uuid_socket >= 0);
9c1e04d0 3268 assert(notify_socket >= 0);
3acc84eb 3269 assert(master_pty_socket >= 0);
03cfe0d5
LP
3270 assert(kmsg_socket >= 0);
3271
de40a303
LP
3272 log_debug("Outer child is initializing.");
3273
03cfe0d5
LP
3274 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3275 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3276
03cfe0d5
LP
3277 r = reset_audit_loginuid();
3278 if (r < 0)
3279 return r;
3280
3281 /* Mark everything as slave, so that we still
3282 * receive mounts from the real root, but don't
3283 * propagate mounts to the real root. */
60e76d48
ZJS
3284 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3285 if (r < 0)
3286 return r;
03cfe0d5 3287
2d845785 3288 if (dissected_image) {
2d3a5a73
LP
3289 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3290 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3291 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3292 * makes sure ESP partitions and userns are compatible. */
3293
3294 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
3295 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3296 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3297 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
3298 if (r < 0)
3299 return r;
3300 }
03cfe0d5 3301
391567f4
LP
3302 r = determine_uid_shift(directory);
3303 if (r < 0)
3304 return r;
3305
0de7acce 3306 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3307 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3308 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3309 if (l < 0)
3310 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3311 if (l != sizeof(arg_uid_shift))
3312 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3313 "Short write while sending UID shift.");
0e7ac751 3314
0de7acce 3315 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3316 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3317 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3318 * not it will pick a different one, and send it back to us. */
3319
3320 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3321 if (l < 0)
3322 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3323 if (l != sizeof(arg_uid_shift))
3324 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3325 "Short read while receiving UID shift.");
0e7ac751
LP
3326 }
3327
ff6c6cc1
LP
3328 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3329 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3330 }
3331
6f83d3d1
LP
3332 if (path_equal(directory, "/")) {
3333 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3334 * place, so that we can make changes to its mount structure (for example, to implement
3335 * --volatile=) without this interfering with our ability to access files such as
3336 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3337 * (instead of a temporary directory, since we are living in our own mount namspace here
3338 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3339 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3340
3341 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3342 if (r < 0)
3343 return r;
3344
3345 directory = "/run/systemd/nspawn-root";
3346
3347 } else if (!dissected_image) {
3348 /* Turn directory into bind mount (we need that so that we can move the bind mount to root
3349 * later on). */
e50cd82f
LP
3350 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3351 if (r < 0)
3352 return r;
3353 }
7d0ecdd6
LP
3354
3355 r = setup_pivot_root(
3356 directory,
3357 arg_pivot_root_new,
3358 arg_pivot_root_old);
3359 if (r < 0)
3360 return r;
3361
3362 r = setup_volatile_mode(
3363 directory,
3364 arg_volatile_mode,
7d0ecdd6 3365 arg_uid_shift,
8f1ed04a 3366 arg_selinux_apifs_context);
7d0ecdd6
LP
3367 if (r < 0)
3368 return r;
3369
5f0a6347
DDM
3370 r = mount_custom(
3371 directory,
3372 arg_custom_mounts,
3373 arg_n_custom_mounts,
5f0a6347 3374 arg_uid_shift,
5f0a6347
DDM
3375 arg_selinux_apifs_context,
3376 MOUNT_ROOT_ONLY);
3377 if (r < 0)
3378 return r;
3379
2d3a5a73
LP
3380 if (dissected_image) {
3381 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3382 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3383 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3384 if (r < 0)
3385 return r;
3386 }
3387
8199d554
LP
3388 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3389 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3390
3391 r = detect_unified_cgroup_hierarchy_from_image(directory);
3392 if (r < 0)
3393 return r;
3394
3395 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3396 if (l < 0)
3397 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3398 if (l != sizeof(arg_unified_cgroup_hierarchy))
3399 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3400 "Short write while sending cgroup mode.");
8199d554
LP
3401
3402 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3403 }
3404
4ad14eff
LP
3405 /* Mark everything as shared so our mounts get propagated down. This is
3406 * required to make new bind mounts available in systemd services
5238e957 3407 * inside the container that create a new mount namespace.
4ad14eff
LP
3408 * See https://github.com/systemd/systemd/issues/3860
3409 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3410 * shared propagation mode.
3411 *
3412 * IMPORTANT: Do not overmount the root directory anymore from now on to
3413 * enable moving the root directory mount to root later on.
3414 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3415 */
4ad14eff
LP
3416 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3417 if (r < 0)
3418 return r;
3419
3420 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3421 if (r < 0)
3422 return r;
3423
03cfe0d5
LP
3424 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3425 if (r < 0)
3426 return r;
3427
e5a4bb0d 3428 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
64e82c19 3429 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3430 if (r < 0)
3431 return log_error_errno(r, "Failed to make tree read-only: %m");
3432 }
3433
0de7acce 3434 r = mount_all(directory,
4f086aab 3435 arg_mount_settings,
0de7acce 3436 arg_uid_shift,
0de7acce 3437 arg_selinux_apifs_context);
03cfe0d5
LP
3438 if (r < 0)
3439 return r;
3440
07fa00f9
LP
3441 r = copy_devnodes(directory);
3442 if (r < 0)
03cfe0d5
LP
3443 return r;
3444
de40a303
LP
3445 r = make_extra_nodes(directory);
3446 if (r < 0)
3447 return r;
3448
3449 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3450 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
03cfe0d5 3451
07fa00f9
LP
3452 r = setup_pts(directory);
3453 if (r < 0)
03cfe0d5
LP
3454 return r;
3455
3456 r = setup_propagate(directory);
3457 if (r < 0)
3458 return r;
3459
8e5430c4
LP
3460 r = setup_keyring();
3461 if (r < 0)
3462 return r;
3463
03cfe0d5
LP
3464 r = setup_timezone(directory);
3465 if (r < 0)
3466 return r;
3467
3468 r = setup_resolv_conf(directory);
3469 if (r < 0)
3470 return r;
3471
e01ff70a
MS
3472 r = setup_machine_id(directory);
3473 if (r < 0)
3474 return r;
3475
03cfe0d5
LP
3476 r = setup_journal(directory);
3477 if (r < 0)
3478 return r;
3479
0de7acce
LP
3480 r = mount_custom(
3481 directory,
3482 arg_custom_mounts,
3483 arg_n_custom_mounts,
0de7acce 3484 arg_uid_shift,
de40a303 3485 arg_selinux_apifs_context,
5f0a6347 3486 MOUNT_NON_ROOT_ONLY);
03cfe0d5
LP
3487 if (r < 0)
3488 return r;
3489
489fae52 3490 if (!arg_use_cgns) {
0996ef00
CB
3491 r = mount_cgroups(
3492 directory,
3493 arg_unified_cgroup_hierarchy,
3494 arg_userns_mode != USER_NAMESPACE_NO,
3495 arg_uid_shift,
3496 arg_uid_range,
5a8ff0e6 3497 arg_selinux_apifs_context,
ada54120 3498 false);
0996ef00
CB
3499 if (r < 0)
3500 return r;
3501 }
03cfe0d5
LP
3502
3503 r = mount_move_root(directory);
3504 if (r < 0)
3505 return log_error_errno(r, "Failed to move root directory: %m");
3506
9c1e04d0
AP
3507 fd = setup_sd_notify_child();
3508 if (fd < 0)
3509 return fd;
3510
03cfe0d5 3511 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3512 arg_clone_ns_flags |
8869a0b4 3513 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3514 if (pid < 0)
3515 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3516 if (pid == 0) {
3517 pid_socket = safe_close(pid_socket);
e01ff70a 3518 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3519 notify_socket = safe_close(notify_socket);
825d5287 3520 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3521
3522 /* The inner child has all namespaces that are
3523 * requested, so that we all are owned by the user if
3524 * user namespaces are turned on. */
3525
d7bea6b6
DP
3526 if (arg_network_namespace_path) {
3527 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3528 if (r < 0)
e2d39e54 3529 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3530 }
3531
3acc84eb 3532 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
03cfe0d5
LP
3533 if (r < 0)
3534 _exit(EXIT_FAILURE);
3535
3536 _exit(EXIT_SUCCESS);
3537 }
3538
3539 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3540 if (l < 0)
3541 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3542 if (l != sizeof(pid))
3543 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3544 "Short write while sending PID.");
03cfe0d5 3545
e01ff70a
MS
3546 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3547 if (l < 0)
3548 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3549 if (l != sizeof(arg_uuid))
3550 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3551 "Short write while sending machine ID.");
e01ff70a 3552
9c1e04d0
AP
3553 l = send_one_fd(notify_socket, fd, 0);
3554 if (l < 0)
ba72801d 3555 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3556
03cfe0d5 3557 pid_socket = safe_close(pid_socket);
e01ff70a 3558 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3559 notify_socket = safe_close(notify_socket);
3acc84eb 3560 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3561 kmsg_socket = safe_close(kmsg_socket);
3562 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3563 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3564
3565 return 0;
3566}
3567
0e7ac751 3568static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3569 bool tried_hashed = false;
0e7ac751
LP
3570 unsigned n_tries = 100;
3571 uid_t candidate;
3572 int r;
3573
3574 assert(shift);
3575 assert(ret_lock_file);
0de7acce 3576 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3577 assert(arg_uid_range == 0x10000U);
3578
3579 candidate = *shift;
3580
3581 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3582
3583 for (;;) {
fbd0b64f 3584 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3585 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3586
3587 if (--n_tries <= 0)
3588 return -EBUSY;
3589
87d5e4f2 3590 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3591 goto next;
3592 if ((candidate & UINT32_C(0xFFFF)) != 0)
3593 goto next;
3594
3595 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3596 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3597 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3598 goto next;
3599 if (r < 0)
3600 return r;
3601
3602 /* Make some superficial checks whether the range is currently known in the user database */
3603 if (getpwuid(candidate))
3604 goto next;
3605 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3606 goto next;
3607 if (getgrgid(candidate))
3608 goto next;
3609 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3610 goto next;
3611
3612 *ret_lock_file = lf;
3613 lf = (struct LockFile) LOCK_FILE_INIT;
3614 *shift = candidate;
3615 return 0;
3616
3617 next:
d381c8a6
LP
3618 if (arg_machine && !tried_hashed) {
3619 /* Try to hash the base from the container name */
3620
3621 static const uint8_t hash_key[] = {
3622 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3623 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3624 };
3625
3626 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3627
3628 tried_hashed = true;
3629 } else
3630 random_bytes(&candidate, sizeof(candidate));
3631
87d5e4f2 3632 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3633 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3634 }
3635}
3636
03cfe0d5 3637static int setup_uid_map(pid_t pid) {
fbd0b64f 3638 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3639 int r;
3640
3641 assert(pid > 1);
3642
3643 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3644 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3645 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3646 if (r < 0)
3647 return log_error_errno(r, "Failed to write UID map: %m");
3648
3649 /* We always assign the same UID and GID ranges */
3650 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3651 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3652 if (r < 0)
3653 return log_error_errno(r, "Failed to write GID map: %m");
3654
3655 return 0;
3656}
3657
9c1e04d0 3658static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3659 char buf[NOTIFY_BUFFER_MAX+1];
3660 char *p = NULL;
3661 struct iovec iovec = {
3662 .iov_base = buf,
3663 .iov_len = sizeof(buf)-1,
3664 };
3665 union {
3666 struct cmsghdr cmsghdr;
3667 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3668 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3669 } control = {};
3670 struct msghdr msghdr = {
3671 .msg_iov = &iovec,
3672 .msg_iovlen = 1,
3673 .msg_control = &control,
3674 .msg_controllen = sizeof(control),
3675 };
3676 struct cmsghdr *cmsg;
3677 struct ucred *ucred = NULL;
3678 ssize_t n;
3679 pid_t inner_child_pid;
3680 _cleanup_strv_free_ char **tags = NULL;
3681
3682 assert(userdata);
3683
3684 inner_child_pid = PTR_TO_PID(userdata);
3685
3686 if (revents != EPOLLIN) {
3687 log_warning("Got unexpected poll event for notify fd.");
3688 return 0;
3689 }
3690
3691 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3692 if (n < 0) {
3742095b 3693 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3694 return 0;
3695
3696 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3697 }
3698 cmsg_close_all(&msghdr);
3699
3700 CMSG_FOREACH(cmsg, &msghdr) {
3701 if (cmsg->cmsg_level == SOL_SOCKET &&
3702 cmsg->cmsg_type == SCM_CREDENTIALS &&
3703 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3704
3705 ucred = (struct ucred*) CMSG_DATA(cmsg);
3706 }
3707 }
3708
3709 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3710 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3711 return 0;
3712 }
3713
3714 if ((size_t) n >= sizeof(buf)) {
3715 log_warning("Received notify message exceeded maximum size. Ignoring.");
3716 return 0;
3717 }
3718
3719 buf[n] = 0;
3720 tags = strv_split(buf, "\n\r");
3721 if (!tags)
3722 return log_oom();
3723
3724 if (strv_find(tags, "READY=1"))
04f590a4 3725 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
3726
3727 p = strv_find_startswith(tags, "STATUS=");
3728 if (p)
04f590a4 3729 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
3730
3731 return 0;
3732}
3733
5773024d 3734static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3735 int r;
9c1e04d0 3736
5773024d 3737 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3738 if (r < 0)
3739 return log_error_errno(r, "Failed to allocate notify event source: %m");
3740
5773024d 3741 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3742
3743 return 0;
3744}
3745
5d961407
LP
3746static int merge_settings(Settings *settings, const char *path) {
3747 int rl;
f757855e 3748
5d961407
LP
3749 assert(settings);
3750 assert(path);
f757855e 3751
5d961407
LP
3752 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3753 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3754
7732f92b
LP
3755 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3756 settings->start_mode >= 0) {
3757 arg_start_mode = settings->start_mode;
130d3d22 3758 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3759 }
3760
a2f577fc
JL
3761 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3762 arg_ephemeral = settings->ephemeral;
3763
de40a303
LP
3764 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3765 settings->root) {
3766
3767 if (!arg_settings_trusted)
3768 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3769 else
3770 free_and_replace(arg_directory, settings->root);
3771 }
3772
b53ede69
PW
3773 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3774 settings->pivot_root_new) {
3775 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3776 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3777 }
3778
5f932eb9 3779 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3780 settings->working_directory)
3781 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3782
f757855e 3783 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3784 settings->environment)
3785 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 3786
de40a303
LP
3787 if ((arg_settings_mask & SETTING_USER) == 0) {
3788
3789 if (settings->user)
3790 free_and_replace(arg_user, settings->user);
3791
3792 if (uid_is_valid(settings->uid))
3793 arg_uid = settings->uid;
3794 if (gid_is_valid(settings->gid))
3795 arg_gid = settings->gid;
3796 if (settings->n_supplementary_gids > 0) {
3797 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3798 arg_n_supplementary_gids = settings->n_supplementary_gids;
3799 }
3800 }
f757855e
LP
3801
3802 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 3803 uint64_t plus, minus;
7be830c6 3804 uint64_t network_minus = 0;
f757855e 3805
de40a303
LP
3806 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3807 * Settings structure */
3808
0e265674 3809 plus = settings->capability;
a3fc6b55
LP
3810 minus = settings->drop_capability;
3811
3812 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3813 if (settings_private_network(settings))
3814 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3815 else
7be830c6 3816 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 3817 }
0e265674
LP
3818
3819 if (!arg_settings_trusted && plus != 0) {
3820 if (settings->capability != 0)
5d961407 3821 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
3822 } else {
3823 arg_caps_retain &= ~network_minus;
520e0d54 3824 arg_caps_retain |= plus;
7be830c6 3825 }
f757855e 3826
a3fc6b55 3827 arg_caps_retain &= ~minus;
de40a303
LP
3828
3829 /* Copy the full capabilities over too */
3830 if (capability_quintet_is_set(&settings->full_capabilities)) {
3831 if (!arg_settings_trusted)
5238e957 3832 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
3833 else
3834 arg_full_capabilities = settings->full_capabilities;
3835 }
f757855e
LP
3836 }
3837
3838 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3839 settings->kill_signal > 0)
3840 arg_kill_signal = settings->kill_signal;
3841
3842 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3843 settings->personality != PERSONALITY_INVALID)
3844 arg_personality = settings->personality;
3845
3846 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3847 !sd_id128_is_null(settings->machine_id)) {
3848
3849 if (!arg_settings_trusted)
5d961407 3850 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3851 else
3852 arg_uuid = settings->machine_id;
3853 }
3854
3855 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3856 settings->read_only >= 0)
3857 arg_read_only = settings->read_only;
3858
3859 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3860 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3861 arg_volatile_mode = settings->volatile_mode;
3862
3863 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3864 settings->n_custom_mounts > 0) {
3865
3866 if (!arg_settings_trusted)
5d961407 3867 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3868 else {
3869 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3870 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3871 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3872 settings->n_custom_mounts = 0;
3873 }
3874 }
3875
3876 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3877 (settings->private_network >= 0 ||
3878 settings->network_veth >= 0 ||
3879 settings->network_bridge ||
22b28dfd 3880 settings->network_zone ||
f757855e
LP
3881 settings->network_interfaces ||
3882 settings->network_macvlan ||
f6d6bad1 3883 settings->network_ipvlan ||
de40a303
LP
3884 settings->network_veth_extra ||
3885 settings->network_namespace_path)) {
f757855e
LP
3886
3887 if (!arg_settings_trusted)
5d961407 3888 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3889 else {
f6d6bad1 3890 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3891 arg_private_network = settings_private_network(settings);
3892
130d3d22
YW
3893 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3894 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3895 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3896 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3897
1cc6c93a
YW
3898 free_and_replace(arg_network_bridge, settings->network_bridge);
3899 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
3900
3901 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
3902 }
3903 }
3904
3905 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3906 settings->expose_ports) {
3907
3908 if (!arg_settings_trusted)
5d961407 3909 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3910 else {
3911 expose_port_free_all(arg_expose_ports);
1cc6c93a 3912 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3913 }
3914 }
3915
0de7acce
LP
3916 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3917 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3918
3919 if (!arg_settings_trusted)
5d961407 3920 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3921 else {
3922 arg_userns_mode = settings->userns_mode;
3923 arg_uid_shift = settings->uid_shift;
3924 arg_uid_range = settings->uid_range;
3925 arg_userns_chown = settings->userns_chown;
3926 }
3927 }
3928
9c1e04d0
AP
3929 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3930 arg_notify_ready = settings->notify_ready;
3931
960e4569
LP
3932 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3933
de40a303 3934 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
5d961407 3935 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3936 else {
130d3d22
YW
3937 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3938 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569 3939 }
de40a303
LP
3940
3941#if HAVE_SECCOMP
3942 if (!arg_settings_trusted && settings->seccomp)
3943 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3944 else {
3945 seccomp_release(arg_seccomp);
3946 arg_seccomp = TAKE_PTR(settings->seccomp);
3947 }
3948#endif
960e4569
LP
3949 }
3950
bf428efb
LP
3951 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3952 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3953 continue;
3954
3955 if (!settings->rlimit[rl])
3956 continue;
3957
3958 if (!arg_settings_trusted) {
5d961407 3959 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3960 continue;
3961 }
3962
3963 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3964 }
3965
3a9530e5
LP
3966 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3967 settings->hostname)
3968 free_and_replace(arg_hostname, settings->hostname);
3969
66edd963
LP
3970 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3971 settings->no_new_privileges >= 0)
3972 arg_no_new_privileges = settings->no_new_privileges;
3973
81f345df
LP
3974 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3975 settings->oom_score_adjust_set) {
3976
3977 if (!arg_settings_trusted)
5d961407 3978 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3979 else {
3980 arg_oom_score_adjust = settings->oom_score_adjust;
3981 arg_oom_score_adjust_set = true;
3982 }
3983 }
3984
d107bb7d 3985 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 3986 settings->cpu_set.set) {
d107bb7d
LP
3987
3988 if (!arg_settings_trusted)
5d961407 3989 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 3990 else {
0985c7c4
ZJS
3991 cpu_set_reset(&arg_cpu_set);
3992 arg_cpu_set = settings->cpu_set;
3993 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
3994 }
3995 }
3996
09d423e9
LP
3997 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3998 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3999 arg_resolv_conf = settings->resolv_conf;
4000
4e1d6aa9
LP
4001 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4002 settings->link_journal != _LINK_JOURNAL_INVALID) {
4003
4004 if (!arg_settings_trusted)
4005 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4006 else {
4007 arg_link_journal = settings->link_journal;
4008 arg_link_journal_try = settings->link_journal_try;
4009 }
4010 }
4011
1688841f
LP
4012 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4013 settings->timezone != _TIMEZONE_MODE_INVALID)
4014 arg_timezone = settings->timezone;
4015
de40a303
LP
4016 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4017 settings->slice) {
4018
4019 if (!arg_settings_trusted)
4020 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4021 else
4022 free_and_replace(arg_slice, settings->slice);
4023 }
4024
4025 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4026 settings->use_cgns >= 0) {
4027
4028 if (!arg_settings_trusted)
4029 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4030 else
4031 arg_use_cgns = settings->use_cgns;
4032 }
4033
4034 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4035 settings->clone_ns_flags != (unsigned long) -1) {
4036
4037 if (!arg_settings_trusted)
4038 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4039 else
4040 arg_clone_ns_flags = settings->clone_ns_flags;
4041 }
4042
4043 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4044 settings->console_mode >= 0) {
4045
4046 if (!arg_settings_trusted)
4047 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4048 else
4049 arg_console_mode = settings->console_mode;
4050 }
4051
4052 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4053 * don't consult arg_settings_mask for them. */
4054
4055 sd_bus_message_unref(arg_property_message);
4056 arg_property_message = TAKE_PTR(settings->properties);
4057
4058 arg_console_width = settings->console_width;
4059 arg_console_height = settings->console_height;
4060
b2645747 4061 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4062 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4063 arg_n_extra_nodes = settings->n_extra_nodes;
4064
f757855e
LP
4065 return 0;
4066}
4067
5d961407
LP
4068static int load_settings(void) {
4069 _cleanup_(settings_freep) Settings *settings = NULL;
4070 _cleanup_fclose_ FILE *f = NULL;
4071 _cleanup_free_ char *p = NULL;
4072 const char *fn, *i;
4073 int r;
4074
de40a303
LP
4075 if (arg_oci_bundle)
4076 return 0;
4077
5d961407
LP
4078 /* If all settings are masked, there's no point in looking for
4079 * the settings file */
4080 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4081 return 0;
4082
4083 fn = strjoina(arg_machine, ".nspawn");
4084
4085 /* We first look in the admin's directories in /etc and /run */
4086 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4087 _cleanup_free_ char *j = NULL;
4088
657ee2d8 4089 j = path_join(i, fn);
5d961407
LP
4090 if (!j)
4091 return log_oom();
4092
4093 f = fopen(j, "re");
4094 if (f) {
4095 p = TAKE_PTR(j);
4096
4097 /* By default, we trust configuration from /etc and /run */
4098 if (arg_settings_trusted < 0)
4099 arg_settings_trusted = true;
4100
4101 break;
4102 }
4103
4104 if (errno != ENOENT)
4105 return log_error_errno(errno, "Failed to open %s: %m", j);
4106 }
4107
4108 if (!f) {
4109 /* After that, let's look for a file next to the
4110 * actual image we shall boot. */
4111
4112 if (arg_image) {
4113 p = file_in_same_dir(arg_image, fn);
4114 if (!p)
4115 return log_oom();
cd6e3914 4116 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4117 p = file_in_same_dir(arg_directory, fn);
4118 if (!p)
4119 return log_oom();
4120 }
4121
4122 if (p) {
4123 f = fopen(p, "re");
4124 if (!f && errno != ENOENT)
4125 return log_error_errno(errno, "Failed to open %s: %m", p);
4126
4127 /* By default, we do not trust configuration from /var/lib/machines */
4128 if (arg_settings_trusted < 0)
4129 arg_settings_trusted = false;
4130 }
4131 }
4132
4133 if (!f)
4134 return 0;
4135
4136 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4137
4138 r = settings_load(f, p, &settings);
4139 if (r < 0)
4140 return r;
4141
4142 return merge_settings(settings, p);
4143}
4144
de40a303
LP
4145static int load_oci_bundle(void) {
4146 _cleanup_(settings_freep) Settings *settings = NULL;
4147 int r;
4148
4149 if (!arg_oci_bundle)
4150 return 0;
4151
4152 /* By default let's trust OCI bundles */
4153 if (arg_settings_trusted < 0)
4154 arg_settings_trusted = true;
4155
4156 r = oci_load(NULL, arg_oci_bundle, &settings);
4157 if (r < 0)
4158 return r;
4159
4160 return merge_settings(settings, arg_oci_bundle);
4161}
4162
3acc84eb 4163static int run_container(
2d845785 4164 DissectedImage *dissected_image,
b0067625
ZJS
4165 bool secondary,
4166 FDSet *fds,
4167 char veth_name[IFNAMSIZ], bool *veth_created,
4168 union in_addr_union *exposed,
3acc84eb 4169 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4170
4171 static const struct sigaction sa = {
4172 .sa_handler = nop_signal_handler,
e28c7cd0 4173 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4174 };
4175
8e766630 4176 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4177 _cleanup_close_ int etc_passwd_lock = -1;
4178 _cleanup_close_pair_ int
4179 kmsg_socket_pair[2] = { -1, -1 },
4180 rtnl_socket_pair[2] = { -1, -1 },
4181 pid_socket_pair[2] = { -1, -1 },
4182 uuid_socket_pair[2] = { -1, -1 },
4183 notify_socket_pair[2] = { -1, -1 },
8199d554 4184 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4185 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4186 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4187
3acc84eb 4188 _cleanup_close_ int notify_socket = -1;
b0067625 4189 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4190 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4191 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4192 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4193 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4194 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4195 ContainerStatus container_status = 0;
b0067625
ZJS
4196 int ifi = 0, r;
4197 ssize_t l;
4198 sigset_t mask_chld;
d7bea6b6 4199 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
4200
4201 assert_se(sigemptyset(&mask_chld) == 0);
4202 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4203
4204 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4205 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4206 * check with getpwuid() if the specific user already exists. Note that /etc might be
4207 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4208 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4209 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4210 * really ours. */
4211
4212 etc_passwd_lock = take_etc_passwd_lock(NULL);
4213 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4214 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4215 }
4216
4217 r = barrier_create(&barrier);
4218 if (r < 0)
4219 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4220
4221 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4222 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4223
4224 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4225 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4226
4227 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4228 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4229
4230 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4231 return log_error_errno(errno, "Failed to create id socket pair: %m");
4232
4233 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4234 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4235
3acc84eb
FB
4236 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4237 return log_error_errno(errno, "Failed to create console socket pair: %m");
4238
b0067625
ZJS
4239 if (arg_userns_mode != USER_NAMESPACE_NO)
4240 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4241 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4242
8199d554
LP
4243 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4244 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4245 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4246
b0067625
ZJS
4247 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4248 * parent's blocking calls and give it a chance to call wait() and terminate. */
4249 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4250 if (r < 0)
4251 return log_error_errno(errno, "Failed to change the signal mask: %m");
4252
4253 r = sigaction(SIGCHLD, &sa, NULL);
4254 if (r < 0)
4255 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4256
d7bea6b6
DP
4257 if (arg_network_namespace_path) {
4258 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4259 if (netns_fd < 0)
4260 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4261
4262 r = fd_is_network_ns(netns_fd);
6619ad88
LP
4263 if (r == -EUCLEAN)
4264 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4265 else if (r < 0)
d7bea6b6 4266 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4267 else if (r == 0)
4268 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4269 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4270 }
4271
b0067625
ZJS
4272 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4273 if (*pid < 0)
4274 return log_error_errno(errno, "clone() failed%s: %m",
4275 errno == EINVAL ?
4276 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4277
4278 if (*pid == 0) {
4279 /* The outer child only has a file system namespace. */
4280 barrier_set_role(&barrier, BARRIER_CHILD);
4281
b0067625
ZJS
4282 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4283 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4284 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4285 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4286 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4287 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4288 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4289 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4290
4291 (void) reset_all_signal_handlers();
4292 (void) reset_signal_mask();
4293
4294 r = outer_child(&barrier,
4295 arg_directory,
2d845785 4296 dissected_image,
b0067625
ZJS
4297 secondary,
4298 pid_socket_pair[1],
4299 uuid_socket_pair[1],
4300 notify_socket_pair[1],
4301 kmsg_socket_pair[1],
4302 rtnl_socket_pair[1],
4303 uid_shift_socket_pair[1],
3acc84eb 4304 master_pty_socket_pair[1],
8199d554 4305 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
4306 fds,
4307 netns_fd);
b0067625
ZJS
4308 if (r < 0)
4309 _exit(EXIT_FAILURE);
4310
4311 _exit(EXIT_SUCCESS);
4312 }
4313
4314 barrier_set_role(&barrier, BARRIER_PARENT);
4315
e4077ff6 4316 fdset_close(fds);
b0067625
ZJS
4317
4318 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4319 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4320 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4321 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4322 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4323 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4324 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4325 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4326
4327 if (arg_userns_mode != USER_NAMESPACE_NO) {
4328 /* The child just let us know the UID shift it might have read from the image. */
4329 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4330 if (l < 0)
4331 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4332 if (l != sizeof arg_uid_shift)
4333 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4334
4335 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4336 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4337 * image, but if that's already in use, pick a new one, and report back to the child,
4338 * which one we now picked. */
4339
4340 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4341 if (r < 0)
4342 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4343
4344 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4345 if (l < 0)
4346 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4347 if (l != sizeof arg_uid_shift)
4348 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4349 }
4350 }
4351
8199d554
LP
4352 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4353 /* The child let us know the support cgroup mode it might have read from the image. */
4354 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4355 if (l < 0)
4356 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4357 if (l != sizeof(arg_unified_cgroup_hierarchy))
4358 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4359 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4360 }
4361
b0067625 4362 /* Wait for the outer child. */
d2e0ac3d
LP
4363 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4364 if (r < 0)
4365 return r;
4366 if (r != EXIT_SUCCESS)
4367 return -EIO;
b0067625
ZJS
4368
4369 /* And now retrieve the PID of the inner child. */
4370 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4371 if (l < 0)
4372 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4373 if (l != sizeof *pid)
4374 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4375
4376 /* We also retrieve container UUID in case it was generated by outer child */
4377 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4378 if (l < 0)
4379 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4380 if (l != sizeof(arg_uuid))
4381 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4382
4383 /* We also retrieve the socket used for notifications generated by outer child */
4384 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4385 if (notify_socket < 0)
4386 return log_error_errno(notify_socket,
4387 "Failed to receive notification socket from the outer child: %m");
4388
4389 log_debug("Init process invoked as PID "PID_FMT, *pid);
4390
4391 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4392 if (!barrier_place_and_sync(&barrier)) /* #1 */
4393 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4394
4395 r = setup_uid_map(*pid);
4396 if (r < 0)
4397 return r;
4398
4399 (void) barrier_place(&barrier); /* #2 */
4400 }
4401
4402 if (arg_private_network) {
75116558
PS
4403 if (!arg_network_namespace_path) {
4404 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4405 if (!barrier_place_and_sync(&barrier)) /* #3 */
4406 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4407 }
4408
b0067625
ZJS
4409 r = move_network_interfaces(*pid, arg_network_interfaces);
4410 if (r < 0)
4411 return r;
4412
4413 if (arg_network_veth) {
4414 r = setup_veth(arg_machine, *pid, veth_name,
4415 arg_network_bridge || arg_network_zone);
4416 if (r < 0)
4417 return r;
4418 else if (r > 0)
4419 ifi = r;
4420
4421 if (arg_network_bridge) {
4422 /* Add the interface to a bridge */
4423 r = setup_bridge(veth_name, arg_network_bridge, false);
4424 if (r < 0)
4425 return r;
4426 if (r > 0)
4427 ifi = r;
4428 } else if (arg_network_zone) {
4429 /* Add the interface to a bridge, possibly creating it */
4430 r = setup_bridge(veth_name, arg_network_zone, true);
4431 if (r < 0)
4432 return r;
4433 if (r > 0)
4434 ifi = r;
4435 }
4436 }
4437
4438 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4439 if (r < 0)
4440 return r;
4441
4442 /* We created the primary and extra veth links now; let's remember this, so that we know to
4443 remove them later on. Note that we don't bother with removing veth links that were created
4444 here when their setup failed half-way, because in that case the kernel should be able to
4445 remove them on its own, since they cannot be referenced by anything yet. */
4446 *veth_created = true;
4447
4448 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4449 if (r < 0)
4450 return r;
4451
4452 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4453 if (r < 0)
4454 return r;
4455 }
4456
abdb9b08
LP
4457 if (arg_register || !arg_keep_unit) {
4458 r = sd_bus_default_system(&bus);
4459 if (r < 0)
4460 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4461
4462 r = sd_bus_set_close_on_exit(bus, false);
4463 if (r < 0)
4464 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4465 }
4466
4467 if (!arg_keep_unit) {
4468 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4469 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4470 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4471
75152a4d
LP
4472 r = sd_bus_match_signal_async(
4473 bus,
4474 NULL,
4475 "org.freedesktop.systemd1",
4476 NULL,
4477 "org.freedesktop.systemd1.Scope",
4478 "RequestStop",
4479 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4480 if (r < 0)
75152a4d 4481 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4482 }
4483
b0067625
ZJS
4484 if (arg_register) {
4485 r = register_machine(
abdb9b08 4486 bus,
b0067625
ZJS
4487 arg_machine,
4488 *pid,
4489 arg_directory,
4490 arg_uuid,
4491 ifi,
4492 arg_slice,
4493 arg_custom_mounts, arg_n_custom_mounts,
4494 arg_kill_signal,
4495 arg_property,
de40a303 4496 arg_property_message,
b0067625
ZJS
4497 arg_keep_unit,
4498 arg_container_service_name);
4499 if (r < 0)
4500 return r;
abdb9b08 4501
cd2dfc6f
LP
4502 } else if (!arg_keep_unit) {
4503 r = allocate_scope(
abdb9b08 4504 bus,
cd2dfc6f
LP
4505 arg_machine,
4506 *pid,
4507 arg_slice,
4508 arg_custom_mounts, arg_n_custom_mounts,
4509 arg_kill_signal,
de40a303
LP
4510 arg_property,
4511 arg_property_message);
cd2dfc6f
LP
4512 if (r < 0)
4513 return r;
4514
4515 } else if (arg_slice || arg_property)
4516 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4517
27da7ef0 4518 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4519 if (r < 0)
4520 return r;
4521
27da7ef0 4522 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4523 if (r < 0)
4524 return r;
b0067625 4525
de54e02d 4526 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4527 if (r < 0)
4528 return r;
4529
4530 /* Notify the child that the parent is ready with all
4531 * its setup (including cgroup-ification), and that
4532 * the child can now hand over control to the code to
4533 * run inside the container. */
75116558 4534 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4535
4536 /* Block SIGCHLD here, before notifying child.
4537 * process_pty() will handle it with the other signals. */
4538 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4539
4540 /* Reset signal to default */
4541 r = default_signals(SIGCHLD, -1);
4542 if (r < 0)
4543 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4544
4545 r = sd_event_new(&event);
4546 if (r < 0)
4547 return log_error_errno(r, "Failed to get default event source: %m");
4548
8fd010bb
LP
4549 (void) sd_event_set_watchdog(event, true);
4550
abdb9b08
LP
4551 if (bus) {
4552 r = sd_bus_attach_event(bus, event, 0);
4553 if (r < 0)
4554 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4555 }
4556
5773024d 4557 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4558 if (r < 0)
4559 return r;
4560
4561 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4562 if (!barrier_place_and_sync(&barrier)) /* #5 */
4563 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4564
4565 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4566 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4567 etc_passwd_lock = safe_close(etc_passwd_lock);
4568
04f590a4
LP
4569 (void) sd_notifyf(false,
4570 "STATUS=Container running.\n"
4571 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4572 if (!arg_notify_ready)
919f5ae0 4573 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4574
4575 if (arg_kill_signal > 0) {
4576 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4577 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4578 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4579 } else {
4580 /* Immediately exit */
919f5ae0
LP
4581 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4582 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4583 }
4584
6916b164 4585 /* Exit when the child exits */
919f5ae0 4586 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4587
4588 if (arg_expose_ports) {
4589 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4590 if (r < 0)
4591 return r;
4592
4593 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4594 }
4595
4596 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4597
3acc84eb
FB
4598 if (arg_console_mode != CONSOLE_PIPE) {
4599 _cleanup_close_ int fd = -1;
4600 PTYForwardFlags flags = 0;
de40a303 4601
3acc84eb
FB
4602 /* Retrieve the master pty allocated by inner child */
4603 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4604 if (fd < 0)
4605 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4606
4607 switch (arg_console_mode) {
de40a303 4608
3acc84eb
FB
4609 case CONSOLE_READ_ONLY:
4610 flags |= PTY_FORWARD_READ_ONLY;
4611
4612 _fallthrough_;
4613
4614 case CONSOLE_INTERACTIVE:
4615 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4616
4617 r = pty_forward_new(event, fd, flags, &forward);
4618 if (r < 0)
4619 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4620
4621 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4622 (void) pty_forward_set_width_height(forward,
4623 arg_console_width,
4624 arg_console_height);
4625 break;
4626
4627 default:
4628 assert(arg_console_mode == CONSOLE_PASSIVE);
4629 }
4630
4631 *master = TAKE_FD(fd);
de40a303 4632 }
b0067625
ZJS
4633
4634 r = sd_event_loop(event);
4635 if (r < 0)
4636 return log_error_errno(r, "Failed to run event loop: %m");
4637
de40a303
LP
4638 if (forward) {
4639 char last_char = 0;
b0067625 4640
de40a303
LP
4641 (void) pty_forward_get_last_char(forward, &last_char);
4642 forward = pty_forward_free(forward);
b0067625 4643
de40a303
LP
4644 if (!arg_quiet && last_char != '\n')
4645 putc('\n', stdout);
4646 }
b0067625
ZJS
4647
4648 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4649 if (!arg_register && !arg_keep_unit && bus)
4650 terminate_scope(bus, arg_machine);
b0067625
ZJS
4651
4652 /* Normally redundant, but better safe than sorry */
c67b0082 4653 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4654
4655 r = wait_for_container(*pid, &container_status);
4656 *pid = 0;
4657
0bb0a9fa
ZJS
4658 /* Tell machined that we are gone. */
4659 if (bus)
4660 (void) unregister_machine(bus, arg_machine);
4661
b0067625
ZJS
4662 if (r < 0)
4663 /* We failed to wait for the container, or the container exited abnormally. */
4664 return r;
4665 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4666 /* r > 0 → The container exited with a non-zero status.
4667 * As a special case, we need to replace 133 with a different value,
4668 * because 133 is special-cased in the service file to reboot the container.
4669 * otherwise → The container exited with zero status and a reboot was not requested.
4670 */
2a49b612 4671 if (r == EXIT_FORCE_RESTART)
27e29a1e 4672 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4673 *ret = r;
b0067625
ZJS
4674 return 0; /* finito */
4675 }
4676
4677 /* CONTAINER_REBOOTED, loop again */
4678
4679 if (arg_keep_unit) {
4680 /* Special handling if we are running as a service: instead of simply
4681 * restarting the machine we want to restart the entire service, so let's
4682 * inform systemd about this with the special exit code 133. The service
4683 * file uses RestartForceExitStatus=133 so that this results in a full
4684 * nspawn restart. This is necessary since we might have cgroup parameters
4685 * set we want to have flushed out. */
2a49b612
ZJS
4686 *ret = EXIT_FORCE_RESTART;
4687 return 0; /* finito */
b0067625
ZJS
4688 }
4689
4690 expose_port_flush(arg_expose_ports, exposed);
4691
4692 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4693 *veth_created = false;
4694 return 1; /* loop again */
4695}
4696
bf428efb 4697static int initialize_rlimits(void) {
bf428efb
LP
4698 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4699 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4700 * container execution environments. */
4701
4702 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4703 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4704 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4705 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4706 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4707 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4708 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4709 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4710 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4711 [RLIMIT_NICE] = { 0, 0 },
4712 [RLIMIT_NOFILE] = { 1024, 4096 },
4713 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4714 [RLIMIT_RTPRIO] = { 0, 0 },
4715 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4716 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4717
4718 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4719 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4720 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4721 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4722 * that PID 1 changes a number of other resource limits during early initialization which is why we
4723 * don't read the other limits from PID 1 but prefer the static table above. */
4724 };
4725
4726 int rl;
4727
4728 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4729 /* Let's only fill in what the user hasn't explicitly configured anyway */
4730 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4731 const struct rlimit *v;
4732 struct rlimit buffer;
4733
4734 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4735 /* For these two let's read the limits off PID 1. See above for an explanation. */
4736
4737 if (prlimit(1, rl, NULL, &buffer) < 0)
4738 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4739
4740 v = &buffer;
4741 } else
4742 v = kernel_defaults + rl;
4743
4744 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4745 if (!arg_rlimit[rl])
4746 return log_oom();
4747 }
4748
4749 if (DEBUG_LOGGING) {
4750 _cleanup_free_ char *k = NULL;
4751
4752 (void) rlimit_format(arg_rlimit[rl], &k);
4753 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4754 }
4755 }
4756
4757 return 0;
4758}
4759
44dbef90 4760static int run(int argc, char *argv[]) {
7bf011e3
LP
4761 bool secondary = false, remove_directory = false, remove_image = false,
4762 veth_created = false, remove_tmprootdir = false;
2d845785 4763 _cleanup_close_ int master = -1;
03cfe0d5 4764 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4765 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4766 char veth_name[IFNAMSIZ] = "";
03cfe0d5 4767 union in_addr_union exposed = {};
8e766630 4768 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 4769 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4770 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4771 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4772 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
7bf011e3 4773 pid_t pid = 0;
03cfe0d5
LP
4774
4775 log_parse_environment();
4776 log_open();
415fc41c 4777
03cfe0d5
LP
4778 r = parse_argv(argc, argv);
4779 if (r <= 0)
4780 goto finish;
4781
fba868fa
LP
4782 r = must_be_root();
4783 if (r < 0)
03cfe0d5 4784 goto finish;
fba868fa 4785
bf428efb
LP
4786 r = initialize_rlimits();
4787 if (r < 0)
4788 goto finish;
4789
de40a303
LP
4790 r = load_oci_bundle();
4791 if (r < 0)
4792 goto finish;
4793
f757855e
LP
4794 r = determine_names();
4795 if (r < 0)
4796 goto finish;
4797
4798 r = load_settings();
4799 if (r < 0)
4800 goto finish;
4801
d4d99bc6 4802 r = cg_unified();
5eee8290
LP
4803 if (r < 0) {
4804 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4805 goto finish;
4806 }
4807
f757855e
LP
4808 r = verify_arguments();
4809 if (r < 0)
4810 goto finish;
03cfe0d5 4811
49048684
ZJS
4812 /* Reapply environment settings. */
4813 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 4814
2949ff26
LP
4815 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4816 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4817 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4818 (void) ignore_signals(SIGPIPE, -1);
4819
03cfe0d5
LP
4820 n_fd_passed = sd_listen_fds(false);
4821 if (n_fd_passed > 0) {
4822 r = fdset_new_listen_fds(&fds, false);
4823 if (r < 0) {
4824 log_error_errno(r, "Failed to collect file descriptors: %m");
4825 goto finish;
4826 }
4827 }
4828
83e803a9
ZJS
4829 /* The "default" umask. This is appropriate for most file and directory
4830 * operations performed by nspawn, and is the umask that will be used for
4831 * the child. Functions like copy_devnodes() change the umask temporarily. */
4832 umask(0022);
4833
03cfe0d5
LP
4834 if (arg_directory) {
4835 assert(!arg_image);
4836
b35ca61a
LP
4837 /* Safety precaution: let's not allow running images from the live host OS image, as long as
4838 * /var from the host will propagate into container dynamically (because bad things happen if
4839 * two systems write to the same /var). Let's allow it for the special cases where /var is
4840 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
4841 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
4842 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
4843 r = -EINVAL;
4844 goto finish;
4845 }
4846
4847 if (arg_ephemeral) {
4848 _cleanup_free_ char *np = NULL;
4849
8d4aa2bb 4850 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4851 if (r < 0)
4852 goto finish;
4853
7bf011e3
LP
4854 /* If the specified path is a mount point we generate the new snapshot immediately
4855 * inside it under a random name. However if the specified is not a mount point we
4856 * create the new snapshot in the parent directory, just next to it. */
e1873695 4857 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4858 if (r < 0) {
4859 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4860 goto finish;
4861 }
4862 if (r > 0)
770b5ce4 4863 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4864 else
770b5ce4 4865 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4866 if (r < 0) {
0f3be6ca 4867 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4868 goto finish;
4869 }
4870
6992459c
LP
4871 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
4872 * only owned by us and noone else. */
4873 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
4874 if (r < 0) {
4875 log_error_errno(r, "Failed to lock %s: %m", np);
4876 goto finish;
4877 }
4878
7bf011e3
LP
4879 {
4880 BLOCK_SIGNALS(SIGINT);
4881 r = btrfs_subvol_snapshot(arg_directory, np,
4882 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4883 BTRFS_SNAPSHOT_FALLBACK_COPY |
4884 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4885 BTRFS_SNAPSHOT_RECURSIVE |
4886 BTRFS_SNAPSHOT_QUOTA |
4887 BTRFS_SNAPSHOT_SIGINT);
4888 }
4889 if (r == -EINTR) {
4890 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
4891 goto finish;
4892 }
03cfe0d5
LP
4893 if (r < 0) {
4894 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4895 goto finish;
ec16945e
LP
4896 }
4897
1cc6c93a 4898 free_and_replace(arg_directory, np);
17cbb288 4899 remove_directory = true;
30535c16 4900 } else {
cb638b5e 4901 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4902 if (r < 0)
4903 goto finish;
4904
30535c16
LP
4905 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4906 if (r == -EBUSY) {
4907 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4908 goto finish;
4909 }
4910 if (r < 0) {
4911 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4912 goto finish;
30535c16
LP
4913 }
4914
4915 if (arg_template) {
8d4aa2bb 4916 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4917 if (r < 0)
4918 goto finish;
4919
7bf011e3
LP
4920 {
4921 BLOCK_SIGNALS(SIGINT);
4922 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4923 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4924 BTRFS_SNAPSHOT_FALLBACK_COPY |
4925 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4926 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4927 BTRFS_SNAPSHOT_RECURSIVE |
4928 BTRFS_SNAPSHOT_QUOTA |
4929 BTRFS_SNAPSHOT_SIGINT);
4930 }
ff6c6cc1
LP
4931 if (r == -EEXIST)
4932 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4933 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
4934 else if (r == -EINTR) {
4935 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
4936 goto finish;
4937 } else if (r < 0) {
83521414 4938 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4939 goto finish;
ff6c6cc1
LP
4940 } else
4941 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4942 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4943 }
ec16945e
LP
4944 }
4945
7732f92b 4946 if (arg_start_mode == START_BOOT) {
a5201ed6 4947 const char *p;
c9fe05e0 4948
a5201ed6
LP
4949 if (arg_pivot_root_new)
4950 p = prefix_roota(arg_directory, arg_pivot_root_new);
4951 else
4952 p = arg_directory;
c9fe05e0
AR
4953
4954 if (path_is_os_tree(p) <= 0) {
4955 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4956 r = -EINVAL;
1b9e5b12
LP
4957 goto finish;
4958 }
4959 } else {
c9fe05e0
AR
4960 const char *p, *q;
4961
a5201ed6
LP
4962 if (arg_pivot_root_new)
4963 p = prefix_roota(arg_directory, arg_pivot_root_new);
4964 else
4965 p = arg_directory;
c9fe05e0
AR
4966
4967 q = strjoina(p, "/usr/");
1b9e5b12 4968
c9fe05e0
AR
4969 if (laccess(q, F_OK) < 0) {
4970 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4971 r = -EINVAL;
1b9e5b12 4972 goto finish;
1b9e5b12
LP
4973 }
4974 }
ec16945e 4975
6b9132a9 4976 } else {
ec16945e
LP
4977 assert(arg_image);
4978 assert(!arg_template);
4979
8d4aa2bb 4980 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4981 if (r < 0)
4982 goto finish;
4983
0f3be6ca
LP
4984 if (arg_ephemeral) {
4985 _cleanup_free_ char *np = NULL;
4986
4987 r = tempfn_random(arg_image, "machine.", &np);
4988 if (r < 0) {
4989 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4990 goto finish;
4991 }
4992
6992459c
LP
4993 /* Always take an exclusive lock on our own ephemeral copy. */
4994 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
4995 if (r < 0) {
4996 r = log_error_errno(r, "Failed to create image lock: %m");
4997 goto finish;
4998 }
4999
7bf011e3
LP
5000 {
5001 BLOCK_SIGNALS(SIGINT);
5002 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5003 }
5004 if (r == -EINTR) {
5005 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5006 goto finish;
5007 }
0f3be6ca
LP
5008 if (r < 0) {
5009 r = log_error_errno(r, "Failed to copy image file: %m");
5010 goto finish;
5011 }
5012
1cc6c93a 5013 free_and_replace(arg_image, np);
0f3be6ca
LP
5014 remove_image = true;
5015 } else {
5016 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5017 if (r == -EBUSY) {
5018 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5019 goto finish;
5020 }
5021 if (r < 0) {
5022 r = log_error_errno(r, "Failed to create image lock: %m");
5023 goto finish;
5024 }
4623e8e6 5025
78ebe980
LP
5026 if (!arg_root_hash) {
5027 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
5028 if (r < 0) {
5029 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
5030 goto finish;
5031 }
5032 }
30535c16
LP
5033 }
5034
c67b0082 5035 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5036 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5037 goto finish;
1b9e5b12 5038 }
6b9132a9 5039
c67b0082
LP
5040 remove_tmprootdir = true;
5041
5042 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5043 if (!arg_directory) {
5044 r = log_oom();
5045 goto finish;
6b9132a9 5046 }
88213476 5047
e08f94ac 5048 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, LO_FLAGS_PARTSCAN, &loop);
2d845785
LP
5049 if (r < 0) {
5050 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5051 goto finish;
5052 }
1b9e5b12 5053
4526113f 5054 r = dissect_image_and_warn(
e0f9e7bd 5055 loop->fd,
4526113f 5056 arg_image,
e0f9e7bd
LP
5057 arg_root_hash, arg_root_hash_size,
5058 DISSECT_IMAGE_REQUIRE_ROOT,
5059 &dissected_image);
2d845785 5060 if (r == -ENOPKG) {
4526113f 5061 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5062 log_notice("Note that the disk image needs to\n"
5063 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5064 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5065 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
5066 " d) or contain a file system without a partition table\n"
5067 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5068 goto finish;
2d845785 5069 }
4526113f 5070 if (r < 0)
842f3b0f 5071 goto finish;
1b9e5b12 5072
4623e8e6
LP
5073 if (!arg_root_hash && dissected_image->can_verity)
5074 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5075
5076 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
5077 if (r < 0)
5078 goto finish;
0f3be6ca
LP
5079
5080 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5081 if (remove_image && unlink(arg_image) >= 0)
5082 remove_image = false;
842f3b0f 5083 }
842f3b0f 5084
86c0dd4a 5085 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5086 if (r < 0)
5087 goto finish;
5088
de40a303
LP
5089 if (arg_console_mode < 0)
5090 arg_console_mode =
5091 isatty(STDIN_FILENO) > 0 &&
5092 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5093
de40a303
LP
5094 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5095 arg_quiet = true;
a258bf26 5096
9c857b9d
LP
5097 if (!arg_quiet)
5098 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5099 arg_machine, arg_image ?: arg_directory);
5100
72c0a2c2 5101 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5102
66edd963 5103 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5104 r = log_error_errno(errno, "Failed to become subreaper: %m");
5105 goto finish;
5106 }
5107
d87be9b0 5108 for (;;) {
3acc84eb 5109 r = run_container(dissected_image,
44dbef90
LP
5110 secondary,
5111 fds,
5112 veth_name, &veth_created,
3acc84eb 5113 &exposed, &master,
44dbef90 5114 &pid, &ret);
b0067625 5115 if (r <= 0)
d87be9b0 5116 break;
d87be9b0 5117 }
88213476
LP
5118
5119finish:
04f590a4
LP
5120 (void) sd_notify(false,
5121 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5122 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5123
9444b1f2 5124 if (pid > 0)
c67b0082 5125 (void) kill(pid, SIGKILL);
88213476 5126
503546da 5127 /* Try to flush whatever is still queued in the pty */
6a0f896b 5128 if (master >= 0) {
1c876927 5129 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5130 master = safe_close(master);
5131 }
5132
5133 if (pid > 0)
5134 (void) wait_for_terminate(pid, NULL);
503546da 5135
50ebcf6c
LP
5136 pager_close();
5137
17cbb288 5138 if (remove_directory && arg_directory) {
ec16945e
LP
5139 int k;
5140
17cbb288 5141 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5142 if (k < 0)
17cbb288 5143 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5144 }
5145
0f3be6ca
LP
5146 if (remove_image && arg_image) {
5147 if (unlink(arg_image) < 0)
5148 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5149 }
5150
c67b0082
LP
5151 if (remove_tmprootdir) {
5152 if (rmdir(tmprootdir) < 0)
5153 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5154 }
5155
785890ac
LP
5156 if (arg_machine) {
5157 const char *p;
5158
63c372cb 5159 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5160 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5161 }
5162
7a8f6325 5163 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5164
5165 if (veth_created)
5166 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5167 (void) remove_bridge(arg_network_zone);
f757855e 5168
f757855e
LP
5169 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5170 expose_port_free_all(arg_expose_ports);
bf428efb 5171 rlimit_free_all(arg_rlimit);
b2645747 5172 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
6d0b55c2 5173
44dbef90
LP
5174 if (r < 0)
5175 return r;
5176
5177 return ret;
88213476 5178}
44dbef90
LP
5179
5180DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);