]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
tree-wide: use SD_ID128_STRING_MAX where appropriate
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
8fe0087e
LP
14#include <sys/personality.h>
15#include <sys/prctl.h>
16#include <sys/types.h>
6916b164 17#include <sys/wait.h>
8fe0087e 18#include <unistd.h>
1b9e5b12 19
b053cd5f 20#include "sd-bus.h"
1f0cd86b 21#include "sd-daemon.h"
1f0cd86b 22#include "sd-id128.h"
8fe0087e 23
b5efdb8a 24#include "alloc-util.h"
8fe0087e
LP
25#include "barrier.h"
26#include "base-filesystem.h"
27#include "blkid-util.h"
28#include "btrfs-util.h"
b8ea7a6e 29#include "bus-error.h"
b053cd5f 30#include "bus-util.h"
8fe0087e 31#include "cap-list.h"
430f0182 32#include "capability-util.h"
04d391da 33#include "cgroup-util.h"
8fe0087e 34#include "copy.h"
d107bb7d 35#include "cpu-set-util.h"
4fc9982c 36#include "dev-setup.h"
2d845785 37#include "dissect-image.h"
8fe0087e 38#include "env-util.h"
3ffd4af2 39#include "fd-util.h"
842f3b0f 40#include "fdset.h"
a5c32cff 41#include "fileio.h"
f97b34a6 42#include "format-util.h"
f4f15635 43#include "fs-util.h"
1b9e5b12 44#include "gpt.h"
4623e8e6 45#include "hexdecoct.h"
8fe0087e 46#include "hostname-util.h"
910fd145 47#include "id128-util.h"
8fe0087e 48#include "log.h"
2d845785 49#include "loop-util.h"
8fe0087e 50#include "loopback-setup.h"
1b9cebf6 51#include "machine-image.h"
8fe0087e 52#include "macro.h"
44dbef90 53#include "main-func.h"
f5947a5e 54#include "missing_sched.h"
8fe0087e 55#include "mkdir.h"
4349cd7c 56#include "mount-util.h"
049af8ad 57#include "mountpoint-util.h"
0cb8e3d1 58#include "namespace-util.h"
8fe0087e 59#include "netlink-util.h"
07630cea 60#include "nspawn-cgroup.h"
3603efde 61#include "nspawn-def.h"
07630cea
LP
62#include "nspawn-expose-ports.h"
63#include "nspawn-mount.h"
64#include "nspawn-network.h"
de40a303 65#include "nspawn-oci.h"
7336138e 66#include "nspawn-patch-uid.h"
07630cea 67#include "nspawn-register.h"
910fd145 68#include "nspawn-seccomp.h"
07630cea
LP
69#include "nspawn-settings.h"
70#include "nspawn-setuid.h"
7732f92b 71#include "nspawn-stub-pid1.h"
d8b4d14d 72#include "nulstr-util.h"
d58ad743 73#include "os-util.h"
50ebcf6c 74#include "pager.h"
6bedfcbb 75#include "parse-util.h"
8fe0087e 76#include "path-util.h"
294bf0c3 77#include "pretty-print.h"
0b452006 78#include "process-util.h"
8fe0087e
LP
79#include "ptyfwd.h"
80#include "random-util.h"
8869a0b4 81#include "raw-clone.h"
bf428efb 82#include "rlimit-util.h"
8fe0087e 83#include "rm-rf.h"
de40a303
LP
84#if HAVE_SECCOMP
85#include "seccomp-util.h"
86#endif
68b02049 87#include "selinux-util.h"
8fe0087e 88#include "signal-util.h"
2583fbea 89#include "socket-util.h"
8fcde012 90#include "stat-util.h"
15a5e950 91#include "stdio-util.h"
5c828e66 92#include "string-table.h"
07630cea 93#include "string-util.h"
8fe0087e 94#include "strv.h"
de40a303 95#include "sysctl-util.h"
8fe0087e 96#include "terminal-util.h"
e4de7287 97#include "tmpfile-util.h"
affb60b1 98#include "umask-util.h"
43c3fb46 99#include "unit-name.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
62b1e758
YW
103#if HAVE_SPLIT_USR
104#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
105#else
106#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
107#endif
108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
6145bb4f 118 CONTAINER_REBOOTED,
113cea80
DH
119} ContainerStatus;
120
88213476 121static char *arg_directory = NULL;
ec16945e 122static char *arg_template = NULL;
5f932eb9 123static char *arg_chdir = NULL;
b53ede69
PW
124static char *arg_pivot_root_new = NULL;
125static char *arg_pivot_root_old = NULL;
687d0825 126static char *arg_user = NULL;
de40a303
LP
127static uid_t arg_uid = UID_INVALID;
128static gid_t arg_gid = GID_INVALID;
129static gid_t* arg_supplementary_gids = NULL;
130static size_t arg_n_supplementary_gids = 0;
9444b1f2 131static sd_id128_t arg_uuid = {};
3a9530e5
LP
132static char *arg_machine = NULL; /* The name used by the host to refer to this */
133static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
134static const char *arg_selinux_context = NULL;
135static const char *arg_selinux_apifs_context = NULL;
de40a303 136static char *arg_slice = NULL;
ff01d048 137static bool arg_private_network = false;
bc2f673e 138static bool arg_read_only = false;
7732f92b 139static StartMode arg_start_mode = START_PID1;
ec16945e 140static bool arg_ephemeral = false;
57fb9fb5 141static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 142static bool arg_link_journal_try = false;
520e0d54 143static uint64_t arg_caps_retain =
50b52222
LP
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 155 (1ULL << CAP_MKNOD) |
5076f0cc
LP
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
5076f0cc 159 (1ULL << CAP_SETFCAP) |
50b52222 160 (1ULL << CAP_SETGID) |
5076f0cc
LP
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
50b52222 164 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
50b52222 169 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 170static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 171static CustomMount *arg_custom_mounts = NULL;
88614c8a 172static size_t arg_n_custom_mounts = 0;
f4889f65 173static char **arg_setenv = NULL;
284c0b91 174static bool arg_quiet = false;
eb91eb18 175static bool arg_register = true;
89f7c846 176static bool arg_keep_unit = false;
aa28aefe 177static char **arg_network_interfaces = NULL;
c74e630d 178static char **arg_network_macvlan = NULL;
4bbfe7ad 179static char **arg_network_ipvlan = NULL;
69c79d3c 180static bool arg_network_veth = false;
f6d6bad1 181static char **arg_network_veth_extra = NULL;
f757855e 182static char *arg_network_bridge = NULL;
22b28dfd 183static char *arg_network_zone = NULL;
d7bea6b6 184static char *arg_network_namespace_path = NULL;
bb068de0 185static PagerFlags arg_pager_flags = 0;
050f7277 186static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 187static char *arg_image = NULL;
de40a303 188static char *arg_oci_bundle = NULL;
f757855e 189static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 190static ExposePort *arg_expose_ports = NULL;
f36933fe 191static char **arg_property = NULL;
de40a303 192static sd_bus_message *arg_property_message = NULL;
0de7acce 193static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 194static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 195static bool arg_userns_chown = false;
c6c8f6e2 196static int arg_kill_signal = 0;
5da38d07 197static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
198static SettingsMask arg_settings_mask = 0;
199static int arg_settings_trusted = -1;
200static char **arg_parameters = NULL;
6aadfa4c 201static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 202static bool arg_notify_ready = false;
5a8ff0e6 203static bool arg_use_cgns = true;
0c582db0 204static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 205static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
206static void *arg_root_hash = NULL;
207static size_t arg_root_hash_size = 0;
960e4569
LP
208static char **arg_syscall_whitelist = NULL;
209static char **arg_syscall_blacklist = NULL;
de40a303
LP
210#if HAVE_SECCOMP
211static scmp_filter_ctx arg_seccomp = NULL;
212#endif
bf428efb 213static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 214static bool arg_no_new_privileges = false;
81f345df
LP
215static int arg_oom_score_adjust = 0;
216static bool arg_oom_score_adjust_set = false;
0985c7c4 217static CPUSet arg_cpu_set = {};
09d423e9 218static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 219static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
220static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
221static DeviceNode* arg_extra_nodes = NULL;
222static size_t arg_n_extra_nodes = 0;
223static char **arg_sysctl = NULL;
224static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
88213476 225
6145bb4f
LP
226STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
227STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
228STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
237STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
238STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
241STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
247STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
248STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
252#if HAVE_SECCOMP
253STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
254#endif
0985c7c4 255STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
256STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
257
dce66ffe
ZJS
258static int handle_arg_console(const char *arg) {
259 if (streq(arg, "help")) {
260 puts("interactive\n"
261 "read-only\n"
262 "passive\n"
263 "pipe");
264 return 0;
265 }
266
267 if (streq(arg, "interactive"))
268 arg_console_mode = CONSOLE_INTERACTIVE;
269 else if (streq(arg, "read-only"))
270 arg_console_mode = CONSOLE_READ_ONLY;
271 else if (streq(arg, "passive"))
272 arg_console_mode = CONSOLE_PASSIVE;
273 else if (streq(arg, "pipe"))
274 arg_console_mode = CONSOLE_PIPE;
275 else
276 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
277
278 arg_settings_mask |= SETTING_CONSOLE_MODE;
279 return 1;
280}
281
37ec0fdd
LP
282static int help(void) {
283 _cleanup_free_ char *link = NULL;
284 int r;
285
bb068de0 286 (void) pager_open(arg_pager_flags);
50ebcf6c 287
37ec0fdd
LP
288 r = terminal_urlify_man("systemd-nspawn", "1", &link);
289 if (r < 0)
290 return log_oom();
291
25148653 292 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 293 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
294 " -h --help Show this help\n"
295 " --version Print version string\n"
69c79d3c 296 " -q --quiet Do not show status information\n"
bb068de0 297 " --no-pager Do not pipe output into a pager\n"
25148653
LP
298 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
299 "%3$sImage:%4$s\n"
1b9e5b12 300 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
301 " --template=PATH Initialize root directory from template directory,\n"
302 " if missing\n"
303 " -x --ephemeral Run container with snapshot of root directory, and\n"
304 " remove it after exit\n"
25e68fd3
LP
305 " -i --image=PATH Root file system disk image (or device node) for\n"
306 " the container\n"
de40a303 307 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
308 " --read-only Mount the root directory read-only\n"
309 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 310 " --root-hash=HASH Specify verity root hash for root disk image\n"
25148653
LP
311 " --pivot-root=PATH[:PATH]\n"
312 " Pivot root to given directory in the container\n\n"
313 "%3$sExecution:%4$s\n"
7732f92b 314 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 315 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 316 " --chdir=PATH Set working directory in the container\n"
25148653
LP
317 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
318 " -u --user=USER Run the command under specified user or UID\n"
319 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
320 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
321 "%3$sSystem Identity:%4$s\n"
a8828ed9 322 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 323 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
324 " --uuid=UUID Set a specific machine UUID for the container\n\n"
325 "%3$sProperties:%4$s\n"
a8828ed9 326 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 327 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
328 " --register=BOOLEAN Register container as machine\n"
329 " --keep-unit Do not register a scope for the machine, reuse\n"
330 " the service unit nspawn is running in\n\n"
331 "%3$sUser Namespacing:%4$s\n"
90b4a64d 332 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 333 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 334 " Similar, but with user configured UID/GID range\n"
25148653
LP
335 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
336 "%3$sNetworking:%4$s\n"
69c79d3c
LP
337 " --private-network Disable network in container\n"
338 " --network-interface=INTERFACE\n"
339 " Assign an existing network interface to the\n"
340 " container\n"
c74e630d
LP
341 " --network-macvlan=INTERFACE\n"
342 " Create a macvlan network interface based on an\n"
343 " existing network interface to the container\n"
4bbfe7ad
TG
344 " --network-ipvlan=INTERFACE\n"
345 " Create a ipvlan network interface based on an\n"
346 " existing network interface to the container\n"
a8eaaee7 347 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 348 " and container\n"
f6d6bad1
LP
349 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
350 " Add an additional virtual Ethernet link between\n"
351 " host and container\n"
ab046dde 352 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
353 " Add a virtual Ethernet connection to the container\n"
354 " and attach it to an existing bridge on the host\n"
355 " --network-zone=NAME Similar, but attach the new interface to an\n"
356 " an automatically managed bridge interface\n"
d7bea6b6
DP
357 " --network-namespace-path=PATH\n"
358 " Set network namespace to the one represented by\n"
359 " the specified kernel namespace file node\n"
6d0b55c2 360 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
361 " Expose a container IP port on the host\n\n"
362 "%3$sSecurity:%4$s\n"
a8828ed9
DW
363 " --capability=CAP In addition to the default, retain specified\n"
364 " capability\n"
365 " --drop-capability=CAP Drop the specified capability from the default set\n"
f4e803c8 366 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
367 " --system-call-filter=LIST|~LIST\n"
368 " Permit/prohibit specific system calls\n"
25148653
LP
369 " -Z --selinux-context=SECLABEL\n"
370 " Set the SELinux security context to be used by\n"
371 " processes in the container\n"
372 " -L --selinux-apifs-context=SECLABEL\n"
373 " Set the SELinux security context to be used by\n"
374 " API/tmpfs file systems in the container\n\n"
375 "%3$sResources:%4$s\n"
bf428efb 376 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
377 " --oom-score-adjust=VALUE\n"
378 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
379 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
380 " --personality=ARCH Pick personality for this container\n\n"
25148653 381 "%3$sIntegration:%4$s\n"
09d423e9 382 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 383 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
384 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
385 " host, try-guest, try-host\n"
386 " -j Equivalent to --link-journal=try-guest\n\n"
387 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
388 " --bind=PATH[:PATH[:OPTIONS]]\n"
389 " Bind mount a file or directory from the host into\n"
a8828ed9 390 " the container\n"
5e5bfa6e
EY
391 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
392 " Similar, but creates a read-only bind mount\n"
de40a303
LP
393 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
394 " it\n"
06c17c39 395 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
396 " --overlay=PATH[:PATH...]:PATH\n"
397 " Create an overlay mount from the host to \n"
398 " the container\n"
399 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
400 " Similar, but creates a read-only overlay mount\n\n"
401 "%3$sInput/Output:%4$s\n"
de40a303
LP
402 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
403 " set up for the container.\n"
404 " -P --pipe Equivalent to --console=pipe\n"
25148653 405 "\nSee the %2$s for details.\n"
37ec0fdd
LP
406 , program_invocation_short_name
407 , link
37a92352
LP
408 , ansi_underline(), ansi_normal()
409 , ansi_highlight(), ansi_normal()
410 );
37ec0fdd
LP
411
412 return 0;
88213476
LP
413}
414
86c0dd4a 415static int custom_mount_check_all(void) {
88614c8a 416 size_t i;
5a8af538 417
5a8af538
LP
418 for (i = 0; i < arg_n_custom_mounts; i++) {
419 CustomMount *m = &arg_custom_mounts[i];
420
0de7acce 421 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
422 if (arg_userns_chown)
423 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
424 "--private-users-chown may not be combined with custom root mounts.");
425 else if (arg_uid_shift == UID_INVALID)
426 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
427 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 428 }
5a8af538
LP
429 }
430
431 return 0;
432}
433
8199d554 434static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 435 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 436 int r;
5da38d07 437
efdb0237 438 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
439
440 e = getenv(var);
441 if (!e) {
d5fc5b2f 442 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
443 var = "UNIFIED_CGROUP_HIERARCHY";
444 e = getenv(var);
c78c095b
ZJS
445 }
446
447 if (!isempty(e)) {
efdb0237
LP
448 r = parse_boolean(e);
449 if (r < 0)
c78c095b 450 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
451 if (r > 0)
452 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
453 else
454 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
455 }
456
8199d554
LP
457 return 0;
458}
459
460static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
461 int r;
462
75b0d8b8
ZJS
463 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
464 * in the image actually supports. */
b4cccbc1
LP
465 r = cg_all_unified();
466 if (r < 0)
467 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
468 if (r > 0) {
a8725a06
ZJS
469 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
470 * routine only detects 231, so we'll have a false negative here for 230. */
471 r = systemd_installation_has_version(directory, 230);
472 if (r < 0)
473 return log_error_errno(r, "Failed to determine systemd version in container: %m");
474 if (r > 0)
475 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
476 else
477 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 478 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
479 /* Mixed cgroup hierarchy support was added in 233 */
480 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
481 if (r < 0)
482 return log_error_errno(r, "Failed to determine systemd version in container: %m");
483 if (r > 0)
484 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
485 else
486 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
487 } else
5da38d07 488 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 489
8199d554
LP
490 log_debug("Using %s hierarchy for container.",
491 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
492 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
493
efdb0237
LP
494 return 0;
495}
496
8a99bd0c
ZJS
497static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
498 uint64_t mask = 0;
499 int r;
500
501 for (;;) {
502 _cleanup_free_ char *t = NULL;
503
504 r = extract_first_word(&spec, &t, ",", 0);
505 if (r < 0)
506 return log_error_errno(r, "Failed to parse capability %s.", t);
507 if (r == 0)
508 break;
509
510 if (streq(t, "help")) {
511 for (int i = 0; i < capability_list_length(); i++) {
512 const char *name;
513
514 name = capability_to_name(i);
515 if (name)
516 puts(name);
517 }
518
519 return 0; /* quit */
520 }
521
522 if (streq(t, "all"))
523 mask = (uint64_t) -1;
524 else {
525 r = capability_from_name(t);
526 if (r < 0)
527 return log_error_errno(r, "Failed to parse capability %s.", t);
528
529 mask |= 1ULL << r;
530 }
531 }
532
533 *ret_mask = mask;
534 return 1; /* continue */
535}
536
49048684 537static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
538 int r;
539
540 r = getenv_bool(name);
541 if (r == -ENXIO)
49048684 542 return 0;
0c582db0 543 if (r < 0)
49048684 544 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 545
0c582db0 546 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 547 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 548 return 0;
0c582db0
LB
549}
550
49048684 551static int parse_mount_settings_env(void) {
4f086aab 552 const char *e;
1099ceeb
LP
553 int r;
554
555 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
556 if (r < 0 && r != -ENXIO)
557 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
558 if (r >= 0)
559 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
560
561 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 562 if (streq_ptr(e, "network"))
4f086aab 563 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 564
49048684
ZJS
565 else if (e) {
566 r = parse_boolean(e);
567 if (r < 0)
568 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
569
570 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
571 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 572 }
4f086aab 573
49048684 574 return 0;
4f086aab
SU
575}
576
49048684 577static int parse_environment(void) {
d5455d2f
LP
578 const char *e;
579 int r;
580
49048684
ZJS
581 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
582 if (r < 0)
583 return r;
584 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
585 if (r < 0)
586 return r;
587 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
588 if (r < 0)
589 return r;
590 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
591 if (r < 0)
592 return r;
d5455d2f 593
49048684
ZJS
594 r = parse_mount_settings_env();
595 if (r < 0)
596 return r;
d5455d2f 597
489fae52
ZJS
598 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
599 * even if it is supported. If not supported, it has no effect. */
de40a303 600 if (!cg_ns_supported())
489fae52 601 arg_use_cgns = false;
de40a303
LP
602 else {
603 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
604 if (r < 0) {
605 if (r != -ENXIO)
49048684 606 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
607
608 arg_use_cgns = true;
609 } else {
610 arg_use_cgns = r > 0;
611 arg_settings_mask |= SETTING_USE_CGNS;
612 }
613 }
d5455d2f
LP
614
615 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
616 if (e)
617 arg_container_service_name = e;
618
49048684 619 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
620}
621
88213476 622static int parse_argv(int argc, char *argv[]) {
a41fe3a2 623 enum {
acbeb427
ZJS
624 ARG_VERSION = 0x100,
625 ARG_PRIVATE_NETWORK,
bc2f673e 626 ARG_UUID,
5076f0cc 627 ARG_READ_ONLY,
57fb9fb5 628 ARG_CAPABILITY,
420c7379 629 ARG_DROP_CAPABILITY,
17fe0523
LP
630 ARG_LINK_JOURNAL,
631 ARG_BIND,
f4889f65 632 ARG_BIND_RO,
06c17c39 633 ARG_TMPFS,
5a8af538
LP
634 ARG_OVERLAY,
635 ARG_OVERLAY_RO,
de40a303 636 ARG_INACCESSIBLE,
eb91eb18 637 ARG_SHARE_SYSTEM,
89f7c846 638 ARG_REGISTER,
aa28aefe 639 ARG_KEEP_UNIT,
69c79d3c 640 ARG_NETWORK_INTERFACE,
c74e630d 641 ARG_NETWORK_MACVLAN,
4bbfe7ad 642 ARG_NETWORK_IPVLAN,
ab046dde 643 ARG_NETWORK_BRIDGE,
22b28dfd 644 ARG_NETWORK_ZONE,
f6d6bad1 645 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 646 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 647 ARG_PERSONALITY,
4d9f07b4 648 ARG_VOLATILE,
ec16945e 649 ARG_TEMPLATE,
f36933fe 650 ARG_PROPERTY,
6dac160c 651 ARG_PRIVATE_USERS,
c6c8f6e2 652 ARG_KILL_SIGNAL,
f757855e 653 ARG_SETTINGS,
5f932eb9 654 ARG_CHDIR,
b53ede69 655 ARG_PIVOT_ROOT,
7336138e 656 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 657 ARG_NOTIFY_READY,
4623e8e6 658 ARG_ROOT_HASH,
960e4569 659 ARG_SYSTEM_CALL_FILTER,
bf428efb 660 ARG_RLIMIT,
3a9530e5 661 ARG_HOSTNAME,
66edd963 662 ARG_NO_NEW_PRIVILEGES,
81f345df 663 ARG_OOM_SCORE_ADJUST,
d107bb7d 664 ARG_CPU_AFFINITY,
09d423e9 665 ARG_RESOLV_CONF,
1688841f 666 ARG_TIMEZONE,
de40a303
LP
667 ARG_CONSOLE,
668 ARG_PIPE,
669 ARG_OCI_BUNDLE,
bb068de0 670 ARG_NO_PAGER,
a41fe3a2
LP
671 };
672
88213476 673 static const struct option options[] = {
d7bea6b6
DP
674 { "help", no_argument, NULL, 'h' },
675 { "version", no_argument, NULL, ARG_VERSION },
676 { "directory", required_argument, NULL, 'D' },
677 { "template", required_argument, NULL, ARG_TEMPLATE },
678 { "ephemeral", no_argument, NULL, 'x' },
679 { "user", required_argument, NULL, 'u' },
680 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
681 { "as-pid2", no_argument, NULL, 'a' },
682 { "boot", no_argument, NULL, 'b' },
683 { "uuid", required_argument, NULL, ARG_UUID },
684 { "read-only", no_argument, NULL, ARG_READ_ONLY },
685 { "capability", required_argument, NULL, ARG_CAPABILITY },
686 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 687 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
688 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
689 { "bind", required_argument, NULL, ARG_BIND },
690 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
691 { "tmpfs", required_argument, NULL, ARG_TMPFS },
692 { "overlay", required_argument, NULL, ARG_OVERLAY },
693 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 694 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 695 { "machine", required_argument, NULL, 'M' },
3a9530e5 696 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
697 { "slice", required_argument, NULL, 'S' },
698 { "setenv", required_argument, NULL, 'E' },
699 { "selinux-context", required_argument, NULL, 'Z' },
700 { "selinux-apifs-context", required_argument, NULL, 'L' },
701 { "quiet", no_argument, NULL, 'q' },
702 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
703 { "register", required_argument, NULL, ARG_REGISTER },
704 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
705 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
706 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
707 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
708 { "network-veth", no_argument, NULL, 'n' },
709 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
710 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
711 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
712 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
713 { "personality", required_argument, NULL, ARG_PERSONALITY },
714 { "image", required_argument, NULL, 'i' },
715 { "volatile", optional_argument, NULL, ARG_VOLATILE },
716 { "port", required_argument, NULL, 'p' },
717 { "property", required_argument, NULL, ARG_PROPERTY },
718 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
719 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
720 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
721 { "settings", required_argument, NULL, ARG_SETTINGS },
722 { "chdir", required_argument, NULL, ARG_CHDIR },
723 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
724 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
725 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
726 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 727 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 728 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 729 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 730 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 731 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
732 { "console", required_argument, NULL, ARG_CONSOLE },
733 { "pipe", no_argument, NULL, ARG_PIPE },
734 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 735 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
eb9da376 736 {}
88213476
LP
737 };
738
9444b1f2 739 int c, r;
a42c8b54 740 uint64_t plus = 0, minus = 0;
f757855e 741 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
742
743 assert(argc >= 0);
744 assert(argv);
745
de40a303 746 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
747 switch (c) {
748
749 case 'h':
37ec0fdd 750 return help();
88213476 751
acbeb427 752 case ARG_VERSION:
3f6fd1ba 753 return version();
acbeb427 754
88213476 755 case 'D':
0f03c2a4 756 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 757 if (r < 0)
0f03c2a4 758 return r;
de40a303
LP
759
760 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
761 break;
762
763 case ARG_TEMPLATE:
0f03c2a4 764 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 765 if (r < 0)
0f03c2a4 766 return r;
de40a303
LP
767
768 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
769 break;
770
1b9e5b12 771 case 'i':
0f03c2a4 772 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 773 if (r < 0)
0f03c2a4 774 return r;
de40a303
LP
775
776 arg_settings_mask |= SETTING_DIRECTORY;
777 break;
778
779 case ARG_OCI_BUNDLE:
780 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
781 if (r < 0)
782 return r;
783
ec16945e
LP
784 break;
785
786 case 'x':
787 arg_ephemeral = true;
a2f577fc 788 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
789 break;
790
687d0825 791 case 'u':
2fc09a9c
DM
792 r = free_and_strdup(&arg_user, optarg);
793 if (r < 0)
7027ff61 794 return log_oom();
687d0825 795
f757855e 796 arg_settings_mask |= SETTING_USER;
687d0825
MV
797 break;
798
22b28dfd
LP
799 case ARG_NETWORK_ZONE: {
800 char *j;
801
b910cc72 802 j = strjoin("vz-", optarg);
22b28dfd
LP
803 if (!j)
804 return log_oom();
805
806 if (!ifname_valid(j)) {
807 log_error("Network zone name not valid: %s", j);
808 free(j);
809 return -EINVAL;
810 }
811
df1fac6d 812 free_and_replace(arg_network_zone, j);
22b28dfd
LP
813
814 arg_network_veth = true;
815 arg_private_network = true;
816 arg_settings_mask |= SETTING_NETWORK;
817 break;
818 }
819
ab046dde 820 case ARG_NETWORK_BRIDGE:
ef76dff2 821
baaa35ad
ZJS
822 if (!ifname_valid(optarg))
823 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
824 "Bridge interface name not valid: %s", optarg);
ef76dff2 825
f757855e
LP
826 r = free_and_strdup(&arg_network_bridge, optarg);
827 if (r < 0)
828 return log_oom();
ab046dde 829
4831981d 830 _fallthrough_;
0dfaa006 831 case 'n':
69c79d3c
LP
832 arg_network_veth = true;
833 arg_private_network = true;
f757855e 834 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
835 break;
836
f6d6bad1
LP
837 case ARG_NETWORK_VETH_EXTRA:
838 r = veth_extra_parse(&arg_network_veth_extra, optarg);
839 if (r < 0)
840 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
841
842 arg_private_network = true;
843 arg_settings_mask |= SETTING_NETWORK;
844 break;
845
aa28aefe 846 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
847 if (!ifname_valid(optarg))
848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
849 "Network interface name not valid: %s", optarg);
ef76dff2 850
c74e630d
LP
851 if (strv_extend(&arg_network_interfaces, optarg) < 0)
852 return log_oom();
853
854 arg_private_network = true;
f757855e 855 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
856 break;
857
858 case ARG_NETWORK_MACVLAN:
ef76dff2 859
baaa35ad
ZJS
860 if (!ifname_valid(optarg))
861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
862 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 863
c74e630d 864 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
865 return log_oom();
866
4bbfe7ad 867 arg_private_network = true;
f757855e 868 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
869 break;
870
871 case ARG_NETWORK_IPVLAN:
ef76dff2 872
baaa35ad
ZJS
873 if (!ifname_valid(optarg))
874 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
875 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 876
4bbfe7ad
TG
877 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
878 return log_oom();
879
4831981d 880 _fallthrough_;
ff01d048
LP
881 case ARG_PRIVATE_NETWORK:
882 arg_private_network = true;
f757855e 883 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
884 break;
885
d7bea6b6
DP
886 case ARG_NETWORK_NAMESPACE_PATH:
887 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
888 if (r < 0)
889 return r;
890
de40a303 891 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
892 break;
893
0f0dbc46 894 case 'b':
baaa35ad
ZJS
895 if (arg_start_mode == START_PID2)
896 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
897 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
898
899 arg_start_mode = START_BOOT;
900 arg_settings_mask |= SETTING_START_MODE;
901 break;
902
903 case 'a':
baaa35ad
ZJS
904 if (arg_start_mode == START_BOOT)
905 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
906 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
907
908 arg_start_mode = START_PID2;
909 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
910 break;
911
144f0fc0 912 case ARG_UUID:
9444b1f2 913 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
914 if (r < 0)
915 return log_error_errno(r, "Invalid UUID: %s", optarg);
916
baaa35ad
ZJS
917 if (sd_id128_is_null(arg_uuid))
918 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
919 "Machine UUID may not be all zeroes.");
f757855e
LP
920
921 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 922 break;
aa96c6cb 923
43c3fb46
LP
924 case 'S': {
925 _cleanup_free_ char *mangled = NULL;
926
927 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
928 if (r < 0)
929 return log_oom();
930
43c3fb46 931 free_and_replace(arg_slice, mangled);
de40a303 932 arg_settings_mask |= SETTING_SLICE;
144f0fc0 933 break;
43c3fb46 934 }
144f0fc0 935
7027ff61 936 case 'M':
c1521918 937 if (isempty(optarg))
97b11eed 938 arg_machine = mfree(arg_machine);
c1521918 939 else {
baaa35ad
ZJS
940 if (!machine_name_is_valid(optarg))
941 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
942 "Invalid machine name: %s", optarg);
7027ff61 943
0c3c4284
LP
944 r = free_and_strdup(&arg_machine, optarg);
945 if (r < 0)
eb91eb18 946 return log_oom();
eb91eb18 947 }
9ce6d1b3 948 break;
7027ff61 949
3a9530e5
LP
950 case ARG_HOSTNAME:
951 if (isempty(optarg))
952 arg_hostname = mfree(arg_hostname);
953 else {
baaa35ad
ZJS
954 if (!hostname_is_valid(optarg, false))
955 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
956 "Invalid hostname: %s", optarg);
3a9530e5
LP
957
958 r = free_and_strdup(&arg_hostname, optarg);
959 if (r < 0)
960 return log_oom();
961 }
962
963 arg_settings_mask |= SETTING_HOSTNAME;
964 break;
965
82adf6af
LP
966 case 'Z':
967 arg_selinux_context = optarg;
a8828ed9
DW
968 break;
969
82adf6af
LP
970 case 'L':
971 arg_selinux_apifs_context = optarg;
a8828ed9
DW
972 break;
973
bc2f673e
LP
974 case ARG_READ_ONLY:
975 arg_read_only = true;
f757855e 976 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
977 break;
978
420c7379
LP
979 case ARG_CAPABILITY:
980 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
981 uint64_t m;
982 r = parse_capability_spec(optarg, &m);
983 if (r <= 0)
984 return r;
5076f0cc 985
8a99bd0c
ZJS
986 if (c == ARG_CAPABILITY)
987 plus |= m;
988 else
989 minus |= m;
f757855e 990 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
991 break;
992 }
66edd963
LP
993 case ARG_NO_NEW_PRIVILEGES:
994 r = parse_boolean(optarg);
995 if (r < 0)
996 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
997
998 arg_no_new_privileges = r;
999 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1000 break;
1001
57fb9fb5
LP
1002 case 'j':
1003 arg_link_journal = LINK_GUEST;
574edc90 1004 arg_link_journal_try = true;
4e1d6aa9 1005 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1006 break;
1007
1008 case ARG_LINK_JOURNAL:
4e1d6aa9 1009 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1010 if (r < 0)
1011 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1012
4e1d6aa9 1013 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1014 break;
1015
17fe0523 1016 case ARG_BIND:
f757855e
LP
1017 case ARG_BIND_RO:
1018 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1019 if (r < 0)
1020 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1021
f757855e 1022 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1023 break;
06c17c39 1024
f757855e
LP
1025 case ARG_TMPFS:
1026 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1027 if (r < 0)
1028 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1029
f757855e 1030 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1031 break;
5a8af538
LP
1032
1033 case ARG_OVERLAY:
ad85779a
LP
1034 case ARG_OVERLAY_RO:
1035 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1036 if (r == -EADDRNOTAVAIL)
1037 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1038 if (r < 0)
1039 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1040
f757855e 1041 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1042 break;
06c17c39 1043
de40a303
LP
1044 case ARG_INACCESSIBLE:
1045 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1048
1049 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1050 break;
1051
a5f1cb3b 1052 case 'E': {
f4889f65
LP
1053 char **n;
1054
baaa35ad
ZJS
1055 if (!env_assignment_is_valid(optarg))
1056 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1057 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1058
1059 n = strv_env_set(arg_setenv, optarg);
1060 if (!n)
1061 return log_oom();
1062
130d3d22 1063 strv_free_and_replace(arg_setenv, n);
f757855e 1064 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1065 break;
1066 }
1067
284c0b91
LP
1068 case 'q':
1069 arg_quiet = true;
1070 break;
1071
8a96d94e 1072 case ARG_SHARE_SYSTEM:
a6b5216c 1073 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1074 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1075 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1076 arg_clone_ns_flags = 0;
8a96d94e
LP
1077 break;
1078
eb91eb18
LP
1079 case ARG_REGISTER:
1080 r = parse_boolean(optarg);
1081 if (r < 0) {
1082 log_error("Failed to parse --register= argument: %s", optarg);
1083 return r;
1084 }
1085
1086 arg_register = r;
1087 break;
1088
89f7c846
LP
1089 case ARG_KEEP_UNIT:
1090 arg_keep_unit = true;
1091 break;
1092
6afc95b7
LP
1093 case ARG_PERSONALITY:
1094
ac45f971 1095 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1096 if (arg_personality == PERSONALITY_INVALID)
1097 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1098 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1099
f757855e 1100 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1101 break;
1102
4d9f07b4
LP
1103 case ARG_VOLATILE:
1104
1105 if (!optarg)
f757855e 1106 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1107 else if (streq(optarg, "help")) {
1108 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1109 return 0;
1110 } else {
f757855e 1111 VolatileMode m;
4d9f07b4 1112
f757855e 1113 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1114 if (m < 0)
1115 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1116 "Failed to parse --volatile= argument: %s", optarg);
1117 else
f757855e 1118 arg_volatile_mode = m;
6d0b55c2
LP
1119 }
1120
f757855e
LP
1121 arg_settings_mask |= SETTING_VOLATILE_MODE;
1122 break;
6d0b55c2 1123
f757855e
LP
1124 case 'p':
1125 r = expose_port_parse(&arg_expose_ports, optarg);
1126 if (r == -EEXIST)
1127 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1128 if (r < 0)
1129 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1130
f757855e 1131 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1132 break;
6d0b55c2 1133
f36933fe
LP
1134 case ARG_PROPERTY:
1135 if (strv_extend(&arg_property, optarg) < 0)
1136 return log_oom();
1137
1138 break;
1139
ae209204
ZJS
1140 case ARG_PRIVATE_USERS: {
1141 int boolean = -1;
0de7acce 1142
ae209204
ZJS
1143 if (!optarg)
1144 boolean = true;
1145 else if (!in_charset(optarg, DIGITS))
1146 /* do *not* parse numbers as booleans */
1147 boolean = parse_boolean(optarg);
1148
1149 if (boolean == false) {
0de7acce
LP
1150 /* no: User namespacing off */
1151 arg_userns_mode = USER_NAMESPACE_NO;
1152 arg_uid_shift = UID_INVALID;
1153 arg_uid_range = UINT32_C(0x10000);
ae209204 1154 } else if (boolean == true) {
0de7acce
LP
1155 /* yes: User namespacing on, UID range is read from root dir */
1156 arg_userns_mode = USER_NAMESPACE_FIXED;
1157 arg_uid_shift = UID_INVALID;
1158 arg_uid_range = UINT32_C(0x10000);
1159 } else if (streq(optarg, "pick")) {
1160 /* pick: User namespacing on, UID range is picked randomly */
1161 arg_userns_mode = USER_NAMESPACE_PICK;
1162 arg_uid_shift = UID_INVALID;
1163 arg_uid_range = UINT32_C(0x10000);
1164 } else {
6c2058b3 1165 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1166 const char *range, *shift;
1167
0de7acce
LP
1168 /* anything else: User namespacing on, UID range is explicitly configured */
1169
6dac160c
LP
1170 range = strchr(optarg, ':');
1171 if (range) {
6c2058b3
ZJS
1172 buffer = strndup(optarg, range - optarg);
1173 if (!buffer)
1174 return log_oom();
1175 shift = buffer;
6dac160c
LP
1176
1177 range++;
bfd292ec
ZJS
1178 r = safe_atou32(range, &arg_uid_range);
1179 if (r < 0)
be715731 1180 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1181 } else
1182 shift = optarg;
1183
be715731
ZJS
1184 r = parse_uid(shift, &arg_uid_shift);
1185 if (r < 0)
1186 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1187
1188 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1189 }
1190
baaa35ad
ZJS
1191 if (arg_uid_range <= 0)
1192 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1193 "UID range cannot be 0.");
be715731 1194
0de7acce 1195 arg_settings_mask |= SETTING_USERNS;
6dac160c 1196 break;
ae209204 1197 }
6dac160c 1198
0de7acce 1199 case 'U':
ccabee0d
LP
1200 if (userns_supported()) {
1201 arg_userns_mode = USER_NAMESPACE_PICK;
1202 arg_uid_shift = UID_INVALID;
1203 arg_uid_range = UINT32_C(0x10000);
1204
1205 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1206 }
1207
7336138e
LP
1208 break;
1209
0de7acce 1210 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1211 arg_userns_chown = true;
0de7acce
LP
1212
1213 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1214 break;
1215
c6c8f6e2 1216 case ARG_KILL_SIGNAL:
5c828e66
LP
1217 if (streq(optarg, "help")) {
1218 DUMP_STRING_TABLE(signal, int, _NSIG);
1219 return 0;
1220 }
1221
29a3db75 1222 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1223 if (arg_kill_signal < 0)
1224 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1225 "Cannot parse signal: %s", optarg);
c6c8f6e2 1226
f757855e
LP
1227 arg_settings_mask |= SETTING_KILL_SIGNAL;
1228 break;
1229
1230 case ARG_SETTINGS:
1231
1232 /* no → do not read files
1233 * yes → read files, do not override cmdline, trust only subset
1234 * override → read files, override cmdline, trust only subset
1235 * trusted → read files, do not override cmdline, trust all
1236 */
1237
1238 r = parse_boolean(optarg);
1239 if (r < 0) {
1240 if (streq(optarg, "trusted")) {
1241 mask_all_settings = false;
1242 mask_no_settings = false;
1243 arg_settings_trusted = true;
1244
1245 } else if (streq(optarg, "override")) {
1246 mask_all_settings = false;
1247 mask_no_settings = true;
1248 arg_settings_trusted = -1;
1249 } else
1250 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1251 } else if (r > 0) {
1252 /* yes */
1253 mask_all_settings = false;
1254 mask_no_settings = false;
1255 arg_settings_trusted = -1;
1256 } else {
1257 /* no */
1258 mask_all_settings = true;
1259 mask_no_settings = false;
1260 arg_settings_trusted = false;
1261 }
1262
c6c8f6e2
LP
1263 break;
1264
5f932eb9 1265 case ARG_CHDIR:
baaa35ad
ZJS
1266 if (!path_is_absolute(optarg))
1267 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1268 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1269
1270 r = free_and_strdup(&arg_chdir, optarg);
1271 if (r < 0)
1272 return log_oom();
1273
1274 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1275 break;
1276
b53ede69
PW
1277 case ARG_PIVOT_ROOT:
1278 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1279 if (r < 0)
1280 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1281
1282 arg_settings_mask |= SETTING_PIVOT_ROOT;
1283 break;
1284
9c1e04d0
AP
1285 case ARG_NOTIFY_READY:
1286 r = parse_boolean(optarg);
baaa35ad
ZJS
1287 if (r < 0)
1288 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1289 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1290 arg_notify_ready = r;
1291 arg_settings_mask |= SETTING_NOTIFY_READY;
1292 break;
1293
4623e8e6
LP
1294 case ARG_ROOT_HASH: {
1295 void *k;
1296 size_t l;
1297
1298 r = unhexmem(optarg, strlen(optarg), &k, &l);
1299 if (r < 0)
1300 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1301 if (l < sizeof(sd_id128_t)) {
4623e8e6 1302 free(k);
c6147113 1303 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6
LP
1304 }
1305
1306 free(arg_root_hash);
1307 arg_root_hash = k;
1308 arg_root_hash_size = l;
1309 break;
1310 }
1311
960e4569
LP
1312 case ARG_SYSTEM_CALL_FILTER: {
1313 bool negative;
1314 const char *items;
1315
1316 negative = optarg[0] == '~';
1317 items = negative ? optarg + 1 : optarg;
1318
1319 for (;;) {
1320 _cleanup_free_ char *word = NULL;
1321
1322 r = extract_first_word(&items, &word, NULL, 0);
1323 if (r == 0)
1324 break;
1325 if (r == -ENOMEM)
1326 return log_oom();
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to parse system call filter: %m");
1329
1330 if (negative)
1331 r = strv_extend(&arg_syscall_blacklist, word);
1332 else
1333 r = strv_extend(&arg_syscall_whitelist, word);
1334 if (r < 0)
1335 return log_oom();
1336 }
1337
1338 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1339 break;
1340 }
1341
bf428efb
LP
1342 case ARG_RLIMIT: {
1343 const char *eq;
622ecfa8 1344 _cleanup_free_ char *name = NULL;
bf428efb
LP
1345 int rl;
1346
5c828e66
LP
1347 if (streq(optarg, "help")) {
1348 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1349 return 0;
1350 }
1351
bf428efb 1352 eq = strchr(optarg, '=');
baaa35ad
ZJS
1353 if (!eq)
1354 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1355 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1356
1357 name = strndup(optarg, eq - optarg);
1358 if (!name)
1359 return log_oom();
1360
1361 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1362 if (rl < 0)
1363 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1364 "Unknown resource limit: %s", name);
bf428efb
LP
1365
1366 if (!arg_rlimit[rl]) {
1367 arg_rlimit[rl] = new0(struct rlimit, 1);
1368 if (!arg_rlimit[rl])
1369 return log_oom();
1370 }
1371
1372 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1375
1376 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1377 break;
1378 }
1379
81f345df
LP
1380 case ARG_OOM_SCORE_ADJUST:
1381 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1382 if (r < 0)
1383 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1384
1385 arg_oom_score_adjust_set = true;
1386 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1387 break;
1388
d107bb7d 1389 case ARG_CPU_AFFINITY: {
0985c7c4 1390 CPUSet cpuset;
d107bb7d
LP
1391
1392 r = parse_cpu_set(optarg, &cpuset);
1393 if (r < 0)
0985c7c4 1394 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1395
0985c7c4
ZJS
1396 cpu_set_reset(&arg_cpu_set);
1397 arg_cpu_set = cpuset;
d107bb7d
LP
1398 arg_settings_mask |= SETTING_CPU_AFFINITY;
1399 break;
1400 }
1401
09d423e9
LP
1402 case ARG_RESOLV_CONF:
1403 if (streq(optarg, "help")) {
1404 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1405 return 0;
1406 }
1407
1408 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1409 if (arg_resolv_conf < 0)
1410 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1411 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1412
1413 arg_settings_mask |= SETTING_RESOLV_CONF;
1414 break;
1415
1688841f
LP
1416 case ARG_TIMEZONE:
1417 if (streq(optarg, "help")) {
1418 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1419 return 0;
1420 }
1421
1422 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1423 if (arg_timezone < 0)
1424 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1425 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1426
1427 arg_settings_mask |= SETTING_TIMEZONE;
1428 break;
1429
de40a303 1430 case ARG_CONSOLE:
dce66ffe
ZJS
1431 r = handle_arg_console(optarg);
1432 if (r <= 0)
1433 return r;
de40a303
LP
1434 break;
1435
1436 case 'P':
1437 case ARG_PIPE:
dce66ffe
ZJS
1438 r = handle_arg_console("pipe");
1439 if (r <= 0)
1440 return r;
de40a303
LP
1441 break;
1442
bb068de0
ZJS
1443 case ARG_NO_PAGER:
1444 arg_pager_flags |= PAGER_DISABLE;
1445 break;
1446
88213476
LP
1447 case '?':
1448 return -EINVAL;
1449
1450 default:
eb9da376 1451 assert_not_reached("Unhandled option");
88213476 1452 }
88213476 1453
60f1ec13
LP
1454 if (argc > optind) {
1455 strv_free(arg_parameters);
1456 arg_parameters = strv_copy(argv + optind);
1457 if (!arg_parameters)
1458 return log_oom();
d7bea6b6 1459
60f1ec13
LP
1460 arg_settings_mask |= SETTING_START_MODE;
1461 }
1462
1463 if (arg_ephemeral && arg_template && !arg_directory)
1464 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1465 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1466 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1467 * --directory=". */
1468 arg_directory = TAKE_PTR(arg_template);
1469
bd4b15f2 1470 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1471
de40a303 1472 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1473 r = parse_environment();
1474 if (r < 0)
1475 return r;
de40a303 1476
60f1ec13
LP
1477 /* Load all settings from .nspawn files */
1478 if (mask_no_settings)
1479 arg_settings_mask = 0;
1480
1481 /* Don't load any settings from .nspawn files */
1482 if (mask_all_settings)
1483 arg_settings_mask = _SETTINGS_MASK_ALL;
1484
1485 return 1;
1486}
1487
1488static int verify_arguments(void) {
1489 int r;
a6b5216c 1490
75b0d8b8
ZJS
1491 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1492 /* If we are running the stub init in the container, we don't need to look at what the init
1493 * in the container supports, because we are not using it. Let's immediately pick the right
1494 * setting based on the host system configuration.
1495 *
1496 * We only do this, if the user didn't use an environment variable to override the detection.
1497 */
1498
1499 r = cg_all_unified();
1500 if (r < 0)
1501 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1502 if (r > 0)
1503 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1504 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1505 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1506 else
1507 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1508 }
1509
4f086aab
SU
1510 if (arg_userns_mode != USER_NAMESPACE_NO)
1511 arg_mount_settings |= MOUNT_USE_USERNS;
1512
1513 if (arg_private_network)
1514 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1515
48a8d337
LB
1516 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1517 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1518 arg_register = false;
baaa35ad 1519 if (arg_start_mode != START_PID1)
60f1ec13 1520 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1521 }
eb91eb18 1522
0de7acce 1523 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1524 arg_userns_chown = true;
1525
60f1ec13
LP
1526 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1527 arg_kill_signal = SIGRTMIN+3;
1528
e5a4bb0d
LP
1529 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1530 arg_read_only = true;
1531
baaa35ad 1532 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1533 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1534 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1535 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1536
baaa35ad 1537 if (arg_directory && arg_image)
60f1ec13 1538 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1539
baaa35ad 1540 if (arg_template && arg_image)
60f1ec13 1541 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1542
baaa35ad 1543 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1544 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1545
baaa35ad 1546 if (arg_ephemeral && arg_template)
60f1ec13 1547 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1548
baaa35ad 1549 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1550 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1551
baaa35ad 1552 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1553 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1554
baaa35ad 1555 if (arg_userns_chown && arg_read_only)
de40a303
LP
1556 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1557 "--read-only and --private-users-chown may not be combined.");
f757855e 1558
e5a4bb0d
LP
1559 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1560 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1561 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1562 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1563 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1564
de40a303
LP
1565 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1566 * to avoid conflicts between different network options. */
60f1ec13
LP
1567 if (arg_network_namespace_path &&
1568 (arg_network_interfaces || arg_network_macvlan ||
1569 arg_network_ipvlan || arg_network_veth_extra ||
1570 arg_network_bridge || arg_network_zone ||
1571 arg_network_veth || arg_private_network))
de40a303 1572 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1573
60f1ec13 1574 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1575 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1576 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1577
baaa35ad 1578 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1579 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1580
baaa35ad 1581 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1583
baaa35ad 1584 if (arg_expose_ports && !arg_private_network)
60f1ec13 1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1586
349cc4a5 1587#if ! HAVE_LIBIPTC
baaa35ad 1588 if (arg_expose_ports)
60f1ec13 1589 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1590#endif
1591
60f1ec13
LP
1592 r = custom_mount_check_all();
1593 if (r < 0)
1594 return r;
c6c8f6e2 1595
f757855e 1596 return 0;
88213476
LP
1597}
1598
03cfe0d5
LP
1599static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1600 assert(p);
1601
0de7acce 1602 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1603 return 0;
1604
1605 if (uid == UID_INVALID && gid == GID_INVALID)
1606 return 0;
1607
1608 if (uid != UID_INVALID) {
1609 uid += arg_uid_shift;
1610
1611 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1612 return -EOVERFLOW;
1613 }
1614
1615 if (gid != GID_INVALID) {
1616 gid += (gid_t) arg_uid_shift;
1617
1618 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1619 return -EOVERFLOW;
1620 }
1621
1622 if (lchown(p, uid, gid) < 0)
1623 return -errno;
b12afc8c
LP
1624
1625 return 0;
1626}
1627
03cfe0d5
LP
1628static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1629 const char *q;
dae8b82e 1630 int r;
03cfe0d5
LP
1631
1632 q = prefix_roota(root, path);
dae8b82e
ZJS
1633 r = mkdir_errno_wrapper(q, mode);
1634 if (r == -EEXIST)
1635 return 0;
1636 if (r < 0)
1637 return r;
03cfe0d5
LP
1638
1639 return userns_lchown(q, uid, gid);
1640}
1641
1688841f 1642static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1643 return PATH_STARTSWITH_SET(
1644 path,
1645 "../usr/share/zoneinfo/",
1646 "/usr/share/zoneinfo/");
1688841f
LP
1647}
1648
83205269
LP
1649static bool etc_writable(void) {
1650 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1651}
1652
e58a1277 1653static int setup_timezone(const char *dest) {
1688841f
LP
1654 _cleanup_free_ char *p = NULL, *etc = NULL;
1655 const char *where, *check;
1656 TimezoneMode m;
d4036145 1657 int r;
f8440af5 1658
e58a1277
LP
1659 assert(dest);
1660
1688841f 1661 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1662 r = readlink_malloc("/etc/localtime", &p);
1663 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1664 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1665 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1666 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1667 else if (r < 0) {
1668 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1669 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1670 * file.
1671 *
1672 * Example:
1673 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1674 */
1675 return 0;
1676 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1677 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1678 else
1679 m = arg_timezone;
1680 } else
1681 m = arg_timezone;
1682
1683 if (m == TIMEZONE_OFF)
1684 return 0;
1685
a5648b80 1686 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1687 if (r < 0) {
1688841f 1688 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1689 return 0;
1690 }
1691
1688841f
LP
1692 where = strjoina(etc, "/localtime");
1693
1694 switch (m) {
1695
1696 case TIMEZONE_DELETE:
1697 if (unlink(where) < 0)
1698 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1699
d4036145 1700 return 0;
d4036145 1701
1688841f
LP
1702 case TIMEZONE_SYMLINK: {
1703 _cleanup_free_ char *q = NULL;
1704 const char *z, *what;
4d1c38b8 1705
1688841f
LP
1706 z = timezone_from_path(p);
1707 if (!z) {
1708 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1709 return 0;
1688841f 1710 }
d4036145 1711
1688841f
LP
1712 r = readlink_malloc(where, &q);
1713 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1714 return 0; /* Already pointing to the right place? Then do nothing .. */
1715
1716 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1717 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1718 if (r < 0)
1719 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1720 else {
1721 if (unlink(where) < 0 && errno != ENOENT) {
1722 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1723 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1724 return 0;
1725 }
1726
1727 what = strjoina("../usr/share/zoneinfo/", z);
1728 if (symlink(what, where) < 0) {
1729 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1730 errno, "Failed to correct timezone of container, ignoring: %m");
1731 return 0;
1732 }
1733
1734 break;
1735 }
1736
1737 _fallthrough_;
d4036145 1738 }
68fb0892 1739
1688841f
LP
1740 case TIMEZONE_BIND: {
1741 _cleanup_free_ char *resolved = NULL;
1742 int found;
1743
a5648b80 1744 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1745 if (found < 0) {
1746 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1747 return 0;
1748 }
1749
1750 if (found == 0) /* missing? */
1751 (void) touch(resolved);
1752
1753 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1754 if (r >= 0)
1755 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1756
1757 _fallthrough_;
79d80fc1 1758 }
4d9f07b4 1759
1688841f
LP
1760 case TIMEZONE_COPY:
1761 /* If mounting failed, try to copy */
8a016c74 1762 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1763 if (r < 0) {
1764 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1765 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1766 return 0;
1767 }
1768
1769 break;
1770
1771 default:
1772 assert_not_reached("unexpected mode");
d4036145 1773 }
e58a1277 1774
1688841f 1775 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1776 r = userns_lchown(where, 0, 0);
1777 if (r < 0)
1688841f 1778 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1779
e58a1277 1780 return 0;
88213476
LP
1781}
1782
09d423e9
LP
1783static int have_resolv_conf(const char *path) {
1784 assert(path);
1785
1786 if (access(path, F_OK) < 0) {
1787 if (errno == ENOENT)
1788 return 0;
1789
1790 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1791 }
1792
1793 return 1;
1794}
1795
7357272e 1796static int resolved_listening(void) {
b8ea7a6e 1797 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1798 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1799 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1800 int r;
1801
7357272e 1802 /* Check if resolved is listening */
b053cd5f
LP
1803
1804 r = sd_bus_open_system(&bus);
1805 if (r < 0)
b8ea7a6e 1806 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1807
7357272e 1808 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1809 if (r < 0)
1810 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1811 if (r == 0)
1812 return 0;
7357272e
DM
1813
1814 r = sd_bus_get_property_string(bus,
1815 "org.freedesktop.resolve1",
1816 "/org/freedesktop/resolve1",
1817 "org.freedesktop.resolve1.Manager",
1818 "DNSStubListener",
b8ea7a6e 1819 &error,
7357272e
DM
1820 &dns_stub_listener_mode);
1821 if (r < 0)
b8ea7a6e 1822 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1823
1824 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1825}
1826
2547bb41 1827static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1828 _cleanup_free_ char *etc = NULL;
1829 const char *where, *what;
1830 ResolvConfMode m;
1831 int r;
2547bb41
LP
1832
1833 assert(dest);
1834
09d423e9
LP
1835 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1836 if (arg_private_network)
1837 m = RESOLV_CONF_OFF;
1838 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
83205269 1839 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
09d423e9 1840 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 1841 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 1842 else
83205269 1843 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
09d423e9
LP
1844 } else
1845 m = arg_resolv_conf;
1846
1847 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1848 return 0;
1849
a5648b80 1850 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
1851 if (r < 0) {
1852 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1853 return 0;
1854 }
1855
1856 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1857
1858 if (m == RESOLV_CONF_DELETE) {
1859 if (unlink(where) < 0)
1860 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1861
87447ae4
LP
1862 return 0;
1863 }
79d80fc1 1864
09d423e9
LP
1865 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1866 what = STATIC_RESOLV_CONF;
1867 else
1868 what = "/etc/resolv.conf";
87447ae4 1869
09d423e9
LP
1870 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1871 _cleanup_free_ char *resolved = NULL;
1872 int found;
1873
a5648b80 1874 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
1875 if (found < 0) {
1876 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1877 return 0;
1878 }
3539724c 1879
87447ae4
LP
1880 if (found == 0) /* missing? */
1881 (void) touch(resolved);
5367354d 1882
09d423e9 1883 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1884 if (r >= 0)
87447ae4 1885 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1886 }
1887
1888 /* If that didn't work, let's copy the file */
8a016c74 1889 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 1890 if (r < 0) {
3539724c
LP
1891 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1892 * resolved or something similar runs inside and the symlink points there.
68a313c5 1893 *
3539724c 1894 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1895 */
09d423e9 1896 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1897 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1898 return 0;
1899 }
2547bb41 1900
03cfe0d5
LP
1901 r = userns_lchown(where, 0, 0);
1902 if (r < 0)
3539724c 1903 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1904
2547bb41
LP
1905 return 0;
1906}
1907
1e4f1671 1908static int setup_boot_id(void) {
cdde6ba6
LP
1909 _cleanup_(unlink_and_freep) char *from = NULL;
1910 _cleanup_free_ char *path = NULL;
3bbaff3e 1911 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1912 const char *to;
04bc4a3f
LP
1913 int r;
1914
1eacc470 1915 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 1916
1eacc470 1917 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
1918 if (r < 0)
1919 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1920
1921 r = sd_id128_randomize(&rnd);
f647962d
MS
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1924
cdde6ba6 1925 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1926 if (r < 0)
1927 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1928
cdde6ba6
LP
1929 from = TAKE_PTR(path);
1930 to = "/proc/sys/kernel/random/boot_id";
1931
60e76d48 1932 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1933 if (r < 0)
1934 return r;
04bc4a3f 1935
cdde6ba6 1936 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1937}
1938
e58a1277 1939static int copy_devnodes(const char *dest) {
88213476
LP
1940 static const char devnodes[] =
1941 "null\0"
1942 "zero\0"
1943 "full\0"
1944 "random\0"
1945 "urandom\0"
85614d66
TG
1946 "tty\0"
1947 "net/tun\0";
88213476 1948
de40a303 1949 _cleanup_umask_ mode_t u;
88213476 1950 const char *d;
e58a1277 1951 int r = 0;
a258bf26
LP
1952
1953 assert(dest);
124640f1
LP
1954
1955 u = umask(0000);
88213476 1956
03cfe0d5
LP
1957 /* Create /dev/net, so that we can create /dev/net/tun in it */
1958 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1959 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1960
88213476 1961 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1962 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1963 struct stat st;
88213476 1964
c6134d3e 1965 from = path_join("/dev/", d);
8967f291
LP
1966 if (!from)
1967 return log_oom();
1968
c6134d3e 1969 to = path_join(dest, from);
8967f291
LP
1970 if (!to)
1971 return log_oom();
88213476
LP
1972
1973 if (stat(from, &st) < 0) {
1974
4a62c710
MS
1975 if (errno != ENOENT)
1976 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1977
baaa35ad
ZJS
1978 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1979 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1980 "%s is not a char or block device, cannot copy.", from);
1981 else {
8dfce114
LP
1982 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1983
81f5049b 1984 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1985 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1986 if (errno == EEXIST)
8dbf71ec 1987 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1988 if (errno != EPERM)
1989 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1990
8dfce114 1991 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1992 r = touch(to);
1993 if (r < 0)
1994 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1995 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1996 if (r < 0)
1997 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1998 }
6278cf60 1999
03cfe0d5
LP
2000 r = userns_lchown(to, 0, 0);
2001 if (r < 0)
2002 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2003
657ee2d8 2004 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2005 if (!dn)
2006 return log_oom();
2007
2008 r = userns_mkdir(dest, dn, 0755, 0, 0);
2009 if (r < 0)
2010 return log_error_errno(r, "Failed to create '%s': %m", dn);
2011
2012 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2013 return log_oom();
2014
c6134d3e 2015 prefixed = path_join(dest, sl);
8dfce114
LP
2016 if (!prefixed)
2017 return log_oom();
2018
2d9b74ba 2019 t = path_join("..", d);
8dfce114
LP
2020 if (!t)
2021 return log_oom();
2022
2023 if (symlink(t, prefixed) < 0)
2024 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2025 }
88213476
LP
2026 }
2027
e58a1277
LP
2028 return r;
2029}
88213476 2030
de40a303
LP
2031static int make_extra_nodes(const char *dest) {
2032 _cleanup_umask_ mode_t u;
2033 size_t i;
2034 int r;
2035
2036 u = umask(0000);
2037
2038 for (i = 0; i < arg_n_extra_nodes; i++) {
2039 _cleanup_free_ char *path = NULL;
2040 DeviceNode *n = arg_extra_nodes + i;
2041
c6134d3e 2042 path = path_join(dest, n->path);
de40a303
LP
2043 if (!path)
2044 return log_oom();
2045
2046 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2047 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2048
2049 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2050 if (r < 0)
2051 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2052 }
2053
2054 return 0;
2055}
2056
03cfe0d5
LP
2057static int setup_pts(const char *dest) {
2058 _cleanup_free_ char *options = NULL;
2059 const char *p;
709f6e46 2060 int r;
03cfe0d5 2061
349cc4a5 2062#if HAVE_SELINUX
03cfe0d5
LP
2063 if (arg_selinux_apifs_context)
2064 (void) asprintf(&options,
3dce8915 2065 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2066 arg_uid_shift + TTY_GID,
2067 arg_selinux_apifs_context);
2068 else
2069#endif
2070 (void) asprintf(&options,
3dce8915 2071 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2072 arg_uid_shift + TTY_GID);
f2d88580 2073
03cfe0d5 2074 if (!options)
f2d88580
LP
2075 return log_oom();
2076
03cfe0d5 2077 /* Mount /dev/pts itself */
cc9fce65 2078 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2079 r = mkdir_errno_wrapper(p, 0755);
2080 if (r < 0)
2081 return log_error_errno(r, "Failed to create /dev/pts: %m");
2082
60e76d48
ZJS
2083 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2084 if (r < 0)
2085 return r;
709f6e46
MS
2086 r = userns_lchown(p, 0, 0);
2087 if (r < 0)
2088 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2089
2090 /* Create /dev/ptmx symlink */
2091 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2092 if (symlink("pts/ptmx", p) < 0)
2093 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2094 r = userns_lchown(p, 0, 0);
2095 if (r < 0)
2096 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2097
03cfe0d5
LP
2098 /* And fix /dev/pts/ptmx ownership */
2099 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2100 r = userns_lchown(p, 0, 0);
2101 if (r < 0)
2102 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2103
f2d88580
LP
2104 return 0;
2105}
2106
3acc84eb
FB
2107static int setup_stdio_as_dev_console(void) {
2108 int terminal;
e58a1277 2109 int r;
e58a1277 2110
3acc84eb
FB
2111 terminal = open_terminal("/dev/console", O_RDWR);
2112 if (terminal < 0)
2113 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2114
3acc84eb
FB
2115 /* Make sure we can continue logging to the original stderr, even if
2116 * stderr points elsewhere now */
2117 r = log_dup_console();
2118 if (r < 0)
2119 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2120
3acc84eb
FB
2121 /* invalidates 'terminal' on success and failure */
2122 r = rearrange_stdio(terminal, terminal, terminal);
f647962d 2123 if (r < 0)
3acc84eb
FB
2124 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2125
2126 return 0;
2127}
88213476 2128
3acc84eb
FB
2129static int setup_dev_console(const char *console) {
2130 _cleanup_free_ char *p = NULL;
2131 int r;
a258bf26 2132
3acc84eb
FB
2133 /* Create /dev/console symlink */
2134 r = path_make_relative("/dev", console, &p);
81f5049b 2135 if (r < 0)
3acc84eb
FB
2136 return log_error_errno(r, "Failed to create relative path: %m");
2137
2138 if (symlink(p, "/dev/console") < 0)
2139 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2140
3acc84eb 2141 return 0;
e58a1277
LP
2142}
2143
8e5430c4
LP
2144static int setup_keyring(void) {
2145 key_serial_t keyring;
2146
2147 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2148 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2149 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2150 * these system calls let's make sure we don't leak anything into the container. */
2151
2152 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2153 if (keyring == -1) {
2154 if (errno == ENOSYS)
2155 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2156 else if (IN_SET(errno, EACCES, EPERM))
2157 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2158 else
2159 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2160 }
2161
2162 return 0;
2163}
2164
1e4f1671 2165static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2166 _cleanup_(unlink_and_freep) char *from = NULL;
2167 _cleanup_free_ char *fifo = NULL;
2168 _cleanup_close_ int fd = -1;
7fd1b19b 2169 _cleanup_umask_ mode_t u;
9ec5a93c 2170 int r;
e58a1277 2171
e58a1277 2172 assert(kmsg_socket >= 0);
a258bf26 2173
e58a1277 2174 u = umask(0000);
a258bf26 2175
1eacc470 2176 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2177 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2178 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2179 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2180
1eacc470 2181 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2182 if (r < 0)
2183 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2184
9ec5a93c 2185 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2186 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2187
2188 from = TAKE_PTR(fifo);
9ec5a93c 2189
1eacc470 2190 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2191 if (r < 0)
2192 return r;
e58a1277 2193
669fc4e5 2194 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2195 if (fd < 0)
2196 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2197
9ec5a93c 2198 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2199 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2200 if (r < 0)
2201 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2202
25ea79fe 2203 return 0;
88213476
LP
2204}
2205
1c4baffc 2206static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2207 union in_addr_union *exposed = userdata;
2208
2209 assert(rtnl);
2210 assert(m);
2211 assert(exposed);
2212
7a8f6325 2213 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2214 return 0;
2215}
2216
3a74cea5 2217static int setup_hostname(void) {
c818eef1 2218 int r;
3a74cea5 2219
0c582db0 2220 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2221 return 0;
2222
c818eef1
LP
2223 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2226
7027ff61 2227 return 0;
3a74cea5
LP
2228}
2229
57fb9fb5 2230static int setup_journal(const char *directory) {
0f5e1382 2231 _cleanup_free_ char *d = NULL;
5905d7cf 2232 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2233 const char *dirname, *p, *q;
2234 sd_id128_t this_id;
8054d749 2235 bool try;
57fb9fb5
LP
2236 int r;
2237
df9a75e4
LP
2238 /* Don't link journals in ephemeral mode */
2239 if (arg_ephemeral)
2240 return 0;
2241
8054d749
LP
2242 if (arg_link_journal == LINK_NO)
2243 return 0;
2244
2245 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2246
4d680aee 2247 r = sd_id128_get_machine(&this_id);
f647962d
MS
2248 if (r < 0)
2249 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2250
e01ff70a 2251 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2252 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2253 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2254 if (try)
4d680aee 2255 return 0;
df9a75e4 2256 return -EEXIST;
4d680aee
ZJS
2257 }
2258
369ca6da
ZJS
2259 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2260 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2261 if (r < 0) {
2262 bool ignore = r == -EROFS && try;
2263 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2264 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2265 return ignore ? 0 : r;
2266 }
2267 }
03cfe0d5 2268
e01ff70a
MS
2269 (void) sd_id128_to_string(arg_uuid, id);
2270
03cfe0d5
LP
2271 p = strjoina("/var/log/journal/", id);
2272 q = prefix_roota(directory, p);
27407a01 2273
e1873695 2274 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2275 if (try)
2276 return 0;
27407a01 2277
baaa35ad
ZJS
2278 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2279 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2280 }
2281
e1873695 2282 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2283 if (try)
2284 return 0;
57fb9fb5 2285
baaa35ad
ZJS
2286 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2287 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2288 }
2289
2290 r = readlink_and_make_absolute(p, &d);
2291 if (r >= 0) {
3742095b 2292 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2293 path_equal(d, q)) {
2294
03cfe0d5 2295 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2296 if (r < 0)
709f6e46 2297 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2298 return 0;
57fb9fb5
LP
2299 }
2300
4a62c710
MS
2301 if (unlink(p) < 0)
2302 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2303 } else if (r == -EINVAL) {
2304
2305 if (arg_link_journal == LINK_GUEST &&
2306 rmdir(p) < 0) {
2307
27407a01
ZJS
2308 if (errno == ENOTDIR) {
2309 log_error("%s already exists and is neither a symlink nor a directory", p);
2310 return r;
4314d33f
MS
2311 } else
2312 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2313 }
4314d33f
MS
2314 } else if (r != -ENOENT)
2315 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2316
2317 if (arg_link_journal == LINK_GUEST) {
2318
2319 if (symlink(q, p) < 0) {
8054d749 2320 if (try) {
56f64d95 2321 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2322 return 0;
4314d33f
MS
2323 } else
2324 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2325 }
2326
03cfe0d5 2327 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2328 if (r < 0)
709f6e46 2329 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2330 return 0;
57fb9fb5
LP
2331 }
2332
2333 if (arg_link_journal == LINK_HOST) {
ccddd104 2334 /* don't create parents here — if the host doesn't have
574edc90 2335 * permanent journal set up, don't force it here */
ba8e6c4d 2336
dae8b82e
ZJS
2337 r = mkdir_errno_wrapper(p, 0755);
2338 if (r < 0 && r != -EEXIST) {
8054d749 2339 if (try) {
dae8b82e 2340 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2341 return 0;
4314d33f 2342 } else
dae8b82e 2343 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2344 }
2345
27407a01
ZJS
2346 } else if (access(p, F_OK) < 0)
2347 return 0;
57fb9fb5 2348
cdb2b9d0
LP
2349 if (dir_is_empty(q) == 0)
2350 log_warning("%s is not empty, proceeding anyway.", q);
2351
03cfe0d5 2352 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2353 if (r < 0)
2354 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2355
60e76d48
ZJS
2356 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2357 if (r < 0)
4a62c710 2358 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2359
27407a01 2360 return 0;
57fb9fb5
LP
2361}
2362
de40a303
LP
2363static int drop_capabilities(uid_t uid) {
2364 CapabilityQuintet q;
2365
2366 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2367 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2368 * arg_caps_retain. */
2369
2370 if (capability_quintet_is_set(&arg_full_capabilities)) {
2371 q = arg_full_capabilities;
2372
2373 if (q.bounding == (uint64_t) -1)
2374 q.bounding = uid == 0 ? arg_caps_retain : 0;
2375
2376 if (q.effective == (uint64_t) -1)
2377 q.effective = uid == 0 ? q.bounding : 0;
2378
2379 if (q.inheritable == (uint64_t) -1)
2380 q.inheritable = uid == 0 ? q.bounding : 0;
2381
2382 if (q.permitted == (uint64_t) -1)
2383 q.permitted = uid == 0 ? q.bounding : 0;
2384
2385 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2386 q.ambient = 0;
f66ad460
AZ
2387
2388 if (capability_quintet_mangle(&q))
2389 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2390
2391 } else {
de40a303
LP
2392 q = (CapabilityQuintet) {
2393 .bounding = arg_caps_retain,
2394 .effective = uid == 0 ? arg_caps_retain : 0,
2395 .inheritable = uid == 0 ? arg_caps_retain : 0,
2396 .permitted = uid == 0 ? arg_caps_retain : 0,
2397 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2398 };
2399
f66ad460
AZ
2400 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2401 * in order to maintain the same behavior as systemd < 242. */
2402 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2403 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2404 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2405
2406 }
2407
de40a303 2408 return capability_quintet_enforce(&q);
88213476
LP
2409}
2410
db999e0f
LP
2411static int reset_audit_loginuid(void) {
2412 _cleanup_free_ char *p = NULL;
2413 int r;
2414
0c582db0 2415 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2416 return 0;
2417
2418 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2419 if (r == -ENOENT)
db999e0f 2420 return 0;
f647962d
MS
2421 if (r < 0)
2422 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2423
2424 /* Already reset? */
2425 if (streq(p, "4294967295"))
2426 return 0;
2427
57512c89 2428 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2429 if (r < 0) {
10a87006
LP
2430 log_error_errno(r,
2431 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2432 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2433 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2434 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2435 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2436
db999e0f 2437 sleep(5);
77b6e194 2438 }
db999e0f
LP
2439
2440 return 0;
77b6e194
LP
2441}
2442
785890ac
LP
2443static int setup_propagate(const char *root) {
2444 const char *p, *q;
709f6e46 2445 int r;
785890ac
LP
2446
2447 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2448 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2449 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2450 (void) mkdir_p(p, 0600);
2451
709f6e46
MS
2452 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2453 if (r < 0)
2454 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2455
709f6e46
MS
2456 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2459
709f6e46
MS
2460 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2463
03cfe0d5 2464 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2465 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2466 if (r < 0)
2467 return r;
785890ac 2468
60e76d48
ZJS
2469 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2470 if (r < 0)
2471 return r;
785890ac 2472
19caffac
AC
2473 /* machined will MS_MOVE into that directory, and that's only
2474 * supported for non-shared mounts. */
60e76d48 2475 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2476}
2477
317feb4d 2478static int setup_machine_id(const char *directory) {
691675ba
LP
2479 const char *etc_machine_id;
2480 sd_id128_t id;
3bbaff3e 2481 int r;
e01ff70a 2482
317feb4d
LP
2483 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2484 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2485 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2486 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2487 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2488 * container behaves nicely). */
2489
e01ff70a
MS
2490 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2491
691675ba 2492 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2493 if (r < 0) {
2494 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2495 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2496
317feb4d
LP
2497 if (sd_id128_is_null(arg_uuid)) {
2498 r = sd_id128_randomize(&arg_uuid);
2499 if (r < 0)
2500 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2501 }
2502 } else {
baaa35ad
ZJS
2503 if (sd_id128_is_null(id))
2504 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2505 "Machine ID in container image is zero, refusing.");
e01ff70a 2506
317feb4d
LP
2507 arg_uuid = id;
2508 }
691675ba 2509
e01ff70a
MS
2510 return 0;
2511}
2512
7336138e
LP
2513static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2514 int r;
2515
2516 assert(directory);
2517
0de7acce 2518 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2519 return 0;
2520
2521 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2522 if (r == -EOPNOTSUPP)
2523 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2524 if (r == -EBADE)
2525 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2526 if (r < 0)
2527 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2528 if (r == 0)
2529 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2530 else
2531 log_debug("Patched directory tree to match UID/GID range.");
2532
2533 return r;
2534}
2535
113cea80 2536/*
6d416b9c
LS
2537 * Return values:
2538 * < 0 : wait_for_terminate() failed to get the state of the
2539 * container, the container was terminated by a signal, or
2540 * failed for an unknown reason. No change is made to the
2541 * container argument.
2542 * > 0 : The program executed in the container terminated with an
2543 * error. The exit code of the program executed in the
919699ec
LP
2544 * container is returned. The container argument has been set
2545 * to CONTAINER_TERMINATED.
6d416b9c
LS
2546 * 0 : The container is being rebooted, has been shut down or exited
2547 * successfully. The container argument has been set to either
2548 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2549 *
6d416b9c
LS
2550 * That is, success is indicated by a return value of zero, and an
2551 * error is indicated by a non-zero value.
113cea80
DH
2552 */
2553static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2554 siginfo_t status;
919699ec 2555 int r;
113cea80
DH
2556
2557 r = wait_for_terminate(pid, &status);
f647962d
MS
2558 if (r < 0)
2559 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2560
2561 switch (status.si_code) {
fddbb89c 2562
113cea80 2563 case CLD_EXITED:
b5a2179b 2564 if (status.si_status == 0)
919699ec 2565 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2566 else
919699ec 2567 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2568
919699ec
LP
2569 *container = CONTAINER_TERMINATED;
2570 return status.si_status;
113cea80
DH
2571
2572 case CLD_KILLED:
2573 if (status.si_status == SIGINT) {
919699ec 2574 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2575 *container = CONTAINER_TERMINATED;
919699ec
LP
2576 return 0;
2577
113cea80 2578 } else if (status.si_status == SIGHUP) {
919699ec 2579 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2580 *container = CONTAINER_REBOOTED;
919699ec 2581 return 0;
113cea80 2582 }
919699ec 2583
4831981d 2584 _fallthrough_;
113cea80 2585 case CLD_DUMPED:
baaa35ad
ZJS
2586 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2587 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2588
2589 default:
baaa35ad
ZJS
2590 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2591 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2592 }
113cea80
DH
2593}
2594
023fb90b
LP
2595static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2596 pid_t pid;
2597
4a0b58c4 2598 pid = PTR_TO_PID(userdata);
023fb90b 2599 if (pid > 0) {
c6c8f6e2 2600 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2601 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2602 sd_event_source_set_userdata(s, NULL);
2603 return 0;
2604 }
2605 }
2606
2607 sd_event_exit(sd_event_source_get_event(s), 0);
2608 return 0;
2609}
2610
6916b164 2611static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2612 pid_t pid;
2613
2614 assert(s);
2615 assert(ssi);
2616
2617 pid = PTR_TO_PID(userdata);
2618
6916b164
AU
2619 for (;;) {
2620 siginfo_t si = {};
abdb9b08 2621
6916b164
AU
2622 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2623 return log_error_errno(errno, "Failed to waitid(): %m");
2624 if (si.si_pid == 0) /* No pending children. */
2625 break;
abdb9b08 2626 if (si.si_pid == pid) {
6916b164
AU
2627 /* The main process we care for has exited. Return from
2628 * signal handler but leave the zombie. */
2629 sd_event_exit(sd_event_source_get_event(s), 0);
2630 break;
2631 }
abdb9b08 2632
6916b164
AU
2633 /* Reap all other children. */
2634 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2635 }
2636
2637 return 0;
2638}
2639
abdb9b08
LP
2640static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2641 pid_t pid;
2642
2643 assert(m);
2644
2645 pid = PTR_TO_PID(userdata);
2646
2647 if (arg_kill_signal > 0) {
2648 log_info("Container termination requested. Attempting to halt container.");
2649 (void) kill(pid, arg_kill_signal);
2650 } else {
2651 log_info("Container termination requested. Exiting.");
2652 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2653 }
2654
2655 return 0;
2656}
2657
ec16945e 2658static int determine_names(void) {
1b9cebf6 2659 int r;
ec16945e 2660
c1521918
LP
2661 if (arg_template && !arg_directory && arg_machine) {
2662
2663 /* If --template= was specified then we should not
2664 * search for a machine, but instead create a new one
2665 * in /var/lib/machine. */
2666
657ee2d8 2667 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2668 if (!arg_directory)
2669 return log_oom();
2670 }
2671
ec16945e 2672 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2673 if (arg_machine) {
2674 _cleanup_(image_unrefp) Image *i = NULL;
2675
5ef46e5f 2676 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2677 if (r == -ENOENT)
2678 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2681
eb38edce 2682 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2683 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2684 else
0f03c2a4 2685 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2686 if (r < 0)
0f3be6ca 2687 return log_oom();
1b9cebf6 2688
aee327b8
LP
2689 if (!arg_ephemeral)
2690 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2691 } else {
2692 r = safe_getcwd(&arg_directory);
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to determine current directory: %m");
2695 }
ec16945e 2696
c6147113
LP
2697 if (!arg_directory && !arg_image)
2698 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2699 }
2700
2701 if (!arg_machine) {
b9ba4dab
LP
2702 if (arg_directory && path_equal(arg_directory, "/"))
2703 arg_machine = gethostname_malloc();
4827ab48
LP
2704 else {
2705 if (arg_image) {
2706 char *e;
2707
2708 arg_machine = strdup(basename(arg_image));
2709
2710 /* Truncate suffix if there is one */
2711 e = endswith(arg_machine, ".raw");
2712 if (e)
2713 *e = 0;
2714 } else
2715 arg_machine = strdup(basename(arg_directory));
2716 }
ec16945e
LP
2717 if (!arg_machine)
2718 return log_oom();
2719
ae691c1d 2720 hostname_cleanup(arg_machine);
c6147113
LP
2721 if (!machine_name_is_valid(arg_machine))
2722 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2723
2724 if (arg_ephemeral) {
2725 char *b;
2726
2727 /* Add a random suffix when this is an
2728 * ephemeral machine, so that we can run many
2729 * instances at once without manually having
2730 * to specify -M each time. */
2731
2732 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2733 return log_oom();
2734
2735 free(arg_machine);
2736 arg_machine = b;
2737 }
ec16945e
LP
2738 }
2739
2740 return 0;
2741}
2742
8d4aa2bb 2743static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2744 char *chased;
2745 int r;
2746
2747 assert(p);
2748
2749 if (!*p)
2750 return 0;
2751
a5648b80 2752 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
2753 if (r < 0)
2754 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2755
a5648b80 2756 return free_and_replace(*p, chased);
3f342ec4
LP
2757}
2758
03cfe0d5 2759static int determine_uid_shift(const char *directory) {
6dac160c
LP
2760 int r;
2761
0de7acce 2762 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2763 arg_uid_shift = 0;
6dac160c 2764 return 0;
03cfe0d5 2765 }
6dac160c
LP
2766
2767 if (arg_uid_shift == UID_INVALID) {
2768 struct stat st;
2769
03cfe0d5 2770 r = stat(directory, &st);
6dac160c 2771 if (r < 0)
03cfe0d5 2772 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2773
2774 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2775
baaa35ad
ZJS
2776 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2778 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2779
2780 arg_uid_range = UINT32_C(0x10000);
2781 }
2782
baaa35ad
ZJS
2783 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2784 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2785 "UID base too high for UID range.");
6dac160c 2786
6dac160c
LP
2787 return 0;
2788}
2789
de40a303
LP
2790static unsigned long effective_clone_ns_flags(void) {
2791 unsigned long flags = arg_clone_ns_flags;
2792
2793 if (arg_private_network)
2794 flags |= CLONE_NEWNET;
2795 if (arg_use_cgns)
2796 flags |= CLONE_NEWCGROUP;
2797 if (arg_userns_mode != USER_NAMESPACE_NO)
2798 flags |= CLONE_NEWUSER;
2799
2800 return flags;
2801}
2802
2803static int patch_sysctl(void) {
2804
2805 /* This table is inspired by runc's sysctl() function */
2806 static const struct {
2807 const char *key;
2808 bool prefix;
2809 unsigned long clone_flags;
2810 } safe_sysctl[] = {
2811 { "kernel.hostname", false, CLONE_NEWUTS },
2812 { "kernel.domainname", false, CLONE_NEWUTS },
2813 { "kernel.msgmax", false, CLONE_NEWIPC },
2814 { "kernel.msgmnb", false, CLONE_NEWIPC },
2815 { "kernel.msgmni", false, CLONE_NEWIPC },
2816 { "kernel.sem", false, CLONE_NEWIPC },
2817 { "kernel.shmall", false, CLONE_NEWIPC },
2818 { "kernel.shmmax", false, CLONE_NEWIPC },
2819 { "kernel.shmmni", false, CLONE_NEWIPC },
2820 { "fs.mqueue.", true, CLONE_NEWIPC },
2821 { "net.", true, CLONE_NEWNET },
2822 };
2823
2824 unsigned long flags;
2825 char **k, **v;
2826 int r;
2827
2828 flags = effective_clone_ns_flags();
2829
2830 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2831 bool good = false;
2832 size_t i;
2833
2834 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2835
2836 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2837 continue;
2838
2839 if (safe_sysctl[i].prefix)
2840 good = startswith(*k, safe_sysctl[i].key);
2841 else
2842 good = streq(*k, safe_sysctl[i].key);
2843
2844 if (good)
2845 break;
2846 }
2847
c6147113
LP
2848 if (!good)
2849 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
2850
2851 r = sysctl_write(*k, *v);
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2854 }
2855
2856 return 0;
2857}
2858
03cfe0d5
LP
2859static int inner_child(
2860 Barrier *barrier,
2861 const char *directory,
2862 bool secondary,
2863 int kmsg_socket,
2864 int rtnl_socket,
3acc84eb 2865 int master_pty_socket,
f757855e 2866 FDSet *fds) {
69c79d3c 2867
03cfe0d5 2868 _cleanup_free_ char *home = NULL;
b5ea030d 2869 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 2870 size_t n_env = 1;
03cfe0d5 2871 const char *envp[] = {
0c300adf 2872 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2873 NULL, /* container */
03cfe0d5
LP
2874 NULL, /* TERM */
2875 NULL, /* HOME */
2876 NULL, /* USER */
2877 NULL, /* LOGNAME */
2878 NULL, /* container_uuid */
2879 NULL, /* LISTEN_FDS */
2880 NULL, /* LISTEN_PID */
9c1e04d0 2881 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2882 NULL
2883 };
1a68e1e5 2884 const char *exec_target;
2371271c 2885 _cleanup_strv_free_ char **env_use = NULL;
de40a303 2886 int r, which_failed;
88213476 2887
b37469d7
LP
2888 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2889 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2890 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2891 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2892 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2893 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2894 * namespace.
2895 *
2896 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2897 * unshare(). See below. */
2898
03cfe0d5
LP
2899 assert(barrier);
2900 assert(directory);
2901 assert(kmsg_socket >= 0);
88213476 2902
de40a303
LP
2903 log_debug("Inner child is initializing.");
2904
0de7acce 2905 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2906 /* Tell the parent, that it now can write the UID map. */
2907 (void) barrier_place(barrier); /* #1 */
7027ff61 2908
03cfe0d5 2909 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2910 if (!barrier_place_and_sync(barrier)) /* #2 */
2911 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2912 "Parent died too early");
88213476
LP
2913 }
2914
6d66bd3b
EV
2915 r = reset_uid_gid();
2916 if (r < 0)
2917 return log_error_errno(r, "Couldn't become new root: %m");
2918
0de7acce 2919 r = mount_all(NULL,
4f086aab 2920 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2921 arg_uid_shift,
0de7acce 2922 arg_selinux_apifs_context);
03cfe0d5
LP
2923 if (r < 0)
2924 return r;
2925
04413780
ZJS
2926 if (!arg_network_namespace_path && arg_private_network) {
2927 r = unshare(CLONE_NEWNET);
2928 if (r < 0)
2929 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2930
2931 /* Tell the parent that it can setup network interfaces. */
2932 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2933 }
2934
4f086aab 2935 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2936 if (r < 0)
2937 return r;
2938
03cfe0d5
LP
2939 /* Wait until we are cgroup-ified, so that we
2940 * can mount the right cgroup path writable */
baaa35ad
ZJS
2941 if (!barrier_place_and_sync(barrier)) /* #4 */
2942 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2943 "Parent died too early");
88213476 2944
489fae52 2945 if (arg_use_cgns) {
0996ef00
CB
2946 r = unshare(CLONE_NEWCGROUP);
2947 if (r < 0)
04413780 2948 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2949 r = mount_cgroups(
2950 "",
2951 arg_unified_cgroup_hierarchy,
2952 arg_userns_mode != USER_NAMESPACE_NO,
2953 arg_uid_shift,
2954 arg_uid_range,
5a8ff0e6 2955 arg_selinux_apifs_context,
ada54120 2956 true);
0996ef00
CB
2957 if (r < 0)
2958 return r;
2959 } else {
2960 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2961 if (r < 0)
2962 return r;
2963 }
ec16945e 2964
1e4f1671 2965 r = setup_boot_id();
03cfe0d5
LP
2966 if (r < 0)
2967 return r;
ec16945e 2968
1e4f1671 2969 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2970 if (r < 0)
2971 return r;
2972 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2973
de40a303
LP
2974 r = mount_custom(
2975 "/",
2976 arg_custom_mounts,
2977 arg_n_custom_mounts,
2978 false,
2979 0,
2980 0,
2981 arg_selinux_apifs_context,
2982 true);
2983 if (r < 0)
2984 return r;
2985
03cfe0d5
LP
2986 if (setsid() < 0)
2987 return log_error_errno(errno, "setsid() failed: %m");
2988
2989 if (arg_private_network)
2990 loopback_setup();
2991
7a8f6325
LP
2992 if (arg_expose_ports) {
2993 r = expose_port_send_rtnl(rtnl_socket);
2994 if (r < 0)
2995 return r;
2996 rtnl_socket = safe_close(rtnl_socket);
2997 }
03cfe0d5 2998
3acc84eb 2999 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3000 _cleanup_close_ int master = -1;
3acc84eb
FB
3001 _cleanup_free_ char *console = NULL;
3002
3003 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3004 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3005 if (master < 0)
dc98caea 3006 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3007
3008 r = setup_dev_console(console);
3009 if (r < 0)
3010 return log_error_errno(r, "Failed to setup /dev/console: %m");
3011
3012 r = send_one_fd(master_pty_socket, master, 0);
3013 if (r < 0)
3014 return log_error_errno(r, "Failed to send master fd: %m");
3015 master_pty_socket = safe_close(master_pty_socket);
3016
3017 r = setup_stdio_as_dev_console();
3018 if (r < 0)
3019 return r;
3020 }
3021
de40a303
LP
3022 r = patch_sysctl();
3023 if (r < 0)
3024 return r;
3025
81f345df
LP
3026 if (arg_oom_score_adjust_set) {
3027 r = set_oom_score_adjust(arg_oom_score_adjust);
3028 if (r < 0)
3029 return log_error_errno(r, "Failed to adjust OOM score: %m");
3030 }
3031
0985c7c4
ZJS
3032 if (arg_cpu_set.set)
3033 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3034 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3035
c818eef1 3036 (void) setup_hostname();
03cfe0d5 3037
050f7277 3038 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3039 r = safe_personality(arg_personality);
3040 if (r < 0)
3041 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3042 } else if (secondary) {
21022b9d
LP
3043 r = safe_personality(PER_LINUX32);
3044 if (r < 0)
3045 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3046 }
3047
de40a303
LP
3048 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3049 if (r < 0)
3050 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3051
3052#if HAVE_SECCOMP
3053 if (arg_seccomp) {
3054
3055 if (is_seccomp_available()) {
3056
3057 r = seccomp_load(arg_seccomp);
7bc5e0b1 3058 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3059 return log_error_errno(r, "Failed to install seccomp filter: %m");
3060 if (r < 0)
3061 log_debug_errno(r, "Failed to install seccomp filter: %m");
3062 }
3063 } else
3064#endif
3065 {
3066 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3067 if (r < 0)
3068 return r;
3069 }
3070
349cc4a5 3071#if HAVE_SELINUX
03cfe0d5 3072 if (arg_selinux_context)
2ed96880 3073 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3074 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3075#endif
3076
de40a303
LP
3077 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3078 * if we need to later on. */
3079 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3080 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3081
3082 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3083 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3084 else
3085 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
3086 if (r < 0)
3087 return r;
3088
de40a303
LP
3089 r = drop_capabilities(getuid());
3090 if (r < 0)
3091 return log_error_errno(r, "Dropping capabilities failed: %m");
3092
66edd963
LP
3093 if (arg_no_new_privileges)
3094 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3095 return log_error_errno(errno, "Failed to disable new privileges: %m");
3096
6aadfa4c
ILG
3097 /* LXC sets container=lxc, so follow the scheme here */
3098 envp[n_env++] = strjoina("container=", arg_container_service_name);
3099
03cfe0d5
LP
3100 envp[n_env] = strv_find_prefix(environ, "TERM=");
3101 if (envp[n_env])
313cefa1 3102 n_env++;
03cfe0d5 3103
de40a303
LP
3104 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3105 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3106 return log_oom();
3107
3108 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3109 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3110 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3111 return log_oom();
03cfe0d5 3112
3bbaff3e 3113 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3114
691675ba 3115 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3116 return log_oom();
03cfe0d5
LP
3117
3118 if (fdset_size(fds) > 0) {
3119 r = fdset_cloexec(fds, false);
3120 if (r < 0)
3121 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3122
3123 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3124 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3125 return log_oom();
3126 }
9c1e04d0
AP
3127 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3128 return log_oom();
03cfe0d5 3129
2371271c
TG
3130 env_use = strv_env_merge(2, envp, arg_setenv);
3131 if (!env_use)
3132 return log_oom();
03cfe0d5
LP
3133
3134 /* Let the parent know that we are ready and
3135 * wait until the parent is ready with the
3136 * setup, too... */
baaa35ad
ZJS
3137 if (!barrier_place_and_sync(barrier)) /* #5 */
3138 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3139 "Parent died too early");
03cfe0d5 3140
5f932eb9
LP
3141 if (arg_chdir)
3142 if (chdir(arg_chdir) < 0)
3143 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3144
7732f92b 3145 if (arg_start_mode == START_PID2) {
75bf701f 3146 r = stub_pid1(arg_uuid);
7732f92b
LP
3147 if (r < 0)
3148 return r;
3149 }
3150
de40a303
LP
3151 log_debug("Inner child completed, invoking payload.");
3152
8ca082b4
LP
3153 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3154 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3155 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3156 log_close();
8ca082b4
LP
3157 log_set_open_when_needed(true);
3158
03cfe0d5
LP
3159 (void) fdset_close_others(fds);
3160
7732f92b 3161 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3162 char **a;
3163 size_t m;
3164
3165 /* Automatically search for the init system */
3166
75f32f04
ZJS
3167 m = strv_length(arg_parameters);
3168 a = newa(char*, m + 2);
3169 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3170 a[1 + m] = NULL;
03cfe0d5 3171
ced58da7 3172 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3173 execve(a[0], a, env_use);
3174
ced58da7 3175 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3176 execve(a[0], a, env_use);
3177
ced58da7 3178 a[0] = (char*) "/sbin/init";
03cfe0d5 3179 execve(a[0], a, env_use);
ced58da7
LP
3180
3181 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3182 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3183 const char *dollar_path;
3184
1a68e1e5 3185 exec_target = arg_parameters[0];
b6b180b7
LP
3186
3187 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3188 * binary. */
3189 dollar_path = strv_env_get(env_use, "PATH");
3190 if (dollar_path) {
3191 if (putenv((char*) dollar_path) != 0)
3192 return log_error_errno(errno, "Failed to update $PATH: %m");
3193 }
3194
f757855e 3195 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3196 } else {
5f932eb9 3197 if (!arg_chdir)
d929b0f9
ZJS
3198 /* If we cannot change the directory, we'll end up in /, that is expected. */
3199 (void) chdir(home ?: "/root");
5f932eb9 3200
03cfe0d5
LP
3201 execle("/bin/bash", "-bash", NULL, env_use);
3202 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3203
3204 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3205 }
3206
8ca082b4 3207 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3208}
3209
9c1e04d0 3210static int setup_sd_notify_child(void) {
271f518f 3211 _cleanup_close_ int fd = -1;
9c1e04d0 3212 union sockaddr_union sa = {
44ed5214
LP
3213 .un.sun_family = AF_UNIX,
3214 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3215 };
3216 int r;
3217
3218 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3219 if (fd < 0)
3220 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3221
3222 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3223 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3224
9c1e04d0 3225 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3226 if (r < 0)
44ed5214 3227 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3228
adc7d9f0 3229 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3230 if (r < 0)
adc7d9f0 3231 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3232
2ff48e98 3233 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3234 if (r < 0)
2ff48e98 3235 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3236
271f518f 3237 return TAKE_FD(fd);
9c1e04d0
AP
3238}
3239
03cfe0d5
LP
3240static int outer_child(
3241 Barrier *barrier,
3242 const char *directory,
2d845785 3243 DissectedImage *dissected_image,
03cfe0d5
LP
3244 bool secondary,
3245 int pid_socket,
e01ff70a 3246 int uuid_socket,
9c1e04d0 3247 int notify_socket,
03cfe0d5
LP
3248 int kmsg_socket,
3249 int rtnl_socket,
825d5287 3250 int uid_shift_socket,
3acc84eb 3251 int master_pty_socket,
8199d554 3252 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3253 FDSet *fds,
3254 int netns_fd) {
03cfe0d5 3255
bf428efb 3256 _cleanup_close_ int fd = -1;
03cfe0d5
LP
3257 pid_t pid;
3258 ssize_t l;
de40a303 3259 int r;
03cfe0d5 3260
b37469d7
LP
3261 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3262 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3263 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3264 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3265
03cfe0d5
LP
3266 assert(barrier);
3267 assert(directory);
03cfe0d5 3268 assert(pid_socket >= 0);
e01ff70a 3269 assert(uuid_socket >= 0);
9c1e04d0 3270 assert(notify_socket >= 0);
3acc84eb 3271 assert(master_pty_socket >= 0);
03cfe0d5
LP
3272 assert(kmsg_socket >= 0);
3273
de40a303
LP
3274 log_debug("Outer child is initializing.");
3275
03cfe0d5
LP
3276 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3277 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3278
03cfe0d5
LP
3279 r = reset_audit_loginuid();
3280 if (r < 0)
3281 return r;
3282
3283 /* Mark everything as slave, so that we still
3284 * receive mounts from the real root, but don't
3285 * propagate mounts to the real root. */
60e76d48
ZJS
3286 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3287 if (r < 0)
3288 return r;
03cfe0d5 3289
2d845785 3290 if (dissected_image) {
2d3a5a73
LP
3291 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3292 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3293 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3294 * makes sure ESP partitions and userns are compatible. */
3295
3296 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
3297 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3298 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3299 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
3300 if (r < 0)
3301 return r;
3302 }
03cfe0d5 3303
391567f4
LP
3304 r = determine_uid_shift(directory);
3305 if (r < 0)
3306 return r;
3307
0de7acce 3308 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3309 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3310 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3311 if (l < 0)
3312 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3313 if (l != sizeof(arg_uid_shift))
3314 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3315 "Short write while sending UID shift.");
0e7ac751 3316
0de7acce 3317 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3318 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3319 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3320 * not it will pick a different one, and send it back to us. */
3321
3322 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3323 if (l < 0)
3324 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3325 if (l != sizeof(arg_uid_shift))
3326 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3327 "Short read while receiving UID shift.");
0e7ac751
LP
3328 }
3329
ff6c6cc1
LP
3330 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3331 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3332 }
3333
6f83d3d1
LP
3334 if (path_equal(directory, "/")) {
3335 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3336 * place, so that we can make changes to its mount structure (for example, to implement
3337 * --volatile=) without this interfering with our ability to access files such as
3338 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3339 * (instead of a temporary directory, since we are living in our own mount namspace here
3340 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3341 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3342
3343 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3344 if (r < 0)
3345 return r;
3346
3347 directory = "/run/systemd/nspawn-root";
3348
3349 } else if (!dissected_image) {
3350 /* Turn directory into bind mount (we need that so that we can move the bind mount to root
3351 * later on). */
e50cd82f
LP
3352 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3353 if (r < 0)
3354 return r;
3355 }
7d0ecdd6
LP
3356
3357 r = setup_pivot_root(
3358 directory,
3359 arg_pivot_root_new,
3360 arg_pivot_root_old);
3361 if (r < 0)
3362 return r;
3363
3364 r = setup_volatile_mode(
3365 directory,
3366 arg_volatile_mode,
3367 arg_userns_mode != USER_NAMESPACE_NO,
3368 arg_uid_shift,
3369 arg_uid_range,
8f1ed04a 3370 arg_selinux_apifs_context);
7d0ecdd6
LP
3371 if (r < 0)
3372 return r;
3373
2d3a5a73
LP
3374 if (dissected_image) {
3375 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3376 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3377 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3378 if (r < 0)
3379 return r;
3380 }
3381
8199d554
LP
3382 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3383 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3384
3385 r = detect_unified_cgroup_hierarchy_from_image(directory);
3386 if (r < 0)
3387 return r;
3388
3389 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3390 if (l < 0)
3391 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3392 if (l != sizeof(arg_unified_cgroup_hierarchy))
3393 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3394 "Short write while sending cgroup mode.");
8199d554
LP
3395
3396 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3397 }
3398
4ad14eff
LP
3399 /* Mark everything as shared so our mounts get propagated down. This is
3400 * required to make new bind mounts available in systemd services
5238e957 3401 * inside the container that create a new mount namespace.
4ad14eff
LP
3402 * See https://github.com/systemd/systemd/issues/3860
3403 * Further submounts (such as /dev) done after this will inherit the
13e785f7 3404 * shared propagation mode. */
4ad14eff
LP
3405 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3406 if (r < 0)
3407 return r;
3408
3409 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3410 if (r < 0)
3411 return r;
3412
03cfe0d5
LP
3413 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3414 if (r < 0)
3415 return r;
3416
e5a4bb0d 3417 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
64e82c19 3418 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3419 if (r < 0)
3420 return log_error_errno(r, "Failed to make tree read-only: %m");
3421 }
3422
0de7acce 3423 r = mount_all(directory,
4f086aab 3424 arg_mount_settings,
0de7acce 3425 arg_uid_shift,
0de7acce 3426 arg_selinux_apifs_context);
03cfe0d5
LP
3427 if (r < 0)
3428 return r;
3429
07fa00f9
LP
3430 r = copy_devnodes(directory);
3431 if (r < 0)
03cfe0d5
LP
3432 return r;
3433
de40a303
LP
3434 r = make_extra_nodes(directory);
3435 if (r < 0)
3436 return r;
3437
3438 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3439 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
03cfe0d5 3440
07fa00f9
LP
3441 r = setup_pts(directory);
3442 if (r < 0)
03cfe0d5
LP
3443 return r;
3444
3445 r = setup_propagate(directory);
3446 if (r < 0)
3447 return r;
3448
8e5430c4
LP
3449 r = setup_keyring();
3450 if (r < 0)
3451 return r;
3452
03cfe0d5
LP
3453 r = setup_timezone(directory);
3454 if (r < 0)
3455 return r;
3456
3457 r = setup_resolv_conf(directory);
3458 if (r < 0)
3459 return r;
3460
e01ff70a
MS
3461 r = setup_machine_id(directory);
3462 if (r < 0)
3463 return r;
3464
03cfe0d5
LP
3465 r = setup_journal(directory);
3466 if (r < 0)
3467 return r;
3468
0de7acce
LP
3469 r = mount_custom(
3470 directory,
3471 arg_custom_mounts,
3472 arg_n_custom_mounts,
3473 arg_userns_mode != USER_NAMESPACE_NO,
3474 arg_uid_shift,
3475 arg_uid_range,
de40a303
LP
3476 arg_selinux_apifs_context,
3477 false);
03cfe0d5
LP
3478 if (r < 0)
3479 return r;
3480
489fae52 3481 if (!arg_use_cgns) {
0996ef00
CB
3482 r = mount_cgroups(
3483 directory,
3484 arg_unified_cgroup_hierarchy,
3485 arg_userns_mode != USER_NAMESPACE_NO,
3486 arg_uid_shift,
3487 arg_uid_range,
5a8ff0e6 3488 arg_selinux_apifs_context,
ada54120 3489 false);
0996ef00
CB
3490 if (r < 0)
3491 return r;
3492 }
03cfe0d5
LP
3493
3494 r = mount_move_root(directory);
3495 if (r < 0)
3496 return log_error_errno(r, "Failed to move root directory: %m");
3497
9c1e04d0
AP
3498 fd = setup_sd_notify_child();
3499 if (fd < 0)
3500 return fd;
3501
03cfe0d5 3502 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3503 arg_clone_ns_flags |
8869a0b4 3504 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3505 if (pid < 0)
3506 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3507 if (pid == 0) {
3508 pid_socket = safe_close(pid_socket);
e01ff70a 3509 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3510 notify_socket = safe_close(notify_socket);
825d5287 3511 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3512
3513 /* The inner child has all namespaces that are
3514 * requested, so that we all are owned by the user if
3515 * user namespaces are turned on. */
3516
d7bea6b6
DP
3517 if (arg_network_namespace_path) {
3518 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3519 if (r < 0)
e2d39e54 3520 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3521 }
3522
3acc84eb 3523 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
03cfe0d5
LP
3524 if (r < 0)
3525 _exit(EXIT_FAILURE);
3526
3527 _exit(EXIT_SUCCESS);
3528 }
3529
3530 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3531 if (l < 0)
3532 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3533 if (l != sizeof(pid))
3534 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3535 "Short write while sending PID.");
03cfe0d5 3536
e01ff70a
MS
3537 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3538 if (l < 0)
3539 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3540 if (l != sizeof(arg_uuid))
3541 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3542 "Short write while sending machine ID.");
e01ff70a 3543
9c1e04d0
AP
3544 l = send_one_fd(notify_socket, fd, 0);
3545 if (l < 0)
ba72801d 3546 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3547
03cfe0d5 3548 pid_socket = safe_close(pid_socket);
e01ff70a 3549 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3550 notify_socket = safe_close(notify_socket);
3acc84eb 3551 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3552 kmsg_socket = safe_close(kmsg_socket);
3553 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3554 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3555
3556 return 0;
3557}
3558
0e7ac751 3559static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3560 bool tried_hashed = false;
0e7ac751
LP
3561 unsigned n_tries = 100;
3562 uid_t candidate;
3563 int r;
3564
3565 assert(shift);
3566 assert(ret_lock_file);
0de7acce 3567 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3568 assert(arg_uid_range == 0x10000U);
3569
3570 candidate = *shift;
3571
3572 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3573
3574 for (;;) {
fbd0b64f 3575 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3576 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3577
3578 if (--n_tries <= 0)
3579 return -EBUSY;
3580
87d5e4f2 3581 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3582 goto next;
3583 if ((candidate & UINT32_C(0xFFFF)) != 0)
3584 goto next;
3585
3586 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3587 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3588 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3589 goto next;
3590 if (r < 0)
3591 return r;
3592
3593 /* Make some superficial checks whether the range is currently known in the user database */
3594 if (getpwuid(candidate))
3595 goto next;
3596 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3597 goto next;
3598 if (getgrgid(candidate))
3599 goto next;
3600 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3601 goto next;
3602
3603 *ret_lock_file = lf;
3604 lf = (struct LockFile) LOCK_FILE_INIT;
3605 *shift = candidate;
3606 return 0;
3607
3608 next:
d381c8a6
LP
3609 if (arg_machine && !tried_hashed) {
3610 /* Try to hash the base from the container name */
3611
3612 static const uint8_t hash_key[] = {
3613 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3614 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3615 };
3616
3617 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3618
3619 tried_hashed = true;
3620 } else
3621 random_bytes(&candidate, sizeof(candidate));
3622
87d5e4f2 3623 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3624 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3625 }
3626}
3627
03cfe0d5 3628static int setup_uid_map(pid_t pid) {
fbd0b64f 3629 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3630 int r;
3631
3632 assert(pid > 1);
3633
3634 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3635 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3636 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3637 if (r < 0)
3638 return log_error_errno(r, "Failed to write UID map: %m");
3639
3640 /* We always assign the same UID and GID ranges */
3641 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3642 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3643 if (r < 0)
3644 return log_error_errno(r, "Failed to write GID map: %m");
3645
3646 return 0;
3647}
3648
9c1e04d0 3649static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3650 char buf[NOTIFY_BUFFER_MAX+1];
3651 char *p = NULL;
3652 struct iovec iovec = {
3653 .iov_base = buf,
3654 .iov_len = sizeof(buf)-1,
3655 };
3656 union {
3657 struct cmsghdr cmsghdr;
3658 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3659 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3660 } control = {};
3661 struct msghdr msghdr = {
3662 .msg_iov = &iovec,
3663 .msg_iovlen = 1,
3664 .msg_control = &control,
3665 .msg_controllen = sizeof(control),
3666 };
3667 struct cmsghdr *cmsg;
3668 struct ucred *ucred = NULL;
3669 ssize_t n;
3670 pid_t inner_child_pid;
3671 _cleanup_strv_free_ char **tags = NULL;
3672
3673 assert(userdata);
3674
3675 inner_child_pid = PTR_TO_PID(userdata);
3676
3677 if (revents != EPOLLIN) {
3678 log_warning("Got unexpected poll event for notify fd.");
3679 return 0;
3680 }
3681
3682 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3683 if (n < 0) {
3742095b 3684 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3685 return 0;
3686
3687 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3688 }
3689 cmsg_close_all(&msghdr);
3690
3691 CMSG_FOREACH(cmsg, &msghdr) {
3692 if (cmsg->cmsg_level == SOL_SOCKET &&
3693 cmsg->cmsg_type == SCM_CREDENTIALS &&
3694 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3695
3696 ucred = (struct ucred*) CMSG_DATA(cmsg);
3697 }
3698 }
3699
3700 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3701 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3702 return 0;
3703 }
3704
3705 if ((size_t) n >= sizeof(buf)) {
3706 log_warning("Received notify message exceeded maximum size. Ignoring.");
3707 return 0;
3708 }
3709
3710 buf[n] = 0;
3711 tags = strv_split(buf, "\n\r");
3712 if (!tags)
3713 return log_oom();
3714
3715 if (strv_find(tags, "READY=1"))
04f590a4 3716 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
3717
3718 p = strv_find_startswith(tags, "STATUS=");
3719 if (p)
04f590a4 3720 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
3721
3722 return 0;
3723}
3724
5773024d 3725static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3726 int r;
9c1e04d0 3727
5773024d 3728 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3729 if (r < 0)
3730 return log_error_errno(r, "Failed to allocate notify event source: %m");
3731
5773024d 3732 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3733
3734 return 0;
3735}
3736
5d961407
LP
3737static int merge_settings(Settings *settings, const char *path) {
3738 int rl;
f757855e 3739
5d961407
LP
3740 assert(settings);
3741 assert(path);
f757855e 3742
5d961407
LP
3743 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3744 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3745
7732f92b
LP
3746 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3747 settings->start_mode >= 0) {
3748 arg_start_mode = settings->start_mode;
130d3d22 3749 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3750 }
3751
a2f577fc
JL
3752 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3753 arg_ephemeral = settings->ephemeral;
3754
de40a303
LP
3755 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3756 settings->root) {
3757
3758 if (!arg_settings_trusted)
3759 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3760 else
3761 free_and_replace(arg_directory, settings->root);
3762 }
3763
b53ede69
PW
3764 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3765 settings->pivot_root_new) {
3766 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3767 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3768 }
3769
5f932eb9 3770 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3771 settings->working_directory)
3772 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3773
f757855e 3774 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3775 settings->environment)
3776 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 3777
de40a303
LP
3778 if ((arg_settings_mask & SETTING_USER) == 0) {
3779
3780 if (settings->user)
3781 free_and_replace(arg_user, settings->user);
3782
3783 if (uid_is_valid(settings->uid))
3784 arg_uid = settings->uid;
3785 if (gid_is_valid(settings->gid))
3786 arg_gid = settings->gid;
3787 if (settings->n_supplementary_gids > 0) {
3788 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3789 arg_n_supplementary_gids = settings->n_supplementary_gids;
3790 }
3791 }
f757855e
LP
3792
3793 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 3794 uint64_t plus, minus;
7be830c6 3795 uint64_t network_minus = 0;
f757855e 3796
de40a303
LP
3797 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3798 * Settings structure */
3799
0e265674 3800 plus = settings->capability;
a3fc6b55
LP
3801 minus = settings->drop_capability;
3802
3803 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3804 if (settings_private_network(settings))
3805 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3806 else
7be830c6 3807 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 3808 }
0e265674
LP
3809
3810 if (!arg_settings_trusted && plus != 0) {
3811 if (settings->capability != 0)
5d961407 3812 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
3813 } else {
3814 arg_caps_retain &= ~network_minus;
520e0d54 3815 arg_caps_retain |= plus;
7be830c6 3816 }
f757855e 3817
a3fc6b55 3818 arg_caps_retain &= ~minus;
de40a303
LP
3819
3820 /* Copy the full capabilities over too */
3821 if (capability_quintet_is_set(&settings->full_capabilities)) {
3822 if (!arg_settings_trusted)
5238e957 3823 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
3824 else
3825 arg_full_capabilities = settings->full_capabilities;
3826 }
f757855e
LP
3827 }
3828
3829 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3830 settings->kill_signal > 0)
3831 arg_kill_signal = settings->kill_signal;
3832
3833 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3834 settings->personality != PERSONALITY_INVALID)
3835 arg_personality = settings->personality;
3836
3837 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3838 !sd_id128_is_null(settings->machine_id)) {
3839
3840 if (!arg_settings_trusted)
5d961407 3841 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3842 else
3843 arg_uuid = settings->machine_id;
3844 }
3845
3846 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3847 settings->read_only >= 0)
3848 arg_read_only = settings->read_only;
3849
3850 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3851 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3852 arg_volatile_mode = settings->volatile_mode;
3853
3854 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3855 settings->n_custom_mounts > 0) {
3856
3857 if (!arg_settings_trusted)
5d961407 3858 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3859 else {
3860 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3861 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3862 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3863 settings->n_custom_mounts = 0;
3864 }
3865 }
3866
3867 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3868 (settings->private_network >= 0 ||
3869 settings->network_veth >= 0 ||
3870 settings->network_bridge ||
22b28dfd 3871 settings->network_zone ||
f757855e
LP
3872 settings->network_interfaces ||
3873 settings->network_macvlan ||
f6d6bad1 3874 settings->network_ipvlan ||
de40a303
LP
3875 settings->network_veth_extra ||
3876 settings->network_namespace_path)) {
f757855e
LP
3877
3878 if (!arg_settings_trusted)
5d961407 3879 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3880 else {
f6d6bad1 3881 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3882 arg_private_network = settings_private_network(settings);
3883
130d3d22
YW
3884 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3885 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3886 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3887 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3888
1cc6c93a
YW
3889 free_and_replace(arg_network_bridge, settings->network_bridge);
3890 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
3891
3892 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
3893 }
3894 }
3895
3896 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3897 settings->expose_ports) {
3898
3899 if (!arg_settings_trusted)
5d961407 3900 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3901 else {
3902 expose_port_free_all(arg_expose_ports);
1cc6c93a 3903 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3904 }
3905 }
3906
0de7acce
LP
3907 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3908 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3909
3910 if (!arg_settings_trusted)
5d961407 3911 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3912 else {
3913 arg_userns_mode = settings->userns_mode;
3914 arg_uid_shift = settings->uid_shift;
3915 arg_uid_range = settings->uid_range;
3916 arg_userns_chown = settings->userns_chown;
3917 }
3918 }
3919
9c1e04d0
AP
3920 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3921 arg_notify_ready = settings->notify_ready;
3922
960e4569
LP
3923 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3924
de40a303 3925 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
5d961407 3926 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3927 else {
130d3d22
YW
3928 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3929 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569 3930 }
de40a303
LP
3931
3932#if HAVE_SECCOMP
3933 if (!arg_settings_trusted && settings->seccomp)
3934 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3935 else {
3936 seccomp_release(arg_seccomp);
3937 arg_seccomp = TAKE_PTR(settings->seccomp);
3938 }
3939#endif
960e4569
LP
3940 }
3941
bf428efb
LP
3942 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3943 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3944 continue;
3945
3946 if (!settings->rlimit[rl])
3947 continue;
3948
3949 if (!arg_settings_trusted) {
5d961407 3950 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3951 continue;
3952 }
3953
3954 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3955 }
3956
3a9530e5
LP
3957 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3958 settings->hostname)
3959 free_and_replace(arg_hostname, settings->hostname);
3960
66edd963
LP
3961 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3962 settings->no_new_privileges >= 0)
3963 arg_no_new_privileges = settings->no_new_privileges;
3964
81f345df
LP
3965 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3966 settings->oom_score_adjust_set) {
3967
3968 if (!arg_settings_trusted)
5d961407 3969 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3970 else {
3971 arg_oom_score_adjust = settings->oom_score_adjust;
3972 arg_oom_score_adjust_set = true;
3973 }
3974 }
3975
d107bb7d 3976 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 3977 settings->cpu_set.set) {
d107bb7d
LP
3978
3979 if (!arg_settings_trusted)
5d961407 3980 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 3981 else {
0985c7c4
ZJS
3982 cpu_set_reset(&arg_cpu_set);
3983 arg_cpu_set = settings->cpu_set;
3984 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
3985 }
3986 }
3987
09d423e9
LP
3988 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3989 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3990 arg_resolv_conf = settings->resolv_conf;
3991
4e1d6aa9
LP
3992 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3993 settings->link_journal != _LINK_JOURNAL_INVALID) {
3994
3995 if (!arg_settings_trusted)
3996 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3997 else {
3998 arg_link_journal = settings->link_journal;
3999 arg_link_journal_try = settings->link_journal_try;
4000 }
4001 }
4002
1688841f
LP
4003 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4004 settings->timezone != _TIMEZONE_MODE_INVALID)
4005 arg_timezone = settings->timezone;
4006
de40a303
LP
4007 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4008 settings->slice) {
4009
4010 if (!arg_settings_trusted)
4011 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4012 else
4013 free_and_replace(arg_slice, settings->slice);
4014 }
4015
4016 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4017 settings->use_cgns >= 0) {
4018
4019 if (!arg_settings_trusted)
4020 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4021 else
4022 arg_use_cgns = settings->use_cgns;
4023 }
4024
4025 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4026 settings->clone_ns_flags != (unsigned long) -1) {
4027
4028 if (!arg_settings_trusted)
4029 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4030 else
4031 arg_clone_ns_flags = settings->clone_ns_flags;
4032 }
4033
4034 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4035 settings->console_mode >= 0) {
4036
4037 if (!arg_settings_trusted)
4038 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4039 else
4040 arg_console_mode = settings->console_mode;
4041 }
4042
4043 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4044 * don't consult arg_settings_mask for them. */
4045
4046 sd_bus_message_unref(arg_property_message);
4047 arg_property_message = TAKE_PTR(settings->properties);
4048
4049 arg_console_width = settings->console_width;
4050 arg_console_height = settings->console_height;
4051
b2645747 4052 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4053 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4054 arg_n_extra_nodes = settings->n_extra_nodes;
4055
f757855e
LP
4056 return 0;
4057}
4058
5d961407
LP
4059static int load_settings(void) {
4060 _cleanup_(settings_freep) Settings *settings = NULL;
4061 _cleanup_fclose_ FILE *f = NULL;
4062 _cleanup_free_ char *p = NULL;
4063 const char *fn, *i;
4064 int r;
4065
de40a303
LP
4066 if (arg_oci_bundle)
4067 return 0;
4068
5d961407
LP
4069 /* If all settings are masked, there's no point in looking for
4070 * the settings file */
4071 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4072 return 0;
4073
4074 fn = strjoina(arg_machine, ".nspawn");
4075
4076 /* We first look in the admin's directories in /etc and /run */
4077 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4078 _cleanup_free_ char *j = NULL;
4079
657ee2d8 4080 j = path_join(i, fn);
5d961407
LP
4081 if (!j)
4082 return log_oom();
4083
4084 f = fopen(j, "re");
4085 if (f) {
4086 p = TAKE_PTR(j);
4087
4088 /* By default, we trust configuration from /etc and /run */
4089 if (arg_settings_trusted < 0)
4090 arg_settings_trusted = true;
4091
4092 break;
4093 }
4094
4095 if (errno != ENOENT)
4096 return log_error_errno(errno, "Failed to open %s: %m", j);
4097 }
4098
4099 if (!f) {
4100 /* After that, let's look for a file next to the
4101 * actual image we shall boot. */
4102
4103 if (arg_image) {
4104 p = file_in_same_dir(arg_image, fn);
4105 if (!p)
4106 return log_oom();
cd6e3914 4107 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4108 p = file_in_same_dir(arg_directory, fn);
4109 if (!p)
4110 return log_oom();
4111 }
4112
4113 if (p) {
4114 f = fopen(p, "re");
4115 if (!f && errno != ENOENT)
4116 return log_error_errno(errno, "Failed to open %s: %m", p);
4117
4118 /* By default, we do not trust configuration from /var/lib/machines */
4119 if (arg_settings_trusted < 0)
4120 arg_settings_trusted = false;
4121 }
4122 }
4123
4124 if (!f)
4125 return 0;
4126
4127 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4128
4129 r = settings_load(f, p, &settings);
4130 if (r < 0)
4131 return r;
4132
4133 return merge_settings(settings, p);
4134}
4135
de40a303
LP
4136static int load_oci_bundle(void) {
4137 _cleanup_(settings_freep) Settings *settings = NULL;
4138 int r;
4139
4140 if (!arg_oci_bundle)
4141 return 0;
4142
4143 /* By default let's trust OCI bundles */
4144 if (arg_settings_trusted < 0)
4145 arg_settings_trusted = true;
4146
4147 r = oci_load(NULL, arg_oci_bundle, &settings);
4148 if (r < 0)
4149 return r;
4150
4151 return merge_settings(settings, arg_oci_bundle);
4152}
4153
3acc84eb 4154static int run_container(
2d845785 4155 DissectedImage *dissected_image,
b0067625
ZJS
4156 bool secondary,
4157 FDSet *fds,
4158 char veth_name[IFNAMSIZ], bool *veth_created,
4159 union in_addr_union *exposed,
3acc84eb 4160 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4161
4162 static const struct sigaction sa = {
4163 .sa_handler = nop_signal_handler,
e28c7cd0 4164 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4165 };
4166
8e766630 4167 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4168 _cleanup_close_ int etc_passwd_lock = -1;
4169 _cleanup_close_pair_ int
4170 kmsg_socket_pair[2] = { -1, -1 },
4171 rtnl_socket_pair[2] = { -1, -1 },
4172 pid_socket_pair[2] = { -1, -1 },
4173 uuid_socket_pair[2] = { -1, -1 },
4174 notify_socket_pair[2] = { -1, -1 },
8199d554 4175 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4176 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4177 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4178
3acc84eb 4179 _cleanup_close_ int notify_socket = -1;
b0067625 4180 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4181 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4182 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4183 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4184 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4185 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4186 ContainerStatus container_status = 0;
b0067625
ZJS
4187 int ifi = 0, r;
4188 ssize_t l;
4189 sigset_t mask_chld;
d7bea6b6 4190 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
4191
4192 assert_se(sigemptyset(&mask_chld) == 0);
4193 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4194
4195 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4196 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4197 * check with getpwuid() if the specific user already exists. Note that /etc might be
4198 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4199 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4200 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4201 * really ours. */
4202
4203 etc_passwd_lock = take_etc_passwd_lock(NULL);
4204 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4205 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4206 }
4207
4208 r = barrier_create(&barrier);
4209 if (r < 0)
4210 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4211
4212 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4213 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4214
4215 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4216 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4217
4218 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4219 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4220
4221 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4222 return log_error_errno(errno, "Failed to create id socket pair: %m");
4223
4224 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4225 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4226
3acc84eb
FB
4227 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4228 return log_error_errno(errno, "Failed to create console socket pair: %m");
4229
b0067625
ZJS
4230 if (arg_userns_mode != USER_NAMESPACE_NO)
4231 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4232 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4233
8199d554
LP
4234 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4235 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4236 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4237
b0067625
ZJS
4238 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4239 * parent's blocking calls and give it a chance to call wait() and terminate. */
4240 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4241 if (r < 0)
4242 return log_error_errno(errno, "Failed to change the signal mask: %m");
4243
4244 r = sigaction(SIGCHLD, &sa, NULL);
4245 if (r < 0)
4246 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4247
d7bea6b6
DP
4248 if (arg_network_namespace_path) {
4249 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4250 if (netns_fd < 0)
4251 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4252
4253 r = fd_is_network_ns(netns_fd);
6619ad88
LP
4254 if (r == -EUCLEAN)
4255 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4256 else if (r < 0)
d7bea6b6 4257 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4258 else if (r == 0)
4259 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4260 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4261 }
4262
b0067625
ZJS
4263 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4264 if (*pid < 0)
4265 return log_error_errno(errno, "clone() failed%s: %m",
4266 errno == EINVAL ?
4267 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4268
4269 if (*pid == 0) {
4270 /* The outer child only has a file system namespace. */
4271 barrier_set_role(&barrier, BARRIER_CHILD);
4272
b0067625
ZJS
4273 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4274 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4275 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4276 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4277 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4278 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4279 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4280 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4281
4282 (void) reset_all_signal_handlers();
4283 (void) reset_signal_mask();
4284
4285 r = outer_child(&barrier,
4286 arg_directory,
2d845785 4287 dissected_image,
b0067625
ZJS
4288 secondary,
4289 pid_socket_pair[1],
4290 uuid_socket_pair[1],
4291 notify_socket_pair[1],
4292 kmsg_socket_pair[1],
4293 rtnl_socket_pair[1],
4294 uid_shift_socket_pair[1],
3acc84eb 4295 master_pty_socket_pair[1],
8199d554 4296 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
4297 fds,
4298 netns_fd);
b0067625
ZJS
4299 if (r < 0)
4300 _exit(EXIT_FAILURE);
4301
4302 _exit(EXIT_SUCCESS);
4303 }
4304
4305 barrier_set_role(&barrier, BARRIER_PARENT);
4306
e4077ff6 4307 fdset_close(fds);
b0067625
ZJS
4308
4309 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4310 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4311 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4312 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4313 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4314 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4315 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4316 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4317
4318 if (arg_userns_mode != USER_NAMESPACE_NO) {
4319 /* The child just let us know the UID shift it might have read from the image. */
4320 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4321 if (l < 0)
4322 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4323 if (l != sizeof arg_uid_shift)
4324 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4325
4326 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4327 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4328 * image, but if that's already in use, pick a new one, and report back to the child,
4329 * which one we now picked. */
4330
4331 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4332 if (r < 0)
4333 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4334
4335 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4336 if (l < 0)
4337 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4338 if (l != sizeof arg_uid_shift)
4339 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4340 }
4341 }
4342
8199d554
LP
4343 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4344 /* The child let us know the support cgroup mode it might have read from the image. */
4345 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4346 if (l < 0)
4347 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4348 if (l != sizeof(arg_unified_cgroup_hierarchy))
4349 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4350 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4351 }
4352
b0067625 4353 /* Wait for the outer child. */
d2e0ac3d
LP
4354 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4355 if (r < 0)
4356 return r;
4357 if (r != EXIT_SUCCESS)
4358 return -EIO;
b0067625
ZJS
4359
4360 /* And now retrieve the PID of the inner child. */
4361 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4362 if (l < 0)
4363 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4364 if (l != sizeof *pid)
4365 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4366
4367 /* We also retrieve container UUID in case it was generated by outer child */
4368 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4369 if (l < 0)
4370 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4371 if (l != sizeof(arg_uuid))
4372 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4373
4374 /* We also retrieve the socket used for notifications generated by outer child */
4375 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4376 if (notify_socket < 0)
4377 return log_error_errno(notify_socket,
4378 "Failed to receive notification socket from the outer child: %m");
4379
4380 log_debug("Init process invoked as PID "PID_FMT, *pid);
4381
4382 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4383 if (!barrier_place_and_sync(&barrier)) /* #1 */
4384 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4385
4386 r = setup_uid_map(*pid);
4387 if (r < 0)
4388 return r;
4389
4390 (void) barrier_place(&barrier); /* #2 */
4391 }
4392
4393 if (arg_private_network) {
75116558
PS
4394 if (!arg_network_namespace_path) {
4395 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4396 if (!barrier_place_and_sync(&barrier)) /* #3 */
4397 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4398 }
4399
b0067625
ZJS
4400 r = move_network_interfaces(*pid, arg_network_interfaces);
4401 if (r < 0)
4402 return r;
4403
4404 if (arg_network_veth) {
4405 r = setup_veth(arg_machine, *pid, veth_name,
4406 arg_network_bridge || arg_network_zone);
4407 if (r < 0)
4408 return r;
4409 else if (r > 0)
4410 ifi = r;
4411
4412 if (arg_network_bridge) {
4413 /* Add the interface to a bridge */
4414 r = setup_bridge(veth_name, arg_network_bridge, false);
4415 if (r < 0)
4416 return r;
4417 if (r > 0)
4418 ifi = r;
4419 } else if (arg_network_zone) {
4420 /* Add the interface to a bridge, possibly creating it */
4421 r = setup_bridge(veth_name, arg_network_zone, true);
4422 if (r < 0)
4423 return r;
4424 if (r > 0)
4425 ifi = r;
4426 }
4427 }
4428
4429 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4430 if (r < 0)
4431 return r;
4432
4433 /* We created the primary and extra veth links now; let's remember this, so that we know to
4434 remove them later on. Note that we don't bother with removing veth links that were created
4435 here when their setup failed half-way, because in that case the kernel should be able to
4436 remove them on its own, since they cannot be referenced by anything yet. */
4437 *veth_created = true;
4438
4439 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4440 if (r < 0)
4441 return r;
4442
4443 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4444 if (r < 0)
4445 return r;
4446 }
4447
abdb9b08
LP
4448 if (arg_register || !arg_keep_unit) {
4449 r = sd_bus_default_system(&bus);
4450 if (r < 0)
4451 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4452
4453 r = sd_bus_set_close_on_exit(bus, false);
4454 if (r < 0)
4455 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4456 }
4457
4458 if (!arg_keep_unit) {
4459 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4460 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4461 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4462
75152a4d
LP
4463 r = sd_bus_match_signal_async(
4464 bus,
4465 NULL,
4466 "org.freedesktop.systemd1",
4467 NULL,
4468 "org.freedesktop.systemd1.Scope",
4469 "RequestStop",
4470 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4471 if (r < 0)
75152a4d 4472 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4473 }
4474
b0067625
ZJS
4475 if (arg_register) {
4476 r = register_machine(
abdb9b08 4477 bus,
b0067625
ZJS
4478 arg_machine,
4479 *pid,
4480 arg_directory,
4481 arg_uuid,
4482 ifi,
4483 arg_slice,
4484 arg_custom_mounts, arg_n_custom_mounts,
4485 arg_kill_signal,
4486 arg_property,
de40a303 4487 arg_property_message,
b0067625
ZJS
4488 arg_keep_unit,
4489 arg_container_service_name);
4490 if (r < 0)
4491 return r;
abdb9b08 4492
cd2dfc6f
LP
4493 } else if (!arg_keep_unit) {
4494 r = allocate_scope(
abdb9b08 4495 bus,
cd2dfc6f
LP
4496 arg_machine,
4497 *pid,
4498 arg_slice,
4499 arg_custom_mounts, arg_n_custom_mounts,
4500 arg_kill_signal,
de40a303
LP
4501 arg_property,
4502 arg_property_message);
cd2dfc6f
LP
4503 if (r < 0)
4504 return r;
4505
4506 } else if (arg_slice || arg_property)
4507 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4508
27da7ef0 4509 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4510 if (r < 0)
4511 return r;
4512
27da7ef0 4513 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4514 if (r < 0)
4515 return r;
b0067625 4516
de54e02d 4517 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4518 if (r < 0)
4519 return r;
4520
4521 /* Notify the child that the parent is ready with all
4522 * its setup (including cgroup-ification), and that
4523 * the child can now hand over control to the code to
4524 * run inside the container. */
75116558 4525 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4526
4527 /* Block SIGCHLD here, before notifying child.
4528 * process_pty() will handle it with the other signals. */
4529 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4530
4531 /* Reset signal to default */
4532 r = default_signals(SIGCHLD, -1);
4533 if (r < 0)
4534 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4535
4536 r = sd_event_new(&event);
4537 if (r < 0)
4538 return log_error_errno(r, "Failed to get default event source: %m");
4539
8fd010bb
LP
4540 (void) sd_event_set_watchdog(event, true);
4541
abdb9b08
LP
4542 if (bus) {
4543 r = sd_bus_attach_event(bus, event, 0);
4544 if (r < 0)
4545 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4546 }
4547
5773024d 4548 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4549 if (r < 0)
4550 return r;
4551
4552 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4553 if (!barrier_place_and_sync(&barrier)) /* #5 */
4554 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4555
4556 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4557 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4558 etc_passwd_lock = safe_close(etc_passwd_lock);
4559
04f590a4
LP
4560 (void) sd_notifyf(false,
4561 "STATUS=Container running.\n"
4562 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4563 if (!arg_notify_ready)
919f5ae0 4564 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4565
4566 if (arg_kill_signal > 0) {
4567 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4568 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4569 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4570 } else {
4571 /* Immediately exit */
919f5ae0
LP
4572 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4573 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4574 }
4575
6916b164 4576 /* Exit when the child exits */
919f5ae0 4577 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4578
4579 if (arg_expose_ports) {
4580 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4581 if (r < 0)
4582 return r;
4583
4584 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4585 }
4586
4587 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4588
3acc84eb
FB
4589 if (arg_console_mode != CONSOLE_PIPE) {
4590 _cleanup_close_ int fd = -1;
4591 PTYForwardFlags flags = 0;
de40a303 4592
3acc84eb
FB
4593 /* Retrieve the master pty allocated by inner child */
4594 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4595 if (fd < 0)
4596 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4597
4598 switch (arg_console_mode) {
de40a303 4599
3acc84eb
FB
4600 case CONSOLE_READ_ONLY:
4601 flags |= PTY_FORWARD_READ_ONLY;
4602
4603 _fallthrough_;
4604
4605 case CONSOLE_INTERACTIVE:
4606 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4607
4608 r = pty_forward_new(event, fd, flags, &forward);
4609 if (r < 0)
4610 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4611
4612 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4613 (void) pty_forward_set_width_height(forward,
4614 arg_console_width,
4615 arg_console_height);
4616 break;
4617
4618 default:
4619 assert(arg_console_mode == CONSOLE_PASSIVE);
4620 }
4621
4622 *master = TAKE_FD(fd);
de40a303 4623 }
b0067625
ZJS
4624
4625 r = sd_event_loop(event);
4626 if (r < 0)
4627 return log_error_errno(r, "Failed to run event loop: %m");
4628
de40a303
LP
4629 if (forward) {
4630 char last_char = 0;
b0067625 4631
de40a303
LP
4632 (void) pty_forward_get_last_char(forward, &last_char);
4633 forward = pty_forward_free(forward);
b0067625 4634
de40a303
LP
4635 if (!arg_quiet && last_char != '\n')
4636 putc('\n', stdout);
4637 }
b0067625
ZJS
4638
4639 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4640 if (!arg_register && !arg_keep_unit && bus)
4641 terminate_scope(bus, arg_machine);
b0067625
ZJS
4642
4643 /* Normally redundant, but better safe than sorry */
c67b0082 4644 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4645
4646 r = wait_for_container(*pid, &container_status);
4647 *pid = 0;
4648
0bb0a9fa
ZJS
4649 /* Tell machined that we are gone. */
4650 if (bus)
4651 (void) unregister_machine(bus, arg_machine);
4652
b0067625
ZJS
4653 if (r < 0)
4654 /* We failed to wait for the container, or the container exited abnormally. */
4655 return r;
4656 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4657 /* r > 0 → The container exited with a non-zero status.
4658 * As a special case, we need to replace 133 with a different value,
4659 * because 133 is special-cased in the service file to reboot the container.
4660 * otherwise → The container exited with zero status and a reboot was not requested.
4661 */
2a49b612 4662 if (r == EXIT_FORCE_RESTART)
27e29a1e 4663 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4664 *ret = r;
b0067625
ZJS
4665 return 0; /* finito */
4666 }
4667
4668 /* CONTAINER_REBOOTED, loop again */
4669
4670 if (arg_keep_unit) {
4671 /* Special handling if we are running as a service: instead of simply
4672 * restarting the machine we want to restart the entire service, so let's
4673 * inform systemd about this with the special exit code 133. The service
4674 * file uses RestartForceExitStatus=133 so that this results in a full
4675 * nspawn restart. This is necessary since we might have cgroup parameters
4676 * set we want to have flushed out. */
2a49b612
ZJS
4677 *ret = EXIT_FORCE_RESTART;
4678 return 0; /* finito */
b0067625
ZJS
4679 }
4680
4681 expose_port_flush(arg_expose_ports, exposed);
4682
4683 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4684 *veth_created = false;
4685 return 1; /* loop again */
4686}
4687
bf428efb 4688static int initialize_rlimits(void) {
bf428efb
LP
4689 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4690 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4691 * container execution environments. */
4692
4693 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4694 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4695 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4696 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4697 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4698 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4699 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4700 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4701 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4702 [RLIMIT_NICE] = { 0, 0 },
4703 [RLIMIT_NOFILE] = { 1024, 4096 },
4704 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4705 [RLIMIT_RTPRIO] = { 0, 0 },
4706 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4707 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4708
4709 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4710 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4711 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4712 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4713 * that PID 1 changes a number of other resource limits during early initialization which is why we
4714 * don't read the other limits from PID 1 but prefer the static table above. */
4715 };
4716
4717 int rl;
4718
4719 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4720 /* Let's only fill in what the user hasn't explicitly configured anyway */
4721 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4722 const struct rlimit *v;
4723 struct rlimit buffer;
4724
4725 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4726 /* For these two let's read the limits off PID 1. See above for an explanation. */
4727
4728 if (prlimit(1, rl, NULL, &buffer) < 0)
4729 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4730
4731 v = &buffer;
4732 } else
4733 v = kernel_defaults + rl;
4734
4735 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4736 if (!arg_rlimit[rl])
4737 return log_oom();
4738 }
4739
4740 if (DEBUG_LOGGING) {
4741 _cleanup_free_ char *k = NULL;
4742
4743 (void) rlimit_format(arg_rlimit[rl], &k);
4744 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4745 }
4746 }
4747
4748 return 0;
4749}
4750
44dbef90 4751static int run(int argc, char *argv[]) {
7bf011e3
LP
4752 bool secondary = false, remove_directory = false, remove_image = false,
4753 veth_created = false, remove_tmprootdir = false;
2d845785 4754 _cleanup_close_ int master = -1;
03cfe0d5 4755 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4756 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4757 char veth_name[IFNAMSIZ] = "";
03cfe0d5 4758 union in_addr_union exposed = {};
8e766630 4759 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 4760 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4761 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4762 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4763 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
7bf011e3 4764 pid_t pid = 0;
03cfe0d5
LP
4765
4766 log_parse_environment();
4767 log_open();
415fc41c 4768
03cfe0d5
LP
4769 r = parse_argv(argc, argv);
4770 if (r <= 0)
4771 goto finish;
4772
fba868fa
LP
4773 r = must_be_root();
4774 if (r < 0)
03cfe0d5 4775 goto finish;
fba868fa 4776
bf428efb
LP
4777 r = initialize_rlimits();
4778 if (r < 0)
4779 goto finish;
4780
de40a303
LP
4781 r = load_oci_bundle();
4782 if (r < 0)
4783 goto finish;
4784
f757855e
LP
4785 r = determine_names();
4786 if (r < 0)
4787 goto finish;
4788
4789 r = load_settings();
4790 if (r < 0)
4791 goto finish;
4792
d4d99bc6 4793 r = cg_unified();
5eee8290
LP
4794 if (r < 0) {
4795 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4796 goto finish;
4797 }
4798
f757855e
LP
4799 r = verify_arguments();
4800 if (r < 0)
4801 goto finish;
03cfe0d5 4802
49048684
ZJS
4803 /* Reapply environment settings. */
4804 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 4805
2949ff26
LP
4806 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4807 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4808 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4809 (void) ignore_signals(SIGPIPE, -1);
4810
03cfe0d5
LP
4811 n_fd_passed = sd_listen_fds(false);
4812 if (n_fd_passed > 0) {
4813 r = fdset_new_listen_fds(&fds, false);
4814 if (r < 0) {
4815 log_error_errno(r, "Failed to collect file descriptors: %m");
4816 goto finish;
4817 }
4818 }
4819
83e803a9
ZJS
4820 /* The "default" umask. This is appropriate for most file and directory
4821 * operations performed by nspawn, and is the umask that will be used for
4822 * the child. Functions like copy_devnodes() change the umask temporarily. */
4823 umask(0022);
4824
03cfe0d5
LP
4825 if (arg_directory) {
4826 assert(!arg_image);
4827
b35ca61a
LP
4828 /* Safety precaution: let's not allow running images from the live host OS image, as long as
4829 * /var from the host will propagate into container dynamically (because bad things happen if
4830 * two systems write to the same /var). Let's allow it for the special cases where /var is
4831 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
4832 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
4833 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
4834 r = -EINVAL;
4835 goto finish;
4836 }
4837
4838 if (arg_ephemeral) {
4839 _cleanup_free_ char *np = NULL;
4840
8d4aa2bb 4841 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4842 if (r < 0)
4843 goto finish;
4844
7bf011e3
LP
4845 /* If the specified path is a mount point we generate the new snapshot immediately
4846 * inside it under a random name. However if the specified is not a mount point we
4847 * create the new snapshot in the parent directory, just next to it. */
e1873695 4848 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4849 if (r < 0) {
4850 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4851 goto finish;
4852 }
4853 if (r > 0)
770b5ce4 4854 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4855 else
770b5ce4 4856 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4857 if (r < 0) {
0f3be6ca 4858 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4859 goto finish;
4860 }
4861
6992459c
LP
4862 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
4863 * only owned by us and noone else. */
4864 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
4865 if (r < 0) {
4866 log_error_errno(r, "Failed to lock %s: %m", np);
4867 goto finish;
4868 }
4869
7bf011e3
LP
4870 {
4871 BLOCK_SIGNALS(SIGINT);
4872 r = btrfs_subvol_snapshot(arg_directory, np,
4873 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4874 BTRFS_SNAPSHOT_FALLBACK_COPY |
4875 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4876 BTRFS_SNAPSHOT_RECURSIVE |
4877 BTRFS_SNAPSHOT_QUOTA |
4878 BTRFS_SNAPSHOT_SIGINT);
4879 }
4880 if (r == -EINTR) {
4881 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
4882 goto finish;
4883 }
03cfe0d5
LP
4884 if (r < 0) {
4885 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4886 goto finish;
ec16945e
LP
4887 }
4888
1cc6c93a 4889 free_and_replace(arg_directory, np);
17cbb288 4890 remove_directory = true;
30535c16 4891 } else {
cb638b5e 4892 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4893 if (r < 0)
4894 goto finish;
4895
30535c16
LP
4896 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4897 if (r == -EBUSY) {
4898 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4899 goto finish;
4900 }
4901 if (r < 0) {
4902 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4903 goto finish;
30535c16
LP
4904 }
4905
4906 if (arg_template) {
8d4aa2bb 4907 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4908 if (r < 0)
4909 goto finish;
4910
7bf011e3
LP
4911 {
4912 BLOCK_SIGNALS(SIGINT);
4913 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4914 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4915 BTRFS_SNAPSHOT_FALLBACK_COPY |
4916 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4917 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4918 BTRFS_SNAPSHOT_RECURSIVE |
4919 BTRFS_SNAPSHOT_QUOTA |
4920 BTRFS_SNAPSHOT_SIGINT);
4921 }
ff6c6cc1
LP
4922 if (r == -EEXIST)
4923 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4924 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
4925 else if (r == -EINTR) {
4926 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
4927 goto finish;
4928 } else if (r < 0) {
83521414 4929 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4930 goto finish;
ff6c6cc1
LP
4931 } else
4932 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4933 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4934 }
ec16945e
LP
4935 }
4936
7732f92b 4937 if (arg_start_mode == START_BOOT) {
a5201ed6 4938 const char *p;
c9fe05e0 4939
a5201ed6
LP
4940 if (arg_pivot_root_new)
4941 p = prefix_roota(arg_directory, arg_pivot_root_new);
4942 else
4943 p = arg_directory;
c9fe05e0
AR
4944
4945 if (path_is_os_tree(p) <= 0) {
4946 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4947 r = -EINVAL;
1b9e5b12
LP
4948 goto finish;
4949 }
4950 } else {
c9fe05e0
AR
4951 const char *p, *q;
4952
a5201ed6
LP
4953 if (arg_pivot_root_new)
4954 p = prefix_roota(arg_directory, arg_pivot_root_new);
4955 else
4956 p = arg_directory;
c9fe05e0
AR
4957
4958 q = strjoina(p, "/usr/");
1b9e5b12 4959
c9fe05e0
AR
4960 if (laccess(q, F_OK) < 0) {
4961 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4962 r = -EINVAL;
1b9e5b12 4963 goto finish;
1b9e5b12
LP
4964 }
4965 }
ec16945e 4966
6b9132a9 4967 } else {
ec16945e
LP
4968 assert(arg_image);
4969 assert(!arg_template);
4970
8d4aa2bb 4971 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4972 if (r < 0)
4973 goto finish;
4974
0f3be6ca
LP
4975 if (arg_ephemeral) {
4976 _cleanup_free_ char *np = NULL;
4977
4978 r = tempfn_random(arg_image, "machine.", &np);
4979 if (r < 0) {
4980 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4981 goto finish;
4982 }
4983
6992459c
LP
4984 /* Always take an exclusive lock on our own ephemeral copy. */
4985 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
4986 if (r < 0) {
4987 r = log_error_errno(r, "Failed to create image lock: %m");
4988 goto finish;
4989 }
4990
7bf011e3
LP
4991 {
4992 BLOCK_SIGNALS(SIGINT);
4993 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
4994 }
4995 if (r == -EINTR) {
4996 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
4997 goto finish;
4998 }
0f3be6ca
LP
4999 if (r < 0) {
5000 r = log_error_errno(r, "Failed to copy image file: %m");
5001 goto finish;
5002 }
5003
1cc6c93a 5004 free_and_replace(arg_image, np);
0f3be6ca
LP
5005 remove_image = true;
5006 } else {
5007 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5008 if (r == -EBUSY) {
5009 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5010 goto finish;
5011 }
5012 if (r < 0) {
5013 r = log_error_errno(r, "Failed to create image lock: %m");
5014 goto finish;
5015 }
4623e8e6 5016
78ebe980
LP
5017 if (!arg_root_hash) {
5018 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
5019 if (r < 0) {
5020 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
5021 goto finish;
5022 }
5023 }
30535c16
LP
5024 }
5025
c67b0082 5026 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5027 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5028 goto finish;
1b9e5b12 5029 }
6b9132a9 5030
c67b0082
LP
5031 remove_tmprootdir = true;
5032
5033 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5034 if (!arg_directory) {
5035 r = log_oom();
5036 goto finish;
6b9132a9 5037 }
88213476 5038
e08f94ac 5039 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, LO_FLAGS_PARTSCAN, &loop);
2d845785
LP
5040 if (r < 0) {
5041 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5042 goto finish;
5043 }
1b9e5b12 5044
4526113f 5045 r = dissect_image_and_warn(
e0f9e7bd 5046 loop->fd,
4526113f 5047 arg_image,
e0f9e7bd
LP
5048 arg_root_hash, arg_root_hash_size,
5049 DISSECT_IMAGE_REQUIRE_ROOT,
5050 &dissected_image);
2d845785 5051 if (r == -ENOPKG) {
4526113f 5052 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5053 log_notice("Note that the disk image needs to\n"
5054 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5055 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5056 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
5057 " d) or contain a file system without a partition table\n"
5058 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5059 goto finish;
2d845785 5060 }
4526113f 5061 if (r < 0)
842f3b0f 5062 goto finish;
1b9e5b12 5063
4623e8e6
LP
5064 if (!arg_root_hash && dissected_image->can_verity)
5065 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5066
5067 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
5068 if (r < 0)
5069 goto finish;
0f3be6ca
LP
5070
5071 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5072 if (remove_image && unlink(arg_image) >= 0)
5073 remove_image = false;
842f3b0f 5074 }
842f3b0f 5075
86c0dd4a 5076 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5077 if (r < 0)
5078 goto finish;
5079
de40a303
LP
5080 if (arg_console_mode < 0)
5081 arg_console_mode =
5082 isatty(STDIN_FILENO) > 0 &&
5083 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5084
de40a303
LP
5085 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5086 arg_quiet = true;
a258bf26 5087
9c857b9d
LP
5088 if (!arg_quiet)
5089 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5090 arg_machine, arg_image ?: arg_directory);
5091
72c0a2c2 5092 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5093
66edd963 5094 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5095 r = log_error_errno(errno, "Failed to become subreaper: %m");
5096 goto finish;
5097 }
5098
d87be9b0 5099 for (;;) {
3acc84eb 5100 r = run_container(dissected_image,
44dbef90
LP
5101 secondary,
5102 fds,
5103 veth_name, &veth_created,
3acc84eb 5104 &exposed, &master,
44dbef90 5105 &pid, &ret);
b0067625 5106 if (r <= 0)
d87be9b0 5107 break;
d87be9b0 5108 }
88213476
LP
5109
5110finish:
04f590a4
LP
5111 (void) sd_notify(false,
5112 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5113 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5114
9444b1f2 5115 if (pid > 0)
c67b0082 5116 (void) kill(pid, SIGKILL);
88213476 5117
503546da 5118 /* Try to flush whatever is still queued in the pty */
6a0f896b 5119 if (master >= 0) {
1c876927 5120 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5121 master = safe_close(master);
5122 }
5123
5124 if (pid > 0)
5125 (void) wait_for_terminate(pid, NULL);
503546da 5126
50ebcf6c
LP
5127 pager_close();
5128
17cbb288 5129 if (remove_directory && arg_directory) {
ec16945e
LP
5130 int k;
5131
17cbb288 5132 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5133 if (k < 0)
17cbb288 5134 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5135 }
5136
0f3be6ca
LP
5137 if (remove_image && arg_image) {
5138 if (unlink(arg_image) < 0)
5139 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5140 }
5141
c67b0082
LP
5142 if (remove_tmprootdir) {
5143 if (rmdir(tmprootdir) < 0)
5144 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5145 }
5146
785890ac
LP
5147 if (arg_machine) {
5148 const char *p;
5149
63c372cb 5150 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5151 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5152 }
5153
7a8f6325 5154 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5155
5156 if (veth_created)
5157 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5158 (void) remove_bridge(arg_network_zone);
f757855e 5159
f757855e
LP
5160 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5161 expose_port_free_all(arg_expose_ports);
bf428efb 5162 rlimit_free_all(arg_rlimit);
b2645747 5163 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
6d0b55c2 5164
44dbef90
LP
5165 if (r < 0)
5166 return r;
5167
5168 return ret;
88213476 5169}
44dbef90
LP
5170
5171DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);