]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: pass userdata pointer, not inet_addr union
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
b8ea7a6e 31#include "bus-error.h"
b053cd5f 32#include "bus-util.h"
8fe0087e 33#include "cap-list.h"
430f0182 34#include "capability-util.h"
04d391da 35#include "cgroup-util.h"
8fe0087e 36#include "copy.h"
d107bb7d 37#include "cpu-set-util.h"
4fc9982c 38#include "dev-setup.h"
2d845785 39#include "dissect-image.h"
8fe0087e 40#include "env-util.h"
3652872a 41#include "escape.h"
3ffd4af2 42#include "fd-util.h"
842f3b0f 43#include "fdset.h"
a5c32cff 44#include "fileio.h"
f97b34a6 45#include "format-util.h"
f4f15635 46#include "fs-util.h"
1b9e5b12 47#include "gpt.h"
4623e8e6 48#include "hexdecoct.h"
8fe0087e 49#include "hostname-util.h"
910fd145 50#include "id128-util.h"
3652872a 51#include "io-util.h"
8fe0087e 52#include "log.h"
2d845785 53#include "loop-util.h"
8fe0087e 54#include "loopback-setup.h"
1b9cebf6 55#include "machine-image.h"
8fe0087e 56#include "macro.h"
44dbef90 57#include "main-func.h"
f5947a5e 58#include "missing_sched.h"
8fe0087e 59#include "mkdir.h"
4349cd7c 60#include "mount-util.h"
049af8ad 61#include "mountpoint-util.h"
0cb8e3d1 62#include "namespace-util.h"
8fe0087e 63#include "netlink-util.h"
07630cea 64#include "nspawn-cgroup.h"
3652872a 65#include "nspawn-creds.h"
3603efde 66#include "nspawn-def.h"
07630cea
LP
67#include "nspawn-expose-ports.h"
68#include "nspawn-mount.h"
69#include "nspawn-network.h"
de40a303 70#include "nspawn-oci.h"
7336138e 71#include "nspawn-patch-uid.h"
07630cea 72#include "nspawn-register.h"
910fd145 73#include "nspawn-seccomp.h"
07630cea
LP
74#include "nspawn-settings.h"
75#include "nspawn-setuid.h"
7732f92b 76#include "nspawn-stub-pid1.h"
d8b4d14d 77#include "nulstr-util.h"
d58ad743 78#include "os-util.h"
50ebcf6c 79#include "pager.h"
6bedfcbb 80#include "parse-util.h"
8fe0087e 81#include "path-util.h"
294bf0c3 82#include "pretty-print.h"
0b452006 83#include "process-util.h"
8fe0087e
LP
84#include "ptyfwd.h"
85#include "random-util.h"
8869a0b4 86#include "raw-clone.h"
86775e35 87#include "resolve-util.h"
bf428efb 88#include "rlimit-util.h"
8fe0087e 89#include "rm-rf.h"
de40a303
LP
90#if HAVE_SECCOMP
91#include "seccomp-util.h"
92#endif
68b02049 93#include "selinux-util.h"
8fe0087e 94#include "signal-util.h"
2583fbea 95#include "socket-util.h"
8fcde012 96#include "stat-util.h"
15a5e950 97#include "stdio-util.h"
5c828e66 98#include "string-table.h"
07630cea 99#include "string-util.h"
8fe0087e 100#include "strv.h"
de40a303 101#include "sysctl-util.h"
8fe0087e 102#include "terminal-util.h"
e4de7287 103#include "tmpfile-util.h"
affb60b1 104#include "umask-util.h"
43c3fb46 105#include "unit-name.h"
b1d4f8e1 106#include "user-util.h"
8fe0087e 107#include "util.h"
e9642be2 108
e96ceaba
LP
109/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
110#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 111
2a49b612
ZJS
112#define EXIT_FORCE_RESTART 133
113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
6145bb4f 116 CONTAINER_REBOOTED,
113cea80
DH
117} ContainerStatus;
118
88213476 119static char *arg_directory = NULL;
ec16945e 120static char *arg_template = NULL;
5f932eb9 121static char *arg_chdir = NULL;
b53ede69
PW
122static char *arg_pivot_root_new = NULL;
123static char *arg_pivot_root_old = NULL;
687d0825 124static char *arg_user = NULL;
de40a303
LP
125static uid_t arg_uid = UID_INVALID;
126static gid_t arg_gid = GID_INVALID;
127static gid_t* arg_supplementary_gids = NULL;
128static size_t arg_n_supplementary_gids = 0;
9444b1f2 129static sd_id128_t arg_uuid = {};
3a9530e5
LP
130static char *arg_machine = NULL; /* The name used by the host to refer to this */
131static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
132static const char *arg_selinux_context = NULL;
133static const char *arg_selinux_apifs_context = NULL;
de40a303 134static char *arg_slice = NULL;
ff01d048 135static bool arg_private_network = false;
bc2f673e 136static bool arg_read_only = false;
7732f92b 137static StartMode arg_start_mode = START_PID1;
ec16945e 138static bool arg_ephemeral = false;
57fb9fb5 139static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 140static bool arg_link_journal_try = false;
520e0d54 141static uint64_t arg_caps_retain =
50b52222
LP
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 153 (1ULL << CAP_MKNOD) |
5076f0cc
LP
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
5076f0cc 157 (1ULL << CAP_SETFCAP) |
50b52222 158 (1ULL << CAP_SETGID) |
5076f0cc
LP
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
50b52222 162 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 166 (1ULL << CAP_SYS_RESOURCE) |
50b52222 167 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 168static uint64_t arg_caps_ambient = 0;
de40a303 169static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 170static CustomMount *arg_custom_mounts = NULL;
88614c8a 171static size_t arg_n_custom_mounts = 0;
f4889f65 172static char **arg_setenv = NULL;
284c0b91 173static bool arg_quiet = false;
eb91eb18 174static bool arg_register = true;
89f7c846 175static bool arg_keep_unit = false;
aa28aefe 176static char **arg_network_interfaces = NULL;
c74e630d 177static char **arg_network_macvlan = NULL;
4bbfe7ad 178static char **arg_network_ipvlan = NULL;
69c79d3c 179static bool arg_network_veth = false;
f6d6bad1 180static char **arg_network_veth_extra = NULL;
f757855e 181static char *arg_network_bridge = NULL;
22b28dfd 182static char *arg_network_zone = NULL;
d7bea6b6 183static char *arg_network_namespace_path = NULL;
bb068de0 184static PagerFlags arg_pager_flags = 0;
050f7277 185static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 186static char *arg_image = NULL;
de40a303 187static char *arg_oci_bundle = NULL;
f757855e 188static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 189static ExposePort *arg_expose_ports = NULL;
f36933fe 190static char **arg_property = NULL;
de40a303 191static sd_bus_message *arg_property_message = NULL;
0de7acce 192static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 193static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 194static bool arg_userns_chown = false;
c6c8f6e2 195static int arg_kill_signal = 0;
5da38d07 196static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
197static SettingsMask arg_settings_mask = 0;
198static int arg_settings_trusted = -1;
199static char **arg_parameters = NULL;
6aadfa4c 200static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 201static bool arg_notify_ready = false;
5a8ff0e6 202static bool arg_use_cgns = true;
0c582db0 203static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 204static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 205static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
206static char **arg_syscall_allow_list = NULL;
207static char **arg_syscall_deny_list = NULL;
de40a303
LP
208#if HAVE_SECCOMP
209static scmp_filter_ctx arg_seccomp = NULL;
210#endif
bf428efb 211static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 212static bool arg_no_new_privileges = false;
81f345df
LP
213static int arg_oom_score_adjust = 0;
214static bool arg_oom_score_adjust_set = false;
0985c7c4 215static CPUSet arg_cpu_set = {};
09d423e9 216static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 217static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
218static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
219static DeviceNode* arg_extra_nodes = NULL;
220static size_t arg_n_extra_nodes = 0;
221static char **arg_sysctl = NULL;
222static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
223static Credential *arg_credentials = NULL;
224static size_t arg_n_credentials = 0;
88213476 225
6145bb4f
LP
226STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
227STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
228STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
229STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
230STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
231STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
232STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
237STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
238STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
239STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
240STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
241STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
247STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
248STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 249STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
250STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
252#if HAVE_SECCOMP
253STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
254#endif
0985c7c4 255STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f
LP
256STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
257
dce66ffe
ZJS
258static int handle_arg_console(const char *arg) {
259 if (streq(arg, "help")) {
10e8a60b
LP
260 puts("autopipe\n"
261 "interactive\n"
dce66ffe 262 "passive\n"
10e8a60b
LP
263 "pipe\n"
264 "read-only");
dce66ffe
ZJS
265 return 0;
266 }
267
268 if (streq(arg, "interactive"))
269 arg_console_mode = CONSOLE_INTERACTIVE;
270 else if (streq(arg, "read-only"))
271 arg_console_mode = CONSOLE_READ_ONLY;
272 else if (streq(arg, "passive"))
273 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
274 else if (streq(arg, "pipe")) {
275 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
276 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
277 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
278 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
279 "Proceeding anyway.");
280
dce66ffe 281 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
282 } else if (streq(arg, "autopipe")) {
283 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
284 arg_console_mode = CONSOLE_INTERACTIVE;
285 else
286 arg_console_mode = CONSOLE_PIPE;
554c4beb 287 } else
dce66ffe
ZJS
288 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
289
290 arg_settings_mask |= SETTING_CONSOLE_MODE;
291 return 1;
292}
293
37ec0fdd
LP
294static int help(void) {
295 _cleanup_free_ char *link = NULL;
296 int r;
297
bb068de0 298 (void) pager_open(arg_pager_flags);
50ebcf6c 299
37ec0fdd
LP
300 r = terminal_urlify_man("systemd-nspawn", "1", &link);
301 if (r < 0)
302 return log_oom();
303
25148653 304 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 305 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
306 " -h --help Show this help\n"
307 " --version Print version string\n"
69c79d3c 308 " -q --quiet Do not show status information\n"
bb068de0 309 " --no-pager Do not pipe output into a pager\n"
25148653
LP
310 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
311 "%3$sImage:%4$s\n"
1b9e5b12 312 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
313 " --template=PATH Initialize root directory from template directory,\n"
314 " if missing\n"
315 " -x --ephemeral Run container with snapshot of root directory, and\n"
316 " remove it after exit\n"
25e68fd3
LP
317 " -i --image=PATH Root file system disk image (or device node) for\n"
318 " the container\n"
de40a303 319 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
320 " --read-only Mount the root directory read-only\n"
321 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 322 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
323 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
324 " as a DER encoded PKCS7, either as a path to a file\n"
325 " or as an ASCII base64 encoded string prefixed by\n"
326 " 'base64:'\n"
e7cbe5cb 327 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
328 " --pivot-root=PATH[:PATH]\n"
329 " Pivot root to given directory in the container\n\n"
330 "%3$sExecution:%4$s\n"
7732f92b 331 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 332 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 333 " --chdir=PATH Set working directory in the container\n"
25148653
LP
334 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
335 " -u --user=USER Run the command under specified user or UID\n"
336 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
337 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
338 "%3$sSystem Identity:%4$s\n"
a8828ed9 339 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 340 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
341 " --uuid=UUID Set a specific machine UUID for the container\n\n"
342 "%3$sProperties:%4$s\n"
a8828ed9 343 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 344 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
345 " --register=BOOLEAN Register container as machine\n"
346 " --keep-unit Do not register a scope for the machine, reuse\n"
347 " the service unit nspawn is running in\n\n"
348 "%3$sUser Namespacing:%4$s\n"
90b4a64d 349 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 350 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 351 " Similar, but with user configured UID/GID range\n"
25148653
LP
352 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
353 "%3$sNetworking:%4$s\n"
69c79d3c
LP
354 " --private-network Disable network in container\n"
355 " --network-interface=INTERFACE\n"
356 " Assign an existing network interface to the\n"
357 " container\n"
c74e630d
LP
358 " --network-macvlan=INTERFACE\n"
359 " Create a macvlan network interface based on an\n"
360 " existing network interface to the container\n"
4bbfe7ad
TG
361 " --network-ipvlan=INTERFACE\n"
362 " Create a ipvlan network interface based on an\n"
363 " existing network interface to the container\n"
a8eaaee7 364 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 365 " and container\n"
f6d6bad1
LP
366 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
367 " Add an additional virtual Ethernet link between\n"
368 " host and container\n"
ab046dde 369 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
370 " Add a virtual Ethernet connection to the container\n"
371 " and attach it to an existing bridge on the host\n"
372 " --network-zone=NAME Similar, but attach the new interface to an\n"
373 " an automatically managed bridge interface\n"
d7bea6b6
DP
374 " --network-namespace-path=PATH\n"
375 " Set network namespace to the one represented by\n"
376 " the specified kernel namespace file node\n"
6d0b55c2 377 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
378 " Expose a container IP port on the host\n\n"
379 "%3$sSecurity:%4$s\n"
a8828ed9
DW
380 " --capability=CAP In addition to the default, retain specified\n"
381 " capability\n"
382 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
383 " --ambient-capability=CAP\n"
384 " Sets the specified capability for the started\n"
385 " process. Not useful if booting a machine.\n"
f4e803c8 386 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
387 " --system-call-filter=LIST|~LIST\n"
388 " Permit/prohibit specific system calls\n"
25148653
LP
389 " -Z --selinux-context=SECLABEL\n"
390 " Set the SELinux security context to be used by\n"
391 " processes in the container\n"
392 " -L --selinux-apifs-context=SECLABEL\n"
393 " Set the SELinux security context to be used by\n"
394 " API/tmpfs file systems in the container\n\n"
395 "%3$sResources:%4$s\n"
bf428efb 396 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
397 " --oom-score-adjust=VALUE\n"
398 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
399 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
400 " --personality=ARCH Pick personality for this container\n\n"
25148653 401 "%3$sIntegration:%4$s\n"
09d423e9 402 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 403 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
404 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
405 " host, try-guest, try-host\n"
406 " -j Equivalent to --link-journal=try-guest\n\n"
407 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
408 " --bind=PATH[:PATH[:OPTIONS]]\n"
409 " Bind mount a file or directory from the host into\n"
a8828ed9 410 " the container\n"
5e5bfa6e
EY
411 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
412 " Similar, but creates a read-only bind mount\n"
de40a303
LP
413 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
414 " it\n"
06c17c39 415 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
416 " --overlay=PATH[:PATH...]:PATH\n"
417 " Create an overlay mount from the host to \n"
418 " the container\n"
419 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
420 " Similar, but creates a read-only overlay mount\n\n"
421 "%3$sInput/Output:%4$s\n"
de40a303
LP
422 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
423 " set up for the container.\n"
3652872a
LP
424 " -P --pipe Equivalent to --console=pipe\n\n"
425 "%3$sCredentials:%4$s\n"
426 " --set-credential=ID:VALUE\n"
427 " Pass a credential with literal value to container.\n"
428 " --load-credential=ID:PATH\n"
429 " Load credential to pass to container from file or\n"
430 " AF_UNIX stream socket.\n"
25148653 431 "\nSee the %2$s for details.\n"
37ec0fdd
LP
432 , program_invocation_short_name
433 , link
37a92352
LP
434 , ansi_underline(), ansi_normal()
435 , ansi_highlight(), ansi_normal()
436 );
37ec0fdd
LP
437
438 return 0;
88213476
LP
439}
440
86c0dd4a 441static int custom_mount_check_all(void) {
88614c8a 442 size_t i;
5a8af538 443
5a8af538
LP
444 for (i = 0; i < arg_n_custom_mounts; i++) {
445 CustomMount *m = &arg_custom_mounts[i];
446
0de7acce 447 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
448 if (arg_userns_chown)
449 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
450 "--private-users-chown may not be combined with custom root mounts.");
451 else if (arg_uid_shift == UID_INVALID)
452 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
453 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 454 }
5a8af538
LP
455 }
456
457 return 0;
458}
459
8199d554 460static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 461 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 462 int r;
5da38d07 463
efdb0237 464 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
465
466 e = getenv(var);
467 if (!e) {
d5fc5b2f 468 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
469 var = "UNIFIED_CGROUP_HIERARCHY";
470 e = getenv(var);
c78c095b
ZJS
471 }
472
473 if (!isempty(e)) {
efdb0237
LP
474 r = parse_boolean(e);
475 if (r < 0)
c78c095b 476 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
477 if (r > 0)
478 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
479 else
480 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
481 }
482
8199d554
LP
483 return 0;
484}
485
486static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
487 int r;
488
75b0d8b8
ZJS
489 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
490 * in the image actually supports. */
b4cccbc1
LP
491 r = cg_all_unified();
492 if (r < 0)
493 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
494 if (r > 0) {
a8725a06
ZJS
495 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
496 * routine only detects 231, so we'll have a false negative here for 230. */
497 r = systemd_installation_has_version(directory, 230);
498 if (r < 0)
499 return log_error_errno(r, "Failed to determine systemd version in container: %m");
500 if (r > 0)
501 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
502 else
503 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 504 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
505 /* Mixed cgroup hierarchy support was added in 233 */
506 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
507 if (r < 0)
508 return log_error_errno(r, "Failed to determine systemd version in container: %m");
509 if (r > 0)
510 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
511 else
512 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
513 } else
5da38d07 514 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 515
8199d554
LP
516 log_debug("Using %s hierarchy for container.",
517 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
518 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
519
efdb0237
LP
520 return 0;
521}
522
8a99bd0c
ZJS
523static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
524 uint64_t mask = 0;
525 int r;
526
527 for (;;) {
528 _cleanup_free_ char *t = NULL;
529
530 r = extract_first_word(&spec, &t, ",", 0);
531 if (r < 0)
532 return log_error_errno(r, "Failed to parse capability %s.", t);
533 if (r == 0)
534 break;
535
536 if (streq(t, "help")) {
537 for (int i = 0; i < capability_list_length(); i++) {
538 const char *name;
539
540 name = capability_to_name(i);
541 if (name)
542 puts(name);
543 }
544
545 return 0; /* quit */
546 }
547
548 if (streq(t, "all"))
549 mask = (uint64_t) -1;
550 else {
551 r = capability_from_name(t);
552 if (r < 0)
553 return log_error_errno(r, "Failed to parse capability %s.", t);
554
555 mask |= 1ULL << r;
556 }
557 }
558
559 *ret_mask = mask;
560 return 1; /* continue */
561}
562
49048684 563static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
564 int r;
565
566 r = getenv_bool(name);
567 if (r == -ENXIO)
49048684 568 return 0;
0c582db0 569 if (r < 0)
49048684 570 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 571
0c582db0 572 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 573 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 574 return 0;
0c582db0
LB
575}
576
49048684 577static int parse_mount_settings_env(void) {
4f086aab 578 const char *e;
1099ceeb
LP
579 int r;
580
581 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
582 if (r < 0 && r != -ENXIO)
583 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
584 if (r >= 0)
585 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
586
587 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 588 if (streq_ptr(e, "network"))
4f086aab 589 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 590
49048684
ZJS
591 else if (e) {
592 r = parse_boolean(e);
593 if (r < 0)
594 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
595
596 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
597 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 598 }
4f086aab 599
49048684 600 return 0;
4f086aab
SU
601}
602
49048684 603static int parse_environment(void) {
d5455d2f
LP
604 const char *e;
605 int r;
606
49048684
ZJS
607 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
608 if (r < 0)
609 return r;
610 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
611 if (r < 0)
612 return r;
613 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
614 if (r < 0)
615 return r;
616 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
617 if (r < 0)
618 return r;
d5455d2f 619
49048684
ZJS
620 r = parse_mount_settings_env();
621 if (r < 0)
622 return r;
d5455d2f 623
489fae52
ZJS
624 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
625 * even if it is supported. If not supported, it has no effect. */
de40a303 626 if (!cg_ns_supported())
489fae52 627 arg_use_cgns = false;
de40a303
LP
628 else {
629 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
630 if (r < 0) {
631 if (r != -ENXIO)
49048684 632 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
633
634 arg_use_cgns = true;
635 } else {
636 arg_use_cgns = r > 0;
637 arg_settings_mask |= SETTING_USE_CGNS;
638 }
639 }
d5455d2f
LP
640
641 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
642 if (e)
643 arg_container_service_name = e;
644
49048684 645 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
646}
647
88213476 648static int parse_argv(int argc, char *argv[]) {
a41fe3a2 649 enum {
acbeb427
ZJS
650 ARG_VERSION = 0x100,
651 ARG_PRIVATE_NETWORK,
bc2f673e 652 ARG_UUID,
5076f0cc 653 ARG_READ_ONLY,
57fb9fb5 654 ARG_CAPABILITY,
88fc9c9b 655 ARG_AMBIENT_CAPABILITY,
420c7379 656 ARG_DROP_CAPABILITY,
17fe0523
LP
657 ARG_LINK_JOURNAL,
658 ARG_BIND,
f4889f65 659 ARG_BIND_RO,
06c17c39 660 ARG_TMPFS,
5a8af538
LP
661 ARG_OVERLAY,
662 ARG_OVERLAY_RO,
de40a303 663 ARG_INACCESSIBLE,
eb91eb18 664 ARG_SHARE_SYSTEM,
89f7c846 665 ARG_REGISTER,
aa28aefe 666 ARG_KEEP_UNIT,
69c79d3c 667 ARG_NETWORK_INTERFACE,
c74e630d 668 ARG_NETWORK_MACVLAN,
4bbfe7ad 669 ARG_NETWORK_IPVLAN,
ab046dde 670 ARG_NETWORK_BRIDGE,
22b28dfd 671 ARG_NETWORK_ZONE,
f6d6bad1 672 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 673 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 674 ARG_PERSONALITY,
4d9f07b4 675 ARG_VOLATILE,
ec16945e 676 ARG_TEMPLATE,
f36933fe 677 ARG_PROPERTY,
6dac160c 678 ARG_PRIVATE_USERS,
c6c8f6e2 679 ARG_KILL_SIGNAL,
f757855e 680 ARG_SETTINGS,
5f932eb9 681 ARG_CHDIR,
b53ede69 682 ARG_PIVOT_ROOT,
7336138e 683 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 684 ARG_NOTIFY_READY,
4623e8e6 685 ARG_ROOT_HASH,
89e62e0b
LP
686 ARG_ROOT_HASH_SIG,
687 ARG_VERITY_DATA,
960e4569 688 ARG_SYSTEM_CALL_FILTER,
bf428efb 689 ARG_RLIMIT,
3a9530e5 690 ARG_HOSTNAME,
66edd963 691 ARG_NO_NEW_PRIVILEGES,
81f345df 692 ARG_OOM_SCORE_ADJUST,
d107bb7d 693 ARG_CPU_AFFINITY,
09d423e9 694 ARG_RESOLV_CONF,
1688841f 695 ARG_TIMEZONE,
de40a303
LP
696 ARG_CONSOLE,
697 ARG_PIPE,
698 ARG_OCI_BUNDLE,
bb068de0 699 ARG_NO_PAGER,
3652872a
LP
700 ARG_SET_CREDENTIAL,
701 ARG_LOAD_CREDENTIAL,
a41fe3a2
LP
702 };
703
88213476 704 static const struct option options[] = {
d7bea6b6
DP
705 { "help", no_argument, NULL, 'h' },
706 { "version", no_argument, NULL, ARG_VERSION },
707 { "directory", required_argument, NULL, 'D' },
708 { "template", required_argument, NULL, ARG_TEMPLATE },
709 { "ephemeral", no_argument, NULL, 'x' },
710 { "user", required_argument, NULL, 'u' },
711 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
712 { "as-pid2", no_argument, NULL, 'a' },
713 { "boot", no_argument, NULL, 'b' },
714 { "uuid", required_argument, NULL, ARG_UUID },
715 { "read-only", no_argument, NULL, ARG_READ_ONLY },
716 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 717 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 718 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 719 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
720 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
721 { "bind", required_argument, NULL, ARG_BIND },
722 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
723 { "tmpfs", required_argument, NULL, ARG_TMPFS },
724 { "overlay", required_argument, NULL, ARG_OVERLAY },
725 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 726 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 727 { "machine", required_argument, NULL, 'M' },
3a9530e5 728 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
729 { "slice", required_argument, NULL, 'S' },
730 { "setenv", required_argument, NULL, 'E' },
731 { "selinux-context", required_argument, NULL, 'Z' },
732 { "selinux-apifs-context", required_argument, NULL, 'L' },
733 { "quiet", no_argument, NULL, 'q' },
734 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
735 { "register", required_argument, NULL, ARG_REGISTER },
736 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
737 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
738 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
739 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
740 { "network-veth", no_argument, NULL, 'n' },
741 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
742 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
743 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
744 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
745 { "personality", required_argument, NULL, ARG_PERSONALITY },
746 { "image", required_argument, NULL, 'i' },
747 { "volatile", optional_argument, NULL, ARG_VOLATILE },
748 { "port", required_argument, NULL, 'p' },
749 { "property", required_argument, NULL, ARG_PROPERTY },
750 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
751 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
752 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
753 { "settings", required_argument, NULL, ARG_SETTINGS },
754 { "chdir", required_argument, NULL, ARG_CHDIR },
755 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
756 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
757 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
758 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
759 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 760 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 761 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 762 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 763 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 764 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 765 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
766 { "console", required_argument, NULL, ARG_CONSOLE },
767 { "pipe", no_argument, NULL, ARG_PIPE },
768 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 769 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
770 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
771 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
eb9da376 772 {}
88213476
LP
773 };
774
9444b1f2 775 int c, r;
a42c8b54 776 uint64_t plus = 0, minus = 0;
f757855e 777 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
778
779 assert(argc >= 0);
780 assert(argv);
781
de40a303 782 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
783 switch (c) {
784
785 case 'h':
37ec0fdd 786 return help();
88213476 787
acbeb427 788 case ARG_VERSION:
3f6fd1ba 789 return version();
acbeb427 790
88213476 791 case 'D':
0f03c2a4 792 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 793 if (r < 0)
0f03c2a4 794 return r;
de40a303
LP
795
796 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
797 break;
798
799 case ARG_TEMPLATE:
0f03c2a4 800 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 801 if (r < 0)
0f03c2a4 802 return r;
de40a303
LP
803
804 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
805 break;
806
1b9e5b12 807 case 'i':
0f03c2a4 808 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 809 if (r < 0)
0f03c2a4 810 return r;
de40a303
LP
811
812 arg_settings_mask |= SETTING_DIRECTORY;
813 break;
814
815 case ARG_OCI_BUNDLE:
816 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
817 if (r < 0)
818 return r;
819
ec16945e
LP
820 break;
821
822 case 'x':
823 arg_ephemeral = true;
a2f577fc 824 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
825 break;
826
687d0825 827 case 'u':
2fc09a9c
DM
828 r = free_and_strdup(&arg_user, optarg);
829 if (r < 0)
7027ff61 830 return log_oom();
687d0825 831
f757855e 832 arg_settings_mask |= SETTING_USER;
687d0825
MV
833 break;
834
22b28dfd
LP
835 case ARG_NETWORK_ZONE: {
836 char *j;
837
b910cc72 838 j = strjoin("vz-", optarg);
22b28dfd
LP
839 if (!j)
840 return log_oom();
841
842 if (!ifname_valid(j)) {
843 log_error("Network zone name not valid: %s", j);
844 free(j);
845 return -EINVAL;
846 }
847
df1fac6d 848 free_and_replace(arg_network_zone, j);
22b28dfd
LP
849
850 arg_network_veth = true;
851 arg_private_network = true;
852 arg_settings_mask |= SETTING_NETWORK;
853 break;
854 }
855
ab046dde 856 case ARG_NETWORK_BRIDGE:
ef76dff2 857
baaa35ad
ZJS
858 if (!ifname_valid(optarg))
859 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
860 "Bridge interface name not valid: %s", optarg);
ef76dff2 861
f757855e
LP
862 r = free_and_strdup(&arg_network_bridge, optarg);
863 if (r < 0)
864 return log_oom();
ab046dde 865
4831981d 866 _fallthrough_;
0dfaa006 867 case 'n':
69c79d3c
LP
868 arg_network_veth = true;
869 arg_private_network = true;
f757855e 870 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
871 break;
872
f6d6bad1
LP
873 case ARG_NETWORK_VETH_EXTRA:
874 r = veth_extra_parse(&arg_network_veth_extra, optarg);
875 if (r < 0)
876 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
877
878 arg_private_network = true;
879 arg_settings_mask |= SETTING_NETWORK;
880 break;
881
aa28aefe 882 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
883 if (!ifname_valid(optarg))
884 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
885 "Network interface name not valid: %s", optarg);
ef76dff2 886
b390f178
DDM
887 r = test_network_interface_initialized(optarg);
888 if (r < 0)
889 return r;
890
c74e630d
LP
891 if (strv_extend(&arg_network_interfaces, optarg) < 0)
892 return log_oom();
893
894 arg_private_network = true;
f757855e 895 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
896 break;
897
898 case ARG_NETWORK_MACVLAN:
ef76dff2 899
baaa35ad
ZJS
900 if (!ifname_valid(optarg))
901 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
902 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 903
b390f178
DDM
904 r = test_network_interface_initialized(optarg);
905 if (r < 0)
906 return r;
907
c74e630d 908 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
909 return log_oom();
910
4bbfe7ad 911 arg_private_network = true;
f757855e 912 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
913 break;
914
915 case ARG_NETWORK_IPVLAN:
ef76dff2 916
baaa35ad
ZJS
917 if (!ifname_valid(optarg))
918 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
919 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 920
b390f178
DDM
921 r = test_network_interface_initialized(optarg);
922 if (r < 0)
923 return r;
924
4bbfe7ad
TG
925 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
926 return log_oom();
927
4831981d 928 _fallthrough_;
ff01d048
LP
929 case ARG_PRIVATE_NETWORK:
930 arg_private_network = true;
f757855e 931 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
932 break;
933
d7bea6b6
DP
934 case ARG_NETWORK_NAMESPACE_PATH:
935 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
936 if (r < 0)
937 return r;
938
de40a303 939 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
940 break;
941
0f0dbc46 942 case 'b':
baaa35ad
ZJS
943 if (arg_start_mode == START_PID2)
944 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
945 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
946
947 arg_start_mode = START_BOOT;
948 arg_settings_mask |= SETTING_START_MODE;
949 break;
950
951 case 'a':
baaa35ad
ZJS
952 if (arg_start_mode == START_BOOT)
953 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
954 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
955
956 arg_start_mode = START_PID2;
957 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
958 break;
959
144f0fc0 960 case ARG_UUID:
9444b1f2 961 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
962 if (r < 0)
963 return log_error_errno(r, "Invalid UUID: %s", optarg);
964
baaa35ad
ZJS
965 if (sd_id128_is_null(arg_uuid))
966 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
967 "Machine UUID may not be all zeroes.");
f757855e
LP
968
969 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 970 break;
aa96c6cb 971
43c3fb46
LP
972 case 'S': {
973 _cleanup_free_ char *mangled = NULL;
974
975 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
976 if (r < 0)
977 return log_oom();
978
43c3fb46 979 free_and_replace(arg_slice, mangled);
de40a303 980 arg_settings_mask |= SETTING_SLICE;
144f0fc0 981 break;
43c3fb46 982 }
144f0fc0 983
7027ff61 984 case 'M':
c1521918 985 if (isempty(optarg))
97b11eed 986 arg_machine = mfree(arg_machine);
c1521918 987 else {
52ef5dd7 988 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
989 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
990 "Invalid machine name: %s", optarg);
7027ff61 991
0c3c4284
LP
992 r = free_and_strdup(&arg_machine, optarg);
993 if (r < 0)
eb91eb18 994 return log_oom();
eb91eb18 995 }
9ce6d1b3 996 break;
7027ff61 997
3a9530e5
LP
998 case ARG_HOSTNAME:
999 if (isempty(optarg))
1000 arg_hostname = mfree(arg_hostname);
1001 else {
52ef5dd7 1002 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1003 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1004 "Invalid hostname: %s", optarg);
3a9530e5
LP
1005
1006 r = free_and_strdup(&arg_hostname, optarg);
1007 if (r < 0)
1008 return log_oom();
1009 }
1010
1011 arg_settings_mask |= SETTING_HOSTNAME;
1012 break;
1013
82adf6af
LP
1014 case 'Z':
1015 arg_selinux_context = optarg;
a8828ed9
DW
1016 break;
1017
82adf6af
LP
1018 case 'L':
1019 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1020 break;
1021
bc2f673e
LP
1022 case ARG_READ_ONLY:
1023 arg_read_only = true;
f757855e 1024 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1025 break;
1026
88fc9c9b
TH
1027 case ARG_AMBIENT_CAPABILITY: {
1028 uint64_t m;
1029 r = parse_capability_spec(optarg, &m);
1030 if (r <= 0)
1031 return r;
1032 arg_caps_ambient |= m;
1033 arg_settings_mask |= SETTING_CAPABILITY;
1034 break;
1035 }
420c7379
LP
1036 case ARG_CAPABILITY:
1037 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1038 uint64_t m;
1039 r = parse_capability_spec(optarg, &m);
1040 if (r <= 0)
1041 return r;
5076f0cc 1042
8a99bd0c
ZJS
1043 if (c == ARG_CAPABILITY)
1044 plus |= m;
1045 else
1046 minus |= m;
f757855e 1047 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1048 break;
1049 }
66edd963
LP
1050 case ARG_NO_NEW_PRIVILEGES:
1051 r = parse_boolean(optarg);
1052 if (r < 0)
1053 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1054
1055 arg_no_new_privileges = r;
1056 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1057 break;
1058
57fb9fb5
LP
1059 case 'j':
1060 arg_link_journal = LINK_GUEST;
574edc90 1061 arg_link_journal_try = true;
4e1d6aa9 1062 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1063 break;
1064
1065 case ARG_LINK_JOURNAL:
4e1d6aa9 1066 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1067 if (r < 0)
1068 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1069
4e1d6aa9 1070 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1071 break;
1072
17fe0523 1073 case ARG_BIND:
f757855e
LP
1074 case ARG_BIND_RO:
1075 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1076 if (r < 0)
1077 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1078
f757855e 1079 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1080 break;
06c17c39 1081
f757855e
LP
1082 case ARG_TMPFS:
1083 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1086
f757855e 1087 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1088 break;
5a8af538
LP
1089
1090 case ARG_OVERLAY:
ad85779a
LP
1091 case ARG_OVERLAY_RO:
1092 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1093 if (r == -EADDRNOTAVAIL)
1094 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1095 if (r < 0)
1096 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1097
f757855e 1098 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1099 break;
06c17c39 1100
de40a303
LP
1101 case ARG_INACCESSIBLE:
1102 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1103 if (r < 0)
1104 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1105
1106 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1107 break;
1108
a5f1cb3b 1109 case 'E': {
f4889f65
LP
1110 char **n;
1111
baaa35ad
ZJS
1112 if (!env_assignment_is_valid(optarg))
1113 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1114 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1115
1116 n = strv_env_set(arg_setenv, optarg);
1117 if (!n)
1118 return log_oom();
1119
130d3d22 1120 strv_free_and_replace(arg_setenv, n);
f757855e 1121 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1122 break;
1123 }
1124
284c0b91
LP
1125 case 'q':
1126 arg_quiet = true;
1127 break;
1128
8a96d94e 1129 case ARG_SHARE_SYSTEM:
a6b5216c 1130 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1131 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1132 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1133 arg_clone_ns_flags = 0;
8a96d94e
LP
1134 break;
1135
eb91eb18
LP
1136 case ARG_REGISTER:
1137 r = parse_boolean(optarg);
1138 if (r < 0) {
1139 log_error("Failed to parse --register= argument: %s", optarg);
1140 return r;
1141 }
1142
1143 arg_register = r;
1144 break;
1145
89f7c846
LP
1146 case ARG_KEEP_UNIT:
1147 arg_keep_unit = true;
1148 break;
1149
6afc95b7
LP
1150 case ARG_PERSONALITY:
1151
ac45f971 1152 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1153 if (arg_personality == PERSONALITY_INVALID)
1154 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1155 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1156
f757855e 1157 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1158 break;
1159
4d9f07b4
LP
1160 case ARG_VOLATILE:
1161
1162 if (!optarg)
f757855e 1163 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1164 else if (streq(optarg, "help")) {
1165 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1166 return 0;
1167 } else {
f757855e 1168 VolatileMode m;
4d9f07b4 1169
f757855e 1170 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1171 if (m < 0)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1173 "Failed to parse --volatile= argument: %s", optarg);
1174 else
f757855e 1175 arg_volatile_mode = m;
6d0b55c2
LP
1176 }
1177
f757855e
LP
1178 arg_settings_mask |= SETTING_VOLATILE_MODE;
1179 break;
6d0b55c2 1180
f757855e
LP
1181 case 'p':
1182 r = expose_port_parse(&arg_expose_ports, optarg);
1183 if (r == -EEXIST)
1184 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1185 if (r < 0)
1186 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1187
f757855e 1188 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1189 break;
6d0b55c2 1190
f36933fe
LP
1191 case ARG_PROPERTY:
1192 if (strv_extend(&arg_property, optarg) < 0)
1193 return log_oom();
1194
1195 break;
1196
ae209204
ZJS
1197 case ARG_PRIVATE_USERS: {
1198 int boolean = -1;
0de7acce 1199
ae209204
ZJS
1200 if (!optarg)
1201 boolean = true;
1202 else if (!in_charset(optarg, DIGITS))
1203 /* do *not* parse numbers as booleans */
1204 boolean = parse_boolean(optarg);
1205
1206 if (boolean == false) {
0de7acce
LP
1207 /* no: User namespacing off */
1208 arg_userns_mode = USER_NAMESPACE_NO;
1209 arg_uid_shift = UID_INVALID;
1210 arg_uid_range = UINT32_C(0x10000);
ae209204 1211 } else if (boolean == true) {
0de7acce
LP
1212 /* yes: User namespacing on, UID range is read from root dir */
1213 arg_userns_mode = USER_NAMESPACE_FIXED;
1214 arg_uid_shift = UID_INVALID;
1215 arg_uid_range = UINT32_C(0x10000);
1216 } else if (streq(optarg, "pick")) {
1217 /* pick: User namespacing on, UID range is picked randomly */
1218 arg_userns_mode = USER_NAMESPACE_PICK;
1219 arg_uid_shift = UID_INVALID;
1220 arg_uid_range = UINT32_C(0x10000);
1221 } else {
6c2058b3 1222 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1223 const char *range, *shift;
1224
0de7acce
LP
1225 /* anything else: User namespacing on, UID range is explicitly configured */
1226
6dac160c
LP
1227 range = strchr(optarg, ':');
1228 if (range) {
6c2058b3
ZJS
1229 buffer = strndup(optarg, range - optarg);
1230 if (!buffer)
1231 return log_oom();
1232 shift = buffer;
6dac160c
LP
1233
1234 range++;
bfd292ec
ZJS
1235 r = safe_atou32(range, &arg_uid_range);
1236 if (r < 0)
be715731 1237 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1238 } else
1239 shift = optarg;
1240
be715731
ZJS
1241 r = parse_uid(shift, &arg_uid_shift);
1242 if (r < 0)
1243 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1244
1245 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1246 }
1247
baaa35ad
ZJS
1248 if (arg_uid_range <= 0)
1249 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1250 "UID range cannot be 0.");
be715731 1251
0de7acce 1252 arg_settings_mask |= SETTING_USERNS;
6dac160c 1253 break;
ae209204 1254 }
6dac160c 1255
0de7acce 1256 case 'U':
ccabee0d
LP
1257 if (userns_supported()) {
1258 arg_userns_mode = USER_NAMESPACE_PICK;
1259 arg_uid_shift = UID_INVALID;
1260 arg_uid_range = UINT32_C(0x10000);
1261
1262 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1263 }
1264
7336138e
LP
1265 break;
1266
0de7acce 1267 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1268 arg_userns_chown = true;
0de7acce
LP
1269
1270 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1271 break;
1272
c6c8f6e2 1273 case ARG_KILL_SIGNAL:
5c828e66
LP
1274 if (streq(optarg, "help")) {
1275 DUMP_STRING_TABLE(signal, int, _NSIG);
1276 return 0;
1277 }
1278
29a3db75 1279 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1280 if (arg_kill_signal < 0)
1281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1282 "Cannot parse signal: %s", optarg);
c6c8f6e2 1283
f757855e
LP
1284 arg_settings_mask |= SETTING_KILL_SIGNAL;
1285 break;
1286
1287 case ARG_SETTINGS:
1288
1289 /* no → do not read files
1290 * yes → read files, do not override cmdline, trust only subset
1291 * override → read files, override cmdline, trust only subset
1292 * trusted → read files, do not override cmdline, trust all
1293 */
1294
1295 r = parse_boolean(optarg);
1296 if (r < 0) {
1297 if (streq(optarg, "trusted")) {
1298 mask_all_settings = false;
1299 mask_no_settings = false;
1300 arg_settings_trusted = true;
1301
1302 } else if (streq(optarg, "override")) {
1303 mask_all_settings = false;
1304 mask_no_settings = true;
1305 arg_settings_trusted = -1;
1306 } else
1307 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1308 } else if (r > 0) {
1309 /* yes */
1310 mask_all_settings = false;
1311 mask_no_settings = false;
1312 arg_settings_trusted = -1;
1313 } else {
1314 /* no */
1315 mask_all_settings = true;
1316 mask_no_settings = false;
1317 arg_settings_trusted = false;
1318 }
1319
c6c8f6e2
LP
1320 break;
1321
5f932eb9 1322 case ARG_CHDIR:
baaa35ad
ZJS
1323 if (!path_is_absolute(optarg))
1324 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1325 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1326
1327 r = free_and_strdup(&arg_chdir, optarg);
1328 if (r < 0)
1329 return log_oom();
1330
1331 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1332 break;
1333
b53ede69
PW
1334 case ARG_PIVOT_ROOT:
1335 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1336 if (r < 0)
1337 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1338
1339 arg_settings_mask |= SETTING_PIVOT_ROOT;
1340 break;
1341
9c1e04d0
AP
1342 case ARG_NOTIFY_READY:
1343 r = parse_boolean(optarg);
baaa35ad
ZJS
1344 if (r < 0)
1345 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1346 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1347 arg_notify_ready = r;
1348 arg_settings_mask |= SETTING_NOTIFY_READY;
1349 break;
1350
4623e8e6 1351 case ARG_ROOT_HASH: {
89e62e0b 1352 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1353 size_t l;
1354
1355 r = unhexmem(optarg, strlen(optarg), &k, &l);
1356 if (r < 0)
1357 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1358 if (l < sizeof(sd_id128_t))
c6147113 1359 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1360
89e62e0b
LP
1361 free_and_replace(arg_verity_settings.root_hash, k);
1362 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1363 break;
1364 }
1365
c2923fdc
LB
1366 case ARG_ROOT_HASH_SIG: {
1367 char *value;
89e62e0b
LP
1368 size_t l;
1369 void *p;
c2923fdc
LB
1370
1371 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1372 r = unbase64mem(value, strlen(value), &p, &l);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1375
c2923fdc 1376 } else {
89e62e0b 1377 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1378 if (r < 0)
89e62e0b 1379 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1380 }
1381
89e62e0b
LP
1382 free_and_replace(arg_verity_settings.root_hash_sig, p);
1383 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1384 break;
1385 }
1386
89e62e0b
LP
1387 case ARG_VERITY_DATA:
1388 r = parse_path_argument_and_warn(optarg, false, &arg_verity_settings.data_path);
1389 if (r < 0)
1390 return r;
1391 break;
1392
960e4569
LP
1393 case ARG_SYSTEM_CALL_FILTER: {
1394 bool negative;
1395 const char *items;
1396
1397 negative = optarg[0] == '~';
1398 items = negative ? optarg + 1 : optarg;
1399
1400 for (;;) {
1401 _cleanup_free_ char *word = NULL;
1402
1403 r = extract_first_word(&items, &word, NULL, 0);
1404 if (r == 0)
1405 break;
1406 if (r == -ENOMEM)
1407 return log_oom();
1408 if (r < 0)
1409 return log_error_errno(r, "Failed to parse system call filter: %m");
1410
1411 if (negative)
6b000af4 1412 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1413 else
6b000af4 1414 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1415 if (r < 0)
1416 return log_oom();
1417 }
1418
1419 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1420 break;
1421 }
1422
bf428efb
LP
1423 case ARG_RLIMIT: {
1424 const char *eq;
622ecfa8 1425 _cleanup_free_ char *name = NULL;
bf428efb
LP
1426 int rl;
1427
5c828e66
LP
1428 if (streq(optarg, "help")) {
1429 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1430 return 0;
1431 }
1432
bf428efb 1433 eq = strchr(optarg, '=');
baaa35ad
ZJS
1434 if (!eq)
1435 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1436 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1437
1438 name = strndup(optarg, eq - optarg);
1439 if (!name)
1440 return log_oom();
1441
1442 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1443 if (rl < 0)
1444 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1445 "Unknown resource limit: %s", name);
bf428efb
LP
1446
1447 if (!arg_rlimit[rl]) {
1448 arg_rlimit[rl] = new0(struct rlimit, 1);
1449 if (!arg_rlimit[rl])
1450 return log_oom();
1451 }
1452
1453 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1456
1457 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1458 break;
1459 }
1460
81f345df
LP
1461 case ARG_OOM_SCORE_ADJUST:
1462 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1463 if (r < 0)
1464 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1465
1466 arg_oom_score_adjust_set = true;
1467 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1468 break;
1469
d107bb7d 1470 case ARG_CPU_AFFINITY: {
0985c7c4 1471 CPUSet cpuset;
d107bb7d
LP
1472
1473 r = parse_cpu_set(optarg, &cpuset);
1474 if (r < 0)
0985c7c4 1475 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1476
0985c7c4
ZJS
1477 cpu_set_reset(&arg_cpu_set);
1478 arg_cpu_set = cpuset;
d107bb7d
LP
1479 arg_settings_mask |= SETTING_CPU_AFFINITY;
1480 break;
1481 }
1482
09d423e9
LP
1483 case ARG_RESOLV_CONF:
1484 if (streq(optarg, "help")) {
1485 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1486 return 0;
1487 }
1488
1489 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1490 if (arg_resolv_conf < 0)
1491 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1492 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1493
1494 arg_settings_mask |= SETTING_RESOLV_CONF;
1495 break;
1496
1688841f
LP
1497 case ARG_TIMEZONE:
1498 if (streq(optarg, "help")) {
1499 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1500 return 0;
1501 }
1502
1503 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1504 if (arg_timezone < 0)
1505 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1506 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1507
1508 arg_settings_mask |= SETTING_TIMEZONE;
1509 break;
1510
de40a303 1511 case ARG_CONSOLE:
dce66ffe
ZJS
1512 r = handle_arg_console(optarg);
1513 if (r <= 0)
1514 return r;
de40a303
LP
1515 break;
1516
1517 case 'P':
1518 case ARG_PIPE:
dce66ffe
ZJS
1519 r = handle_arg_console("pipe");
1520 if (r <= 0)
1521 return r;
de40a303
LP
1522 break;
1523
bb068de0
ZJS
1524 case ARG_NO_PAGER:
1525 arg_pager_flags |= PAGER_DISABLE;
1526 break;
1527
3652872a
LP
1528 case ARG_SET_CREDENTIAL: {
1529 _cleanup_free_ char *word = NULL, *data = NULL;
1530 const char *p = optarg;
1531 Credential *a;
1532 size_t i;
1533 int l;
1534
1535 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1536 if (r == -ENOMEM)
1537 return log_oom();
1538 if (r < 0)
1539 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1540 if (r == 0 || !p)
1541 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1542
1543 if (!credential_name_valid(word))
1544 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1545
1546 for (i = 0; i < arg_n_credentials; i++)
1547 if (streq(arg_credentials[i].id, word))
1548 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1549
1550 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1551 if (l < 0)
1552 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1553
1554 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1555 if (!a)
1556 return log_oom();
1557
1558 a[arg_n_credentials++] = (Credential) {
1559 .id = TAKE_PTR(word),
1560 .data = TAKE_PTR(data),
1561 .size = l,
1562 };
1563
1564 arg_credentials = a;
1565
1566 arg_settings_mask |= SETTING_CREDENTIALS;
1567 break;
1568 }
1569
1570 case ARG_LOAD_CREDENTIAL: {
1571 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1572 _cleanup_(erase_and_freep) char *data = NULL;
1573 _cleanup_free_ char *word = NULL, *j = NULL;
1574 const char *p = optarg;
1575 Credential *a;
1576 size_t size, i;
1577
1578 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1579 if (r == -ENOMEM)
1580 return log_oom();
1581 if (r < 0)
1582 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1583 if (r == 0 || !p)
1584 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1585
1586 if (!credential_name_valid(word))
1587 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1588
1589 for (i = 0; i < arg_n_credentials; i++)
1590 if (streq(arg_credentials[i].id, word))
1591 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1592
1593 if (path_is_absolute(p))
1594 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1595 else {
1596 const char *e;
1597
1598 e = getenv("CREDENTIALS_DIRECTORY");
1599 if (!e)
1600 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential not available (no credentials passed at all): %s", word);
1601
1602 j = path_join(e, p);
1603 if (!j)
1604 return log_oom();
1605 }
1606
986311c2
LP
1607 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1608 flags,
1609 NULL,
1610 &data, &size);
3652872a
LP
1611 if (r < 0)
1612 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1613
1614 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1615 if (!a)
1616 return log_oom();
1617
1618 a[arg_n_credentials++] = (Credential) {
1619 .id = TAKE_PTR(word),
1620 .data = TAKE_PTR(data),
1621 .size = size,
1622 };
1623
1624 arg_credentials = a;
1625
1626 arg_settings_mask |= SETTING_CREDENTIALS;
1627 break;
1628 }
1629
88213476
LP
1630 case '?':
1631 return -EINVAL;
1632
1633 default:
eb9da376 1634 assert_not_reached("Unhandled option");
88213476 1635 }
88213476 1636
60f1ec13
LP
1637 if (argc > optind) {
1638 strv_free(arg_parameters);
1639 arg_parameters = strv_copy(argv + optind);
1640 if (!arg_parameters)
1641 return log_oom();
d7bea6b6 1642
60f1ec13
LP
1643 arg_settings_mask |= SETTING_START_MODE;
1644 }
1645
1646 if (arg_ephemeral && arg_template && !arg_directory)
1647 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1648 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1649 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1650 * --directory=". */
1651 arg_directory = TAKE_PTR(arg_template);
1652
bd4b15f2 1653 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1654
de40a303 1655 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1656 r = parse_environment();
1657 if (r < 0)
1658 return r;
de40a303 1659
60f1ec13
LP
1660 /* Load all settings from .nspawn files */
1661 if (mask_no_settings)
1662 arg_settings_mask = 0;
1663
1664 /* Don't load any settings from .nspawn files */
1665 if (mask_all_settings)
1666 arg_settings_mask = _SETTINGS_MASK_ALL;
1667
1668 return 1;
1669}
1670
1671static int verify_arguments(void) {
1672 int r;
a6b5216c 1673
75b0d8b8
ZJS
1674 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1675 /* If we are running the stub init in the container, we don't need to look at what the init
1676 * in the container supports, because we are not using it. Let's immediately pick the right
1677 * setting based on the host system configuration.
1678 *
1679 * We only do this, if the user didn't use an environment variable to override the detection.
1680 */
1681
1682 r = cg_all_unified();
1683 if (r < 0)
1684 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1685 if (r > 0)
1686 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1687 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1688 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1689 else
1690 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1691 }
1692
4f086aab
SU
1693 if (arg_userns_mode != USER_NAMESPACE_NO)
1694 arg_mount_settings |= MOUNT_USE_USERNS;
1695
1696 if (arg_private_network)
1697 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1698
48a8d337
LB
1699 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1700 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1701 arg_register = false;
baaa35ad 1702 if (arg_start_mode != START_PID1)
60f1ec13 1703 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1704 }
eb91eb18 1705
0de7acce 1706 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1707 arg_userns_chown = true;
1708
60f1ec13
LP
1709 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1710 arg_kill_signal = SIGRTMIN+3;
1711
e5a4bb0d
LP
1712 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1713 arg_read_only = true;
1714
2436ea76
DDM
1715 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1716 arg_read_only = true;
1717
baaa35ad 1718 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1719 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1720 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1721 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1722
baaa35ad 1723 if (arg_directory && arg_image)
60f1ec13 1724 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1725
baaa35ad 1726 if (arg_template && arg_image)
60f1ec13 1727 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1728
baaa35ad 1729 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1731
baaa35ad 1732 if (arg_ephemeral && arg_template)
60f1ec13 1733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1734
baaa35ad 1735 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1736 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1737
baaa35ad 1738 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1739 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1740
baaa35ad 1741 if (arg_userns_chown && arg_read_only)
de40a303
LP
1742 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1743 "--read-only and --private-users-chown may not be combined.");
f757855e 1744
e5a4bb0d
LP
1745 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1746 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
5238e957 1747 * copy-up (in case of overlay) making the entire exercise pointless. */
e5a4bb0d
LP
1748 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1749 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1750
679ecd36
SZ
1751 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1752 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1753 if (arg_network_namespace_path &&
1754 (arg_network_interfaces || arg_network_macvlan ||
1755 arg_network_ipvlan || arg_network_veth_extra ||
1756 arg_network_bridge || arg_network_zone ||
679ecd36 1757 arg_network_veth))
de40a303 1758 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1759
60f1ec13 1760 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1761 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1762 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1763
baaa35ad 1764 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1766
baaa35ad 1767 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1768 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1769
baaa35ad 1770 if (arg_expose_ports && !arg_private_network)
60f1ec13 1771 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1772
349cc4a5 1773#if ! HAVE_LIBIPTC
baaa35ad 1774 if (arg_expose_ports)
60f1ec13 1775 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1776#endif
1777
88fc9c9b
TH
1778 if (arg_caps_ambient) {
1779 if (arg_caps_ambient == (uint64_t)-1)
1780 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1781
1782 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1784
1785 if (arg_start_mode == START_BOOT)
1786 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1787 }
1788
60f1ec13
LP
1789 r = custom_mount_check_all();
1790 if (r < 0)
1791 return r;
c6c8f6e2 1792
f757855e 1793 return 0;
88213476
LP
1794}
1795
03cfe0d5
LP
1796static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1797 assert(p);
1798
0de7acce 1799 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1800 return 0;
1801
1802 if (uid == UID_INVALID && gid == GID_INVALID)
1803 return 0;
1804
1805 if (uid != UID_INVALID) {
1806 uid += arg_uid_shift;
1807
1808 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1809 return -EOVERFLOW;
1810 }
1811
1812 if (gid != GID_INVALID) {
1813 gid += (gid_t) arg_uid_shift;
1814
1815 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1816 return -EOVERFLOW;
1817 }
1818
1819 if (lchown(p, uid, gid) < 0)
1820 return -errno;
b12afc8c
LP
1821
1822 return 0;
1823}
1824
03cfe0d5
LP
1825static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1826 const char *q;
dae8b82e 1827 int r;
03cfe0d5
LP
1828
1829 q = prefix_roota(root, path);
dae8b82e
ZJS
1830 r = mkdir_errno_wrapper(q, mode);
1831 if (r == -EEXIST)
1832 return 0;
1833 if (r < 0)
1834 return r;
03cfe0d5
LP
1835
1836 return userns_lchown(q, uid, gid);
1837}
1838
1688841f 1839static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1840 return PATH_STARTSWITH_SET(
1841 path,
1842 "../usr/share/zoneinfo/",
1843 "/usr/share/zoneinfo/");
1688841f
LP
1844}
1845
83205269
LP
1846static bool etc_writable(void) {
1847 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1848}
1849
e58a1277 1850static int setup_timezone(const char *dest) {
1688841f
LP
1851 _cleanup_free_ char *p = NULL, *etc = NULL;
1852 const char *where, *check;
1853 TimezoneMode m;
d4036145 1854 int r;
f8440af5 1855
e58a1277
LP
1856 assert(dest);
1857
1688841f 1858 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1859 r = readlink_malloc("/etc/localtime", &p);
1860 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1861 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1862 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1863 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1864 else if (r < 0) {
1865 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1866 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1867 * file.
1868 *
1869 * Example:
1870 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1871 */
1872 return 0;
1873 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1874 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1875 else
1876 m = arg_timezone;
1877 } else
1878 m = arg_timezone;
1879
1880 if (m == TIMEZONE_OFF)
1881 return 0;
1882
a5648b80 1883 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1884 if (r < 0) {
1688841f 1885 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1886 return 0;
1887 }
1888
1688841f
LP
1889 where = strjoina(etc, "/localtime");
1890
1891 switch (m) {
1892
1893 case TIMEZONE_DELETE:
1894 if (unlink(where) < 0)
1895 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1896
d4036145 1897 return 0;
d4036145 1898
1688841f
LP
1899 case TIMEZONE_SYMLINK: {
1900 _cleanup_free_ char *q = NULL;
1901 const char *z, *what;
4d1c38b8 1902
1688841f
LP
1903 z = timezone_from_path(p);
1904 if (!z) {
1905 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1906 return 0;
1688841f 1907 }
d4036145 1908
1688841f
LP
1909 r = readlink_malloc(where, &q);
1910 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1911 return 0; /* Already pointing to the right place? Then do nothing .. */
1912
1913 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1914 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1915 if (r < 0)
1916 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1917 else {
1918 if (unlink(where) < 0 && errno != ENOENT) {
1919 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1920 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1921 return 0;
1922 }
1923
1924 what = strjoina("../usr/share/zoneinfo/", z);
1925 if (symlink(what, where) < 0) {
1926 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1927 errno, "Failed to correct timezone of container, ignoring: %m");
1928 return 0;
1929 }
1930
1931 break;
1932 }
1933
1934 _fallthrough_;
d4036145 1935 }
68fb0892 1936
1688841f
LP
1937 case TIMEZONE_BIND: {
1938 _cleanup_free_ char *resolved = NULL;
1939 int found;
1940
a5648b80 1941 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1942 if (found < 0) {
1943 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1944 return 0;
1945 }
1946
1947 if (found == 0) /* missing? */
1948 (void) touch(resolved);
1949
511a8cfe 1950 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 1951 if (r >= 0)
511a8cfe 1952 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
1953
1954 _fallthrough_;
79d80fc1 1955 }
4d9f07b4 1956
1688841f
LP
1957 case TIMEZONE_COPY:
1958 /* If mounting failed, try to copy */
8a016c74 1959 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
1960 if (r < 0) {
1961 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1962 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1963 return 0;
1964 }
1965
1966 break;
1967
1968 default:
1969 assert_not_reached("unexpected mode");
d4036145 1970 }
e58a1277 1971
1688841f 1972 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1973 r = userns_lchown(where, 0, 0);
1974 if (r < 0)
1688841f 1975 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1976
e58a1277 1977 return 0;
88213476
LP
1978}
1979
09d423e9
LP
1980static int have_resolv_conf(const char *path) {
1981 assert(path);
1982
1983 if (access(path, F_OK) < 0) {
1984 if (errno == ENOENT)
1985 return 0;
1986
1987 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1988 }
1989
1990 return 1;
1991}
1992
7357272e 1993static int resolved_listening(void) {
b8ea7a6e 1994 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1995 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1996 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1997 int r;
1998
7357272e 1999 /* Check if resolved is listening */
b053cd5f
LP
2000
2001 r = sd_bus_open_system(&bus);
2002 if (r < 0)
b8ea7a6e 2003 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2004
7357272e 2005 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2006 if (r < 0)
2007 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2008 if (r == 0)
2009 return 0;
7357272e
DM
2010
2011 r = sd_bus_get_property_string(bus,
2012 "org.freedesktop.resolve1",
2013 "/org/freedesktop/resolve1",
2014 "org.freedesktop.resolve1.Manager",
2015 "DNSStubListener",
b8ea7a6e 2016 &error,
7357272e
DM
2017 &dns_stub_listener_mode);
2018 if (r < 0)
b8ea7a6e 2019 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2020
2021 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2022}
2023
2547bb41 2024static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2025 _cleanup_free_ char *etc = NULL;
2026 const char *where, *what;
2027 ResolvConfMode m;
2028 int r;
2547bb41
LP
2029
2030 assert(dest);
2031
09d423e9
LP
2032 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2033 if (arg_private_network)
2034 m = RESOLV_CONF_OFF;
86775e35
LP
2035 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2036 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2037 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2038 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2039 else
83205269 2040 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2041
09d423e9
LP
2042 } else
2043 m = arg_resolv_conf;
2044
2045 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2046 return 0;
2047
a5648b80 2048 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2049 if (r < 0) {
2050 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2051 return 0;
2052 }
2053
2054 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2055
2056 if (m == RESOLV_CONF_DELETE) {
2057 if (unlink(where) < 0)
2058 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2059
87447ae4
LP
2060 return 0;
2061 }
79d80fc1 2062
86775e35
LP
2063 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2064 what = PRIVATE_STATIC_RESOLV_CONF;
2065 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2066 what = PRIVATE_UPLINK_RESOLV_CONF;
2067 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2068 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2069 else
2070 what = "/etc/resolv.conf";
87447ae4 2071
86775e35 2072 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2073 _cleanup_free_ char *resolved = NULL;
2074 int found;
2075
a5648b80 2076 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2077 if (found < 0) {
2078 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2079 return 0;
2080 }
3539724c 2081
87447ae4
LP
2082 if (found == 0) /* missing? */
2083 (void) touch(resolved);
5367354d 2084
511a8cfe 2085 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2086 if (r >= 0)
511a8cfe 2087 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2088
2089 /* If that didn't work, let's copy the file */
3539724c
LP
2090 }
2091
86775e35
LP
2092 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2093 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2094 else
2095 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2096 if (r < 0) {
3539724c
LP
2097 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2098 * resolved or something similar runs inside and the symlink points there.
68a313c5 2099 *
3539724c 2100 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2101 */
86775e35
LP
2102 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2103 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2104 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2105 return 0;
2106 }
2547bb41 2107
03cfe0d5
LP
2108 r = userns_lchown(where, 0, 0);
2109 if (r < 0)
3539724c 2110 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2111
2547bb41
LP
2112 return 0;
2113}
2114
1e4f1671 2115static int setup_boot_id(void) {
cdde6ba6
LP
2116 _cleanup_(unlink_and_freep) char *from = NULL;
2117 _cleanup_free_ char *path = NULL;
3bbaff3e 2118 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2119 const char *to;
04bc4a3f
LP
2120 int r;
2121
1eacc470 2122 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2123
1eacc470 2124 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2127
2128 r = sd_id128_randomize(&rnd);
f647962d
MS
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2131
cdde6ba6 2132 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2133 if (r < 0)
2134 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2135
cdde6ba6
LP
2136 from = TAKE_PTR(path);
2137 to = "/proc/sys/kernel/random/boot_id";
2138
511a8cfe 2139 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2140 if (r < 0)
2141 return r;
04bc4a3f 2142
511a8cfe 2143 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2144}
2145
e58a1277 2146static int copy_devnodes(const char *dest) {
88213476
LP
2147 static const char devnodes[] =
2148 "null\0"
2149 "zero\0"
2150 "full\0"
2151 "random\0"
2152 "urandom\0"
85614d66
TG
2153 "tty\0"
2154 "net/tun\0";
88213476 2155
de40a303 2156 _cleanup_umask_ mode_t u;
88213476 2157 const char *d;
e58a1277 2158 int r = 0;
a258bf26
LP
2159
2160 assert(dest);
124640f1
LP
2161
2162 u = umask(0000);
88213476 2163
03cfe0d5
LP
2164 /* Create /dev/net, so that we can create /dev/net/tun in it */
2165 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2166 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2167
88213476 2168 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2169 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2170 struct stat st;
88213476 2171
c6134d3e 2172 from = path_join("/dev/", d);
8967f291
LP
2173 if (!from)
2174 return log_oom();
2175
c6134d3e 2176 to = path_join(dest, from);
8967f291
LP
2177 if (!to)
2178 return log_oom();
88213476
LP
2179
2180 if (stat(from, &st) < 0) {
2181
4a62c710
MS
2182 if (errno != ENOENT)
2183 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2184
baaa35ad
ZJS
2185 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2186 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2187 "%s is not a char or block device, cannot copy.", from);
2188 else {
8dfce114
LP
2189 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2190
81f5049b 2191 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2192 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2193 if (errno == EEXIST)
8dbf71ec 2194 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2195 if (errno != EPERM)
2196 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2197
8dfce114 2198 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2199 r = touch(to);
2200 if (r < 0)
2201 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2202 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2203 if (r < 0)
2204 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2205 }
6278cf60 2206
03cfe0d5
LP
2207 r = userns_lchown(to, 0, 0);
2208 if (r < 0)
2209 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2210
657ee2d8 2211 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2212 if (!dn)
2213 return log_oom();
2214
2215 r = userns_mkdir(dest, dn, 0755, 0, 0);
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to create '%s': %m", dn);
2218
2219 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2220 return log_oom();
2221
c6134d3e 2222 prefixed = path_join(dest, sl);
8dfce114
LP
2223 if (!prefixed)
2224 return log_oom();
2225
2d9b74ba 2226 t = path_join("..", d);
8dfce114
LP
2227 if (!t)
2228 return log_oom();
2229
2230 if (symlink(t, prefixed) < 0)
2231 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2232 }
88213476
LP
2233 }
2234
e58a1277
LP
2235 return r;
2236}
88213476 2237
de40a303
LP
2238static int make_extra_nodes(const char *dest) {
2239 _cleanup_umask_ mode_t u;
2240 size_t i;
2241 int r;
2242
2243 u = umask(0000);
2244
2245 for (i = 0; i < arg_n_extra_nodes; i++) {
2246 _cleanup_free_ char *path = NULL;
2247 DeviceNode *n = arg_extra_nodes + i;
2248
c6134d3e 2249 path = path_join(dest, n->path);
de40a303
LP
2250 if (!path)
2251 return log_oom();
2252
2253 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2254 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2255
2256 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2257 if (r < 0)
2258 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2259 }
2260
2261 return 0;
2262}
2263
03cfe0d5
LP
2264static int setup_pts(const char *dest) {
2265 _cleanup_free_ char *options = NULL;
2266 const char *p;
709f6e46 2267 int r;
03cfe0d5 2268
349cc4a5 2269#if HAVE_SELINUX
03cfe0d5
LP
2270 if (arg_selinux_apifs_context)
2271 (void) asprintf(&options,
3dce8915 2272 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2273 arg_uid_shift + TTY_GID,
2274 arg_selinux_apifs_context);
2275 else
2276#endif
2277 (void) asprintf(&options,
3dce8915 2278 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2279 arg_uid_shift + TTY_GID);
f2d88580 2280
03cfe0d5 2281 if (!options)
f2d88580
LP
2282 return log_oom();
2283
03cfe0d5 2284 /* Mount /dev/pts itself */
cc9fce65 2285 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2286 r = mkdir_errno_wrapper(p, 0755);
2287 if (r < 0)
2288 return log_error_errno(r, "Failed to create /dev/pts: %m");
2289
511a8cfe 2290 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2291 if (r < 0)
2292 return r;
709f6e46
MS
2293 r = userns_lchown(p, 0, 0);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2296
2297 /* Create /dev/ptmx symlink */
2298 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2299 if (symlink("pts/ptmx", p) < 0)
2300 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2301 r = userns_lchown(p, 0, 0);
2302 if (r < 0)
2303 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2304
03cfe0d5
LP
2305 /* And fix /dev/pts/ptmx ownership */
2306 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2307 r = userns_lchown(p, 0, 0);
2308 if (r < 0)
2309 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2310
f2d88580
LP
2311 return 0;
2312}
2313
3acc84eb 2314static int setup_stdio_as_dev_console(void) {
2fef50cd 2315 _cleanup_close_ int terminal = -1;
e58a1277 2316 int r;
e58a1277 2317
335d2ead
LP
2318 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2319 * explicitly, if we are configured to. */
2320 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2321 if (terminal < 0)
2322 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2323
3acc84eb
FB
2324 /* Make sure we can continue logging to the original stderr, even if
2325 * stderr points elsewhere now */
2326 r = log_dup_console();
2327 if (r < 0)
2328 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2329
3acc84eb
FB
2330 /* invalidates 'terminal' on success and failure */
2331 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2332 TAKE_FD(terminal);
f647962d 2333 if (r < 0)
3acc84eb
FB
2334 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2335
2336 return 0;
2337}
88213476 2338
3acc84eb
FB
2339static int setup_dev_console(const char *console) {
2340 _cleanup_free_ char *p = NULL;
2341 int r;
a258bf26 2342
3acc84eb
FB
2343 /* Create /dev/console symlink */
2344 r = path_make_relative("/dev", console, &p);
81f5049b 2345 if (r < 0)
3acc84eb
FB
2346 return log_error_errno(r, "Failed to create relative path: %m");
2347
2348 if (symlink(p, "/dev/console") < 0)
2349 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2350
3acc84eb 2351 return 0;
e58a1277
LP
2352}
2353
8e5430c4
LP
2354static int setup_keyring(void) {
2355 key_serial_t keyring;
2356
6b000af4
LP
2357 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2358 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2359 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2360 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2361 * into the container. */
8e5430c4
LP
2362
2363 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2364 if (keyring == -1) {
2365 if (errno == ENOSYS)
2366 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2367 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2368 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2369 else
2370 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2371 }
2372
2373 return 0;
2374}
2375
3652872a
LP
2376static int setup_credentials(const char *root) {
2377 const char *q;
2378 int r;
2379
2380 if (arg_n_credentials <= 0)
2381 return 0;
2382
2383 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to create /run/host: %m");
2386
2387 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2390
2391 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2392 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2393 if (r < 0)
2394 return r;
2395
2396 for (size_t i = 0; i < arg_n_credentials; i++) {
2397 _cleanup_free_ char *j = NULL;
2398 _cleanup_close_ int fd = -1;
2399
2400 j = path_join(q, arg_credentials[i].id);
2401 if (!j)
2402 return log_oom();
2403
2404 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2405 if (fd < 0)
2406 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2407
2408 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2411
2412 if (fchmod(fd, 0400) < 0)
2413 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2414
2415 if (arg_userns_mode != USER_NAMESPACE_NO) {
2416 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2417 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2418 }
2419 }
2420
2421 if (chmod(q, 0500) < 0)
2422 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2423
2424 r = userns_lchown(q, 0, 0);
2425 if (r < 0)
2426 return r;
2427
2428 /* Make both mount and superblock read-only now */
511a8cfe 2429 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2430 if (r < 0)
2431 return r;
2432
511a8cfe 2433 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2434}
2435
1e4f1671 2436static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2437 _cleanup_(unlink_and_freep) char *from = NULL;
2438 _cleanup_free_ char *fifo = NULL;
2439 _cleanup_close_ int fd = -1;
7fd1b19b 2440 _cleanup_umask_ mode_t u;
9ec5a93c 2441 int r;
e58a1277 2442
e58a1277 2443 assert(kmsg_socket >= 0);
a258bf26 2444
e58a1277 2445 u = umask(0000);
a258bf26 2446
1eacc470 2447 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2448 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2449 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2450 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2451
1eacc470 2452 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2453 if (r < 0)
2454 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2455
9ec5a93c 2456 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2457 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2458
2459 from = TAKE_PTR(fifo);
9ec5a93c 2460
511a8cfe 2461 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2462 if (r < 0)
2463 return r;
e58a1277 2464
669fc4e5 2465 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2466 if (fd < 0)
2467 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2468
9ec5a93c 2469 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2470 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2473
25ea79fe 2474 return 0;
88213476
LP
2475}
2476
1c4baffc 2477static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2478 union in_addr_union *exposed = userdata;
2479
2480 assert(rtnl);
2481 assert(m);
2482 assert(exposed);
2483
7a8f6325 2484 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2485 return 0;
2486}
2487
3a74cea5 2488static int setup_hostname(void) {
c818eef1 2489 int r;
3a74cea5 2490
0c582db0 2491 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2492 return 0;
2493
c818eef1
LP
2494 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2495 if (r < 0)
2496 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2497
7027ff61 2498 return 0;
3a74cea5
LP
2499}
2500
57fb9fb5 2501static int setup_journal(const char *directory) {
0f5e1382 2502 _cleanup_free_ char *d = NULL;
5905d7cf 2503 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2504 const char *dirname, *p, *q;
2505 sd_id128_t this_id;
8054d749 2506 bool try;
57fb9fb5
LP
2507 int r;
2508
df9a75e4
LP
2509 /* Don't link journals in ephemeral mode */
2510 if (arg_ephemeral)
2511 return 0;
2512
8054d749
LP
2513 if (arg_link_journal == LINK_NO)
2514 return 0;
2515
2516 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2517
4d680aee 2518 r = sd_id128_get_machine(&this_id);
f647962d
MS
2519 if (r < 0)
2520 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2521
e01ff70a 2522 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2523 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2524 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2525 if (try)
4d680aee 2526 return 0;
df9a75e4 2527 return -EEXIST;
4d680aee
ZJS
2528 }
2529
369ca6da
ZJS
2530 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2531 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2532 if (r < 0) {
2533 bool ignore = r == -EROFS && try;
2534 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2535 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2536 return ignore ? 0 : r;
2537 }
2538 }
03cfe0d5 2539
e01ff70a
MS
2540 (void) sd_id128_to_string(arg_uuid, id);
2541
03cfe0d5
LP
2542 p = strjoina("/var/log/journal/", id);
2543 q = prefix_roota(directory, p);
27407a01 2544
e1873695 2545 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2546 if (try)
2547 return 0;
27407a01 2548
baaa35ad
ZJS
2549 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2550 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2551 }
2552
e1873695 2553 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2554 if (try)
2555 return 0;
57fb9fb5 2556
baaa35ad
ZJS
2557 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2558 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2559 }
2560
2561 r = readlink_and_make_absolute(p, &d);
2562 if (r >= 0) {
3742095b 2563 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2564 path_equal(d, q)) {
2565
03cfe0d5 2566 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2567 if (r < 0)
709f6e46 2568 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2569 return 0;
57fb9fb5
LP
2570 }
2571
4a62c710
MS
2572 if (unlink(p) < 0)
2573 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2574 } else if (r == -EINVAL) {
2575
2576 if (arg_link_journal == LINK_GUEST &&
2577 rmdir(p) < 0) {
2578
27407a01
ZJS
2579 if (errno == ENOTDIR) {
2580 log_error("%s already exists and is neither a symlink nor a directory", p);
2581 return r;
4314d33f
MS
2582 } else
2583 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2584 }
4314d33f
MS
2585 } else if (r != -ENOENT)
2586 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2587
2588 if (arg_link_journal == LINK_GUEST) {
2589
2590 if (symlink(q, p) < 0) {
8054d749 2591 if (try) {
56f64d95 2592 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2593 return 0;
4314d33f
MS
2594 } else
2595 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2596 }
2597
03cfe0d5 2598 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2599 if (r < 0)
709f6e46 2600 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2601 return 0;
57fb9fb5
LP
2602 }
2603
2604 if (arg_link_journal == LINK_HOST) {
ccddd104 2605 /* don't create parents here — if the host doesn't have
574edc90 2606 * permanent journal set up, don't force it here */
ba8e6c4d 2607
dae8b82e
ZJS
2608 r = mkdir_errno_wrapper(p, 0755);
2609 if (r < 0 && r != -EEXIST) {
8054d749 2610 if (try) {
dae8b82e 2611 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2612 return 0;
4314d33f 2613 } else
dae8b82e 2614 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2615 }
2616
27407a01
ZJS
2617 } else if (access(p, F_OK) < 0)
2618 return 0;
57fb9fb5 2619
cdb2b9d0
LP
2620 if (dir_is_empty(q) == 0)
2621 log_warning("%s is not empty, proceeding anyway.", q);
2622
03cfe0d5 2623 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2624 if (r < 0)
2625 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2626
511a8cfe 2627 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2628 if (r < 0)
4a62c710 2629 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2630
27407a01 2631 return 0;
57fb9fb5
LP
2632}
2633
de40a303
LP
2634static int drop_capabilities(uid_t uid) {
2635 CapabilityQuintet q;
2636
2637 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2638 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2639 * arg_caps_retain. */
2640
2641 if (capability_quintet_is_set(&arg_full_capabilities)) {
2642 q = arg_full_capabilities;
2643
2644 if (q.bounding == (uint64_t) -1)
2645 q.bounding = uid == 0 ? arg_caps_retain : 0;
2646
2647 if (q.effective == (uint64_t) -1)
2648 q.effective = uid == 0 ? q.bounding : 0;
2649
2650 if (q.inheritable == (uint64_t) -1)
88fc9c9b 2651 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303
LP
2652
2653 if (q.permitted == (uint64_t) -1)
88fc9c9b 2654 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303
LP
2655
2656 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
88fc9c9b 2657 q.ambient = arg_caps_ambient;
f66ad460
AZ
2658
2659 if (capability_quintet_mangle(&q))
2660 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2661
2662 } else {
de40a303
LP
2663 q = (CapabilityQuintet) {
2664 .bounding = arg_caps_retain,
2665 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2666 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2667 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2668 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : (uint64_t) -1,
de40a303
LP
2669 };
2670
f66ad460
AZ
2671 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2672 * in order to maintain the same behavior as systemd < 242. */
2673 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2674 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2675 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2676
2677 }
2678
de40a303 2679 return capability_quintet_enforce(&q);
88213476
LP
2680}
2681
db999e0f
LP
2682static int reset_audit_loginuid(void) {
2683 _cleanup_free_ char *p = NULL;
2684 int r;
2685
0c582db0 2686 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2687 return 0;
2688
2689 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2690 if (r == -ENOENT)
db999e0f 2691 return 0;
f647962d
MS
2692 if (r < 0)
2693 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2694
2695 /* Already reset? */
2696 if (streq(p, "4294967295"))
2697 return 0;
2698
57512c89 2699 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2700 if (r < 0) {
10a87006
LP
2701 log_error_errno(r,
2702 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2703 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2704 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2705 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2706 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2707
db999e0f 2708 sleep(5);
77b6e194 2709 }
db999e0f
LP
2710
2711 return 0;
77b6e194
LP
2712}
2713
785890ac
LP
2714static int setup_propagate(const char *root) {
2715 const char *p, *q;
709f6e46 2716 int r;
785890ac
LP
2717
2718 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2719 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2720 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2721 (void) mkdir_p(p, 0600);
2722
5a27b395 2723 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2724 if (r < 0)
5a27b395 2725 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2726
5a27b395 2727 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2728 if (r < 0)
5a27b395 2729 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2730
5a27b395 2731 q = prefix_roota(root, "/run/host/incoming");
511a8cfe 2732 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2733 if (r < 0)
2734 return r;
785890ac 2735
511a8cfe 2736 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2737 if (r < 0)
2738 return r;
785890ac 2739
5a27b395 2740 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
511a8cfe 2741 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2742}
2743
317feb4d 2744static int setup_machine_id(const char *directory) {
691675ba
LP
2745 const char *etc_machine_id;
2746 sd_id128_t id;
3bbaff3e 2747 int r;
e01ff70a 2748
317feb4d
LP
2749 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2750 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2751 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2752 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2753 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2754 * container behaves nicely). */
2755
e01ff70a
MS
2756 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2757
c5fbeedb 2758 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
317feb4d
LP
2759 if (r < 0) {
2760 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2761 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2762
317feb4d
LP
2763 if (sd_id128_is_null(arg_uuid)) {
2764 r = sd_id128_randomize(&arg_uuid);
2765 if (r < 0)
2766 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2767 }
2768 } else {
baaa35ad
ZJS
2769 if (sd_id128_is_null(id))
2770 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2771 "Machine ID in container image is zero, refusing.");
e01ff70a 2772
317feb4d
LP
2773 arg_uuid = id;
2774 }
691675ba 2775
e01ff70a
MS
2776 return 0;
2777}
2778
7336138e
LP
2779static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2780 int r;
2781
2782 assert(directory);
2783
0de7acce 2784 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2785 return 0;
2786
2787 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2788 if (r == -EOPNOTSUPP)
2789 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2790 if (r == -EBADE)
2791 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2792 if (r < 0)
2793 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2794 if (r == 0)
2795 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2796 else
2797 log_debug("Patched directory tree to match UID/GID range.");
2798
2799 return r;
2800}
2801
113cea80 2802/*
6d416b9c
LS
2803 * Return values:
2804 * < 0 : wait_for_terminate() failed to get the state of the
2805 * container, the container was terminated by a signal, or
2806 * failed for an unknown reason. No change is made to the
2807 * container argument.
2808 * > 0 : The program executed in the container terminated with an
2809 * error. The exit code of the program executed in the
919699ec
LP
2810 * container is returned. The container argument has been set
2811 * to CONTAINER_TERMINATED.
6d416b9c
LS
2812 * 0 : The container is being rebooted, has been shut down or exited
2813 * successfully. The container argument has been set to either
2814 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2815 *
6d416b9c
LS
2816 * That is, success is indicated by a return value of zero, and an
2817 * error is indicated by a non-zero value.
113cea80
DH
2818 */
2819static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2820 siginfo_t status;
919699ec 2821 int r;
113cea80
DH
2822
2823 r = wait_for_terminate(pid, &status);
f647962d
MS
2824 if (r < 0)
2825 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2826
2827 switch (status.si_code) {
fddbb89c 2828
113cea80 2829 case CLD_EXITED:
b5a2179b 2830 if (status.si_status == 0)
919699ec 2831 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2832 else
919699ec 2833 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2834
919699ec
LP
2835 *container = CONTAINER_TERMINATED;
2836 return status.si_status;
113cea80
DH
2837
2838 case CLD_KILLED:
2839 if (status.si_status == SIGINT) {
919699ec 2840 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2841 *container = CONTAINER_TERMINATED;
919699ec
LP
2842 return 0;
2843
113cea80 2844 } else if (status.si_status == SIGHUP) {
919699ec 2845 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2846 *container = CONTAINER_REBOOTED;
919699ec 2847 return 0;
113cea80 2848 }
919699ec 2849
4831981d 2850 _fallthrough_;
113cea80 2851 case CLD_DUMPED:
baaa35ad
ZJS
2852 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2853 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2854
2855 default:
baaa35ad
ZJS
2856 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2857 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2858 }
113cea80
DH
2859}
2860
023fb90b
LP
2861static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2862 pid_t pid;
2863
4a0b58c4 2864 pid = PTR_TO_PID(userdata);
023fb90b 2865 if (pid > 0) {
c6c8f6e2 2866 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2867 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2868 sd_event_source_set_userdata(s, NULL);
2869 return 0;
2870 }
2871 }
2872
2873 sd_event_exit(sd_event_source_get_event(s), 0);
2874 return 0;
2875}
2876
6916b164 2877static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2878 pid_t pid;
2879
2880 assert(s);
2881 assert(ssi);
2882
2883 pid = PTR_TO_PID(userdata);
2884
6916b164
AU
2885 for (;;) {
2886 siginfo_t si = {};
abdb9b08 2887
6916b164
AU
2888 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2889 return log_error_errno(errno, "Failed to waitid(): %m");
2890 if (si.si_pid == 0) /* No pending children. */
2891 break;
abdb9b08 2892 if (si.si_pid == pid) {
6916b164
AU
2893 /* The main process we care for has exited. Return from
2894 * signal handler but leave the zombie. */
2895 sd_event_exit(sd_event_source_get_event(s), 0);
2896 break;
2897 }
abdb9b08 2898
6916b164
AU
2899 /* Reap all other children. */
2900 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2901 }
2902
2903 return 0;
2904}
2905
abdb9b08
LP
2906static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2907 pid_t pid;
2908
2909 assert(m);
2910
2911 pid = PTR_TO_PID(userdata);
2912
2913 if (arg_kill_signal > 0) {
2914 log_info("Container termination requested. Attempting to halt container.");
2915 (void) kill(pid, arg_kill_signal);
2916 } else {
2917 log_info("Container termination requested. Exiting.");
2918 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2919 }
2920
2921 return 0;
2922}
2923
ec16945e 2924static int determine_names(void) {
1b9cebf6 2925 int r;
ec16945e 2926
c1521918
LP
2927 if (arg_template && !arg_directory && arg_machine) {
2928
2929 /* If --template= was specified then we should not
2930 * search for a machine, but instead create a new one
2931 * in /var/lib/machine. */
2932
657ee2d8 2933 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2934 if (!arg_directory)
2935 return log_oom();
2936 }
2937
ec16945e 2938 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2939 if (arg_machine) {
2940 _cleanup_(image_unrefp) Image *i = NULL;
2941
5ef46e5f 2942 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2943 if (r == -ENOENT)
2944 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2945 if (r < 0)
2946 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2947
eb38edce 2948 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2949 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2950 else
0f03c2a4 2951 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2952 if (r < 0)
0f3be6ca 2953 return log_oom();
1b9cebf6 2954
aee327b8
LP
2955 if (!arg_ephemeral)
2956 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2957 } else {
2958 r = safe_getcwd(&arg_directory);
2959 if (r < 0)
2960 return log_error_errno(r, "Failed to determine current directory: %m");
2961 }
ec16945e 2962
c6147113
LP
2963 if (!arg_directory && !arg_image)
2964 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
2965 }
2966
2967 if (!arg_machine) {
b9ba4dab
LP
2968 if (arg_directory && path_equal(arg_directory, "/"))
2969 arg_machine = gethostname_malloc();
4827ab48
LP
2970 else {
2971 if (arg_image) {
2972 char *e;
2973
2974 arg_machine = strdup(basename(arg_image));
2975
2976 /* Truncate suffix if there is one */
2977 e = endswith(arg_machine, ".raw");
2978 if (e)
2979 *e = 0;
2980 } else
2981 arg_machine = strdup(basename(arg_directory));
2982 }
ec16945e
LP
2983 if (!arg_machine)
2984 return log_oom();
2985
ae691c1d 2986 hostname_cleanup(arg_machine);
52ef5dd7 2987 if (!hostname_is_valid(arg_machine, 0))
c6147113 2988 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab
LP
2989
2990 if (arg_ephemeral) {
2991 char *b;
2992
2993 /* Add a random suffix when this is an
2994 * ephemeral machine, so that we can run many
2995 * instances at once without manually having
2996 * to specify -M each time. */
2997
2998 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2999 return log_oom();
3000
3001 free(arg_machine);
3002 arg_machine = b;
3003 }
ec16945e
LP
3004 }
3005
3006 return 0;
3007}
3008
8d4aa2bb 3009static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
3010 char *chased;
3011 int r;
3012
3013 assert(p);
3014
3015 if (!*p)
3016 return 0;
3017
a5648b80 3018 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3019 if (r < 0)
3020 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3021
a5648b80 3022 return free_and_replace(*p, chased);
3f342ec4
LP
3023}
3024
03cfe0d5 3025static int determine_uid_shift(const char *directory) {
6dac160c
LP
3026 int r;
3027
0de7acce 3028 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3029 arg_uid_shift = 0;
6dac160c 3030 return 0;
03cfe0d5 3031 }
6dac160c
LP
3032
3033 if (arg_uid_shift == UID_INVALID) {
3034 struct stat st;
3035
03cfe0d5 3036 r = stat(directory, &st);
6dac160c 3037 if (r < 0)
03cfe0d5 3038 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3039
3040 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3041
baaa35ad
ZJS
3042 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3043 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3044 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3045
3046 arg_uid_range = UINT32_C(0x10000);
3047 }
3048
baaa35ad
ZJS
3049 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
3050 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3051 "UID base too high for UID range.");
6dac160c 3052
6dac160c
LP
3053 return 0;
3054}
3055
de40a303
LP
3056static unsigned long effective_clone_ns_flags(void) {
3057 unsigned long flags = arg_clone_ns_flags;
3058
3059 if (arg_private_network)
3060 flags |= CLONE_NEWNET;
3061 if (arg_use_cgns)
3062 flags |= CLONE_NEWCGROUP;
3063 if (arg_userns_mode != USER_NAMESPACE_NO)
3064 flags |= CLONE_NEWUSER;
3065
3066 return flags;
3067}
3068
3069static int patch_sysctl(void) {
3070
3071 /* This table is inspired by runc's sysctl() function */
3072 static const struct {
3073 const char *key;
3074 bool prefix;
3075 unsigned long clone_flags;
3076 } safe_sysctl[] = {
3077 { "kernel.hostname", false, CLONE_NEWUTS },
3078 { "kernel.domainname", false, CLONE_NEWUTS },
3079 { "kernel.msgmax", false, CLONE_NEWIPC },
3080 { "kernel.msgmnb", false, CLONE_NEWIPC },
3081 { "kernel.msgmni", false, CLONE_NEWIPC },
3082 { "kernel.sem", false, CLONE_NEWIPC },
3083 { "kernel.shmall", false, CLONE_NEWIPC },
3084 { "kernel.shmmax", false, CLONE_NEWIPC },
3085 { "kernel.shmmni", false, CLONE_NEWIPC },
3086 { "fs.mqueue.", true, CLONE_NEWIPC },
3087 { "net.", true, CLONE_NEWNET },
3088 };
3089
3090 unsigned long flags;
3091 char **k, **v;
3092 int r;
3093
3094 flags = effective_clone_ns_flags();
3095
3096 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3097 bool good = false;
3098 size_t i;
3099
3100 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3101
3102 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3103 continue;
3104
3105 if (safe_sysctl[i].prefix)
3106 good = startswith(*k, safe_sysctl[i].key);
3107 else
3108 good = streq(*k, safe_sysctl[i].key);
3109
3110 if (good)
3111 break;
3112 }
3113
c6147113
LP
3114 if (!good)
3115 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3116
3117 r = sysctl_write(*k, *v);
3118 if (r < 0)
3119 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3120 }
3121
3122 return 0;
3123}
3124
03cfe0d5
LP
3125static int inner_child(
3126 Barrier *barrier,
3127 const char *directory,
3128 bool secondary,
3129 int kmsg_socket,
3130 int rtnl_socket,
3acc84eb 3131 int master_pty_socket,
e1bb4b0d
LB
3132 FDSet *fds,
3133 char **os_release_pairs) {
69c79d3c 3134
03cfe0d5 3135 _cleanup_free_ char *home = NULL;
b5ea030d 3136 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3137 size_t n_env = 1;
03cfe0d5 3138 const char *envp[] = {
0c300adf 3139 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3140 NULL, /* container */
03cfe0d5
LP
3141 NULL, /* TERM */
3142 NULL, /* HOME */
3143 NULL, /* USER */
3144 NULL, /* LOGNAME */
3145 NULL, /* container_uuid */
3146 NULL, /* LISTEN_FDS */
3147 NULL, /* LISTEN_PID */
9c1e04d0 3148 NULL, /* NOTIFY_SOCKET */
3652872a 3149 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3150 NULL
3151 };
1a68e1e5 3152 const char *exec_target;
2371271c 3153 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3154 int r, which_failed;
88213476 3155
b37469d7
LP
3156 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3157 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3158 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3159 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3160 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3161 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3162 * namespace.
3163 *
3164 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3165 * unshare(). See below. */
3166
03cfe0d5
LP
3167 assert(barrier);
3168 assert(directory);
3169 assert(kmsg_socket >= 0);
88213476 3170
de40a303
LP
3171 log_debug("Inner child is initializing.");
3172
0de7acce 3173 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3174 /* Tell the parent, that it now can write the UID map. */
3175 (void) barrier_place(barrier); /* #1 */
7027ff61 3176
03cfe0d5 3177 /* Wait until the parent wrote the UID map */
baaa35ad 3178 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3179 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3180
2a2e78e9
LP
3181 /* Become the new root user inside our namespace */
3182 r = reset_uid_gid();
3183 if (r < 0)
3184 return log_error_errno(r, "Couldn't become new root: %m");
3185
3186 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3187 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3188 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3189 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3190 if (r < 0)
3191 return r;
3192 }
6d66bd3b 3193
0de7acce 3194 r = mount_all(NULL,
4f086aab 3195 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3196 arg_uid_shift,
0de7acce 3197 arg_selinux_apifs_context);
03cfe0d5
LP
3198 if (r < 0)
3199 return r;
3200
04413780
ZJS
3201 if (!arg_network_namespace_path && arg_private_network) {
3202 r = unshare(CLONE_NEWNET);
3203 if (r < 0)
3204 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3205
3206 /* Tell the parent that it can setup network interfaces. */
3207 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3208 }
3209
4f086aab 3210 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3211 if (r < 0)
3212 return r;
3213
03cfe0d5
LP
3214 /* Wait until we are cgroup-ified, so that we
3215 * can mount the right cgroup path writable */
baaa35ad
ZJS
3216 if (!barrier_place_and_sync(barrier)) /* #4 */
3217 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3218 "Parent died too early");
88213476 3219
489fae52 3220 if (arg_use_cgns) {
0996ef00
CB
3221 r = unshare(CLONE_NEWCGROUP);
3222 if (r < 0)
04413780 3223 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3224 r = mount_cgroups(
3225 "",
3226 arg_unified_cgroup_hierarchy,
3227 arg_userns_mode != USER_NAMESPACE_NO,
3228 arg_uid_shift,
3229 arg_uid_range,
5a8ff0e6 3230 arg_selinux_apifs_context,
ada54120 3231 true);
1433e0f2 3232 } else
0996ef00 3233 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3234 if (r < 0)
3235 return r;
ec16945e 3236
1e4f1671 3237 r = setup_boot_id();
03cfe0d5
LP
3238 if (r < 0)
3239 return r;
ec16945e 3240
1e4f1671 3241 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3242 if (r < 0)
3243 return r;
3244 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3245
de40a303
LP
3246 r = mount_custom(
3247 "/",
3248 arg_custom_mounts,
3249 arg_n_custom_mounts,
de40a303
LP
3250 0,
3251 arg_selinux_apifs_context,
5f0a6347 3252 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3253 if (r < 0)
3254 return r;
3255
03cfe0d5
LP
3256 if (setsid() < 0)
3257 return log_error_errno(errno, "setsid() failed: %m");
3258
3259 if (arg_private_network)
df883de9 3260 (void) loopback_setup();
03cfe0d5 3261
7a8f6325
LP
3262 if (arg_expose_ports) {
3263 r = expose_port_send_rtnl(rtnl_socket);
3264 if (r < 0)
3265 return r;
3266 rtnl_socket = safe_close(rtnl_socket);
3267 }
03cfe0d5 3268
3acc84eb 3269 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3270 _cleanup_close_ int master = -1;
3acc84eb
FB
3271 _cleanup_free_ char *console = NULL;
3272
3273 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3274 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3275 if (master < 0)
dc98caea 3276 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3277
3278 r = setup_dev_console(console);
3279 if (r < 0)
105a1a36 3280 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3281
3282 r = send_one_fd(master_pty_socket, master, 0);
3283 if (r < 0)
3284 return log_error_errno(r, "Failed to send master fd: %m");
3285 master_pty_socket = safe_close(master_pty_socket);
3286
3287 r = setup_stdio_as_dev_console();
3288 if (r < 0)
3289 return r;
3290 }
3291
de40a303
LP
3292 r = patch_sysctl();
3293 if (r < 0)
3294 return r;
3295
81f345df
LP
3296 if (arg_oom_score_adjust_set) {
3297 r = set_oom_score_adjust(arg_oom_score_adjust);
3298 if (r < 0)
3299 return log_error_errno(r, "Failed to adjust OOM score: %m");
3300 }
3301
0985c7c4
ZJS
3302 if (arg_cpu_set.set)
3303 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3304 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3305
c818eef1 3306 (void) setup_hostname();
03cfe0d5 3307
050f7277 3308 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3309 r = safe_personality(arg_personality);
3310 if (r < 0)
3311 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3312 } else if (secondary) {
21022b9d
LP
3313 r = safe_personality(PER_LINUX32);
3314 if (r < 0)
3315 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3316 }
3317
de40a303
LP
3318 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3319 if (r < 0)
3320 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3321
3322#if HAVE_SECCOMP
3323 if (arg_seccomp) {
3324
3325 if (is_seccomp_available()) {
3326
3327 r = seccomp_load(arg_seccomp);
7bc5e0b1 3328 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3329 return log_error_errno(r, "Failed to install seccomp filter: %m");
3330 if (r < 0)
3331 log_debug_errno(r, "Failed to install seccomp filter: %m");
3332 }
3333 } else
3334#endif
3335 {
6b000af4 3336 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3337 if (r < 0)
3338 return r;
3339 }
3340
349cc4a5 3341#if HAVE_SELINUX
03cfe0d5 3342 if (arg_selinux_context)
2ed96880 3343 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3344 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3345#endif
3346
de40a303
LP
3347 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3348 * if we need to later on. */
3349 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3350 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3351
3352 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3353 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3354 else
3462d773 3355 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3356 if (r < 0)
3357 return r;
3358
de40a303
LP
3359 r = drop_capabilities(getuid());
3360 if (r < 0)
3361 return log_error_errno(r, "Dropping capabilities failed: %m");
3362
66edd963
LP
3363 if (arg_no_new_privileges)
3364 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3365 return log_error_errno(errno, "Failed to disable new privileges: %m");
3366
6aadfa4c
ILG
3367 /* LXC sets container=lxc, so follow the scheme here */
3368 envp[n_env++] = strjoina("container=", arg_container_service_name);
3369
03cfe0d5
LP
3370 envp[n_env] = strv_find_prefix(environ, "TERM=");
3371 if (envp[n_env])
313cefa1 3372 n_env++;
03cfe0d5 3373
de40a303
LP
3374 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3375 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3376 return log_oom();
3377
3378 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3379 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3380 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3381 return log_oom();
03cfe0d5 3382
3bbaff3e 3383 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3384
691675ba 3385 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3386 return log_oom();
03cfe0d5
LP
3387
3388 if (fdset_size(fds) > 0) {
3389 r = fdset_cloexec(fds, false);
3390 if (r < 0)
3391 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3392
3393 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3394 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3395 return log_oom();
3396 }
9c1e04d0
AP
3397 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3398 return log_oom();
03cfe0d5 3399
3652872a
LP
3400 if (arg_n_credentials > 0) {
3401 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3402 if (!envp[n_env])
3403 return log_oom();
3404 n_env++;
3405 }
3406
ed4512d0 3407 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3408 if (!env_use)
3409 return log_oom();
03cfe0d5
LP
3410
3411 /* Let the parent know that we are ready and
3412 * wait until the parent is ready with the
3413 * setup, too... */
baaa35ad 3414 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3415 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3416
5f932eb9
LP
3417 if (arg_chdir)
3418 if (chdir(arg_chdir) < 0)
3419 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3420
7732f92b 3421 if (arg_start_mode == START_PID2) {
75bf701f 3422 r = stub_pid1(arg_uuid);
7732f92b
LP
3423 if (r < 0)
3424 return r;
3425 }
3426
335d2ead
LP
3427 if (arg_console_mode != CONSOLE_PIPE) {
3428 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3429 * are configured for that. Acquire it as controlling tty. */
3430 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3431 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3432 }
3433
de40a303
LP
3434 log_debug("Inner child completed, invoking payload.");
3435
8ca082b4
LP
3436 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3437 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3438 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3439 log_close();
8ca082b4
LP
3440 log_set_open_when_needed(true);
3441
03cfe0d5
LP
3442 (void) fdset_close_others(fds);
3443
7732f92b 3444 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3445 char **a;
3446 size_t m;
3447
3448 /* Automatically search for the init system */
3449
75f32f04
ZJS
3450 m = strv_length(arg_parameters);
3451 a = newa(char*, m + 2);
3452 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3453 a[1 + m] = NULL;
03cfe0d5 3454
ced58da7 3455 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3456 execve(a[0], a, env_use);
3457
ced58da7 3458 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3459 execve(a[0], a, env_use);
3460
ced58da7 3461 a[0] = (char*) "/sbin/init";
03cfe0d5 3462 execve(a[0], a, env_use);
ced58da7
LP
3463
3464 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3465 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3466 const char *dollar_path;
3467
1a68e1e5 3468 exec_target = arg_parameters[0];
b6b180b7
LP
3469
3470 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3471 * binary. */
3472 dollar_path = strv_env_get(env_use, "PATH");
3473 if (dollar_path) {
6f646e01 3474 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3475 return log_error_errno(errno, "Failed to update $PATH: %m");
3476 }
3477
f757855e 3478 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3479 } else {
5f932eb9 3480 if (!arg_chdir)
d929b0f9
ZJS
3481 /* If we cannot change the directory, we'll end up in /, that is expected. */
3482 (void) chdir(home ?: "/root");
5f932eb9 3483
03cfe0d5
LP
3484 execle("/bin/bash", "-bash", NULL, env_use);
3485 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3486
3487 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3488 }
3489
8ca082b4 3490 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3491}
3492
e96ceaba 3493static int setup_notify_child(void) {
271f518f 3494 _cleanup_close_ int fd = -1;
9c1e04d0 3495 union sockaddr_union sa = {
44ed5214
LP
3496 .un.sun_family = AF_UNIX,
3497 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3498 };
3499 int r;
3500
3501 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3502 if (fd < 0)
3503 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3504
3505 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3506 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3507
9c1e04d0 3508 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3509 if (r < 0)
44ed5214 3510 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3511
adc7d9f0 3512 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3513 if (r < 0)
adc7d9f0 3514 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3515
2ff48e98 3516 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3517 if (r < 0)
2ff48e98 3518 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3519
271f518f 3520 return TAKE_FD(fd);
9c1e04d0
AP
3521}
3522
03cfe0d5
LP
3523static int outer_child(
3524 Barrier *barrier,
3525 const char *directory,
2d845785 3526 DissectedImage *dissected_image,
03cfe0d5
LP
3527 bool secondary,
3528 int pid_socket,
e01ff70a 3529 int uuid_socket,
9c1e04d0 3530 int notify_socket,
03cfe0d5
LP
3531 int kmsg_socket,
3532 int rtnl_socket,
825d5287 3533 int uid_shift_socket,
3acc84eb 3534 int master_pty_socket,
8199d554 3535 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3536 FDSet *fds,
3537 int netns_fd) {
03cfe0d5 3538
e1bb4b0d 3539 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3540 _cleanup_close_ int fd = -1;
e5f10caf 3541 const char *p;
03cfe0d5
LP
3542 pid_t pid;
3543 ssize_t l;
de40a303 3544 int r;
03cfe0d5 3545
b37469d7
LP
3546 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3547 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3548 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3549 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3550
03cfe0d5
LP
3551 assert(barrier);
3552 assert(directory);
03cfe0d5 3553 assert(pid_socket >= 0);
e01ff70a 3554 assert(uuid_socket >= 0);
9c1e04d0 3555 assert(notify_socket >= 0);
3acc84eb 3556 assert(master_pty_socket >= 0);
03cfe0d5
LP
3557 assert(kmsg_socket >= 0);
3558
de40a303
LP
3559 log_debug("Outer child is initializing.");
3560
e1bb4b0d
LB
3561 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3562 if (r < 0)
3563 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3564
03cfe0d5
LP
3565 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3566 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3567
03cfe0d5
LP
3568 r = reset_audit_loginuid();
3569 if (r < 0)
3570 return r;
3571
2a2e78e9
LP
3572 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3573 * mounts to the real root. */
511a8cfe 3574 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3575 if (r < 0)
3576 return r;
03cfe0d5 3577
2d845785 3578 if (dissected_image) {
2d3a5a73
LP
3579 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3580 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3581 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3582 * makes sure ESP partitions and userns are compatible. */
3583
af187ab2
LP
3584 r = dissected_image_mount_and_warn(
3585 dissected_image, directory, arg_uid_shift,
3586 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3587 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK)|
3588 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3589 if (r < 0)
af187ab2 3590 return r;
2d845785 3591 }
03cfe0d5 3592
391567f4
LP
3593 r = determine_uid_shift(directory);
3594 if (r < 0)
3595 return r;
3596
0de7acce 3597 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3598 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3599 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3600 if (l < 0)
3601 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3602 if (l != sizeof(arg_uid_shift))
3603 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3604 "Short write while sending UID shift.");
0e7ac751 3605
0de7acce 3606 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3607 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3608 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3609 * not it will pick a different one, and send it back to us. */
3610
3611 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3612 if (l < 0)
3613 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3614 if (l != sizeof(arg_uid_shift))
3615 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3616 "Short read while receiving UID shift.");
0e7ac751
LP
3617 }
3618
ff6c6cc1
LP
3619 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3620 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3621 }
3622
6f83d3d1
LP
3623 if (path_equal(directory, "/")) {
3624 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3625 * place, so that we can make changes to its mount structure (for example, to implement
3626 * --volatile=) without this interfering with our ability to access files such as
3627 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3628 * (instead of a temporary directory, since we are living in our own mount namspace here
3629 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3630 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3631
511a8cfe 3632 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3633 if (r < 0)
3634 return r;
3635
3636 directory = "/run/systemd/nspawn-root";
e50cd82f 3637 }
7d0ecdd6
LP
3638
3639 r = setup_pivot_root(
3640 directory,
3641 arg_pivot_root_new,
3642 arg_pivot_root_old);
3643 if (r < 0)
3644 return r;
3645
3646 r = setup_volatile_mode(
3647 directory,
3648 arg_volatile_mode,
7d0ecdd6 3649 arg_uid_shift,
8f1ed04a 3650 arg_selinux_apifs_context);
7d0ecdd6
LP
3651 if (r < 0)
3652 return r;
3653
5f0a6347
DDM
3654 r = mount_custom(
3655 directory,
3656 arg_custom_mounts,
3657 arg_n_custom_mounts,
5f0a6347 3658 arg_uid_shift,
5f0a6347
DDM
3659 arg_selinux_apifs_context,
3660 MOUNT_ROOT_ONLY);
3661 if (r < 0)
3662 return r;
3663
5530dc87
DDM
3664 /* Make sure we always have a mount that we can move to root later on. */
3665 if (!path_is_mount_point(directory, NULL, 0)) {
511a8cfe 3666 r = mount_nofollow_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
5530dc87
DDM
3667 if (r < 0)
3668 return r;
3669 }
3670
2d3a5a73
LP
3671 if (dissected_image) {
3672 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3673 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
4fcb96ce
LP
3674 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK));
3675 if (r == -EUCLEAN)
3676 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3677 if (r < 0)
4fcb96ce 3678 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3679 }
3680
8199d554
LP
3681 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3682 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3683
3684 r = detect_unified_cgroup_hierarchy_from_image(directory);
3685 if (r < 0)
3686 return r;
3687
3688 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3689 if (l < 0)
3690 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3691 if (l != sizeof(arg_unified_cgroup_hierarchy))
3692 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3693 "Short write while sending cgroup mode.");
8199d554
LP
3694
3695 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3696 }
3697
4ad14eff
LP
3698 /* Mark everything as shared so our mounts get propagated down. This is
3699 * required to make new bind mounts available in systemd services
5238e957 3700 * inside the container that create a new mount namespace.
4ad14eff
LP
3701 * See https://github.com/systemd/systemd/issues/3860
3702 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3703 * shared propagation mode.
3704 *
3705 * IMPORTANT: Do not overmount the root directory anymore from now on to
3706 * enable moving the root directory mount to root later on.
3707 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3708 */
511a8cfe 3709 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
4ad14eff
LP
3710 if (r < 0)
3711 return r;
3712
3713 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3714 if (r < 0)
3715 return r;
3716
03cfe0d5
LP
3717 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3718 if (r < 0)
3719 return r;
3720
bbd407ea
DDM
3721 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3722 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3723 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3724 if (r < 0)
3725 return log_error_errno(r, "Failed to make tree read-only: %m");
3726 }
3727
0de7acce 3728 r = mount_all(directory,
4f086aab 3729 arg_mount_settings,
0de7acce 3730 arg_uid_shift,
0de7acce 3731 arg_selinux_apifs_context);
03cfe0d5
LP
3732 if (r < 0)
3733 return r;
3734
07fa00f9
LP
3735 r = copy_devnodes(directory);
3736 if (r < 0)
03cfe0d5
LP
3737 return r;
3738
de40a303
LP
3739 r = make_extra_nodes(directory);
3740 if (r < 0)
3741 return r;
3742
3743 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3744
9fac5029 3745 p = prefix_roota(directory, "/run/host");
e5f10caf 3746 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3747
07fa00f9
LP
3748 r = setup_pts(directory);
3749 if (r < 0)
03cfe0d5
LP
3750 return r;
3751
3752 r = setup_propagate(directory);
3753 if (r < 0)
3754 return r;
3755
8e5430c4
LP
3756 r = setup_keyring();
3757 if (r < 0)
3758 return r;
3759
3652872a
LP
3760 r = setup_credentials(directory);
3761 if (r < 0)
3762 return r;
3763
5c4deb9a
MJ
3764 r = mount_custom(
3765 directory,
3766 arg_custom_mounts,
3767 arg_n_custom_mounts,
3768 arg_uid_shift,
3769 arg_selinux_apifs_context,
3770 MOUNT_NON_ROOT_ONLY);
3771 if (r < 0)
3772 return r;
3773
03cfe0d5
LP
3774 r = setup_timezone(directory);
3775 if (r < 0)
3776 return r;
3777
3778 r = setup_resolv_conf(directory);
3779 if (r < 0)
3780 return r;
3781
e01ff70a
MS
3782 r = setup_machine_id(directory);
3783 if (r < 0)
3784 return r;
3785
03cfe0d5
LP
3786 r = setup_journal(directory);
3787 if (r < 0)
3788 return r;
3789
0f48ba7b
LP
3790 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3791 p = prefix_roota(directory, "/run/host/container-manager");
3792 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3793
3794 /* The same stuff as the $container_uuid env var */
3795 p = prefix_roota(directory, "/run/host/container-uuid");
3796 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3797
489fae52 3798 if (!arg_use_cgns) {
0996ef00
CB
3799 r = mount_cgroups(
3800 directory,
3801 arg_unified_cgroup_hierarchy,
3802 arg_userns_mode != USER_NAMESPACE_NO,
3803 arg_uid_shift,
3804 arg_uid_range,
5a8ff0e6 3805 arg_selinux_apifs_context,
ada54120 3806 false);
0996ef00
CB
3807 if (r < 0)
3808 return r;
3809 }
03cfe0d5
LP
3810
3811 r = mount_move_root(directory);
3812 if (r < 0)
3813 return log_error_errno(r, "Failed to move root directory: %m");
3814
e96ceaba 3815 fd = setup_notify_child();
9c1e04d0
AP
3816 if (fd < 0)
3817 return fd;
3818
03cfe0d5 3819 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3820 arg_clone_ns_flags |
8869a0b4 3821 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3822 if (pid < 0)
3823 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3824 if (pid == 0) {
3825 pid_socket = safe_close(pid_socket);
e01ff70a 3826 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3827 notify_socket = safe_close(notify_socket);
825d5287 3828 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3829
2a2e78e9
LP
3830 /* The inner child has all namespaces that are requested, so that we all are owned by the
3831 * user if user namespaces are turned on. */
03cfe0d5 3832
d7bea6b6
DP
3833 if (arg_network_namespace_path) {
3834 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3835 if (r < 0)
e2d39e54 3836 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3837 }
3838
e1bb4b0d 3839 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3840 if (r < 0)
3841 _exit(EXIT_FAILURE);
3842
3843 _exit(EXIT_SUCCESS);
3844 }
3845
3846 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3847 if (l < 0)
3848 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3849 if (l != sizeof(pid))
3850 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3851 "Short write while sending PID.");
03cfe0d5 3852
e01ff70a
MS
3853 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3854 if (l < 0)
3855 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3856 if (l != sizeof(arg_uuid))
3857 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3858 "Short write while sending machine ID.");
e01ff70a 3859
9c1e04d0
AP
3860 l = send_one_fd(notify_socket, fd, 0);
3861 if (l < 0)
ba72801d 3862 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3863
03cfe0d5 3864 pid_socket = safe_close(pid_socket);
e01ff70a 3865 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3866 notify_socket = safe_close(notify_socket);
3acc84eb 3867 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3868 kmsg_socket = safe_close(kmsg_socket);
3869 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3870 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3871
3872 return 0;
3873}
3874
0e7ac751 3875static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3876 bool tried_hashed = false;
0e7ac751
LP
3877 unsigned n_tries = 100;
3878 uid_t candidate;
3879 int r;
3880
3881 assert(shift);
3882 assert(ret_lock_file);
0de7acce 3883 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3884 assert(arg_uid_range == 0x10000U);
3885
3886 candidate = *shift;
3887
3888 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3889
3890 for (;;) {
fbd0b64f 3891 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3892 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3893
3894 if (--n_tries <= 0)
3895 return -EBUSY;
3896
87d5e4f2 3897 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3898 goto next;
3899 if ((candidate & UINT32_C(0xFFFF)) != 0)
3900 goto next;
3901
3902 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3903 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3904 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3905 goto next;
3906 if (r < 0)
3907 return r;
3908
3909 /* Make some superficial checks whether the range is currently known in the user database */
3910 if (getpwuid(candidate))
3911 goto next;
3912 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3913 goto next;
3914 if (getgrgid(candidate))
3915 goto next;
3916 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3917 goto next;
3918
3919 *ret_lock_file = lf;
3920 lf = (struct LockFile) LOCK_FILE_INIT;
3921 *shift = candidate;
3922 return 0;
3923
3924 next:
d381c8a6
LP
3925 if (arg_machine && !tried_hashed) {
3926 /* Try to hash the base from the container name */
3927
3928 static const uint8_t hash_key[] = {
3929 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3930 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3931 };
3932
3933 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3934
3935 tried_hashed = true;
3936 } else
3937 random_bytes(&candidate, sizeof(candidate));
3938
87d5e4f2 3939 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3940 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3941 }
3942}
3943
03cfe0d5 3944static int setup_uid_map(pid_t pid) {
fbd0b64f 3945 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3946 int r;
3947
3948 assert(pid > 1);
3949
3950 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3951 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3952 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3953 if (r < 0)
3954 return log_error_errno(r, "Failed to write UID map: %m");
3955
3956 /* We always assign the same UID and GID ranges */
3957 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3958 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3959 if (r < 0)
3960 return log_error_errno(r, "Failed to write GID map: %m");
3961
3962 return 0;
3963}
3964
9c1e04d0 3965static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3966 char buf[NOTIFY_BUFFER_MAX+1];
3967 char *p = NULL;
3968 struct iovec iovec = {
3969 .iov_base = buf,
3970 .iov_len = sizeof(buf)-1,
3971 };
fb29cdbe
LP
3972 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
3973 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
3974 struct msghdr msghdr = {
3975 .msg_iov = &iovec,
3976 .msg_iovlen = 1,
3977 .msg_control = &control,
3978 .msg_controllen = sizeof(control),
3979 };
371d72e0 3980 struct ucred *ucred;
9c1e04d0
AP
3981 ssize_t n;
3982 pid_t inner_child_pid;
3983 _cleanup_strv_free_ char **tags = NULL;
3984
3985 assert(userdata);
3986
3987 inner_child_pid = PTR_TO_PID(userdata);
3988
3989 if (revents != EPOLLIN) {
3990 log_warning("Got unexpected poll event for notify fd.");
3991 return 0;
3992 }
3993
3691bcf3
LP
3994 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3995 if (IN_SET(n, -EAGAIN, -EINTR))
3996 return 0;
3997 if (n < 0)
3998 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 3999
9c1e04d0
AP
4000 cmsg_close_all(&msghdr);
4001
371d72e0 4002 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4003 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4004 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4005 return 0;
4006 }
4007
4008 if ((size_t) n >= sizeof(buf)) {
4009 log_warning("Received notify message exceeded maximum size. Ignoring.");
4010 return 0;
4011 }
4012
4013 buf[n] = 0;
4014 tags = strv_split(buf, "\n\r");
4015 if (!tags)
4016 return log_oom();
4017
4018 if (strv_find(tags, "READY=1"))
04f590a4 4019 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
4020
4021 p = strv_find_startswith(tags, "STATUS=");
4022 if (p)
04f590a4 4023 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4024
4025 return 0;
4026}
4027
e96ceaba 4028static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4029 int r;
9c1e04d0 4030
5773024d 4031 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4032 if (r < 0)
4033 return log_error_errno(r, "Failed to allocate notify event source: %m");
4034
5773024d 4035 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4036
4037 return 0;
4038}
4039
5d961407
LP
4040static int merge_settings(Settings *settings, const char *path) {
4041 int rl;
f757855e 4042
5d961407
LP
4043 assert(settings);
4044 assert(path);
f757855e 4045
5d961407
LP
4046 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4047 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4048
7732f92b
LP
4049 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4050 settings->start_mode >= 0) {
4051 arg_start_mode = settings->start_mode;
130d3d22 4052 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4053 }
4054
a2f577fc
JL
4055 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4056 arg_ephemeral = settings->ephemeral;
4057
de40a303
LP
4058 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4059 settings->root) {
4060
4061 if (!arg_settings_trusted)
4062 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4063 else
4064 free_and_replace(arg_directory, settings->root);
4065 }
4066
b53ede69
PW
4067 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4068 settings->pivot_root_new) {
4069 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4070 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4071 }
4072
5f932eb9 4073 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4074 settings->working_directory)
4075 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4076
f757855e 4077 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4078 settings->environment)
4079 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4080
de40a303
LP
4081 if ((arg_settings_mask & SETTING_USER) == 0) {
4082
4083 if (settings->user)
4084 free_and_replace(arg_user, settings->user);
4085
4086 if (uid_is_valid(settings->uid))
4087 arg_uid = settings->uid;
4088 if (gid_is_valid(settings->gid))
4089 arg_gid = settings->gid;
4090 if (settings->n_supplementary_gids > 0) {
4091 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4092 arg_n_supplementary_gids = settings->n_supplementary_gids;
4093 }
4094 }
f757855e
LP
4095
4096 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4097 uint64_t plus, minus;
7be830c6 4098 uint64_t network_minus = 0;
88fc9c9b 4099 uint64_t ambient;
f757855e 4100
de40a303
LP
4101 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4102 * Settings structure */
4103
0e265674 4104 plus = settings->capability;
a3fc6b55
LP
4105 minus = settings->drop_capability;
4106
4107 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4108 if (settings_private_network(settings))
4109 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4110 else
7be830c6 4111 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4112 }
0e265674
LP
4113
4114 if (!arg_settings_trusted && plus != 0) {
4115 if (settings->capability != 0)
5d961407 4116 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4117 } else {
4118 arg_caps_retain &= ~network_minus;
520e0d54 4119 arg_caps_retain |= plus;
7be830c6 4120 }
f757855e 4121
a3fc6b55 4122 arg_caps_retain &= ~minus;
de40a303
LP
4123
4124 /* Copy the full capabilities over too */
4125 if (capability_quintet_is_set(&settings->full_capabilities)) {
4126 if (!arg_settings_trusted)
5238e957 4127 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4128 else
4129 arg_full_capabilities = settings->full_capabilities;
4130 }
88fc9c9b
TH
4131
4132 ambient = settings->ambient_capability;
4133 if (!arg_settings_trusted && ambient != 0)
4134 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4135 else
4136 arg_caps_ambient |= ambient;
f757855e
LP
4137 }
4138
4139 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4140 settings->kill_signal > 0)
4141 arg_kill_signal = settings->kill_signal;
4142
4143 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4144 settings->personality != PERSONALITY_INVALID)
4145 arg_personality = settings->personality;
4146
4147 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4148 !sd_id128_is_null(settings->machine_id)) {
4149
4150 if (!arg_settings_trusted)
5d961407 4151 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4152 else
4153 arg_uuid = settings->machine_id;
4154 }
4155
4156 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4157 settings->read_only >= 0)
4158 arg_read_only = settings->read_only;
4159
4160 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4161 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4162 arg_volatile_mode = settings->volatile_mode;
4163
4164 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4165 settings->n_custom_mounts > 0) {
4166
4167 if (!arg_settings_trusted)
5d961407 4168 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4169 else {
4170 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4171 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4172 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4173 settings->n_custom_mounts = 0;
4174 }
4175 }
4176
4177 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4178 (settings->private_network >= 0 ||
4179 settings->network_veth >= 0 ||
4180 settings->network_bridge ||
22b28dfd 4181 settings->network_zone ||
f757855e
LP
4182 settings->network_interfaces ||
4183 settings->network_macvlan ||
f6d6bad1 4184 settings->network_ipvlan ||
de40a303
LP
4185 settings->network_veth_extra ||
4186 settings->network_namespace_path)) {
f757855e
LP
4187
4188 if (!arg_settings_trusted)
5d961407 4189 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4190 else {
f6d6bad1 4191 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4192 arg_private_network = settings_private_network(settings);
4193
130d3d22
YW
4194 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4195 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4196 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4197 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4198
1cc6c93a
YW
4199 free_and_replace(arg_network_bridge, settings->network_bridge);
4200 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4201
4202 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4203 }
4204 }
4205
4206 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4207 settings->expose_ports) {
4208
4209 if (!arg_settings_trusted)
5d961407 4210 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4211 else {
4212 expose_port_free_all(arg_expose_ports);
1cc6c93a 4213 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4214 }
4215 }
4216
0de7acce
LP
4217 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4218 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4219
4220 if (!arg_settings_trusted)
5d961407 4221 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4222 else {
4223 arg_userns_mode = settings->userns_mode;
4224 arg_uid_shift = settings->uid_shift;
4225 arg_uid_range = settings->uid_range;
4226 arg_userns_chown = settings->userns_chown;
4227 }
4228 }
4229
9c1e04d0
AP
4230 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4231 arg_notify_ready = settings->notify_ready;
4232
960e4569
LP
4233 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4234
6b000af4 4235 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4236 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4237 else {
6b000af4
LP
4238 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4239 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4240 }
de40a303
LP
4241
4242#if HAVE_SECCOMP
4243 if (!arg_settings_trusted && settings->seccomp)
4244 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4245 else {
4246 seccomp_release(arg_seccomp);
4247 arg_seccomp = TAKE_PTR(settings->seccomp);
4248 }
4249#endif
960e4569
LP
4250 }
4251
bf428efb
LP
4252 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4253 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4254 continue;
4255
4256 if (!settings->rlimit[rl])
4257 continue;
4258
4259 if (!arg_settings_trusted) {
5d961407 4260 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4261 continue;
4262 }
4263
4264 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4265 }
4266
3a9530e5
LP
4267 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4268 settings->hostname)
4269 free_and_replace(arg_hostname, settings->hostname);
4270
66edd963
LP
4271 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4272 settings->no_new_privileges >= 0)
4273 arg_no_new_privileges = settings->no_new_privileges;
4274
81f345df
LP
4275 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4276 settings->oom_score_adjust_set) {
4277
4278 if (!arg_settings_trusted)
5d961407 4279 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4280 else {
4281 arg_oom_score_adjust = settings->oom_score_adjust;
4282 arg_oom_score_adjust_set = true;
4283 }
4284 }
4285
d107bb7d 4286 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4287 settings->cpu_set.set) {
d107bb7d
LP
4288
4289 if (!arg_settings_trusted)
5d961407 4290 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4291 else {
0985c7c4
ZJS
4292 cpu_set_reset(&arg_cpu_set);
4293 arg_cpu_set = settings->cpu_set;
4294 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4295 }
4296 }
4297
09d423e9
LP
4298 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4299 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4300 arg_resolv_conf = settings->resolv_conf;
4301
4e1d6aa9
LP
4302 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4303 settings->link_journal != _LINK_JOURNAL_INVALID) {
4304
4305 if (!arg_settings_trusted)
4306 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4307 else {
4308 arg_link_journal = settings->link_journal;
4309 arg_link_journal_try = settings->link_journal_try;
4310 }
4311 }
4312
1688841f
LP
4313 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4314 settings->timezone != _TIMEZONE_MODE_INVALID)
4315 arg_timezone = settings->timezone;
4316
de40a303
LP
4317 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4318 settings->slice) {
4319
4320 if (!arg_settings_trusted)
4321 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4322 else
4323 free_and_replace(arg_slice, settings->slice);
4324 }
4325
4326 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4327 settings->use_cgns >= 0) {
4328
4329 if (!arg_settings_trusted)
4330 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4331 else
4332 arg_use_cgns = settings->use_cgns;
4333 }
4334
4335 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4336 settings->clone_ns_flags != (unsigned long) -1) {
4337
4338 if (!arg_settings_trusted)
4339 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4340 else
4341 arg_clone_ns_flags = settings->clone_ns_flags;
4342 }
4343
4344 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4345 settings->console_mode >= 0) {
4346
4347 if (!arg_settings_trusted)
4348 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4349 else
4350 arg_console_mode = settings->console_mode;
4351 }
4352
4353 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4354 * don't consult arg_settings_mask for them. */
4355
4356 sd_bus_message_unref(arg_property_message);
4357 arg_property_message = TAKE_PTR(settings->properties);
4358
4359 arg_console_width = settings->console_width;
4360 arg_console_height = settings->console_height;
4361
b2645747 4362 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4363 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4364 arg_n_extra_nodes = settings->n_extra_nodes;
4365
f757855e
LP
4366 return 0;
4367}
4368
5d961407
LP
4369static int load_settings(void) {
4370 _cleanup_(settings_freep) Settings *settings = NULL;
4371 _cleanup_fclose_ FILE *f = NULL;
4372 _cleanup_free_ char *p = NULL;
4373 const char *fn, *i;
4374 int r;
4375
de40a303
LP
4376 if (arg_oci_bundle)
4377 return 0;
4378
5d961407
LP
4379 /* If all settings are masked, there's no point in looking for
4380 * the settings file */
d7a0f1f4 4381 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4382 return 0;
4383
4384 fn = strjoina(arg_machine, ".nspawn");
4385
4386 /* We first look in the admin's directories in /etc and /run */
4387 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4388 _cleanup_free_ char *j = NULL;
4389
657ee2d8 4390 j = path_join(i, fn);
5d961407
LP
4391 if (!j)
4392 return log_oom();
4393
4394 f = fopen(j, "re");
4395 if (f) {
4396 p = TAKE_PTR(j);
4397
4398 /* By default, we trust configuration from /etc and /run */
4399 if (arg_settings_trusted < 0)
4400 arg_settings_trusted = true;
4401
4402 break;
4403 }
4404
4405 if (errno != ENOENT)
4406 return log_error_errno(errno, "Failed to open %s: %m", j);
4407 }
4408
4409 if (!f) {
4410 /* After that, let's look for a file next to the
4411 * actual image we shall boot. */
4412
4413 if (arg_image) {
4414 p = file_in_same_dir(arg_image, fn);
4415 if (!p)
4416 return log_oom();
cd6e3914 4417 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4418 p = file_in_same_dir(arg_directory, fn);
4419 if (!p)
4420 return log_oom();
4421 }
4422
4423 if (p) {
4424 f = fopen(p, "re");
4425 if (!f && errno != ENOENT)
4426 return log_error_errno(errno, "Failed to open %s: %m", p);
4427
4428 /* By default, we do not trust configuration from /var/lib/machines */
4429 if (arg_settings_trusted < 0)
4430 arg_settings_trusted = false;
4431 }
4432 }
4433
4434 if (!f)
4435 return 0;
4436
4437 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4438
4439 r = settings_load(f, p, &settings);
4440 if (r < 0)
4441 return r;
4442
4443 return merge_settings(settings, p);
4444}
4445
de40a303
LP
4446static int load_oci_bundle(void) {
4447 _cleanup_(settings_freep) Settings *settings = NULL;
4448 int r;
4449
4450 if (!arg_oci_bundle)
4451 return 0;
4452
4453 /* By default let's trust OCI bundles */
4454 if (arg_settings_trusted < 0)
4455 arg_settings_trusted = true;
4456
4457 r = oci_load(NULL, arg_oci_bundle, &settings);
4458 if (r < 0)
4459 return r;
4460
4461 return merge_settings(settings, arg_oci_bundle);
4462}
4463
3acc84eb 4464static int run_container(
2d845785 4465 DissectedImage *dissected_image,
b0067625
ZJS
4466 bool secondary,
4467 FDSet *fds,
4468 char veth_name[IFNAMSIZ], bool *veth_created,
4469 union in_addr_union *exposed,
3acc84eb 4470 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4471
4472 static const struct sigaction sa = {
4473 .sa_handler = nop_signal_handler,
e28c7cd0 4474 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4475 };
4476
8e766630 4477 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4478 _cleanup_close_ int etc_passwd_lock = -1;
4479 _cleanup_close_pair_ int
4480 kmsg_socket_pair[2] = { -1, -1 },
4481 rtnl_socket_pair[2] = { -1, -1 },
4482 pid_socket_pair[2] = { -1, -1 },
4483 uuid_socket_pair[2] = { -1, -1 },
4484 notify_socket_pair[2] = { -1, -1 },
8199d554 4485 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4486 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4487 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4488
3acc84eb 4489 _cleanup_close_ int notify_socket = -1;
b0067625 4490 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4491 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4492 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4493 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4494 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4495 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4496 ContainerStatus container_status = 0;
b0067625
ZJS
4497 int ifi = 0, r;
4498 ssize_t l;
4499 sigset_t mask_chld;
5b4855ab 4500 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4501
4502 assert_se(sigemptyset(&mask_chld) == 0);
4503 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4504
4505 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4506 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4507 * check with getpwuid() if the specific user already exists. Note that /etc might be
4508 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4509 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4510 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4511 * really ours. */
4512
4513 etc_passwd_lock = take_etc_passwd_lock(NULL);
4514 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4515 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4516 }
4517
4518 r = barrier_create(&barrier);
4519 if (r < 0)
4520 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4521
4522 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4523 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4524
4525 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4526 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4527
4528 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4529 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4530
4531 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4532 return log_error_errno(errno, "Failed to create id socket pair: %m");
4533
4534 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4535 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4536
3acc84eb
FB
4537 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4538 return log_error_errno(errno, "Failed to create console socket pair: %m");
4539
b0067625
ZJS
4540 if (arg_userns_mode != USER_NAMESPACE_NO)
4541 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4542 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4543
8199d554
LP
4544 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4545 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4546 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4547
b0067625
ZJS
4548 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4549 * parent's blocking calls and give it a chance to call wait() and terminate. */
4550 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4551 if (r < 0)
4552 return log_error_errno(errno, "Failed to change the signal mask: %m");
4553
4554 r = sigaction(SIGCHLD, &sa, NULL);
4555 if (r < 0)
4556 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4557
d7bea6b6 4558 if (arg_network_namespace_path) {
5b4855ab
DDM
4559 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4560 if (child_netns_fd < 0)
d7bea6b6
DP
4561 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4562
5b4855ab 4563 r = fd_is_network_ns(child_netns_fd);
6619ad88
LP
4564 if (r == -EUCLEAN)
4565 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4566 else if (r < 0)
d7bea6b6 4567 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4568 else if (r == 0)
4569 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4570 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4571 }
4572
b0067625
ZJS
4573 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4574 if (*pid < 0)
4575 return log_error_errno(errno, "clone() failed%s: %m",
4576 errno == EINVAL ?
4577 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4578
4579 if (*pid == 0) {
4580 /* The outer child only has a file system namespace. */
4581 barrier_set_role(&barrier, BARRIER_CHILD);
4582
b0067625
ZJS
4583 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4584 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4585 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4586 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4587 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4588 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4589 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4590 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4591
4592 (void) reset_all_signal_handlers();
4593 (void) reset_signal_mask();
4594
4595 r = outer_child(&barrier,
4596 arg_directory,
2d845785 4597 dissected_image,
b0067625
ZJS
4598 secondary,
4599 pid_socket_pair[1],
4600 uuid_socket_pair[1],
4601 notify_socket_pair[1],
4602 kmsg_socket_pair[1],
4603 rtnl_socket_pair[1],
4604 uid_shift_socket_pair[1],
3acc84eb 4605 master_pty_socket_pair[1],
8199d554 4606 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4607 fds,
5b4855ab 4608 child_netns_fd);
b0067625
ZJS
4609 if (r < 0)
4610 _exit(EXIT_FAILURE);
4611
4612 _exit(EXIT_SUCCESS);
4613 }
4614
4615 barrier_set_role(&barrier, BARRIER_PARENT);
4616
e4077ff6 4617 fdset_close(fds);
b0067625
ZJS
4618
4619 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4620 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4621 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4622 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4623 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4624 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4625 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4626 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4627
4628 if (arg_userns_mode != USER_NAMESPACE_NO) {
4629 /* The child just let us know the UID shift it might have read from the image. */
4630 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4631 if (l < 0)
4632 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4633 if (l != sizeof arg_uid_shift)
4634 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4635
4636 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4637 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4638 * image, but if that's already in use, pick a new one, and report back to the child,
4639 * which one we now picked. */
4640
4641 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4642 if (r < 0)
4643 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4644
4645 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4646 if (l < 0)
4647 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4648 if (l != sizeof arg_uid_shift)
4649 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625
ZJS
4650 }
4651 }
4652
8199d554
LP
4653 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4654 /* The child let us know the support cgroup mode it might have read from the image. */
4655 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4656 if (l < 0)
4657 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4658 if (l != sizeof(arg_unified_cgroup_hierarchy))
4659 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4660 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4661 }
4662
b0067625 4663 /* Wait for the outer child. */
d2e0ac3d
LP
4664 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4665 if (r < 0)
4666 return r;
4667 if (r != EXIT_SUCCESS)
4668 return -EIO;
b0067625
ZJS
4669
4670 /* And now retrieve the PID of the inner child. */
4671 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4672 if (l < 0)
4673 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4674 if (l != sizeof *pid)
4675 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4676
4677 /* We also retrieve container UUID in case it was generated by outer child */
4678 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4679 if (l < 0)
4680 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4681 if (l != sizeof(arg_uuid))
4682 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4683
4684 /* We also retrieve the socket used for notifications generated by outer child */
4685 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4686 if (notify_socket < 0)
4687 return log_error_errno(notify_socket,
4688 "Failed to receive notification socket from the outer child: %m");
4689
4690 log_debug("Init process invoked as PID "PID_FMT, *pid);
4691
4692 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4693 if (!barrier_place_and_sync(&barrier)) /* #1 */
4694 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625
ZJS
4695
4696 r = setup_uid_map(*pid);
4697 if (r < 0)
4698 return r;
4699
4700 (void) barrier_place(&barrier); /* #2 */
4701 }
4702
4703 if (arg_private_network) {
75116558
PS
4704 if (!arg_network_namespace_path) {
4705 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4706 if (!barrier_place_and_sync(&barrier)) /* #3 */
4707 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4708 }
4709
5b4855ab
DDM
4710 if (child_netns_fd < 0) {
4711 /* Make sure we have an open file descriptor to the child's network
4712 * namespace so it stays alive even if the child exits. */
4713 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4714 if (r < 0)
4715 return log_error_errno(r, "Failed to open child network namespace: %m");
4716 }
4717
4718 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4719 if (r < 0)
4720 return r;
4721
4722 if (arg_network_veth) {
4723 r = setup_veth(arg_machine, *pid, veth_name,
4724 arg_network_bridge || arg_network_zone);
4725 if (r < 0)
4726 return r;
4727 else if (r > 0)
4728 ifi = r;
4729
4730 if (arg_network_bridge) {
4731 /* Add the interface to a bridge */
4732 r = setup_bridge(veth_name, arg_network_bridge, false);
4733 if (r < 0)
4734 return r;
4735 if (r > 0)
4736 ifi = r;
4737 } else if (arg_network_zone) {
4738 /* Add the interface to a bridge, possibly creating it */
4739 r = setup_bridge(veth_name, arg_network_zone, true);
4740 if (r < 0)
4741 return r;
4742 if (r > 0)
4743 ifi = r;
4744 }
4745 }
4746
4747 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4748 if (r < 0)
4749 return r;
4750
4751 /* We created the primary and extra veth links now; let's remember this, so that we know to
4752 remove them later on. Note that we don't bother with removing veth links that were created
4753 here when their setup failed half-way, because in that case the kernel should be able to
4754 remove them on its own, since they cannot be referenced by anything yet. */
4755 *veth_created = true;
4756
4757 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4758 if (r < 0)
4759 return r;
4760
4761 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4762 if (r < 0)
4763 return r;
4764 }
4765
abdb9b08
LP
4766 if (arg_register || !arg_keep_unit) {
4767 r = sd_bus_default_system(&bus);
4768 if (r < 0)
4769 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4770
4771 r = sd_bus_set_close_on_exit(bus, false);
4772 if (r < 0)
4773 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4774 }
4775
4776 if (!arg_keep_unit) {
4777 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4778 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4779 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4780
75152a4d
LP
4781 r = sd_bus_match_signal_async(
4782 bus,
4783 NULL,
4784 "org.freedesktop.systemd1",
4785 NULL,
4786 "org.freedesktop.systemd1.Scope",
4787 "RequestStop",
4788 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4789 if (r < 0)
75152a4d 4790 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4791 }
4792
b0067625
ZJS
4793 if (arg_register) {
4794 r = register_machine(
abdb9b08 4795 bus,
b0067625
ZJS
4796 arg_machine,
4797 *pid,
4798 arg_directory,
4799 arg_uuid,
4800 ifi,
4801 arg_slice,
4802 arg_custom_mounts, arg_n_custom_mounts,
4803 arg_kill_signal,
4804 arg_property,
de40a303 4805 arg_property_message,
b0067625
ZJS
4806 arg_keep_unit,
4807 arg_container_service_name);
4808 if (r < 0)
4809 return r;
abdb9b08 4810
cd2dfc6f
LP
4811 } else if (!arg_keep_unit) {
4812 r = allocate_scope(
abdb9b08 4813 bus,
cd2dfc6f
LP
4814 arg_machine,
4815 *pid,
4816 arg_slice,
4817 arg_custom_mounts, arg_n_custom_mounts,
4818 arg_kill_signal,
de40a303
LP
4819 arg_property,
4820 arg_property_message);
cd2dfc6f
LP
4821 if (r < 0)
4822 return r;
4823
4824 } else if (arg_slice || arg_property)
4825 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4826
27da7ef0 4827 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4828 if (r < 0)
4829 return r;
4830
27da7ef0 4831 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4832 if (r < 0)
4833 return r;
b0067625 4834
de54e02d 4835 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4836 if (r < 0)
4837 return r;
4838
4839 /* Notify the child that the parent is ready with all
4840 * its setup (including cgroup-ification), and that
4841 * the child can now hand over control to the code to
4842 * run inside the container. */
75116558 4843 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4844
4845 /* Block SIGCHLD here, before notifying child.
4846 * process_pty() will handle it with the other signals. */
4847 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4848
4849 /* Reset signal to default */
4850 r = default_signals(SIGCHLD, -1);
4851 if (r < 0)
4852 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4853
4854 r = sd_event_new(&event);
4855 if (r < 0)
4856 return log_error_errno(r, "Failed to get default event source: %m");
4857
8fd010bb
LP
4858 (void) sd_event_set_watchdog(event, true);
4859
abdb9b08
LP
4860 if (bus) {
4861 r = sd_bus_attach_event(bus, event, 0);
4862 if (r < 0)
4863 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4864 }
4865
e96ceaba 4866 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4867 if (r < 0)
4868 return r;
4869
4870 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
4871 if (!barrier_place_and_sync(&barrier)) /* #5 */
4872 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4873
38ccb557 4874 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
4875 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4876 etc_passwd_lock = safe_close(etc_passwd_lock);
4877
04f590a4
LP
4878 (void) sd_notifyf(false,
4879 "STATUS=Container running.\n"
4880 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4881 if (!arg_notify_ready)
919f5ae0 4882 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4883
4884 if (arg_kill_signal > 0) {
4885 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4886 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4887 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4888 } else {
4889 /* Immediately exit */
919f5ae0
LP
4890 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4891 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4892 }
4893
6916b164 4894 /* Exit when the child exits */
919f5ae0 4895 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4896
4897 if (arg_expose_ports) {
4898 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4899 if (r < 0)
4900 return r;
4901
4902 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4903 }
4904
4905 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4906
3acc84eb
FB
4907 if (arg_console_mode != CONSOLE_PIPE) {
4908 _cleanup_close_ int fd = -1;
4909 PTYForwardFlags flags = 0;
de40a303 4910
3acc84eb
FB
4911 /* Retrieve the master pty allocated by inner child */
4912 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4913 if (fd < 0)
4914 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4915
4916 switch (arg_console_mode) {
de40a303 4917
3acc84eb
FB
4918 case CONSOLE_READ_ONLY:
4919 flags |= PTY_FORWARD_READ_ONLY;
4920
4921 _fallthrough_;
4922
4923 case CONSOLE_INTERACTIVE:
4924 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4925
4926 r = pty_forward_new(event, fd, flags, &forward);
4927 if (r < 0)
4928 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4929
4930 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4931 (void) pty_forward_set_width_height(forward,
4932 arg_console_width,
4933 arg_console_height);
4934 break;
4935
4936 default:
4937 assert(arg_console_mode == CONSOLE_PASSIVE);
4938 }
4939
4940 *master = TAKE_FD(fd);
de40a303 4941 }
b0067625
ZJS
4942
4943 r = sd_event_loop(event);
4944 if (r < 0)
4945 return log_error_errno(r, "Failed to run event loop: %m");
4946
de40a303
LP
4947 if (forward) {
4948 char last_char = 0;
b0067625 4949
de40a303
LP
4950 (void) pty_forward_get_last_char(forward, &last_char);
4951 forward = pty_forward_free(forward);
b0067625 4952
de40a303
LP
4953 if (!arg_quiet && last_char != '\n')
4954 putc('\n', stdout);
4955 }
b0067625
ZJS
4956
4957 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
4958 if (!arg_register && !arg_keep_unit && bus)
4959 terminate_scope(bus, arg_machine);
b0067625
ZJS
4960
4961 /* Normally redundant, but better safe than sorry */
c67b0082 4962 (void) kill(*pid, SIGKILL);
b0067625 4963
5b4855ab
DDM
4964 if (arg_private_network) {
4965 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
4966 * to avoid having to move the parent to the child network namespace. */
4967 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
4968 if (r < 0)
4969 return r;
4970
4971 if (r == 0) {
4972 _cleanup_close_ int parent_netns_fd = -1;
4973
4974 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
4975 if (r < 0) {
4976 log_error_errno(r, "Failed to open parent network namespace: %m");
4977 _exit(EXIT_FAILURE);
4978 }
4979
4980 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
4981 if (r < 0) {
4982 log_error_errno(r, "Failed to enter child network namespace: %m");
4983 _exit(EXIT_FAILURE);
4984 }
4985
4986 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
4987 if (r < 0)
4988 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
4989
4990 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
4991 }
4992 }
4993
b0067625
ZJS
4994 r = wait_for_container(*pid, &container_status);
4995 *pid = 0;
4996
0bb0a9fa
ZJS
4997 /* Tell machined that we are gone. */
4998 if (bus)
4999 (void) unregister_machine(bus, arg_machine);
5000
b0067625
ZJS
5001 if (r < 0)
5002 /* We failed to wait for the container, or the container exited abnormally. */
5003 return r;
5004 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5005 /* r > 0 → The container exited with a non-zero status.
5006 * As a special case, we need to replace 133 with a different value,
5007 * because 133 is special-cased in the service file to reboot the container.
5008 * otherwise → The container exited with zero status and a reboot was not requested.
5009 */
2a49b612 5010 if (r == EXIT_FORCE_RESTART)
27e29a1e 5011 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5012 *ret = r;
b0067625
ZJS
5013 return 0; /* finito */
5014 }
5015
5016 /* CONTAINER_REBOOTED, loop again */
5017
5018 if (arg_keep_unit) {
5019 /* Special handling if we are running as a service: instead of simply
5020 * restarting the machine we want to restart the entire service, so let's
5021 * inform systemd about this with the special exit code 133. The service
5022 * file uses RestartForceExitStatus=133 so that this results in a full
5023 * nspawn restart. This is necessary since we might have cgroup parameters
5024 * set we want to have flushed out. */
2a49b612
ZJS
5025 *ret = EXIT_FORCE_RESTART;
5026 return 0; /* finito */
b0067625
ZJS
5027 }
5028
5029 expose_port_flush(arg_expose_ports, exposed);
5030
5031 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5032 *veth_created = false;
5033 return 1; /* loop again */
5034}
5035
bf428efb 5036static int initialize_rlimits(void) {
bf428efb
LP
5037 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
5038 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5039 * container execution environments. */
5040
5041 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5042 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5043 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5044 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5045 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5046 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5047 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5048 [RLIMIT_MEMLOCK] = { 65536, 65536 },
5049 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5050 [RLIMIT_NICE] = { 0, 0 },
5051 [RLIMIT_NOFILE] = { 1024, 4096 },
5052 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5053 [RLIMIT_RTPRIO] = { 0, 0 },
5054 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5055 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5056
5057 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5058 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5059 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5060 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5061 * that PID 1 changes a number of other resource limits during early initialization which is why we
5062 * don't read the other limits from PID 1 but prefer the static table above. */
5063 };
5064
5065 int rl;
5066
5067 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5068 /* Let's only fill in what the user hasn't explicitly configured anyway */
5069 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5070 const struct rlimit *v;
5071 struct rlimit buffer;
5072
5073 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5074 /* For these two let's read the limits off PID 1. See above for an explanation. */
5075
5076 if (prlimit(1, rl, NULL, &buffer) < 0)
5077 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5078
5079 v = &buffer;
5080 } else
5081 v = kernel_defaults + rl;
5082
5083 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5084 if (!arg_rlimit[rl])
5085 return log_oom();
5086 }
5087
5088 if (DEBUG_LOGGING) {
5089 _cleanup_free_ char *k = NULL;
5090
5091 (void) rlimit_format(arg_rlimit[rl], &k);
5092 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5093 }
5094 }
5095
5096 return 0;
5097}
5098
287b7376
LP
5099static int cant_be_in_netns(void) {
5100 union sockaddr_union sa = {
5101 .un = {
5102 .sun_family = AF_UNIX,
5103 .sun_path = "/run/udev/control",
5104 },
5105 };
5106 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5107 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5108 _cleanup_close_ int fd = -1;
5109 struct ucred ucred;
5110 int r;
5111
5112 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5113 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5114 * nice message. */
5115
5116 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5117 return 0;
5118
5119 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5120 if (fd < 0)
5121 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5122
5123 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5124
5125 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5126 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5127 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5128
5129 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5130 }
5131
5132 r = getpeercred(fd, &ucred);
5133 if (r < 0)
5134 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5135
5136 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5137 r = readlink_malloc(udev_path, &udev_ns);
5138 if (r < 0)
5139 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5140
5141 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5142 if (r < 0)
5143 return log_error_errno(r, "Failed to read our own network namespace: %m");
5144
5145 if (!streq(our_ns, udev_ns))
5146 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5147 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5148 return 0;
5149}
5150
44dbef90 5151static int run(int argc, char *argv[]) {
7bf011e3
LP
5152 bool secondary = false, remove_directory = false, remove_image = false,
5153 veth_created = false, remove_tmprootdir = false;
2d845785 5154 _cleanup_close_ int master = -1;
03cfe0d5 5155 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5156 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5157 char veth_name[IFNAMSIZ] = "";
03cfe0d5 5158 union in_addr_union exposed = {};
8e766630 5159 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5160 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5161 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5162 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5163 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
7bf011e3 5164 pid_t pid = 0;
03cfe0d5
LP
5165
5166 log_parse_environment();
5167 log_open();
415fc41c 5168
03cfe0d5
LP
5169 r = parse_argv(argc, argv);
5170 if (r <= 0)
5171 goto finish;
5172
38ee19c0
ZJS
5173 if (geteuid() != 0) {
5174 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5175 argc >= 2 ? "Need to be root." :
5176 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5177 goto finish;
38ee19c0 5178 }
fba868fa 5179
287b7376
LP
5180 r = cant_be_in_netns();
5181 if (r < 0)
5182 goto finish;
5183
bf428efb
LP
5184 r = initialize_rlimits();
5185 if (r < 0)
5186 goto finish;
5187
de40a303
LP
5188 r = load_oci_bundle();
5189 if (r < 0)
5190 goto finish;
5191
f757855e
LP
5192 r = determine_names();
5193 if (r < 0)
5194 goto finish;
5195
5196 r = load_settings();
5197 if (r < 0)
5198 goto finish;
5199
d4d99bc6 5200 r = cg_unified();
5eee8290
LP
5201 if (r < 0) {
5202 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5203 goto finish;
5204 }
5205
f757855e
LP
5206 r = verify_arguments();
5207 if (r < 0)
5208 goto finish;
03cfe0d5 5209
49048684
ZJS
5210 /* Reapply environment settings. */
5211 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5212
2949ff26
LP
5213 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5214 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5215 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5216 (void) ignore_signals(SIGPIPE, -1);
5217
03cfe0d5
LP
5218 n_fd_passed = sd_listen_fds(false);
5219 if (n_fd_passed > 0) {
5220 r = fdset_new_listen_fds(&fds, false);
5221 if (r < 0) {
5222 log_error_errno(r, "Failed to collect file descriptors: %m");
5223 goto finish;
5224 }
5225 }
5226
83e803a9
ZJS
5227 /* The "default" umask. This is appropriate for most file and directory
5228 * operations performed by nspawn, and is the umask that will be used for
5229 * the child. Functions like copy_devnodes() change the umask temporarily. */
5230 umask(0022);
5231
03cfe0d5
LP
5232 if (arg_directory) {
5233 assert(!arg_image);
5234
b35ca61a
LP
5235 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5236 * /var from the host will propagate into container dynamically (because bad things happen if
5237 * two systems write to the same /var). Let's allow it for the special cases where /var is
5238 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5239 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5240 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5241 r = -EINVAL;
5242 goto finish;
5243 }
5244
5245 if (arg_ephemeral) {
5246 _cleanup_free_ char *np = NULL;
5247
8d4aa2bb 5248 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5249 if (r < 0)
5250 goto finish;
5251
7bf011e3
LP
5252 /* If the specified path is a mount point we generate the new snapshot immediately
5253 * inside it under a random name. However if the specified is not a mount point we
5254 * create the new snapshot in the parent directory, just next to it. */
e1873695 5255 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5256 if (r < 0) {
5257 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5258 goto finish;
5259 }
5260 if (r > 0)
770b5ce4 5261 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5262 else
770b5ce4 5263 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5264 if (r < 0) {
0f3be6ca 5265 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5266 goto finish;
5267 }
5268
6992459c 5269 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5270 * only owned by us and no one else. */
6992459c 5271 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5272 if (r < 0) {
5273 log_error_errno(r, "Failed to lock %s: %m", np);
5274 goto finish;
5275 }
5276
7bf011e3
LP
5277 {
5278 BLOCK_SIGNALS(SIGINT);
5279 r = btrfs_subvol_snapshot(arg_directory, np,
5280 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5281 BTRFS_SNAPSHOT_FALLBACK_COPY |
5282 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5283 BTRFS_SNAPSHOT_RECURSIVE |
5284 BTRFS_SNAPSHOT_QUOTA |
5285 BTRFS_SNAPSHOT_SIGINT);
5286 }
5287 if (r == -EINTR) {
5288 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5289 goto finish;
5290 }
03cfe0d5
LP
5291 if (r < 0) {
5292 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5293 goto finish;
ec16945e
LP
5294 }
5295
1cc6c93a 5296 free_and_replace(arg_directory, np);
17cbb288 5297 remove_directory = true;
30535c16 5298 } else {
cb638b5e 5299 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5300 if (r < 0)
5301 goto finish;
5302
30535c16
LP
5303 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5304 if (r == -EBUSY) {
5305 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5306 goto finish;
5307 }
5308 if (r < 0) {
5309 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5310 goto finish;
30535c16
LP
5311 }
5312
5313 if (arg_template) {
8d4aa2bb 5314 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5315 if (r < 0)
5316 goto finish;
5317
7bf011e3
LP
5318 {
5319 BLOCK_SIGNALS(SIGINT);
5320 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5321 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5322 BTRFS_SNAPSHOT_FALLBACK_COPY |
5323 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5324 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5325 BTRFS_SNAPSHOT_RECURSIVE |
5326 BTRFS_SNAPSHOT_QUOTA |
5327 BTRFS_SNAPSHOT_SIGINT);
5328 }
ff6c6cc1
LP
5329 if (r == -EEXIST)
5330 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5331 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5332 else if (r == -EINTR) {
5333 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5334 goto finish;
5335 } else if (r < 0) {
83521414 5336 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5337 goto finish;
ff6c6cc1
LP
5338 } else
5339 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5340 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5341 }
ec16945e
LP
5342 }
5343
7732f92b 5344 if (arg_start_mode == START_BOOT) {
a5201ed6 5345 const char *p;
c9fe05e0 5346
a5201ed6
LP
5347 if (arg_pivot_root_new)
5348 p = prefix_roota(arg_directory, arg_pivot_root_new);
5349 else
5350 p = arg_directory;
c9fe05e0
AR
5351
5352 if (path_is_os_tree(p) <= 0) {
5353 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5354 r = -EINVAL;
1b9e5b12
LP
5355 goto finish;
5356 }
5357 } else {
c9fe05e0
AR
5358 const char *p, *q;
5359
a5201ed6
LP
5360 if (arg_pivot_root_new)
5361 p = prefix_roota(arg_directory, arg_pivot_root_new);
5362 else
5363 p = arg_directory;
c9fe05e0
AR
5364
5365 q = strjoina(p, "/usr/");
1b9e5b12 5366
c9fe05e0
AR
5367 if (laccess(q, F_OK) < 0) {
5368 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5369 r = -EINVAL;
1b9e5b12 5370 goto finish;
1b9e5b12
LP
5371 }
5372 }
ec16945e 5373
6b9132a9 5374 } else {
e7cbe5cb 5375 DissectImageFlags dissect_image_flags = DISSECT_IMAGE_REQUIRE_ROOT | DISSECT_IMAGE_RELAX_VAR_CHECK;
ec16945e
LP
5376 assert(arg_image);
5377 assert(!arg_template);
5378
8d4aa2bb 5379 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5380 if (r < 0)
5381 goto finish;
5382
0f3be6ca
LP
5383 if (arg_ephemeral) {
5384 _cleanup_free_ char *np = NULL;
5385
5386 r = tempfn_random(arg_image, "machine.", &np);
5387 if (r < 0) {
5388 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5389 goto finish;
5390 }
5391
6992459c
LP
5392 /* Always take an exclusive lock on our own ephemeral copy. */
5393 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5394 if (r < 0) {
5395 r = log_error_errno(r, "Failed to create image lock: %m");
5396 goto finish;
5397 }
5398
7bf011e3
LP
5399 {
5400 BLOCK_SIGNALS(SIGINT);
5401 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5402 }
5403 if (r == -EINTR) {
5404 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5405 goto finish;
5406 }
0f3be6ca
LP
5407 if (r < 0) {
5408 r = log_error_errno(r, "Failed to copy image file: %m");
5409 goto finish;
5410 }
5411
1cc6c93a 5412 free_and_replace(arg_image, np);
0f3be6ca
LP
5413 remove_image = true;
5414 } else {
5415 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5416 if (r == -EBUSY) {
5417 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5418 goto finish;
5419 }
5420 if (r < 0) {
5421 r = log_error_errno(r, "Failed to create image lock: %m");
5422 goto finish;
5423 }
4623e8e6 5424
89e62e0b
LP
5425 r = verity_settings_load(
5426 &arg_verity_settings,
5427 arg_image, NULL, NULL);
e7cbe5cb
LB
5428 if (r < 0) {
5429 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5430 goto finish;
78ebe980 5431 }
89e62e0b
LP
5432
5433 if (arg_verity_settings.data_path)
5434 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5435 }
5436
c67b0082 5437 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5438 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5439 goto finish;
1b9e5b12 5440 }
6b9132a9 5441
c67b0082
LP
5442 remove_tmprootdir = true;
5443
5444 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5445 if (!arg_directory) {
5446 r = log_oom();
5447 goto finish;
6b9132a9 5448 }
88213476 5449
89e62e0b
LP
5450 r = loop_device_make_by_path(
5451 arg_image,
5452 arg_read_only ? O_RDONLY : O_RDWR,
5453 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5454 &loop);
2d845785
LP
5455 if (r < 0) {
5456 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5457 goto finish;
5458 }
1b9e5b12 5459
4526113f 5460 r = dissect_image_and_warn(
e0f9e7bd 5461 loop->fd,
4526113f 5462 arg_image,
89e62e0b 5463 &arg_verity_settings,
18d73705 5464 NULL,
e7cbe5cb 5465 dissect_image_flags,
e0f9e7bd 5466 &dissected_image);
2d845785 5467 if (r == -ENOPKG) {
4526113f 5468 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5469 log_notice("Note that the disk image needs to\n"
5470 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5471 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5472 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5473 " d) or contain a file system without a partition table\n"
5474 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5475 goto finish;
2d845785 5476 }
4526113f 5477 if (r < 0)
842f3b0f 5478 goto finish;
1b9e5b12 5479
89e62e0b 5480 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
4623e8e6
LP
5481 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5482
89e62e0b
LP
5483 r = dissected_image_decrypt_interactively(
5484 dissected_image,
5485 NULL,
5486 &arg_verity_settings,
5487 0,
5488 &decrypted_image);
1b9e5b12
LP
5489 if (r < 0)
5490 goto finish;
0f3be6ca
LP
5491
5492 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5493 if (remove_image && unlink(arg_image) >= 0)
5494 remove_image = false;
842f3b0f 5495 }
842f3b0f 5496
86c0dd4a 5497 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5498 if (r < 0)
5499 goto finish;
5500
de40a303
LP
5501 if (arg_console_mode < 0)
5502 arg_console_mode =
5503 isatty(STDIN_FILENO) > 0 &&
5504 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5505
de40a303
LP
5506 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5507 arg_quiet = true;
a258bf26 5508
9c857b9d
LP
5509 if (!arg_quiet)
5510 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5511 arg_machine, arg_image ?: arg_directory);
5512
72c0a2c2 5513 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5514
66edd963 5515 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5516 r = log_error_errno(errno, "Failed to become subreaper: %m");
5517 goto finish;
5518 }
5519
d87be9b0 5520 for (;;) {
3acc84eb 5521 r = run_container(dissected_image,
44dbef90
LP
5522 secondary,
5523 fds,
5524 veth_name, &veth_created,
3acc84eb 5525 &exposed, &master,
44dbef90 5526 &pid, &ret);
b0067625 5527 if (r <= 0)
d87be9b0 5528 break;
d87be9b0 5529 }
88213476
LP
5530
5531finish:
04f590a4
LP
5532 (void) sd_notify(false,
5533 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5534 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5535
9444b1f2 5536 if (pid > 0)
c67b0082 5537 (void) kill(pid, SIGKILL);
88213476 5538
503546da 5539 /* Try to flush whatever is still queued in the pty */
6a0f896b 5540 if (master >= 0) {
1c876927 5541 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5542 master = safe_close(master);
5543 }
5544
5545 if (pid > 0)
5546 (void) wait_for_terminate(pid, NULL);
503546da 5547
50ebcf6c
LP
5548 pager_close();
5549
17cbb288 5550 if (remove_directory && arg_directory) {
ec16945e
LP
5551 int k;
5552
17cbb288 5553 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5554 if (k < 0)
17cbb288 5555 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5556 }
5557
0f3be6ca
LP
5558 if (remove_image && arg_image) {
5559 if (unlink(arg_image) < 0)
5560 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5561 }
5562
c67b0082
LP
5563 if (remove_tmprootdir) {
5564 if (rmdir(tmprootdir) < 0)
5565 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5566 }
5567
785890ac
LP
5568 if (arg_machine) {
5569 const char *p;
5570
63c372cb 5571 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5572 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5573 }
5574
7a8f6325 5575 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5576
5577 if (veth_created)
5578 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5579 (void) remove_bridge(arg_network_zone);
f757855e 5580
f757855e
LP
5581 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5582 expose_port_free_all(arg_expose_ports);
bf428efb 5583 rlimit_free_all(arg_rlimit);
b2645747 5584 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5585 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5586
44dbef90
LP
5587 if (r < 0)
5588 return r;
5589
5590 return ret;
88213476 5591}
44dbef90
LP
5592
5593DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);