]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
pidref: add trivial helper pidref_set_self() to set pidref to our handle to our own...
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
d6b4d1c7 31#include "build.h"
b8ea7a6e 32#include "bus-error.h"
7f8a85e6 33#include "bus-locator.h"
b053cd5f 34#include "bus-util.h"
8fe0087e 35#include "cap-list.h"
430f0182 36#include "capability-util.h"
04d391da 37#include "cgroup-util.h"
f461a28d 38#include "chase.h"
988851b6 39#include "common-signal.h"
8fe0087e 40#include "copy.h"
d107bb7d 41#include "cpu-set-util.h"
786d19fd 42#include "creds-util.h"
4fc9982c 43#include "dev-setup.h"
57f1b61b 44#include "discover-image.h"
2d845785 45#include "dissect-image.h"
8fe0087e 46#include "env-util.h"
3652872a 47#include "escape.h"
3ffd4af2 48#include "fd-util.h"
842f3b0f 49#include "fdset.h"
a5c32cff 50#include "fileio.h"
f97b34a6 51#include "format-util.h"
f4f15635 52#include "fs-util.h"
1b9e5b12 53#include "gpt.h"
4623e8e6 54#include "hexdecoct.h"
e2054217 55#include "hostname-setup.h"
8fe0087e 56#include "hostname-util.h"
910fd145 57#include "id128-util.h"
3652872a 58#include "io-util.h"
8fe0087e 59#include "log.h"
2d845785 60#include "loop-util.h"
8fe0087e 61#include "loopback-setup.h"
8fe0087e 62#include "macro.h"
44dbef90 63#include "main-func.h"
f5947a5e 64#include "missing_sched.h"
8fe0087e 65#include "mkdir.h"
4349cd7c 66#include "mount-util.h"
049af8ad 67#include "mountpoint-util.h"
0cb8e3d1 68#include "namespace-util.h"
8fe0087e 69#include "netlink-util.h"
2f893044 70#include "nspawn-bind-user.h"
07630cea 71#include "nspawn-cgroup.h"
3652872a 72#include "nspawn-creds.h"
3603efde 73#include "nspawn-def.h"
07630cea
LP
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
de40a303 77#include "nspawn-oci.h"
7336138e 78#include "nspawn-patch-uid.h"
07630cea 79#include "nspawn-register.h"
910fd145 80#include "nspawn-seccomp.h"
07630cea
LP
81#include "nspawn-settings.h"
82#include "nspawn-setuid.h"
7732f92b 83#include "nspawn-stub-pid1.h"
c9394f4f 84#include "nspawn-util.h"
91181e07 85#include "nspawn.h"
d8b4d14d 86#include "nulstr-util.h"
d58ad743 87#include "os-util.h"
50ebcf6c 88#include "pager.h"
614b022c 89#include "parse-argument.h"
6bedfcbb 90#include "parse-util.h"
294bf0c3 91#include "pretty-print.h"
0b452006 92#include "process-util.h"
8fe0087e
LP
93#include "ptyfwd.h"
94#include "random-util.h"
8869a0b4 95#include "raw-clone.h"
86775e35 96#include "resolve-util.h"
bf428efb 97#include "rlimit-util.h"
8fe0087e 98#include "rm-rf.h"
de40a303 99#include "seccomp-util.h"
68b02049 100#include "selinux-util.h"
8fe0087e 101#include "signal-util.h"
2583fbea 102#include "socket-util.h"
8fcde012 103#include "stat-util.h"
15a5e950 104#include "stdio-util.h"
5c828e66 105#include "string-table.h"
07630cea 106#include "string-util.h"
8fe0087e 107#include "strv.h"
de40a303 108#include "sysctl-util.h"
8fe0087e 109#include "terminal-util.h"
e4de7287 110#include "tmpfile-util.h"
affb60b1 111#include "umask-util.h"
43c3fb46 112#include "unit-name.h"
b1d4f8e1 113#include "user-util.h"
e9642be2 114
e96ceaba
LP
115/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
116#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 117#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 118
2a49b612
ZJS
119#define EXIT_FORCE_RESTART 133
120
113cea80
DH
121typedef enum ContainerStatus {
122 CONTAINER_TERMINATED,
6145bb4f 123 CONTAINER_REBOOTED,
113cea80
DH
124} ContainerStatus;
125
88213476 126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
b53ede69
PW
129static char *arg_pivot_root_new = NULL;
130static char *arg_pivot_root_old = NULL;
687d0825 131static char *arg_user = NULL;
de40a303
LP
132static uid_t arg_uid = UID_INVALID;
133static gid_t arg_gid = GID_INVALID;
134static gid_t* arg_supplementary_gids = NULL;
135static size_t arg_n_supplementary_gids = 0;
9444b1f2 136static sd_id128_t arg_uuid = {};
3a9530e5
LP
137static char *arg_machine = NULL; /* The name used by the host to refer to this */
138static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
139static const char *arg_selinux_context = NULL;
140static const char *arg_selinux_apifs_context = NULL;
de40a303 141static char *arg_slice = NULL;
ff01d048 142static bool arg_private_network = false;
bc2f673e 143static bool arg_read_only = false;
7732f92b 144static StartMode arg_start_mode = START_PID1;
ec16945e 145static bool arg_ephemeral = false;
57fb9fb5 146static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 147static bool arg_link_journal_try = false;
520e0d54 148static uint64_t arg_caps_retain =
50b52222
LP
149 (1ULL << CAP_AUDIT_CONTROL) |
150 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
151 (1ULL << CAP_CHOWN) |
152 (1ULL << CAP_DAC_OVERRIDE) |
153 (1ULL << CAP_DAC_READ_SEARCH) |
154 (1ULL << CAP_FOWNER) |
155 (1ULL << CAP_FSETID) |
156 (1ULL << CAP_IPC_OWNER) |
157 (1ULL << CAP_KILL) |
158 (1ULL << CAP_LEASE) |
159 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 160 (1ULL << CAP_MKNOD) |
5076f0cc
LP
161 (1ULL << CAP_NET_BIND_SERVICE) |
162 (1ULL << CAP_NET_BROADCAST) |
163 (1ULL << CAP_NET_RAW) |
5076f0cc 164 (1ULL << CAP_SETFCAP) |
50b52222 165 (1ULL << CAP_SETGID) |
5076f0cc
LP
166 (1ULL << CAP_SETPCAP) |
167 (1ULL << CAP_SETUID) |
168 (1ULL << CAP_SYS_ADMIN) |
50b52222 169 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
170 (1ULL << CAP_SYS_CHROOT) |
171 (1ULL << CAP_SYS_NICE) |
172 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 173 (1ULL << CAP_SYS_RESOURCE) |
50b52222 174 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 175static uint64_t arg_caps_ambient = 0;
de40a303 176static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 177static CustomMount *arg_custom_mounts = NULL;
88614c8a 178static size_t arg_n_custom_mounts = 0;
f4889f65 179static char **arg_setenv = NULL;
284c0b91 180static bool arg_quiet = false;
eb91eb18 181static bool arg_register = true;
89f7c846 182static bool arg_keep_unit = false;
aa28aefe 183static char **arg_network_interfaces = NULL;
c74e630d 184static char **arg_network_macvlan = NULL;
4bbfe7ad 185static char **arg_network_ipvlan = NULL;
69c79d3c 186static bool arg_network_veth = false;
f6d6bad1 187static char **arg_network_veth_extra = NULL;
f757855e 188static char *arg_network_bridge = NULL;
22b28dfd 189static char *arg_network_zone = NULL;
d7bea6b6 190static char *arg_network_namespace_path = NULL;
bb068de0 191static PagerFlags arg_pager_flags = 0;
050f7277 192static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 193static char *arg_image = NULL;
de40a303 194static char *arg_oci_bundle = NULL;
f757855e 195static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 196static ExposePort *arg_expose_ports = NULL;
f36933fe 197static char **arg_property = NULL;
de40a303 198static sd_bus_message *arg_property_message = NULL;
0de7acce 199static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 200static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 201static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 202static int arg_kill_signal = 0;
5da38d07 203static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
204static SettingsMask arg_settings_mask = 0;
205static int arg_settings_trusted = -1;
206static char **arg_parameters = NULL;
6aadfa4c 207static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 208static bool arg_notify_ready = false;
5a8ff0e6 209static bool arg_use_cgns = true;
0c582db0 210static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 211static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 212static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
213static char **arg_syscall_allow_list = NULL;
214static char **arg_syscall_deny_list = NULL;
de40a303
LP
215#if HAVE_SECCOMP
216static scmp_filter_ctx arg_seccomp = NULL;
217#endif
bf428efb 218static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 219static bool arg_no_new_privileges = false;
81f345df
LP
220static int arg_oom_score_adjust = 0;
221static bool arg_oom_score_adjust_set = false;
0985c7c4 222static CPUSet arg_cpu_set = {};
09d423e9 223static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 224static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 225static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
226static DeviceNode* arg_extra_nodes = NULL;
227static size_t arg_n_extra_nodes = 0;
228static char **arg_sysctl = NULL;
229static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
230static Credential *arg_credentials = NULL;
231static size_t arg_n_credentials = 0;
2f893044 232static char **arg_bind_user = NULL;
4a4654e0 233static bool arg_suppress_sync = false;
3603f151 234static char *arg_settings_filename = NULL;
4c27749b 235static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 236static ImagePolicy *arg_image_policy = NULL;
88213476 237
6145bb4f
LP
238STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
250STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
259STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
260STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 261STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
262STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
263STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
264#if HAVE_SECCOMP
265STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
266#endif
0985c7c4 267STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 268STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 269STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 270STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 271STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 272
dce66ffe
ZJS
273static int handle_arg_console(const char *arg) {
274 if (streq(arg, "help")) {
10e8a60b
LP
275 puts("autopipe\n"
276 "interactive\n"
dce66ffe 277 "passive\n"
10e8a60b
LP
278 "pipe\n"
279 "read-only");
dce66ffe
ZJS
280 return 0;
281 }
282
283 if (streq(arg, "interactive"))
284 arg_console_mode = CONSOLE_INTERACTIVE;
285 else if (streq(arg, "read-only"))
286 arg_console_mode = CONSOLE_READ_ONLY;
287 else if (streq(arg, "passive"))
288 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
289 else if (streq(arg, "pipe")) {
290 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
291 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
292 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
293 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
294 "Proceeding anyway.");
295
dce66ffe 296 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
297 } else if (streq(arg, "autopipe")) {
298 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
299 arg_console_mode = CONSOLE_INTERACTIVE;
300 else
301 arg_console_mode = CONSOLE_PIPE;
554c4beb 302 } else
dce66ffe
ZJS
303 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
304
305 arg_settings_mask |= SETTING_CONSOLE_MODE;
306 return 1;
307}
308
37ec0fdd
LP
309static int help(void) {
310 _cleanup_free_ char *link = NULL;
311 int r;
312
384c2c32 313 pager_open(arg_pager_flags);
50ebcf6c 314
37ec0fdd
LP
315 r = terminal_urlify_man("systemd-nspawn", "1", &link);
316 if (r < 0)
317 return log_oom();
318
25148653 319 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 320 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
321 " -h --help Show this help\n"
322 " --version Print version string\n"
69c79d3c 323 " -q --quiet Do not show status information\n"
bb068de0 324 " --no-pager Do not pipe output into a pager\n"
25148653
LP
325 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
326 "%3$sImage:%4$s\n"
1b9e5b12 327 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
328 " --template=PATH Initialize root directory from template directory,\n"
329 " if missing\n"
330 " -x --ephemeral Run container with snapshot of root directory, and\n"
331 " remove it after exit\n"
25e68fd3
LP
332 " -i --image=PATH Root file system disk image (or device node) for\n"
333 " the container\n"
84be0c71 334 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 335 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
336 " --read-only Mount the root directory read-only\n"
337 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 338 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
339 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
340 " as a DER encoded PKCS7, either as a path to a file\n"
341 " or as an ASCII base64 encoded string prefixed by\n"
342 " 'base64:'\n"
e7cbe5cb 343 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
344 " --pivot-root=PATH[:PATH]\n"
345 " Pivot root to given directory in the container\n\n"
346 "%3$sExecution:%4$s\n"
7732f92b 347 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 348 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 349 " --chdir=PATH Set working directory in the container\n"
0d2a0179 350 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
351 " -u --user=USER Run the command under specified user or UID\n"
352 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
353 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
354 " --suppress-sync=BOOLEAN\n"
355 " Suppress any form of disk data synchronization\n\n"
25148653 356 "%3$sSystem Identity:%4$s\n"
a8828ed9 357 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 358 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
359 " --uuid=UUID Set a specific machine UUID for the container\n\n"
360 "%3$sProperties:%4$s\n"
a8828ed9 361 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 362 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
363 " --register=BOOLEAN Register container as machine\n"
364 " --keep-unit Do not register a scope for the machine, reuse\n"
365 " the service unit nspawn is running in\n\n"
366 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
367 " --private-users=no Run without user namespacing\n"
368 " --private-users=yes|pick|identity\n"
369 " Run within user namespace, autoselect UID/GID range\n"
370 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 371 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
372 " --private-users-ownership=MODE\n"
373 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
374 " to private UID/GID range\n"
375 " -U Equivalent to --private-users=pick and\n"
376 " --private-users-ownership=auto\n\n"
25148653 377 "%3$sNetworking:%4$s\n"
69c79d3c 378 " --private-network Disable network in container\n"
2f091b1b 379 " --network-interface=HOSTIF[:CONTAINERIF]\n"
69c79d3c
LP
380 " Assign an existing network interface to the\n"
381 " container\n"
2f091b1b 382 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
c74e630d
LP
383 " Create a macvlan network interface based on an\n"
384 " existing network interface to the container\n"
2f091b1b 385 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
387f6955 386 " Create an ipvlan network interface based on an\n"
4bbfe7ad 387 " existing network interface to the container\n"
a8eaaee7 388 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 389 " and container\n"
f6d6bad1
LP
390 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
391 " Add an additional virtual Ethernet link between\n"
392 " host and container\n"
ab046dde 393 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
394 " Add a virtual Ethernet connection to the container\n"
395 " and attach it to an existing bridge on the host\n"
396 " --network-zone=NAME Similar, but attach the new interface to an\n"
397 " an automatically managed bridge interface\n"
d7bea6b6
DP
398 " --network-namespace-path=PATH\n"
399 " Set network namespace to the one represented by\n"
400 " the specified kernel namespace file node\n"
6d0b55c2 401 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
402 " Expose a container IP port on the host\n\n"
403 "%3$sSecurity:%4$s\n"
a8828ed9
DW
404 " --capability=CAP In addition to the default, retain specified\n"
405 " capability\n"
406 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
407 " --ambient-capability=CAP\n"
408 " Sets the specified capability for the started\n"
409 " process. Not useful if booting a machine.\n"
f4e803c8 410 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
411 " --system-call-filter=LIST|~LIST\n"
412 " Permit/prohibit specific system calls\n"
25148653
LP
413 " -Z --selinux-context=SECLABEL\n"
414 " Set the SELinux security context to be used by\n"
415 " processes in the container\n"
416 " -L --selinux-apifs-context=SECLABEL\n"
417 " Set the SELinux security context to be used by\n"
418 " API/tmpfs file systems in the container\n\n"
419 "%3$sResources:%4$s\n"
bf428efb 420 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
421 " --oom-score-adjust=VALUE\n"
422 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
423 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
424 " --personality=ARCH Pick personality for this container\n\n"
25148653 425 "%3$sIntegration:%4$s\n"
09d423e9 426 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 427 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
428 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
429 " host, try-guest, try-host\n"
430 " -j Equivalent to --link-journal=try-guest\n\n"
431 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
432 " --bind=PATH[:PATH[:OPTIONS]]\n"
433 " Bind mount a file or directory from the host into\n"
a8828ed9 434 " the container\n"
5e5bfa6e
EY
435 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
436 " Similar, but creates a read-only bind mount\n"
de40a303
LP
437 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
438 " it\n"
06c17c39 439 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
440 " --overlay=PATH[:PATH...]:PATH\n"
441 " Create an overlay mount from the host to \n"
442 " the container\n"
443 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
444 " Similar, but creates a read-only overlay mount\n"
445 " --bind-user=NAME Bind user from host to container\n\n"
25148653 446 "%3$sInput/Output:%4$s\n"
de40a303
LP
447 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
448 " set up for the container.\n"
3652872a
LP
449 " -P --pipe Equivalent to --console=pipe\n\n"
450 "%3$sCredentials:%4$s\n"
451 " --set-credential=ID:VALUE\n"
452 " Pass a credential with literal value to container.\n"
453 " --load-credential=ID:PATH\n"
454 " Load credential to pass to container from file or\n"
455 " AF_UNIX stream socket.\n"
bc556335
DDM
456 "\nSee the %2$s for details.\n",
457 program_invocation_short_name,
458 link,
459 ansi_underline(),
460 ansi_normal(),
461 ansi_highlight(),
462 ansi_normal());
37ec0fdd
LP
463
464 return 0;
88213476
LP
465}
466
86c0dd4a 467static int custom_mount_check_all(void) {
88614c8a 468 size_t i;
5a8af538 469
5a8af538
LP
470 for (i = 0; i < arg_n_custom_mounts; i++) {
471 CustomMount *m = &arg_custom_mounts[i];
472
0de7acce 473 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 474 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 475 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 476 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 477 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
479 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 480 }
5a8af538
LP
481 }
482
483 return 0;
484}
485
8199d554 486static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 487 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 488 int r;
5da38d07 489
efdb0237 490 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
491
492 e = getenv(var);
493 if (!e) {
d5fc5b2f 494 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
495 var = "UNIFIED_CGROUP_HIERARCHY";
496 e = getenv(var);
c78c095b
ZJS
497 }
498
499 if (!isempty(e)) {
efdb0237
LP
500 r = parse_boolean(e);
501 if (r < 0)
c78c095b 502 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
503 if (r > 0)
504 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
505 else
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
507 }
508
8199d554
LP
509 return 0;
510}
511
512static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
513 int r;
514
75b0d8b8
ZJS
515 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
516 * in the image actually supports. */
b4cccbc1
LP
517 r = cg_all_unified();
518 if (r < 0)
519 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
520 if (r > 0) {
a8725a06
ZJS
521 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
522 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 523 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
524 if (r < 0)
525 return log_error_errno(r, "Failed to determine systemd version in container: %m");
526 if (r > 0)
527 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
528 else
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 530 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 531 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 532 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
533 if (r < 0)
534 return log_error_errno(r, "Failed to determine systemd version in container: %m");
535 if (r > 0)
536 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
537 else
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
539 } else
5da38d07 540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 541
8199d554
LP
542 log_debug("Using %s hierarchy for container.",
543 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
544 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
545
efdb0237
LP
546 return 0;
547}
548
8a99bd0c
ZJS
549static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
550 uint64_t mask = 0;
551 int r;
552
553 for (;;) {
554 _cleanup_free_ char *t = NULL;
555
556 r = extract_first_word(&spec, &t, ",", 0);
557 if (r < 0)
558 return log_error_errno(r, "Failed to parse capability %s.", t);
559 if (r == 0)
560 break;
561
562 if (streq(t, "help")) {
563 for (int i = 0; i < capability_list_length(); i++) {
564 const char *name;
565
566 name = capability_to_name(i);
567 if (name)
568 puts(name);
569 }
570
571 return 0; /* quit */
572 }
573
574 if (streq(t, "all"))
f5fbe71d 575 mask = UINT64_MAX;
8a99bd0c
ZJS
576 else {
577 r = capability_from_name(t);
578 if (r < 0)
579 return log_error_errno(r, "Failed to parse capability %s.", t);
580
581 mask |= 1ULL << r;
582 }
583 }
584
585 *ret_mask = mask;
586 return 1; /* continue */
587}
588
49048684 589static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
590 int r;
591
592 r = getenv_bool(name);
593 if (r == -ENXIO)
49048684 594 return 0;
0c582db0 595 if (r < 0)
49048684 596 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 597
0c582db0 598 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 599 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 600 return 0;
0c582db0
LB
601}
602
49048684 603static int parse_mount_settings_env(void) {
4f086aab 604 const char *e;
1099ceeb
LP
605 int r;
606
607 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
608 if (r < 0 && r != -ENXIO)
609 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
610 if (r >= 0)
611 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
612
613 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 614 if (streq_ptr(e, "network"))
4f086aab 615 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 616
49048684
ZJS
617 else if (e) {
618 r = parse_boolean(e);
619 if (r < 0)
620 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
621
622 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
623 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 624 }
4f086aab 625
49048684 626 return 0;
4f086aab
SU
627}
628
49048684 629static int parse_environment(void) {
d5455d2f
LP
630 const char *e;
631 int r;
632
49048684
ZJS
633 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
634 if (r < 0)
635 return r;
636 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
637 if (r < 0)
638 return r;
639 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
640 if (r < 0)
641 return r;
642 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
643 if (r < 0)
644 return r;
d5455d2f 645
49048684
ZJS
646 r = parse_mount_settings_env();
647 if (r < 0)
648 return r;
d5455d2f 649
489fae52
ZJS
650 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
651 * even if it is supported. If not supported, it has no effect. */
de40a303 652 if (!cg_ns_supported())
489fae52 653 arg_use_cgns = false;
de40a303
LP
654 else {
655 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
656 if (r < 0) {
657 if (r != -ENXIO)
49048684 658 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
659
660 arg_use_cgns = true;
661 } else {
662 arg_use_cgns = r > 0;
663 arg_settings_mask |= SETTING_USE_CGNS;
664 }
665 }
d5455d2f
LP
666
667 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
668 if (e)
669 arg_container_service_name = e;
670
4a4654e0
LP
671 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
672 if (r >= 0)
673 arg_suppress_sync = r;
674 else if (r != -ENXIO)
675 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
676
49048684 677 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
678}
679
88213476 680static int parse_argv(int argc, char *argv[]) {
a41fe3a2 681 enum {
acbeb427
ZJS
682 ARG_VERSION = 0x100,
683 ARG_PRIVATE_NETWORK,
bc2f673e 684 ARG_UUID,
5076f0cc 685 ARG_READ_ONLY,
57fb9fb5 686 ARG_CAPABILITY,
88fc9c9b 687 ARG_AMBIENT_CAPABILITY,
420c7379 688 ARG_DROP_CAPABILITY,
17fe0523
LP
689 ARG_LINK_JOURNAL,
690 ARG_BIND,
f4889f65 691 ARG_BIND_RO,
06c17c39 692 ARG_TMPFS,
5a8af538
LP
693 ARG_OVERLAY,
694 ARG_OVERLAY_RO,
de40a303 695 ARG_INACCESSIBLE,
eb91eb18 696 ARG_SHARE_SYSTEM,
89f7c846 697 ARG_REGISTER,
aa28aefe 698 ARG_KEEP_UNIT,
69c79d3c 699 ARG_NETWORK_INTERFACE,
c74e630d 700 ARG_NETWORK_MACVLAN,
4bbfe7ad 701 ARG_NETWORK_IPVLAN,
ab046dde 702 ARG_NETWORK_BRIDGE,
22b28dfd 703 ARG_NETWORK_ZONE,
f6d6bad1 704 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 705 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 706 ARG_PERSONALITY,
4d9f07b4 707 ARG_VOLATILE,
ec16945e 708 ARG_TEMPLATE,
f36933fe 709 ARG_PROPERTY,
6dac160c 710 ARG_PRIVATE_USERS,
c6c8f6e2 711 ARG_KILL_SIGNAL,
f757855e 712 ARG_SETTINGS,
5f932eb9 713 ARG_CHDIR,
b53ede69 714 ARG_PIVOT_ROOT,
7336138e 715 ARG_PRIVATE_USERS_CHOWN,
6c045a99 716 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 717 ARG_NOTIFY_READY,
4623e8e6 718 ARG_ROOT_HASH,
89e62e0b
LP
719 ARG_ROOT_HASH_SIG,
720 ARG_VERITY_DATA,
960e4569 721 ARG_SYSTEM_CALL_FILTER,
bf428efb 722 ARG_RLIMIT,
3a9530e5 723 ARG_HOSTNAME,
66edd963 724 ARG_NO_NEW_PRIVILEGES,
81f345df 725 ARG_OOM_SCORE_ADJUST,
d107bb7d 726 ARG_CPU_AFFINITY,
09d423e9 727 ARG_RESOLV_CONF,
1688841f 728 ARG_TIMEZONE,
de40a303
LP
729 ARG_CONSOLE,
730 ARG_PIPE,
731 ARG_OCI_BUNDLE,
bb068de0 732 ARG_NO_PAGER,
3652872a
LP
733 ARG_SET_CREDENTIAL,
734 ARG_LOAD_CREDENTIAL,
2f893044 735 ARG_BIND_USER,
4a4654e0 736 ARG_SUPPRESS_SYNC,
84be0c71 737 ARG_IMAGE_POLICY,
a41fe3a2
LP
738 };
739
88213476 740 static const struct option options[] = {
d7bea6b6
DP
741 { "help", no_argument, NULL, 'h' },
742 { "version", no_argument, NULL, ARG_VERSION },
743 { "directory", required_argument, NULL, 'D' },
744 { "template", required_argument, NULL, ARG_TEMPLATE },
745 { "ephemeral", no_argument, NULL, 'x' },
746 { "user", required_argument, NULL, 'u' },
747 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
748 { "as-pid2", no_argument, NULL, 'a' },
749 { "boot", no_argument, NULL, 'b' },
750 { "uuid", required_argument, NULL, ARG_UUID },
751 { "read-only", no_argument, NULL, ARG_READ_ONLY },
752 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 753 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 754 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 755 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
756 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
757 { "bind", required_argument, NULL, ARG_BIND },
758 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
759 { "tmpfs", required_argument, NULL, ARG_TMPFS },
760 { "overlay", required_argument, NULL, ARG_OVERLAY },
761 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 762 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 763 { "machine", required_argument, NULL, 'M' },
3a9530e5 764 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
765 { "slice", required_argument, NULL, 'S' },
766 { "setenv", required_argument, NULL, 'E' },
767 { "selinux-context", required_argument, NULL, 'Z' },
768 { "selinux-apifs-context", required_argument, NULL, 'L' },
769 { "quiet", no_argument, NULL, 'q' },
770 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
771 { "register", required_argument, NULL, ARG_REGISTER },
772 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
773 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
774 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
775 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
776 { "network-veth", no_argument, NULL, 'n' },
777 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
778 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
779 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
780 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
781 { "personality", required_argument, NULL, ARG_PERSONALITY },
782 { "image", required_argument, NULL, 'i' },
783 { "volatile", optional_argument, NULL, ARG_VOLATILE },
784 { "port", required_argument, NULL, 'p' },
785 { "property", required_argument, NULL, ARG_PROPERTY },
786 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
787 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
788 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
789 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
790 { "settings", required_argument, NULL, ARG_SETTINGS },
791 { "chdir", required_argument, NULL, ARG_CHDIR },
792 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
793 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
794 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
795 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
796 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 797 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 798 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 799 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 800 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 801 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 802 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
803 { "console", required_argument, NULL, ARG_CONSOLE },
804 { "pipe", no_argument, NULL, ARG_PIPE },
805 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 806 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
807 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
808 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 809 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 810 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 811 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 812 {}
88213476
LP
813 };
814
9444b1f2 815 int c, r;
a42c8b54 816 uint64_t plus = 0, minus = 0;
f757855e 817 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
818
819 assert(argc >= 0);
820 assert(argv);
821
ef9c12b1
YW
822 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
823 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
824 optind = 0;
de40a303 825 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
826 switch (c) {
827
828 case 'h':
37ec0fdd 829 return help();
88213476 830
acbeb427 831 case ARG_VERSION:
3f6fd1ba 832 return version();
acbeb427 833
88213476 834 case 'D':
614b022c 835 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 836 if (r < 0)
0f03c2a4 837 return r;
de40a303
LP
838
839 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
840 break;
841
842 case ARG_TEMPLATE:
614b022c 843 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 844 if (r < 0)
0f03c2a4 845 return r;
de40a303
LP
846
847 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
848 break;
849
1b9e5b12 850 case 'i':
614b022c 851 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 852 if (r < 0)
0f03c2a4 853 return r;
de40a303
LP
854
855 arg_settings_mask |= SETTING_DIRECTORY;
856 break;
857
858 case ARG_OCI_BUNDLE:
614b022c 859 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
860 if (r < 0)
861 return r;
862
ec16945e
LP
863 break;
864
865 case 'x':
866 arg_ephemeral = true;
a2f577fc 867 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
868 break;
869
687d0825 870 case 'u':
2fc09a9c
DM
871 r = free_and_strdup(&arg_user, optarg);
872 if (r < 0)
7027ff61 873 return log_oom();
687d0825 874
f757855e 875 arg_settings_mask |= SETTING_USER;
687d0825
MV
876 break;
877
22b28dfd 878 case ARG_NETWORK_ZONE: {
fee9f7b5 879 _cleanup_free_ char *j = NULL;
22b28dfd 880
b910cc72 881 j = strjoin("vz-", optarg);
22b28dfd
LP
882 if (!j)
883 return log_oom();
884
fee9f7b5
FS
885 if (!ifname_valid(j))
886 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
887 "Network zone name not valid: %s", j);
22b28dfd 888
df1fac6d 889 free_and_replace(arg_network_zone, j);
22b28dfd
LP
890
891 arg_network_veth = true;
892 arg_private_network = true;
893 arg_settings_mask |= SETTING_NETWORK;
894 break;
895 }
896
ab046dde 897 case ARG_NETWORK_BRIDGE:
ef76dff2 898
baaa35ad
ZJS
899 if (!ifname_valid(optarg))
900 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
901 "Bridge interface name not valid: %s", optarg);
ef76dff2 902
f757855e
LP
903 r = free_and_strdup(&arg_network_bridge, optarg);
904 if (r < 0)
905 return log_oom();
ab046dde 906
4831981d 907 _fallthrough_;
0dfaa006 908 case 'n':
69c79d3c
LP
909 arg_network_veth = true;
910 arg_private_network = true;
f757855e 911 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
912 break;
913
f6d6bad1
LP
914 case ARG_NETWORK_VETH_EXTRA:
915 r = veth_extra_parse(&arg_network_veth_extra, optarg);
916 if (r < 0)
917 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
918
919 arg_private_network = true;
920 arg_settings_mask |= SETTING_NETWORK;
921 break;
922
aa28aefe 923 case ARG_NETWORK_INTERFACE:
2f091b1b 924 r = interface_pair_parse(&arg_network_interfaces, optarg);
b390f178
DDM
925 if (r < 0)
926 return r;
927
c74e630d 928 arg_private_network = true;
f757855e 929 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
930 break;
931
932 case ARG_NETWORK_MACVLAN:
2f091b1b 933 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
b390f178
DDM
934 if (r < 0)
935 return r;
936
4bbfe7ad 937 arg_private_network = true;
f757855e 938 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
939 break;
940
941 case ARG_NETWORK_IPVLAN:
2f091b1b 942 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
b390f178
DDM
943 if (r < 0)
944 return r;
945
4831981d 946 _fallthrough_;
ff01d048
LP
947 case ARG_PRIVATE_NETWORK:
948 arg_private_network = true;
f757855e 949 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
950 break;
951
d7bea6b6 952 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 953 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
954 if (r < 0)
955 return r;
956
de40a303 957 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
958 break;
959
0f0dbc46 960 case 'b':
baaa35ad
ZJS
961 if (arg_start_mode == START_PID2)
962 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
963 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
964
965 arg_start_mode = START_BOOT;
966 arg_settings_mask |= SETTING_START_MODE;
967 break;
968
969 case 'a':
baaa35ad
ZJS
970 if (arg_start_mode == START_BOOT)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
973
974 arg_start_mode = START_PID2;
975 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
976 break;
977
144f0fc0 978 case ARG_UUID:
aea3f594
ZJS
979 r = id128_from_string_nonzero(optarg, &arg_uuid);
980 if (r == -ENXIO)
baaa35ad
ZJS
981 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
982 "Machine UUID may not be all zeroes.");
aea3f594
ZJS
983 if (r < 0)
984 return log_error_errno(r, "Invalid UUID: %s", optarg);
f757855e
LP
985
986 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 987 break;
aa96c6cb 988
43c3fb46
LP
989 case 'S': {
990 _cleanup_free_ char *mangled = NULL;
991
992 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
993 if (r < 0)
994 return log_oom();
995
43c3fb46 996 free_and_replace(arg_slice, mangled);
de40a303 997 arg_settings_mask |= SETTING_SLICE;
144f0fc0 998 break;
43c3fb46 999 }
144f0fc0 1000
7027ff61 1001 case 'M':
c1521918 1002 if (isempty(optarg))
97b11eed 1003 arg_machine = mfree(arg_machine);
c1521918 1004 else {
52ef5dd7 1005 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1006 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1007 "Invalid machine name: %s", optarg);
7027ff61 1008
0c3c4284
LP
1009 r = free_and_strdup(&arg_machine, optarg);
1010 if (r < 0)
eb91eb18 1011 return log_oom();
eb91eb18 1012 }
9ce6d1b3 1013 break;
7027ff61 1014
3a9530e5
LP
1015 case ARG_HOSTNAME:
1016 if (isempty(optarg))
1017 arg_hostname = mfree(arg_hostname);
1018 else {
52ef5dd7 1019 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1020 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1021 "Invalid hostname: %s", optarg);
3a9530e5
LP
1022
1023 r = free_and_strdup(&arg_hostname, optarg);
1024 if (r < 0)
1025 return log_oom();
1026 }
1027
1028 arg_settings_mask |= SETTING_HOSTNAME;
1029 break;
1030
82adf6af
LP
1031 case 'Z':
1032 arg_selinux_context = optarg;
a8828ed9
DW
1033 break;
1034
82adf6af
LP
1035 case 'L':
1036 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1037 break;
1038
bc2f673e
LP
1039 case ARG_READ_ONLY:
1040 arg_read_only = true;
f757855e 1041 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1042 break;
1043
88fc9c9b
TH
1044 case ARG_AMBIENT_CAPABILITY: {
1045 uint64_t m;
1046 r = parse_capability_spec(optarg, &m);
1047 if (r <= 0)
1048 return r;
1049 arg_caps_ambient |= m;
1050 arg_settings_mask |= SETTING_CAPABILITY;
1051 break;
1052 }
420c7379
LP
1053 case ARG_CAPABILITY:
1054 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1055 uint64_t m;
1056 r = parse_capability_spec(optarg, &m);
1057 if (r <= 0)
1058 return r;
5076f0cc 1059
8a99bd0c
ZJS
1060 if (c == ARG_CAPABILITY)
1061 plus |= m;
1062 else
1063 minus |= m;
f757855e 1064 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1065 break;
1066 }
66edd963
LP
1067 case ARG_NO_NEW_PRIVILEGES:
1068 r = parse_boolean(optarg);
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1071
1072 arg_no_new_privileges = r;
1073 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1074 break;
1075
57fb9fb5
LP
1076 case 'j':
1077 arg_link_journal = LINK_GUEST;
574edc90 1078 arg_link_journal_try = true;
4e1d6aa9 1079 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1080 break;
1081
1082 case ARG_LINK_JOURNAL:
4e1d6aa9 1083 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1086
4e1d6aa9 1087 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1088 break;
1089
17fe0523 1090 case ARG_BIND:
f757855e
LP
1091 case ARG_BIND_RO:
1092 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1095
f757855e 1096 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1097 break;
06c17c39 1098
f757855e
LP
1099 case ARG_TMPFS:
1100 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1101 if (r < 0)
1102 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1103
f757855e 1104 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1105 break;
5a8af538
LP
1106
1107 case ARG_OVERLAY:
ad85779a
LP
1108 case ARG_OVERLAY_RO:
1109 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1110 if (r == -EADDRNOTAVAIL)
1111 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1112 if (r < 0)
1113 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1114
f757855e 1115 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1116 break;
06c17c39 1117
de40a303
LP
1118 case ARG_INACCESSIBLE:
1119 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1120 if (r < 0)
1121 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1122
1123 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1124 break;
1125
0d2a0179
ZJS
1126 case 'E':
1127 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1128 if (r < 0)
0d2a0179 1129 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1130
f757855e 1131 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1132 break;
f4889f65 1133
284c0b91
LP
1134 case 'q':
1135 arg_quiet = true;
1136 break;
1137
8a96d94e 1138 case ARG_SHARE_SYSTEM:
a6b5216c 1139 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1140 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1141 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1142 arg_clone_ns_flags = 0;
8a96d94e
LP
1143 break;
1144
eb91eb18
LP
1145 case ARG_REGISTER:
1146 r = parse_boolean(optarg);
1147 if (r < 0) {
1148 log_error("Failed to parse --register= argument: %s", optarg);
1149 return r;
1150 }
1151
1152 arg_register = r;
1153 break;
1154
89f7c846
LP
1155 case ARG_KEEP_UNIT:
1156 arg_keep_unit = true;
1157 break;
1158
6afc95b7
LP
1159 case ARG_PERSONALITY:
1160
ac45f971 1161 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1162 if (arg_personality == PERSONALITY_INVALID)
1163 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1164 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1165
f757855e 1166 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1167 break;
1168
4d9f07b4
LP
1169 case ARG_VOLATILE:
1170
1171 if (!optarg)
f757855e 1172 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1173 else if (streq(optarg, "help")) {
1174 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1175 return 0;
1176 } else {
f757855e 1177 VolatileMode m;
4d9f07b4 1178
f757855e 1179 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1180 if (m < 0)
1181 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1182 "Failed to parse --volatile= argument: %s", optarg);
1183 else
f757855e 1184 arg_volatile_mode = m;
6d0b55c2
LP
1185 }
1186
f757855e
LP
1187 arg_settings_mask |= SETTING_VOLATILE_MODE;
1188 break;
6d0b55c2 1189
f757855e
LP
1190 case 'p':
1191 r = expose_port_parse(&arg_expose_ports, optarg);
1192 if (r == -EEXIST)
1193 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1194 if (r < 0)
1195 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1196
f757855e 1197 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1198 break;
6d0b55c2 1199
f36933fe
LP
1200 case ARG_PROPERTY:
1201 if (strv_extend(&arg_property, optarg) < 0)
1202 return log_oom();
1203
1204 break;
1205
ae209204 1206 case ARG_PRIVATE_USERS: {
33eac552 1207 int boolean;
0de7acce 1208
ae209204
ZJS
1209 if (!optarg)
1210 boolean = true;
1211 else if (!in_charset(optarg, DIGITS))
1212 /* do *not* parse numbers as booleans */
1213 boolean = parse_boolean(optarg);
33eac552
LP
1214 else
1215 boolean = -1;
ae209204 1216
33eac552 1217 if (boolean == 0) {
0de7acce
LP
1218 /* no: User namespacing off */
1219 arg_userns_mode = USER_NAMESPACE_NO;
1220 arg_uid_shift = UID_INVALID;
1221 arg_uid_range = UINT32_C(0x10000);
33eac552 1222 } else if (boolean > 0) {
0de7acce
LP
1223 /* yes: User namespacing on, UID range is read from root dir */
1224 arg_userns_mode = USER_NAMESPACE_FIXED;
1225 arg_uid_shift = UID_INVALID;
1226 arg_uid_range = UINT32_C(0x10000);
1227 } else if (streq(optarg, "pick")) {
1228 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1229 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1230 * implied by USER_NAMESPACE_PICK
33eac552 1231 * further down. */
0de7acce
LP
1232 arg_uid_shift = UID_INVALID;
1233 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1234
1235 } else if (streq(optarg, "identity")) {
6c2d70ce 1236 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1237 * itself, i.e. we don't actually map anything, but do take benefit of
1238 * isolation of capability sets. */
1239 arg_userns_mode = USER_NAMESPACE_FIXED;
1240 arg_uid_shift = 0;
1241 arg_uid_range = UINT32_C(0x10000);
0de7acce 1242 } else {
6c2058b3 1243 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1244 const char *range, *shift;
1245
0de7acce
LP
1246 /* anything else: User namespacing on, UID range is explicitly configured */
1247
6dac160c
LP
1248 range = strchr(optarg, ':');
1249 if (range) {
6c2058b3
ZJS
1250 buffer = strndup(optarg, range - optarg);
1251 if (!buffer)
1252 return log_oom();
1253 shift = buffer;
6dac160c
LP
1254
1255 range++;
bfd292ec
ZJS
1256 r = safe_atou32(range, &arg_uid_range);
1257 if (r < 0)
be715731 1258 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1259 } else
1260 shift = optarg;
1261
be715731
ZJS
1262 r = parse_uid(shift, &arg_uid_shift);
1263 if (r < 0)
1264 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1265
1266 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1267
58e13de5
LP
1268 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1269 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1270 }
be715731 1271
0de7acce 1272 arg_settings_mask |= SETTING_USERNS;
6dac160c 1273 break;
ae209204 1274 }
6dac160c 1275
0de7acce 1276 case 'U':
ccabee0d 1277 if (userns_supported()) {
6c045a99
LP
1278 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1279 * implied by USER_NAMESPACE_PICK
33eac552 1280 * further down. */
ccabee0d
LP
1281 arg_uid_shift = UID_INVALID;
1282 arg_uid_range = UINT32_C(0x10000);
1283
1284 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1285 }
1286
7336138e
LP
1287 break;
1288
0de7acce 1289 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1290 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1291
1292 arg_settings_mask |= SETTING_USERNS;
1293 break;
1294
1295 case ARG_PRIVATE_USERS_OWNERSHIP:
1296 if (streq(optarg, "help")) {
1297 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1298 return 0;
1299 }
1300
1301 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1302 if (arg_userns_ownership < 0)
1303 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1304
1305 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1306 break;
1307
c6c8f6e2 1308 case ARG_KILL_SIGNAL:
5c828e66
LP
1309 if (streq(optarg, "help")) {
1310 DUMP_STRING_TABLE(signal, int, _NSIG);
1311 return 0;
1312 }
1313
29a3db75 1314 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1315 if (arg_kill_signal < 0)
7211c853 1316 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1317
f757855e
LP
1318 arg_settings_mask |= SETTING_KILL_SIGNAL;
1319 break;
1320
1321 case ARG_SETTINGS:
1322
1323 /* no → do not read files
1324 * yes → read files, do not override cmdline, trust only subset
1325 * override → read files, override cmdline, trust only subset
1326 * trusted → read files, do not override cmdline, trust all
1327 */
1328
1329 r = parse_boolean(optarg);
1330 if (r < 0) {
1331 if (streq(optarg, "trusted")) {
1332 mask_all_settings = false;
1333 mask_no_settings = false;
1334 arg_settings_trusted = true;
1335
1336 } else if (streq(optarg, "override")) {
1337 mask_all_settings = false;
1338 mask_no_settings = true;
1339 arg_settings_trusted = -1;
1340 } else
1341 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1342 } else if (r > 0) {
1343 /* yes */
1344 mask_all_settings = false;
1345 mask_no_settings = false;
1346 arg_settings_trusted = -1;
1347 } else {
1348 /* no */
1349 mask_all_settings = true;
1350 mask_no_settings = false;
1351 arg_settings_trusted = false;
1352 }
1353
c6c8f6e2
LP
1354 break;
1355
5f932eb9 1356 case ARG_CHDIR:
baaa35ad
ZJS
1357 if (!path_is_absolute(optarg))
1358 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1359 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1360
1361 r = free_and_strdup(&arg_chdir, optarg);
1362 if (r < 0)
1363 return log_oom();
1364
1365 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1366 break;
1367
b53ede69
PW
1368 case ARG_PIVOT_ROOT:
1369 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1370 if (r < 0)
1371 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1372
1373 arg_settings_mask |= SETTING_PIVOT_ROOT;
1374 break;
1375
9c1e04d0
AP
1376 case ARG_NOTIFY_READY:
1377 r = parse_boolean(optarg);
baaa35ad
ZJS
1378 if (r < 0)
1379 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1380 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1381 arg_notify_ready = r;
1382 arg_settings_mask |= SETTING_NOTIFY_READY;
1383 break;
1384
4623e8e6 1385 case ARG_ROOT_HASH: {
89e62e0b 1386 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1387 size_t l;
1388
1389 r = unhexmem(optarg, strlen(optarg), &k, &l);
1390 if (r < 0)
1391 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1392 if (l < sizeof(sd_id128_t))
da890466 1393 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
4623e8e6 1394
89e62e0b
LP
1395 free_and_replace(arg_verity_settings.root_hash, k);
1396 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1397 break;
1398 }
1399
c2923fdc
LB
1400 case ARG_ROOT_HASH_SIG: {
1401 char *value;
89e62e0b
LP
1402 size_t l;
1403 void *p;
c2923fdc
LB
1404
1405 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1406 r = unbase64mem(value, strlen(value), &p, &l);
1407 if (r < 0)
1408 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1409
c2923fdc 1410 } else {
89e62e0b 1411 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1412 if (r < 0)
89e62e0b 1413 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1414 }
1415
89e62e0b
LP
1416 free_and_replace(arg_verity_settings.root_hash_sig, p);
1417 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1418 break;
1419 }
1420
89e62e0b 1421 case ARG_VERITY_DATA:
614b022c 1422 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1423 if (r < 0)
1424 return r;
1425 break;
1426
960e4569
LP
1427 case ARG_SYSTEM_CALL_FILTER: {
1428 bool negative;
1429 const char *items;
1430
1431 negative = optarg[0] == '~';
1432 items = negative ? optarg + 1 : optarg;
1433
1434 for (;;) {
1435 _cleanup_free_ char *word = NULL;
1436
1437 r = extract_first_word(&items, &word, NULL, 0);
1438 if (r == 0)
1439 break;
1440 if (r == -ENOMEM)
1441 return log_oom();
1442 if (r < 0)
1443 return log_error_errno(r, "Failed to parse system call filter: %m");
1444
1445 if (negative)
6b000af4 1446 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1447 else
6b000af4 1448 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1449 if (r < 0)
1450 return log_oom();
1451 }
1452
1453 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1454 break;
1455 }
1456
bf428efb
LP
1457 case ARG_RLIMIT: {
1458 const char *eq;
622ecfa8 1459 _cleanup_free_ char *name = NULL;
bf428efb
LP
1460 int rl;
1461
5c828e66
LP
1462 if (streq(optarg, "help")) {
1463 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1464 return 0;
1465 }
1466
bf428efb 1467 eq = strchr(optarg, '=');
baaa35ad
ZJS
1468 if (!eq)
1469 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1470 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1471
1472 name = strndup(optarg, eq - optarg);
1473 if (!name)
1474 return log_oom();
1475
1476 rl = rlimit_from_string_harder(name);
baaa35ad 1477 if (rl < 0)
7211c853 1478 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1479
1480 if (!arg_rlimit[rl]) {
1481 arg_rlimit[rl] = new0(struct rlimit, 1);
1482 if (!arg_rlimit[rl])
1483 return log_oom();
1484 }
1485
1486 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1487 if (r < 0)
1488 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1489
1490 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1491 break;
1492 }
1493
81f345df
LP
1494 case ARG_OOM_SCORE_ADJUST:
1495 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1498
1499 arg_oom_score_adjust_set = true;
1500 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1501 break;
1502
d107bb7d 1503 case ARG_CPU_AFFINITY: {
0985c7c4 1504 CPUSet cpuset;
d107bb7d
LP
1505
1506 r = parse_cpu_set(optarg, &cpuset);
1507 if (r < 0)
0985c7c4 1508 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1509
0985c7c4
ZJS
1510 cpu_set_reset(&arg_cpu_set);
1511 arg_cpu_set = cpuset;
d107bb7d
LP
1512 arg_settings_mask |= SETTING_CPU_AFFINITY;
1513 break;
1514 }
1515
09d423e9
LP
1516 case ARG_RESOLV_CONF:
1517 if (streq(optarg, "help")) {
1518 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1519 return 0;
1520 }
1521
1522 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1523 if (arg_resolv_conf < 0)
7211c853 1524 return log_error_errno(arg_resolv_conf,
baaa35ad 1525 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1526
1527 arg_settings_mask |= SETTING_RESOLV_CONF;
1528 break;
1529
1688841f
LP
1530 case ARG_TIMEZONE:
1531 if (streq(optarg, "help")) {
1532 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1533 return 0;
1534 }
1535
1536 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1537 if (arg_timezone < 0)
7211c853 1538 return log_error_errno(arg_timezone,
baaa35ad 1539 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1540
1541 arg_settings_mask |= SETTING_TIMEZONE;
1542 break;
1543
de40a303 1544 case ARG_CONSOLE:
dce66ffe
ZJS
1545 r = handle_arg_console(optarg);
1546 if (r <= 0)
1547 return r;
de40a303
LP
1548 break;
1549
1550 case 'P':
1551 case ARG_PIPE:
dce66ffe
ZJS
1552 r = handle_arg_console("pipe");
1553 if (r <= 0)
1554 return r;
de40a303
LP
1555 break;
1556
bb068de0
ZJS
1557 case ARG_NO_PAGER:
1558 arg_pager_flags |= PAGER_DISABLE;
1559 break;
1560
3652872a
LP
1561 case ARG_SET_CREDENTIAL: {
1562 _cleanup_free_ char *word = NULL, *data = NULL;
1563 const char *p = optarg;
1564 Credential *a;
e437538f 1565 ssize_t l;
3652872a
LP
1566
1567 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1568 if (r == -ENOMEM)
1569 return log_oom();
1570 if (r < 0)
1571 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1572 if (r == 0 || !p)
1573 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1574
1575 if (!credential_name_valid(word))
1576 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1577
12d729b2 1578 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1579 if (streq(arg_credentials[i].id, word))
1580 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1581
1582 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1583 if (l < 0)
1584 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1585
1586 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1587 if (!a)
1588 return log_oom();
1589
1590 a[arg_n_credentials++] = (Credential) {
1591 .id = TAKE_PTR(word),
1592 .data = TAKE_PTR(data),
1593 .size = l,
1594 };
1595
1596 arg_credentials = a;
1597
1598 arg_settings_mask |= SETTING_CREDENTIALS;
1599 break;
1600 }
1601
1602 case ARG_LOAD_CREDENTIAL: {
1603 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1604 _cleanup_(erase_and_freep) char *data = NULL;
1605 _cleanup_free_ char *word = NULL, *j = NULL;
1606 const char *p = optarg;
1607 Credential *a;
1608 size_t size, i;
1609
1610 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1611 if (r == -ENOMEM)
1612 return log_oom();
1613 if (r < 0)
c941b650 1614 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
3652872a 1615 if (r == 0 || !p)
c941b650 1616 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
3652872a
LP
1617
1618 if (!credential_name_valid(word))
1619 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1620
1621 for (i = 0; i < arg_n_credentials; i++)
1622 if (streq(arg_credentials[i].id, word))
1623 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1624
1625 if (path_is_absolute(p))
1626 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1627 else {
1628 const char *e;
1629
786d19fd
LP
1630 r = get_credentials_dir(&e);
1631 if (r < 0)
1632 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1633
1634 j = path_join(e, p);
1635 if (!j)
1636 return log_oom();
1637 }
1638
986311c2
LP
1639 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1640 flags,
1641 NULL,
1642 &data, &size);
3652872a
LP
1643 if (r < 0)
1644 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1645
1646 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1647 if (!a)
1648 return log_oom();
1649
1650 a[arg_n_credentials++] = (Credential) {
1651 .id = TAKE_PTR(word),
1652 .data = TAKE_PTR(data),
1653 .size = size,
1654 };
1655
1656 arg_credentials = a;
1657
1658 arg_settings_mask |= SETTING_CREDENTIALS;
1659 break;
1660 }
1661
2f893044
LP
1662 case ARG_BIND_USER:
1663 if (!valid_user_group_name(optarg, 0))
1664 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1665
1666 if (strv_extend(&arg_bind_user, optarg) < 0)
1667 return log_oom();
1668
1669 arg_settings_mask |= SETTING_BIND_USER;
1670 break;
1671
4a4654e0
LP
1672 case ARG_SUPPRESS_SYNC:
1673 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1674 if (r < 0)
1675 return r;
1676
1677 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1678 break;
1679
06e78680
YW
1680 case ARG_IMAGE_POLICY:
1681 r = parse_image_policy_argument(optarg, &arg_image_policy);
84be0c71 1682 if (r < 0)
06e78680 1683 return r;
84be0c71 1684 break;
84be0c71 1685
88213476
LP
1686 case '?':
1687 return -EINVAL;
1688
1689 default:
04499a70 1690 assert_not_reached();
88213476 1691 }
88213476 1692
60f1ec13
LP
1693 if (argc > optind) {
1694 strv_free(arg_parameters);
1695 arg_parameters = strv_copy(argv + optind);
1696 if (!arg_parameters)
1697 return log_oom();
d7bea6b6 1698
60f1ec13
LP
1699 arg_settings_mask |= SETTING_START_MODE;
1700 }
1701
1702 if (arg_ephemeral && arg_template && !arg_directory)
1703 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1704 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1705 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1706 * --directory=". */
1707 arg_directory = TAKE_PTR(arg_template);
1708
2642d22a
DDM
1709 arg_caps_retain |= plus;
1710 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1711
1712 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1713 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1714 * indicate that. */
1715 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1716 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1717
1718 arg_caps_retain &= ~minus;
60f1ec13 1719
de40a303 1720 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1721 r = parse_environment();
1722 if (r < 0)
1723 return r;
de40a303 1724
60f1ec13
LP
1725 /* Load all settings from .nspawn files */
1726 if (mask_no_settings)
1727 arg_settings_mask = 0;
1728
1729 /* Don't load any settings from .nspawn files */
1730 if (mask_all_settings)
1731 arg_settings_mask = _SETTINGS_MASK_ALL;
1732
1733 return 1;
1734}
1735
1736static int verify_arguments(void) {
1737 int r;
a6b5216c 1738
75b0d8b8
ZJS
1739 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1740 /* If we are running the stub init in the container, we don't need to look at what the init
1741 * in the container supports, because we are not using it. Let's immediately pick the right
1742 * setting based on the host system configuration.
1743 *
1744 * We only do this, if the user didn't use an environment variable to override the detection.
1745 */
1746
1747 r = cg_all_unified();
1748 if (r < 0)
1749 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1750 if (r > 0)
1751 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1752 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1753 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1754 else
1755 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1756 }
1757
4f086aab
SU
1758 if (arg_userns_mode != USER_NAMESPACE_NO)
1759 arg_mount_settings |= MOUNT_USE_USERNS;
1760
1761 if (arg_private_network)
1762 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1763
48a8d337
LB
1764 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1765 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1766 arg_register = false;
baaa35ad 1767 if (arg_start_mode != START_PID1)
60f1ec13 1768 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1769 }
eb91eb18 1770
6c045a99
LP
1771 if (arg_userns_ownership < 0)
1772 arg_userns_ownership =
f61c7f88 1773 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1774 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1775
60f1ec13
LP
1776 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1777 arg_kill_signal = SIGRTMIN+3;
1778
e5a4bb0d
LP
1779 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1780 arg_read_only = true;
1781
2436ea76
DDM
1782 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1783 arg_read_only = true;
1784
baaa35ad 1785 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1786 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1787 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1789
baaa35ad 1790 if (arg_directory && arg_image)
60f1ec13 1791 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1792
baaa35ad 1793 if (arg_template && arg_image)
60f1ec13 1794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1795
baaa35ad 1796 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1798
baaa35ad 1799 if (arg_ephemeral && arg_template)
60f1ec13 1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1801
baaa35ad 1802 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1804
baaa35ad 1805 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1806 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1807
6c045a99 1808 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1809 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1810 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1811
6c045a99
LP
1812 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1813 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1814 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1815 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1816 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1817
679ecd36
SZ
1818 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1819 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1820 if (arg_network_namespace_path &&
1821 (arg_network_interfaces || arg_network_macvlan ||
1822 arg_network_ipvlan || arg_network_veth_extra ||
1823 arg_network_bridge || arg_network_zone ||
679ecd36 1824 arg_network_veth))
de40a303 1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1826
60f1ec13 1827 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1828 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1829 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1830
baaa35ad 1831 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1833
baaa35ad 1834 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1835 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1836
baaa35ad 1837 if (arg_expose_ports && !arg_private_network)
60f1ec13 1838 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1839
88fc9c9b 1840 if (arg_caps_ambient) {
f5fbe71d 1841 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1842 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1843
1844 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1846
1847 if (arg_start_mode == START_BOOT)
1848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1849 }
1850
2f893044
LP
1851 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1852 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1853
1854 /* Drop duplicate --bind-user= entries */
1855 strv_uniq(arg_bind_user);
1856
60f1ec13
LP
1857 r = custom_mount_check_all();
1858 if (r < 0)
1859 return r;
c6c8f6e2 1860
f757855e 1861 return 0;
88213476
LP
1862}
1863
2f091b1b
TM
1864static int verify_network_interfaces_initialized(void) {
1865 int r;
1866 r = test_network_interfaces_initialized(arg_network_interfaces);
1867 if (r < 0)
1868 return r;
1869
1870 r = test_network_interfaces_initialized(arg_network_macvlan);
1871 if (r < 0)
1872 return r;
1873
1874 r = test_network_interfaces_initialized(arg_network_ipvlan);
1875 if (r < 0)
1876 return r;
1877
1878 return 0;
1879}
1880
91181e07 1881int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1882 assert(p);
1883
0de7acce 1884 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1885 return 0;
1886
1887 if (uid == UID_INVALID && gid == GID_INVALID)
1888 return 0;
1889
1890 if (uid != UID_INVALID) {
1891 uid += arg_uid_shift;
1892
1893 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1894 return -EOVERFLOW;
1895 }
1896
1897 if (gid != GID_INVALID) {
1898 gid += (gid_t) arg_uid_shift;
1899
1900 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1901 return -EOVERFLOW;
1902 }
1903
7c248223 1904 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1905}
1906
91181e07 1907int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1908 const char *q;
dae8b82e 1909 int r;
03cfe0d5
LP
1910
1911 q = prefix_roota(root, path);
3f692e2e 1912 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1913 if (r == -EEXIST)
1914 return 0;
1915 if (r < 0)
1916 return r;
03cfe0d5
LP
1917
1918 return userns_lchown(q, uid, gid);
1919}
1920
1688841f 1921static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1922 return PATH_STARTSWITH_SET(
1923 path,
1924 "../usr/share/zoneinfo/",
1925 "/usr/share/zoneinfo/");
1688841f
LP
1926}
1927
83205269
LP
1928static bool etc_writable(void) {
1929 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1930}
1931
e58a1277 1932static int setup_timezone(const char *dest) {
1688841f
LP
1933 _cleanup_free_ char *p = NULL, *etc = NULL;
1934 const char *where, *check;
1935 TimezoneMode m;
d4036145 1936 int r;
f8440af5 1937
e58a1277
LP
1938 assert(dest);
1939
1688841f 1940 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1941 r = readlink_malloc("/etc/localtime", &p);
1942 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1943 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1944 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1945 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1946 else if (r < 0) {
1947 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1948 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1949 * file.
1950 *
1951 * Example:
1952 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1953 */
1954 return 0;
1955 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1956 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1957 else
1958 m = arg_timezone;
1959 } else
1960 m = arg_timezone;
1961
1962 if (m == TIMEZONE_OFF)
1963 return 0;
1964
f461a28d 1965 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1966 if (r < 0) {
1688841f 1967 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1968 return 0;
1969 }
1970
1688841f
LP
1971 where = strjoina(etc, "/localtime");
1972
1973 switch (m) {
1974
1975 case TIMEZONE_DELETE:
1976 if (unlink(where) < 0)
1977 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1978
d4036145 1979 return 0;
d4036145 1980
1688841f
LP
1981 case TIMEZONE_SYMLINK: {
1982 _cleanup_free_ char *q = NULL;
1983 const char *z, *what;
4d1c38b8 1984
1688841f
LP
1985 z = timezone_from_path(p);
1986 if (!z) {
1987 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1988 return 0;
1688841f 1989 }
d4036145 1990
1688841f
LP
1991 r = readlink_malloc(where, &q);
1992 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1993 return 0; /* Already pointing to the right place? Then do nothing .. */
1994
1995 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 1996 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
1997 if (r < 0)
1998 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1999 else {
2000 if (unlink(where) < 0 && errno != ENOENT) {
2001 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2002 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2003 return 0;
2004 }
2005
2006 what = strjoina("../usr/share/zoneinfo/", z);
2007 if (symlink(what, where) < 0) {
2008 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2009 errno, "Failed to correct timezone of container, ignoring: %m");
2010 return 0;
2011 }
2012
2013 break;
2014 }
2015
2016 _fallthrough_;
d4036145 2017 }
68fb0892 2018
1688841f
LP
2019 case TIMEZONE_BIND: {
2020 _cleanup_free_ char *resolved = NULL;
2021 int found;
2022
f461a28d 2023 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2024 if (found < 0) {
2025 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2026 return 0;
2027 }
2028
2029 if (found == 0) /* missing? */
2030 (void) touch(resolved);
2031
511a8cfe 2032 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2033 if (r >= 0)
511a8cfe 2034 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2035
2036 _fallthrough_;
79d80fc1 2037 }
4d9f07b4 2038
1688841f
LP
2039 case TIMEZONE_COPY:
2040 /* If mounting failed, try to copy */
7c2f5495 2041 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2042 if (r < 0) {
2043 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2044 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2045 return 0;
2046 }
2047
2048 break;
2049
2050 default:
04499a70 2051 assert_not_reached();
d4036145 2052 }
e58a1277 2053
1688841f 2054 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2055 r = userns_lchown(where, 0, 0);
2056 if (r < 0)
1688841f 2057 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2058
e58a1277 2059 return 0;
88213476
LP
2060}
2061
09d423e9
LP
2062static int have_resolv_conf(const char *path) {
2063 assert(path);
2064
2065 if (access(path, F_OK) < 0) {
2066 if (errno == ENOENT)
2067 return 0;
2068
2069 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2070 }
2071
2072 return 1;
2073}
2074
7357272e 2075static int resolved_listening(void) {
b8ea7a6e 2076 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2077 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2078 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2079 int r;
2080
7357272e 2081 /* Check if resolved is listening */
b053cd5f
LP
2082
2083 r = sd_bus_open_system(&bus);
2084 if (r < 0)
b8ea7a6e 2085 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2086
7357272e 2087 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2088 if (r < 0)
2089 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2090 if (r == 0)
2091 return 0;
7357272e 2092
7f8a85e6 2093 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
7357272e 2094 if (r < 0)
b8ea7a6e 2095 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2096
2097 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2098}
2099
2547bb41 2100static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2101 _cleanup_free_ char *etc = NULL;
2102 const char *where, *what;
2103 ResolvConfMode m;
2104 int r;
2547bb41
LP
2105
2106 assert(dest);
2107
09d423e9
LP
2108 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2109 if (arg_private_network)
2110 m = RESOLV_CONF_OFF;
86775e35
LP
2111 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2112 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2113 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2114 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2115 else
83205269 2116 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2117
09d423e9
LP
2118 } else
2119 m = arg_resolv_conf;
2120
2121 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2122 return 0;
2123
f461a28d 2124 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2125 if (r < 0) {
2126 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2127 return 0;
2128 }
2129
2130 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2131
2132 if (m == RESOLV_CONF_DELETE) {
2133 if (unlink(where) < 0)
2134 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2135
87447ae4
LP
2136 return 0;
2137 }
79d80fc1 2138
86775e35
LP
2139 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2140 what = PRIVATE_STATIC_RESOLV_CONF;
2141 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2142 what = PRIVATE_UPLINK_RESOLV_CONF;
2143 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2144 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2145 else
2146 what = "/etc/resolv.conf";
87447ae4 2147
86775e35 2148 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2149 _cleanup_free_ char *resolved = NULL;
2150 int found;
2151
d404c8d8 2152 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
09d423e9
LP
2153 if (found < 0) {
2154 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2155 return 0;
2156 }
3539724c 2157
87447ae4
LP
2158 if (found == 0) /* missing? */
2159 (void) touch(resolved);
5367354d 2160
511a8cfe 2161 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2162 if (r >= 0)
511a8cfe 2163 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2164
2165 /* If that didn't work, let's copy the file */
3539724c
LP
2166 }
2167
86775e35 2168 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2169 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2170 else
7c2f5495 2171 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2172 if (r < 0) {
3539724c
LP
2173 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2174 * resolved or something similar runs inside and the symlink points there.
68a313c5 2175 *
3539724c 2176 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2177 */
86775e35
LP
2178 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2179 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2180 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2181 return 0;
2182 }
2547bb41 2183
03cfe0d5
LP
2184 r = userns_lchown(where, 0, 0);
2185 if (r < 0)
3539724c 2186 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2187
2547bb41
LP
2188 return 0;
2189}
2190
1e4f1671 2191static int setup_boot_id(void) {
cdde6ba6
LP
2192 _cleanup_(unlink_and_freep) char *from = NULL;
2193 _cleanup_free_ char *path = NULL;
3bbaff3e 2194 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2195 const char *to;
04bc4a3f
LP
2196 int r;
2197
1eacc470 2198 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2199
1eacc470 2200 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2201 if (r < 0)
2202 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2203
2204 r = sd_id128_randomize(&rnd);
f647962d
MS
2205 if (r < 0)
2206 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2207
b40c8ebd 2208 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2209 if (r < 0)
2210 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2211
cdde6ba6
LP
2212 from = TAKE_PTR(path);
2213 to = "/proc/sys/kernel/random/boot_id";
2214
511a8cfe 2215 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2216 if (r < 0)
2217 return r;
04bc4a3f 2218
511a8cfe 2219 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2220}
2221
e58a1277 2222static int copy_devnodes(const char *dest) {
88213476
LP
2223 static const char devnodes[] =
2224 "null\0"
2225 "zero\0"
2226 "full\0"
2227 "random\0"
2228 "urandom\0"
85614d66
TG
2229 "tty\0"
2230 "net/tun\0";
88213476 2231
e58a1277 2232 int r = 0;
a258bf26
LP
2233
2234 assert(dest);
124640f1 2235
52f05ef2 2236 BLOCK_WITH_UMASK(0000);
88213476 2237
03cfe0d5
LP
2238 /* Create /dev/net, so that we can create /dev/net/tun in it */
2239 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2240 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2241
88213476 2242 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2243 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2244 struct stat st;
88213476 2245
c6134d3e 2246 from = path_join("/dev/", d);
8967f291
LP
2247 if (!from)
2248 return log_oom();
2249
c6134d3e 2250 to = path_join(dest, from);
8967f291
LP
2251 if (!to)
2252 return log_oom();
88213476
LP
2253
2254 if (stat(from, &st) < 0) {
2255
4a62c710
MS
2256 if (errno != ENOENT)
2257 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2258
baaa35ad
ZJS
2259 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2260 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2261 "%s is not a char or block device, cannot copy.", from);
2262 else {
8dfce114
LP
2263 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2264
81f5049b 2265 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2266 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2267 if (errno == EEXIST)
8dbf71ec 2268 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2269 if (errno != EPERM)
2270 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2271
8dfce114 2272 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2273 r = touch(to);
2274 if (r < 0)
2275 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2276 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2277 if (r < 0)
2278 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2279 }
6278cf60 2280
03cfe0d5
LP
2281 r = userns_lchown(to, 0, 0);
2282 if (r < 0)
2283 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2284
657ee2d8 2285 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2286 if (!dn)
2287 return log_oom();
2288
2289 r = userns_mkdir(dest, dn, 0755, 0, 0);
2290 if (r < 0)
2291 return log_error_errno(r, "Failed to create '%s': %m", dn);
2292
2293 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2294 return log_oom();
2295
c6134d3e 2296 prefixed = path_join(dest, sl);
8dfce114
LP
2297 if (!prefixed)
2298 return log_oom();
2299
2d9b74ba 2300 t = path_join("..", d);
8dfce114
LP
2301 if (!t)
2302 return log_oom();
2303
2304 if (symlink(t, prefixed) < 0)
2305 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2306 }
88213476
LP
2307 }
2308
e58a1277
LP
2309 return r;
2310}
88213476 2311
de40a303 2312static int make_extra_nodes(const char *dest) {
de40a303
LP
2313 size_t i;
2314 int r;
2315
52f05ef2 2316 BLOCK_WITH_UMASK(0000);
de40a303
LP
2317
2318 for (i = 0; i < arg_n_extra_nodes; i++) {
2319 _cleanup_free_ char *path = NULL;
2320 DeviceNode *n = arg_extra_nodes + i;
2321
c6134d3e 2322 path = path_join(dest, n->path);
de40a303
LP
2323 if (!path)
2324 return log_oom();
2325
2326 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2327 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2328
2329 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2330 if (r < 0)
2331 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2332 }
2333
2334 return 0;
2335}
2336
03cfe0d5
LP
2337static int setup_pts(const char *dest) {
2338 _cleanup_free_ char *options = NULL;
2339 const char *p;
709f6e46 2340 int r;
03cfe0d5 2341
349cc4a5 2342#if HAVE_SELINUX
03cfe0d5
LP
2343 if (arg_selinux_apifs_context)
2344 (void) asprintf(&options,
3dce8915 2345 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2346 arg_uid_shift + TTY_GID,
2347 arg_selinux_apifs_context);
2348 else
2349#endif
2350 (void) asprintf(&options,
3dce8915 2351 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2352 arg_uid_shift + TTY_GID);
f2d88580 2353
03cfe0d5 2354 if (!options)
f2d88580
LP
2355 return log_oom();
2356
03cfe0d5 2357 /* Mount /dev/pts itself */
cc9fce65 2358 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2359 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2360 if (r < 0)
2361 return log_error_errno(r, "Failed to create /dev/pts: %m");
2362
511a8cfe 2363 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2364 if (r < 0)
2365 return r;
709f6e46
MS
2366 r = userns_lchown(p, 0, 0);
2367 if (r < 0)
2368 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2369
2370 /* Create /dev/ptmx symlink */
2371 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2372 if (symlink("pts/ptmx", p) < 0)
2373 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2374 r = userns_lchown(p, 0, 0);
2375 if (r < 0)
2376 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2377
03cfe0d5
LP
2378 /* And fix /dev/pts/ptmx ownership */
2379 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2380 r = userns_lchown(p, 0, 0);
2381 if (r < 0)
2382 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2383
f2d88580
LP
2384 return 0;
2385}
2386
3acc84eb 2387static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2388 _cleanup_close_ int terminal = -EBADF;
e58a1277 2389 int r;
e58a1277 2390
335d2ead
LP
2391 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2392 * explicitly, if we are configured to. */
2393 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2394 if (terminal < 0)
2395 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2396
3acc84eb
FB
2397 /* Make sure we can continue logging to the original stderr, even if
2398 * stderr points elsewhere now */
2399 r = log_dup_console();
2400 if (r < 0)
2401 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2402
3acc84eb
FB
2403 /* invalidates 'terminal' on success and failure */
2404 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2405 TAKE_FD(terminal);
f647962d 2406 if (r < 0)
3acc84eb
FB
2407 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2408
2409 return 0;
2410}
88213476 2411
3acc84eb
FB
2412static int setup_dev_console(const char *console) {
2413 _cleanup_free_ char *p = NULL;
2414 int r;
a258bf26 2415
3acc84eb
FB
2416 /* Create /dev/console symlink */
2417 r = path_make_relative("/dev", console, &p);
81f5049b 2418 if (r < 0)
3acc84eb
FB
2419 return log_error_errno(r, "Failed to create relative path: %m");
2420
2421 if (symlink(p, "/dev/console") < 0)
2422 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2423
3acc84eb 2424 return 0;
e58a1277
LP
2425}
2426
8e5430c4
LP
2427static int setup_keyring(void) {
2428 key_serial_t keyring;
2429
6b000af4
LP
2430 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2431 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2432 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2433 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2434 * into the container. */
8e5430c4
LP
2435
2436 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2437 if (keyring == -1) {
2438 if (errno == ENOSYS)
2439 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2440 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2441 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2442 else
2443 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2444 }
2445
2446 return 0;
2447}
2448
3652872a
LP
2449static int setup_credentials(const char *root) {
2450 const char *q;
2451 int r;
2452
2453 if (arg_n_credentials <= 0)
2454 return 0;
2455
2456 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to create /run/host: %m");
2459
2460 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2463
2464 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2465 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2466 if (r < 0)
2467 return r;
2468
2469 for (size_t i = 0; i < arg_n_credentials; i++) {
2470 _cleanup_free_ char *j = NULL;
254d1313 2471 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2472
2473 j = path_join(q, arg_credentials[i].id);
2474 if (!j)
2475 return log_oom();
2476
2477 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2478 if (fd < 0)
2479 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2480
e22c60a9 2481 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
3652872a
LP
2482 if (r < 0)
2483 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2484
2485 if (fchmod(fd, 0400) < 0)
2486 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2487
2488 if (arg_userns_mode != USER_NAMESPACE_NO) {
2489 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2490 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2491 }
2492 }
2493
2494 if (chmod(q, 0500) < 0)
2495 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2496
2497 r = userns_lchown(q, 0, 0);
2498 if (r < 0)
2499 return r;
2500
2501 /* Make both mount and superblock read-only now */
511a8cfe 2502 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2503 if (r < 0)
2504 return r;
2505
511a8cfe 2506 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2507}
2508
5d9d3fcb 2509static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2510 _cleanup_(unlink_and_freep) char *from = NULL;
2511 _cleanup_free_ char *fifo = NULL;
254d1313 2512 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2513 int r;
e58a1277 2514
5d9d3fcb 2515 assert(fd_inner_socket >= 0);
a258bf26 2516
52f05ef2 2517 BLOCK_WITH_UMASK(0000);
a258bf26 2518
30fd9a2d 2519 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2520 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2521 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2522 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2523
1eacc470 2524 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2525 if (r < 0)
2526 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2527
9ec5a93c 2528 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2529 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2530
2531 from = TAKE_PTR(fifo);
9ec5a93c 2532
511a8cfe 2533 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2534 if (r < 0)
2535 return r;
e58a1277 2536
669fc4e5 2537 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2538 if (fd < 0)
2539 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2540
9ec5a93c 2541 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2542 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2543 if (r < 0)
2544 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2545
25ea79fe 2546 return 0;
88213476
LP
2547}
2548
761cf19d 2549struct ExposeArgs {
deff68e7
FW
2550 union in_addr_union address4;
2551 union in_addr_union address6;
761cf19d
FW
2552 struct FirewallContext *fw_ctx;
2553};
2554
1c4baffc 2555static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2556 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2557
2558 assert(rtnl);
2559 assert(m);
6d0b55c2 2560
fb9044cb
LP
2561 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2562 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2563 return 0;
2564}
2565
3a74cea5 2566static int setup_hostname(void) {
c818eef1 2567 int r;
3a74cea5 2568
0c582db0 2569 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2570 return 0;
2571
c818eef1
LP
2572 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2573 if (r < 0)
2574 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2575
7027ff61 2576 return 0;
3a74cea5
LP
2577}
2578
57fb9fb5 2579static int setup_journal(const char *directory) {
0f5e1382 2580 _cleanup_free_ char *d = NULL;
5980d463 2581 const char *p, *q;
b2238e38 2582 sd_id128_t this_id;
8054d749 2583 bool try;
57fb9fb5
LP
2584 int r;
2585
df9a75e4
LP
2586 /* Don't link journals in ephemeral mode */
2587 if (arg_ephemeral)
2588 return 0;
2589
8054d749
LP
2590 if (arg_link_journal == LINK_NO)
2591 return 0;
2592
2593 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2594
4d680aee 2595 r = sd_id128_get_machine(&this_id);
f647962d
MS
2596 if (r < 0)
2597 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2598
e01ff70a 2599 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2600 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2601 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2602 if (try)
4d680aee 2603 return 0;
df9a75e4 2604 return -EEXIST;
4d680aee
ZJS
2605 }
2606
369ca6da
ZJS
2607 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2608 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2609 if (r < 0) {
2610 bool ignore = r == -EROFS && try;
2611 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2612 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2613 return ignore ? 0 : r;
2614 }
2615 }
03cfe0d5 2616
85b55869 2617 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2618 q = prefix_roota(directory, p);
27407a01 2619
e1873695 2620 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2621 if (try)
2622 return 0;
27407a01 2623
baaa35ad
ZJS
2624 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2625 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2626 }
2627
e1873695 2628 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2629 if (try)
2630 return 0;
57fb9fb5 2631
baaa35ad
ZJS
2632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2633 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2634 }
2635
2636 r = readlink_and_make_absolute(p, &d);
2637 if (r >= 0) {
3742095b 2638 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2639 path_equal(d, q)) {
2640
03cfe0d5 2641 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2642 if (r < 0)
709f6e46 2643 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2644 return 0;
57fb9fb5
LP
2645 }
2646
4a62c710
MS
2647 if (unlink(p) < 0)
2648 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2649 } else if (r == -EINVAL) {
2650
2651 if (arg_link_journal == LINK_GUEST &&
2652 rmdir(p) < 0) {
2653
27407a01
ZJS
2654 if (errno == ENOTDIR) {
2655 log_error("%s already exists and is neither a symlink nor a directory", p);
2656 return r;
4314d33f
MS
2657 } else
2658 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2659 }
4314d33f
MS
2660 } else if (r != -ENOENT)
2661 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2662
2663 if (arg_link_journal == LINK_GUEST) {
2664
2665 if (symlink(q, p) < 0) {
8054d749 2666 if (try) {
56f64d95 2667 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2668 return 0;
4314d33f
MS
2669 } else
2670 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2671 }
2672
03cfe0d5 2673 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2674 if (r < 0)
709f6e46 2675 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2676 return 0;
57fb9fb5
LP
2677 }
2678
2679 if (arg_link_journal == LINK_HOST) {
ccddd104 2680 /* don't create parents here — if the host doesn't have
574edc90 2681 * permanent journal set up, don't force it here */
ba8e6c4d 2682
3f692e2e 2683 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2684 if (r < 0 && r != -EEXIST) {
8054d749 2685 if (try) {
dae8b82e 2686 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2687 return 0;
4314d33f 2688 } else
dae8b82e 2689 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2690 }
2691
27407a01
ZJS
2692 } else if (access(p, F_OK) < 0)
2693 return 0;
57fb9fb5 2694
db55bbf2 2695 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2696 log_warning("%s is not empty, proceeding anyway.", q);
2697
03cfe0d5 2698 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2699 if (r < 0)
2700 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2701
511a8cfe 2702 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2703 if (r < 0)
4a62c710 2704 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2705
27407a01 2706 return 0;
57fb9fb5
LP
2707}
2708
de40a303
LP
2709static int drop_capabilities(uid_t uid) {
2710 CapabilityQuintet q;
2711
2712 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2713 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2714 * arg_caps_retain. */
2715
2716 if (capability_quintet_is_set(&arg_full_capabilities)) {
2717 q = arg_full_capabilities;
2718
f5fbe71d 2719 if (q.bounding == UINT64_MAX)
de40a303
LP
2720 q.bounding = uid == 0 ? arg_caps_retain : 0;
2721
f5fbe71d 2722 if (q.effective == UINT64_MAX)
de40a303
LP
2723 q.effective = uid == 0 ? q.bounding : 0;
2724
f5fbe71d 2725 if (q.inheritable == UINT64_MAX)
88fc9c9b 2726 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2727
f5fbe71d 2728 if (q.permitted == UINT64_MAX)
88fc9c9b 2729 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2730
f5fbe71d 2731 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2732 q.ambient = arg_caps_ambient;
f66ad460
AZ
2733
2734 if (capability_quintet_mangle(&q))
2735 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2736
2737 } else {
de40a303
LP
2738 q = (CapabilityQuintet) {
2739 .bounding = arg_caps_retain,
2740 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2741 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2742 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2743 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2744 };
2745
f66ad460
AZ
2746 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2747 * in order to maintain the same behavior as systemd < 242. */
2748 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2749 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2750 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2751
2752 }
2753
de40a303 2754 return capability_quintet_enforce(&q);
88213476
LP
2755}
2756
db999e0f
LP
2757static int reset_audit_loginuid(void) {
2758 _cleanup_free_ char *p = NULL;
2759 int r;
2760
0c582db0 2761 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2762 return 0;
2763
2764 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2765 if (r == -ENOENT)
db999e0f 2766 return 0;
f647962d
MS
2767 if (r < 0)
2768 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2769
2770 /* Already reset? */
2771 if (streq(p, "4294967295"))
2772 return 0;
2773
57512c89 2774 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2775 if (r < 0) {
10a87006
LP
2776 log_error_errno(r,
2777 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2778 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2779 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2780 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2781 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2782
db999e0f 2783 sleep(5);
77b6e194 2784 }
db999e0f
LP
2785
2786 return 0;
77b6e194
LP
2787}
2788
e79581dd 2789static int mount_tunnel_dig(const char *root) {
785890ac 2790 const char *p, *q;
709f6e46 2791 int r;
785890ac
LP
2792
2793 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2794 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2795 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2796 (void) mkdir_p(p, 0600);
2797
5a27b395 2798 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2799 if (r < 0)
5a27b395 2800 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2801
e79581dd 2802 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2803 if (r < 0)
e79581dd 2804 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2805
e79581dd 2806 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2807 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2808 if (r < 0)
2809 return r;
785890ac 2810
511a8cfe 2811 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2812 if (r < 0)
2813 return r;
785890ac 2814
e79581dd
CB
2815 return 0;
2816}
2817
2818static int mount_tunnel_open(void) {
2819 int r;
2820
2821 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2822 if (r < 0)
2823 return r;
2824
2825 return 0;
785890ac
LP
2826}
2827
317feb4d 2828static int setup_machine_id(const char *directory) {
3bbaff3e 2829 int r;
e01ff70a 2830
317feb4d
LP
2831 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2832 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2833 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2834 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2835 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2836 * container behaves nicely). */
2837
319477f1 2838 r = id128_get_machine(directory, &arg_uuid);
bb44fd07
ZJS
2839 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2840 /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d
LP
2841 if (sd_id128_is_null(arg_uuid)) {
2842 r = sd_id128_randomize(&arg_uuid);
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2845 }
bb44fd07
ZJS
2846 } else if (r < 0)
2847 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2848
e01ff70a
MS
2849 return 0;
2850}
2851
7336138e
LP
2852static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2853 int r;
2854
2855 assert(directory);
2856
6c045a99 2857 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2858 return 0;
2859
2860 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2861 if (r == -EOPNOTSUPP)
2862 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2863 if (r == -EBADE)
2864 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2865 if (r < 0)
2866 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2867 if (r == 0)
2868 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2869 else
2870 log_debug("Patched directory tree to match UID/GID range.");
2871
2872 return r;
2873}
2874
113cea80 2875/*
6d416b9c
LS
2876 * Return values:
2877 * < 0 : wait_for_terminate() failed to get the state of the
2878 * container, the container was terminated by a signal, or
2879 * failed for an unknown reason. No change is made to the
2880 * container argument.
2881 * > 0 : The program executed in the container terminated with an
2882 * error. The exit code of the program executed in the
919699ec
LP
2883 * container is returned. The container argument has been set
2884 * to CONTAINER_TERMINATED.
6d416b9c
LS
2885 * 0 : The container is being rebooted, has been shut down or exited
2886 * successfully. The container argument has been set to either
2887 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2888 *
6d416b9c
LS
2889 * That is, success is indicated by a return value of zero, and an
2890 * error is indicated by a non-zero value.
113cea80
DH
2891 */
2892static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2893 siginfo_t status;
919699ec 2894 int r;
113cea80
DH
2895
2896 r = wait_for_terminate(pid, &status);
f647962d
MS
2897 if (r < 0)
2898 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2899
2900 switch (status.si_code) {
fddbb89c 2901
113cea80 2902 case CLD_EXITED:
b5a2179b 2903 if (status.si_status == 0)
919699ec 2904 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2905 else
919699ec 2906 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2907
919699ec
LP
2908 *container = CONTAINER_TERMINATED;
2909 return status.si_status;
113cea80
DH
2910
2911 case CLD_KILLED:
2912 if (status.si_status == SIGINT) {
919699ec 2913 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2914 *container = CONTAINER_TERMINATED;
919699ec
LP
2915 return 0;
2916
113cea80 2917 } else if (status.si_status == SIGHUP) {
919699ec 2918 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2919 *container = CONTAINER_REBOOTED;
919699ec 2920 return 0;
113cea80 2921 }
919699ec 2922
4831981d 2923 _fallthrough_;
113cea80 2924 case CLD_DUMPED:
baaa35ad
ZJS
2925 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2926 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2927
2928 default:
baaa35ad
ZJS
2929 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2930 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2931 }
113cea80
DH
2932}
2933
023fb90b
LP
2934static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2935 pid_t pid;
2936
4a0b58c4 2937 pid = PTR_TO_PID(userdata);
023fb90b 2938 if (pid > 0) {
c6c8f6e2 2939 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2940 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2941 sd_event_source_set_userdata(s, NULL);
2942 return 0;
2943 }
2944 }
2945
2946 sd_event_exit(sd_event_source_get_event(s), 0);
2947 return 0;
2948}
2949
6916b164 2950static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2951 pid_t pid;
2952
2953 assert(s);
2954 assert(ssi);
2955
2956 pid = PTR_TO_PID(userdata);
2957
6916b164
AU
2958 for (;;) {
2959 siginfo_t si = {};
abdb9b08 2960
6916b164
AU
2961 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2962 return log_error_errno(errno, "Failed to waitid(): %m");
2963 if (si.si_pid == 0) /* No pending children. */
2964 break;
abdb9b08 2965 if (si.si_pid == pid) {
6916b164
AU
2966 /* The main process we care for has exited. Return from
2967 * signal handler but leave the zombie. */
2968 sd_event_exit(sd_event_source_get_event(s), 0);
2969 break;
2970 }
abdb9b08 2971
6916b164
AU
2972 /* Reap all other children. */
2973 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2974 }
2975
2976 return 0;
2977}
2978
abdb9b08
LP
2979static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2980 pid_t pid;
2981
2982 assert(m);
2983
2984 pid = PTR_TO_PID(userdata);
2985
2986 if (arg_kill_signal > 0) {
2987 log_info("Container termination requested. Attempting to halt container.");
2988 (void) kill(pid, arg_kill_signal);
2989 } else {
2990 log_info("Container termination requested. Exiting.");
2991 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2992 }
2993
2994 return 0;
2995}
2996
ec16945e 2997static int determine_names(void) {
1b9cebf6 2998 int r;
ec16945e 2999
c1521918
LP
3000 if (arg_template && !arg_directory && arg_machine) {
3001
3002 /* If --template= was specified then we should not
3003 * search for a machine, but instead create a new one
3004 * in /var/lib/machine. */
3005
657ee2d8 3006 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
3007 if (!arg_directory)
3008 return log_oom();
3009 }
3010
ec16945e 3011 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3012 if (arg_machine) {
3013 _cleanup_(image_unrefp) Image *i = NULL;
3014
d577d4a4 3015 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3016 if (r == -ENOENT)
3017 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3018 if (r < 0)
3019 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3020
eb38edce 3021 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3022 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3023 else
0f03c2a4 3024 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3025 if (r < 0)
0f3be6ca 3026 return log_oom();
1b9cebf6 3027
aee327b8
LP
3028 if (!arg_ephemeral)
3029 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3030 } else {
3031 r = safe_getcwd(&arg_directory);
3032 if (r < 0)
3033 return log_error_errno(r, "Failed to determine current directory: %m");
3034 }
ec16945e 3035
c6147113
LP
3036 if (!arg_directory && !arg_image)
3037 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3038 }
3039
3040 if (!arg_machine) {
b9ba4dab
LP
3041 if (arg_directory && path_equal(arg_directory, "/"))
3042 arg_machine = gethostname_malloc();
e9b88a6d
LP
3043 else if (arg_image) {
3044 char *e;
4827ab48 3045
b36e39d2
LP
3046 r = path_extract_filename(arg_image, &arg_machine);
3047 if (r < 0)
3048 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 3049
e9b88a6d
LP
3050 /* Truncate suffix if there is one */
3051 e = endswith(arg_machine, ".raw");
3052 if (e)
3053 *e = 0;
b36e39d2
LP
3054 } else {
3055 r = path_extract_filename(arg_directory, &arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3058 }
ec16945e 3059
ae691c1d 3060 hostname_cleanup(arg_machine);
52ef5dd7 3061 if (!hostname_is_valid(arg_machine, 0))
c6147113 3062 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3063
3603f151
LB
3064 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3065 * to match fixed config file names. */
3066 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3067 if (!arg_settings_filename)
3068 return log_oom();
3069
e9b88a6d
LP
3070 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3071 * instances at once without manually having to specify -M each time. */
3072 if (arg_ephemeral)
3073 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3074 return log_oom();
3603f151
LB
3075 } else {
3076 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3077 if (!arg_settings_filename)
3078 return log_oom();
ec16945e
LP
3079 }
3080
3081 return 0;
3082}
3083
f461a28d 3084static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3085 char *chased;
3086 int r;
3087
3088 assert(p);
3089
3090 if (!*p)
3091 return 0;
3092
f461a28d 3093 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3094 if (r < 0)
3095 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3096
a5648b80 3097 return free_and_replace(*p, chased);
3f342ec4
LP
3098}
3099
03cfe0d5 3100static int determine_uid_shift(const char *directory) {
6dac160c 3101
0de7acce 3102 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3103 arg_uid_shift = 0;
6dac160c 3104 return 0;
03cfe0d5 3105 }
6dac160c
LP
3106
3107 if (arg_uid_shift == UID_INVALID) {
3108 struct stat st;
3109
993da6d4
LP
3110 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3111
3112 if (stat(directory, &st) < 0)
03cfe0d5 3113 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3114
3115 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3116
baaa35ad
ZJS
3117 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3118 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3119 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3120
3121 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3122
3123 if (arg_uid_shift != 0) {
3124 /* If the image is shifted already, then we'll fall back to classic chowning, for
3125 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3126
3127 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3128 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3129 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3130 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3131 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3132 "UID base of %s is not zero, UID mapping not supported.", directory);
3133 }
6dac160c
LP
3134 }
3135
58e13de5
LP
3136 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3137 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3138
6dac160c
LP
3139 return 0;
3140}
3141
de40a303
LP
3142static unsigned long effective_clone_ns_flags(void) {
3143 unsigned long flags = arg_clone_ns_flags;
3144
3145 if (arg_private_network)
3146 flags |= CLONE_NEWNET;
3147 if (arg_use_cgns)
3148 flags |= CLONE_NEWCGROUP;
3149 if (arg_userns_mode != USER_NAMESPACE_NO)
3150 flags |= CLONE_NEWUSER;
3151
3152 return flags;
3153}
3154
3155static int patch_sysctl(void) {
3156
3157 /* This table is inspired by runc's sysctl() function */
3158 static const struct {
3159 const char *key;
3160 bool prefix;
3161 unsigned long clone_flags;
3162 } safe_sysctl[] = {
3163 { "kernel.hostname", false, CLONE_NEWUTS },
3164 { "kernel.domainname", false, CLONE_NEWUTS },
3165 { "kernel.msgmax", false, CLONE_NEWIPC },
3166 { "kernel.msgmnb", false, CLONE_NEWIPC },
3167 { "kernel.msgmni", false, CLONE_NEWIPC },
3168 { "kernel.sem", false, CLONE_NEWIPC },
3169 { "kernel.shmall", false, CLONE_NEWIPC },
3170 { "kernel.shmmax", false, CLONE_NEWIPC },
3171 { "kernel.shmmni", false, CLONE_NEWIPC },
3172 { "fs.mqueue.", true, CLONE_NEWIPC },
3173 { "net.", true, CLONE_NEWNET },
3174 };
3175
3176 unsigned long flags;
de40a303
LP
3177 int r;
3178
3179 flags = effective_clone_ns_flags();
3180
3181 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3182 bool good = false;
3183 size_t i;
3184
3185 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3186
3187 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3188 continue;
3189
3190 if (safe_sysctl[i].prefix)
3191 good = startswith(*k, safe_sysctl[i].key);
3192 else
3193 good = streq(*k, safe_sysctl[i].key);
3194
3195 if (good)
3196 break;
3197 }
3198
c6147113
LP
3199 if (!good)
3200 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3201
3202 r = sysctl_write(*k, *v);
3203 if (r < 0)
3204 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3205 }
3206
3207 return 0;
3208}
3209
03cfe0d5
LP
3210static int inner_child(
3211 Barrier *barrier,
5d9d3fcb 3212 int fd_inner_socket,
e1bb4b0d
LB
3213 FDSet *fds,
3214 char **os_release_pairs) {
69c79d3c 3215
03cfe0d5 3216 _cleanup_free_ char *home = NULL;
88614c8a 3217 size_t n_env = 1;
4ab3d29f
ZJS
3218 char *envp[] = {
3219 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3220 NULL, /* container */
03cfe0d5
LP
3221 NULL, /* TERM */
3222 NULL, /* HOME */
3223 NULL, /* USER */
3224 NULL, /* LOGNAME */
3225 NULL, /* container_uuid */
3226 NULL, /* LISTEN_FDS */
3227 NULL, /* LISTEN_PID */
9c1e04d0 3228 NULL, /* NOTIFY_SOCKET */
3652872a 3229 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3230 NULL, /* LANG */
03cfe0d5
LP
3231 NULL
3232 };
1a68e1e5 3233 const char *exec_target;
2371271c 3234 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3235 int r, which_failed;
88213476 3236
b37469d7
LP
3237 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3238 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3239 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3240 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3241 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3242 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3243 * namespace.
3244 *
3245 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3246 * unshare(). See below. */
3247
03cfe0d5 3248 assert(barrier);
5d9d3fcb 3249 assert(fd_inner_socket >= 0);
88213476 3250
de40a303
LP
3251 log_debug("Inner child is initializing.");
3252
0de7acce 3253 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3254 /* Tell the parent, that it now can write the UID map. */
3255 (void) barrier_place(barrier); /* #1 */
7027ff61 3256
03cfe0d5 3257 /* Wait until the parent wrote the UID map */
baaa35ad 3258 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3259 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3260
2a2e78e9
LP
3261 /* Become the new root user inside our namespace */
3262 r = reset_uid_gid();
3263 if (r < 0)
3264 return log_error_errno(r, "Couldn't become new root: %m");
3265
3266 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3267 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3268 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3269 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3270 if (r < 0)
3271 return r;
3272 }
6d66bd3b 3273
0de7acce 3274 r = mount_all(NULL,
4f086aab 3275 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3276 arg_uid_shift,
0de7acce 3277 arg_selinux_apifs_context);
03cfe0d5
LP
3278 if (r < 0)
3279 return r;
3280
04413780
ZJS
3281 if (!arg_network_namespace_path && arg_private_network) {
3282 r = unshare(CLONE_NEWNET);
3283 if (r < 0)
3284 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3285
3286 /* Tell the parent that it can setup network interfaces. */
3287 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3288 }
3289
4f086aab 3290 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3291 if (r < 0)
3292 return r;
3293
03cfe0d5
LP
3294 /* Wait until we are cgroup-ified, so that we
3295 * can mount the right cgroup path writable */
baaa35ad
ZJS
3296 if (!barrier_place_and_sync(barrier)) /* #4 */
3297 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3298 "Parent died too early");
88213476 3299
489fae52 3300 if (arg_use_cgns) {
0996ef00
CB
3301 r = unshare(CLONE_NEWCGROUP);
3302 if (r < 0)
04413780 3303 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3304 r = mount_cgroups(
3305 "",
3306 arg_unified_cgroup_hierarchy,
3307 arg_userns_mode != USER_NAMESPACE_NO,
3308 arg_uid_shift,
3309 arg_uid_range,
5a8ff0e6 3310 arg_selinux_apifs_context,
ada54120 3311 true);
1433e0f2 3312 } else
0996ef00 3313 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3314 if (r < 0)
3315 return r;
ec16945e 3316
1e4f1671 3317 r = setup_boot_id();
03cfe0d5
LP
3318 if (r < 0)
3319 return r;
ec16945e 3320
5d9d3fcb 3321 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3322 if (r < 0)
3323 return r;
ec16945e 3324
de40a303
LP
3325 r = mount_custom(
3326 "/",
3327 arg_custom_mounts,
3328 arg_n_custom_mounts,
de40a303 3329 0,
c0c8f718 3330 0,
de40a303 3331 arg_selinux_apifs_context,
5f0a6347 3332 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3333 if (r < 0)
3334 return r;
3335
03cfe0d5
LP
3336 if (setsid() < 0)
3337 return log_error_errno(errno, "setsid() failed: %m");
3338
3339 if (arg_private_network)
df883de9 3340 (void) loopback_setup();
03cfe0d5 3341
7a8f6325 3342 if (arg_expose_ports) {
b07ee903 3343 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3344 if (r < 0)
3345 return r;
7a8f6325 3346 }
03cfe0d5 3347
3acc84eb 3348 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3349 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3350 _cleanup_free_ char *console = NULL;
3351
3352 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3353 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3354 if (master < 0)
dc98caea 3355 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3356
3357 r = setup_dev_console(console);
3358 if (r < 0)
105a1a36 3359 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3360
bb1aa185 3361 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3362 if (r < 0)
3363 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3364
3365 r = setup_stdio_as_dev_console();
3366 if (r < 0)
3367 return r;
3368 }
3369
de40a303
LP
3370 r = patch_sysctl();
3371 if (r < 0)
3372 return r;
3373
81f345df
LP
3374 if (arg_oom_score_adjust_set) {
3375 r = set_oom_score_adjust(arg_oom_score_adjust);
3376 if (r < 0)
3377 return log_error_errno(r, "Failed to adjust OOM score: %m");
3378 }
3379
0985c7c4
ZJS
3380 if (arg_cpu_set.set)
3381 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3382 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3383
c818eef1 3384 (void) setup_hostname();
03cfe0d5 3385
050f7277 3386 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3387 r = safe_personality(arg_personality);
3388 if (r < 0)
3389 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3390#ifdef ARCHITECTURE_SECONDARY
3391 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3392 r = safe_personality(PER_LINUX32);
3393 if (r < 0)
3394 return log_error_errno(r, "personality() failed: %m");
4c27749b 3395#endif
af262e5f
LB
3396 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3397 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3398 "invocation with qemu userspace emulator (or equivalent) in effect.",
3399 architecture_to_string(arg_architecture));
03cfe0d5 3400
de40a303
LP
3401 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3402 if (r < 0)
3403 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3404
3405#if HAVE_SECCOMP
3406 if (arg_seccomp) {
3407
3408 if (is_seccomp_available()) {
de40a303 3409 r = seccomp_load(arg_seccomp);
3c098014
ZJS
3410 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3411 return log_error_errno(r, "Failed to install seccomp filter: %m");
3412 if (r < 0)
de40a303
LP
3413 log_debug_errno(r, "Failed to install seccomp filter: %m");
3414 }
3415 } else
3416#endif
3417 {
6b000af4 3418 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3419 if (r < 0)
3420 return r;
3421 }
3422
4a4654e0 3423 if (arg_suppress_sync) {
20e458ae 3424#if HAVE_SECCOMP
4a4654e0
LP
3425 r = seccomp_suppress_sync();
3426 if (r < 0)
3427 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3428#else
2db32618 3429 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3430#endif
4a4654e0
LP
3431 }
3432
349cc4a5 3433#if HAVE_SELINUX
03cfe0d5 3434 if (arg_selinux_context)
2ed96880 3435 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3436 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3437#endif
3438
de40a303
LP
3439 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3440 * if we need to later on. */
3441 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3442 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3443
3444 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3445 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3446 else
3462d773 3447 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3448 if (r < 0)
3449 return r;
3450
de40a303
LP
3451 r = drop_capabilities(getuid());
3452 if (r < 0)
3453 return log_error_errno(r, "Dropping capabilities failed: %m");
3454
66edd963
LP
3455 if (arg_no_new_privileges)
3456 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3457 return log_error_errno(errno, "Failed to disable new privileges: %m");
3458
6aadfa4c
ILG
3459 /* LXC sets container=lxc, so follow the scheme here */
3460 envp[n_env++] = strjoina("container=", arg_container_service_name);
3461
03cfe0d5
LP
3462 envp[n_env] = strv_find_prefix(environ, "TERM=");
3463 if (envp[n_env])
313cefa1 3464 n_env++;
03cfe0d5 3465
de40a303 3466 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3467 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3468 return log_oom();
3469
3470 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3471 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3472 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3473 return log_oom();
03cfe0d5 3474
3bbaff3e 3475 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3476
b7416360 3477 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3478 return log_oom();
03cfe0d5
LP
3479
3480 if (fdset_size(fds) > 0) {
3481 r = fdset_cloexec(fds, false);
3482 if (r < 0)
3483 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3484
4ab3d29f
ZJS
3485 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3486 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3487 return log_oom();
3488 }
4ab3d29f 3489 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3490 return log_oom();
03cfe0d5 3491
3652872a
LP
3492 if (arg_n_credentials > 0) {
3493 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3494 if (!envp[n_env])
3495 return log_oom();
3496 n_env++;
3497 }
3498
b626f695 3499 if (arg_start_mode != START_BOOT) {
a22f5186 3500 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3501 if (!envp[n_env])
3502 return log_oom();
3503 n_env++;
3504 }
3505
4ab3d29f 3506 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3507 if (!env_use)
3508 return log_oom();
03cfe0d5
LP
3509
3510 /* Let the parent know that we are ready and
3511 * wait until the parent is ready with the
3512 * setup, too... */
baaa35ad 3513 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3514 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3515
5f932eb9
LP
3516 if (arg_chdir)
3517 if (chdir(arg_chdir) < 0)
3518 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3519
7732f92b 3520 if (arg_start_mode == START_PID2) {
75bf701f 3521 r = stub_pid1(arg_uuid);
7732f92b
LP
3522 if (r < 0)
3523 return r;
3524 }
3525
335d2ead
LP
3526 if (arg_console_mode != CONSOLE_PIPE) {
3527 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3528 * are configured for that. Acquire it as controlling tty. */
3529 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3530 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3531 }
3532
de40a303
LP
3533 log_debug("Inner child completed, invoking payload.");
3534
8ca082b4
LP
3535 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3536 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3537 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3538 log_close();
8ca082b4 3539 log_set_open_when_needed(true);
a3b00f91 3540 log_settle_target();
8ca082b4 3541
03cfe0d5
LP
3542 (void) fdset_close_others(fds);
3543
7732f92b 3544 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3545 char **a;
3546 size_t m;
3547
3548 /* Automatically search for the init system */
3549
75f32f04
ZJS
3550 m = strv_length(arg_parameters);
3551 a = newa(char*, m + 2);
3552 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3553 a[1 + m] = NULL;
03cfe0d5 3554
a5096641
LP
3555 FOREACH_STRING(init,
3556 "/usr/lib/systemd/systemd",
3557 "/lib/systemd/systemd",
3558 "/sbin/init") {
3559 a[0] = (char*) init;
3560 execve(a[0], a, env_use);
3561 }
ced58da7
LP
3562
3563 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3564 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3565 const char *dollar_path;
3566
1a68e1e5 3567 exec_target = arg_parameters[0];
b6b180b7
LP
3568
3569 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3570 * binary. */
3571 dollar_path = strv_env_get(env_use, "PATH");
3572 if (dollar_path) {
6f646e01 3573 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3574 return log_error_errno(errno, "Failed to update $PATH: %m");
3575 }
3576
f757855e 3577 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3578 } else {
5f932eb9 3579 if (!arg_chdir)
d929b0f9
ZJS
3580 /* If we cannot change the directory, we'll end up in /, that is expected. */
3581 (void) chdir(home ?: "/root");
5f932eb9 3582
53350c7b 3583 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3584 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3585 execle("/bin/bash", "-bash", NULL, env_use);
3586 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3587 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3588
53350c7b 3589 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3590 }
3591
8ca082b4 3592 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3593}
3594
e96ceaba 3595static int setup_notify_child(void) {
254d1313 3596 _cleanup_close_ int fd = -EBADF;
1eb874b9 3597 static const union sockaddr_union sa = {
44ed5214
LP
3598 .un.sun_family = AF_UNIX,
3599 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3600 };
3601 int r;
3602
3603 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3604 if (fd < 0)
3605 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3606
3607 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3608 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3609
9c1e04d0 3610 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3611 if (r < 0)
44ed5214 3612 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3613
adc7d9f0 3614 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3615 if (r < 0)
adc7d9f0 3616 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3617
2ff48e98 3618 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3619 if (r < 0)
2ff48e98 3620 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3621
271f518f 3622 return TAKE_FD(fd);
9c1e04d0
AP
3623}
3624
03cfe0d5
LP
3625static int outer_child(
3626 Barrier *barrier,
3627 const char *directory,
2d845785 3628 DissectedImage *dissected_image,
af06cd30 3629 int fd_outer_socket,
5d9d3fcb 3630 int fd_inner_socket,
d7bea6b6
DP
3631 FDSet *fds,
3632 int netns_fd) {
03cfe0d5 3633
2f893044 3634 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3635 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3636 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3637 bool idmap = false;
e5f10caf 3638 const char *p;
03cfe0d5
LP
3639 pid_t pid;
3640 ssize_t l;
de40a303 3641 int r;
03cfe0d5 3642
d1d0b895
LP
3643 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3644 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3645 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3646 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3647 * forked off it, and it exits. */
b37469d7 3648
03cfe0d5
LP
3649 assert(barrier);
3650 assert(directory);
af06cd30 3651 assert(fd_outer_socket >= 0);
5d9d3fcb 3652 assert(fd_inner_socket >= 0);
03cfe0d5 3653
de40a303
LP
3654 log_debug("Outer child is initializing.");
3655
e1bb4b0d
LB
3656 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3657 if (r < 0)
3658 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3659
03cfe0d5
LP
3660 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3661 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3662
03cfe0d5
LP
3663 r = reset_audit_loginuid();
3664 if (r < 0)
3665 return r;
3666
2a2e78e9
LP
3667 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3668 * mounts to the real root. */
511a8cfe 3669 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3670 if (r < 0)
3671 return r;
03cfe0d5 3672
2d845785 3673 if (dissected_image) {
d1d0b895
LP
3674 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3675 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3676 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3677 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3678
af187ab2 3679 r = dissected_image_mount_and_warn(
d04faa4e
LP
3680 dissected_image,
3681 directory,
3682 arg_uid_shift,
21b61b1d 3683 arg_uid_range,
8d9a1d59 3684 /* userns_fd= */ -EBADF,
d04faa4e
LP
3685 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3686 DISSECT_IMAGE_DISCARD_ON_LOOP|
3687 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3688 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3689 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3690 if (r < 0)
af187ab2 3691 return r;
2d845785 3692 }
03cfe0d5 3693
391567f4
LP
3694 r = determine_uid_shift(directory);
3695 if (r < 0)
3696 return r;
3697
0de7acce 3698 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3699 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3700 if (r < 0)
3701 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3702
af06cd30 3703 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3704 if (l < 0)
3705 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3706 mntns_fd = safe_close(mntns_fd);
3707
0e7ac751 3708 /* Let the parent know which UID shift we read from the image */
af06cd30 3709 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3710 if (l < 0)
3711 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3712 if (l != sizeof(arg_uid_shift))
3713 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3714 "Short write while sending UID shift.");
0e7ac751 3715
0de7acce 3716 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3717 /* When we are supposed to pick the UID shift, the parent will check now whether the
3718 * UID shift we just read from the image is available. If yes, it will send the UID
3719 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3720
af06cd30 3721 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3722 if (l < 0)
3723 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3724 if (l != sizeof(arg_uid_shift))
3725 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3726 "Short read while receiving UID shift.");
0e7ac751
LP
3727 }
3728
ff6c6cc1
LP
3729 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3730 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3731 }
3732
6f83d3d1
LP
3733 if (path_equal(directory, "/")) {
3734 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3735 * place, so that we can make changes to its mount structure (for example, to implement
3736 * --volatile=) without this interfering with our ability to access files such as
3737 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3738 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3739 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3740 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3741
511a8cfe 3742 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3743 if (r < 0)
3744 return r;
3745
3746 directory = "/run/systemd/nspawn-root";
e50cd82f 3747 }
7d0ecdd6 3748
75f81732
LP
3749 /* Make sure we always have a mount that we can move to root later on. */
3750 r = make_mount_point(directory);
3751 if (r < 0)
3752 return r;
3753
3754 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3755 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3756 * we'll live in our own little world from now on, and propagation from the host may only happen via
3757 * the mount tunnel dir, or not at all. */
3758 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3759 if (r < 0)
3760 return r;
3761
7d0ecdd6
LP
3762 r = setup_pivot_root(
3763 directory,
3764 arg_pivot_root_new,
3765 arg_pivot_root_old);
3766 if (r < 0)
3767 return r;
3768
3769 r = setup_volatile_mode(
3770 directory,
3771 arg_volatile_mode,
7d0ecdd6 3772 arg_uid_shift,
8f1ed04a 3773 arg_selinux_apifs_context);
7d0ecdd6
LP
3774 if (r < 0)
3775 return r;
3776
2f893044
LP
3777 r = bind_user_prepare(
3778 directory,
3779 arg_bind_user,
3780 arg_uid_shift,
3781 arg_uid_range,
3782 &arg_custom_mounts, &arg_n_custom_mounts,
3783 &bind_user_context);
3784 if (r < 0)
3785 return r;
3786
3787 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3788 /* Send the user maps we determined to the parent, so that it installs it in our user
3789 * namespace UID map table */
2f893044
LP
3790
3791 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3792 uid_t map[] = {
3793 bind_user_context->data[i].payload_user->uid,
3794 bind_user_context->data[i].host_user->uid,
3795 (uid_t) bind_user_context->data[i].payload_group->gid,
3796 (uid_t) bind_user_context->data[i].host_group->gid,
3797 };
3798
af06cd30 3799 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3800 if (l < 0)
3801 return log_error_errno(errno, "Failed to send user UID map: %m");
3802 if (l != sizeof(map))
3803 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3804 "Short write while sending user UID map.");
3805 }
3806 }
3807
5f0a6347
DDM
3808 r = mount_custom(
3809 directory,
3810 arg_custom_mounts,
3811 arg_n_custom_mounts,
5f0a6347 3812 arg_uid_shift,
c0c8f718 3813 arg_uid_range,
5f0a6347
DDM
3814 arg_selinux_apifs_context,
3815 MOUNT_ROOT_ONLY);
3816 if (r < 0)
3817 return r;
3818
c0c8f718
AV
3819 if (arg_userns_mode != USER_NAMESPACE_NO &&
3820 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3821 arg_uid_shift != 0) {
3822
2b2777ed 3823 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
bb44fd07
ZJS
3824 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3825 /* This might fail because the kernel or file system doesn't support idmapping. We
3826 * can't really distinguish this nicely, nor do we have any guarantees about the
3827 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3828 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3829 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3830 "ID mapped mounts are apparently not available, sorry.");
3831
3832 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3833 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3834 } else if (r < 0)
3835 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3836 else {
c0c8f718
AV
3837 log_debug("ID mapped mounts available, making use of them.");
3838 idmap = true;
3839 }
3840 }
3841
2d3a5a73
LP
3842 if (dissected_image) {
3843 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3844 r = dissected_image_mount(
3845 dissected_image,
3846 directory,
3847 arg_uid_shift,
21b61b1d 3848 arg_uid_range,
8d9a1d59 3849 /* userns_fd= */ -EBADF,
d04faa4e
LP
3850 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3851 DISSECT_IMAGE_DISCARD_ON_LOOP|
3852 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3853 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3854 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3855 if (r == -EUCLEAN)
3856 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3857 if (r < 0)
4fcb96ce 3858 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3859 }
3860
8199d554
LP
3861 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3862 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3863
3864 r = detect_unified_cgroup_hierarchy_from_image(directory);
3865 if (r < 0)
3866 return r;
3867
fefb7a6d 3868 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3869 if (l < 0)
3870 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3871 if (l != sizeof(arg_unified_cgroup_hierarchy))
3872 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3873 "Short write while sending cgroup mode.");
8199d554
LP
3874 }
3875
4ad14eff
LP
3876 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3877 if (r < 0)
3878 return r;
3879
03cfe0d5
LP
3880 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3881 if (r < 0)
3882 return r;
3883
bbd407ea
DDM
3884 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3885 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3886 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3887 if (r < 0)
3888 return log_error_errno(r, "Failed to make tree read-only: %m");
3889 }
3890
0de7acce 3891 r = mount_all(directory,
4f086aab 3892 arg_mount_settings,
0de7acce 3893 arg_uid_shift,
0de7acce 3894 arg_selinux_apifs_context);
03cfe0d5
LP
3895 if (r < 0)
3896 return r;
3897
07fa00f9
LP
3898 r = copy_devnodes(directory);
3899 if (r < 0)
03cfe0d5
LP
3900 return r;
3901
de40a303
LP
3902 r = make_extra_nodes(directory);
3903 if (r < 0)
3904 return r;
3905
3906 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3907
9fac5029 3908 p = prefix_roota(directory, "/run/host");
e5f10caf 3909 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3910
07fa00f9
LP
3911 r = setup_pts(directory);
3912 if (r < 0)
03cfe0d5
LP
3913 return r;
3914
e79581dd 3915 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3916 if (r < 0)
3917 return r;
3918
8e5430c4
LP
3919 r = setup_keyring();
3920 if (r < 0)
3921 return r;
3922
3652872a
LP
3923 r = setup_credentials(directory);
3924 if (r < 0)
3925 return r;
3926
2f893044
LP
3927 r = bind_user_setup(bind_user_context, directory);
3928 if (r < 0)
3929 return r;
3930
5c4deb9a
MJ
3931 r = mount_custom(
3932 directory,
3933 arg_custom_mounts,
3934 arg_n_custom_mounts,
3935 arg_uid_shift,
c0c8f718 3936 arg_uid_range,
5c4deb9a
MJ
3937 arg_selinux_apifs_context,
3938 MOUNT_NON_ROOT_ONLY);
3939 if (r < 0)
3940 return r;
3941
03cfe0d5
LP
3942 r = setup_timezone(directory);
3943 if (r < 0)
3944 return r;
3945
3946 r = setup_resolv_conf(directory);
3947 if (r < 0)
3948 return r;
3949
e01ff70a
MS
3950 r = setup_machine_id(directory);
3951 if (r < 0)
3952 return r;
3953
03cfe0d5
LP
3954 r = setup_journal(directory);
3955 if (r < 0)
3956 return r;
3957
0f48ba7b
LP
3958 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3959 p = prefix_roota(directory, "/run/host/container-manager");
3960 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3961
3962 /* The same stuff as the $container_uuid env var */
3963 p = prefix_roota(directory, "/run/host/container-uuid");
3964 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3965
489fae52 3966 if (!arg_use_cgns) {
0996ef00
CB
3967 r = mount_cgroups(
3968 directory,
3969 arg_unified_cgroup_hierarchy,
3970 arg_userns_mode != USER_NAMESPACE_NO,
3971 arg_uid_shift,
3972 arg_uid_range,
5a8ff0e6 3973 arg_selinux_apifs_context,
ada54120 3974 false);
0996ef00
CB
3975 if (r < 0)
3976 return r;
3977 }
03cfe0d5 3978
57c10a56
CB
3979 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3980 * mounts available in systemd services inside the container that create a new mount namespace. See
3981 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3982 * will inherit the shared propagation mode.
3983 *
3984 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3985 * directory mount to root later on.
3986 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3987 */
9d50f850 3988 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
3989 if (r < 0)
3990 return log_error_errno(r, "Failed to move root directory: %m");
3991
e79581dd
CB
3992 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3993 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3994 * the container. */
3995 r = mount_tunnel_open();
3996 if (r < 0)
3997 return r;
3998
b71a0192
CB
3999 if (arg_userns_mode != USER_NAMESPACE_NO) {
4000 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4001 * requires that a fully visible instance is already present in the target mount
4002 * namespace. Mount one here so the inner child can mount its own instances. Later
4003 * we umount the temporary instances created here before we actually exec the
4004 * payload. Since the rootfs is shared the umount will propagate into the container.
4005 * Note, the inner child wouldn't be able to unmount the instances on its own since
4006 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4007 * this. */
4008 r = pin_fully_visible_fs();
4009 if (r < 0)
4010 return r;
4011 }
4012
e96ceaba 4013 fd = setup_notify_child();
9c1e04d0
AP
4014 if (fd < 0)
4015 return fd;
4016
03cfe0d5 4017 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 4018 arg_clone_ns_flags |
8869a0b4 4019 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
4020 if (pid < 0)
4021 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 4022 if (pid == 0) {
af06cd30 4023 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 4024
2a2e78e9
LP
4025 /* The inner child has all namespaces that are requested, so that we all are owned by the
4026 * user if user namespaces are turned on. */
03cfe0d5 4027
d7bea6b6
DP
4028 if (arg_network_namespace_path) {
4029 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4030 if (r < 0)
e2d39e54 4031 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4032 }
4033
11875a98 4034 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
4035 if (r < 0)
4036 _exit(EXIT_FAILURE);
4037
4038 _exit(EXIT_SUCCESS);
4039 }
4040
af06cd30 4041 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
4042 if (l < 0)
4043 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4044 if (l != sizeof(pid))
4045 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4046 "Short write while sending PID.");
03cfe0d5 4047
af06cd30 4048 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
4049 if (l < 0)
4050 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4051 if (l != sizeof(arg_uuid))
4052 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4053 "Short write while sending machine ID.");
e01ff70a 4054
af06cd30 4055 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 4056 if (l < 0)
ba72801d 4057 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4058
af06cd30 4059 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4060 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4061 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4062
4063 return 0;
4064}
4065
0e7ac751 4066static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4067 bool tried_hashed = false;
0e7ac751
LP
4068 unsigned n_tries = 100;
4069 uid_t candidate;
4070 int r;
4071
4072 assert(shift);
4073 assert(ret_lock_file);
0de7acce 4074 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4075 assert(arg_uid_range == 0x10000U);
4076
4077 candidate = *shift;
4078
4079 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4080
4081 for (;;) {
fbd0b64f 4082 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4083 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4084
4085 if (--n_tries <= 0)
4086 return -EBUSY;
4087
87d5e4f2 4088 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4089 goto next;
4090 if ((candidate & UINT32_C(0xFFFF)) != 0)
4091 goto next;
4092
4093 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4094 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4095 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4096 goto next;
4097 if (r < 0)
4098 return r;
4099
4100 /* Make some superficial checks whether the range is currently known in the user database */
4101 if (getpwuid(candidate))
4102 goto next;
4103 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4104 goto next;
4105 if (getgrgid(candidate))
4106 goto next;
4107 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4108 goto next;
4109
4110 *ret_lock_file = lf;
4111 lf = (struct LockFile) LOCK_FILE_INIT;
4112 *shift = candidate;
4113 return 0;
4114
4115 next:
d381c8a6
LP
4116 if (arg_machine && !tried_hashed) {
4117 /* Try to hash the base from the container name */
4118
4119 static const uint8_t hash_key[] = {
4120 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4121 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4122 };
4123
4124 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4125
4126 tried_hashed = true;
4127 } else
4128 random_bytes(&candidate, sizeof(candidate));
4129
87d5e4f2 4130 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4131 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4132 }
4133}
4134
2f893044
LP
4135static int add_one_uid_map(
4136 char **p,
4137 uid_t container_uid,
4138 uid_t host_uid,
4139 uid_t range) {
4140
4141 return strextendf(p,
4142 UID_FMT " " UID_FMT " " UID_FMT "\n",
4143 container_uid, host_uid, range);
4144}
4145
4146static int make_uid_map_string(
4147 const uid_t bind_user_uid[],
4148 size_t n_bind_user_uid,
4149 size_t offset,
4150 char **ret) {
4151
4152 _cleanup_free_ char *s = NULL;
4153 uid_t previous_uid = 0;
4154 int r;
4155
4156 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4157 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4158 assert(ret);
4159
4160 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4161 * quadruplet, consisting of host and container UID + GID. */
4162
4163 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4164 uid_t payload_uid = bind_user_uid[i*4+offset],
4165 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4166
4167 assert(previous_uid <= payload_uid);
4168 assert(payload_uid < arg_uid_range);
4169
4170 /* Add a range to close the gap to previous entry */
4171 if (payload_uid > previous_uid) {
4172 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4173 if (r < 0)
4174 return r;
4175 }
4176
4177 /* Map this specific user */
4178 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4179 if (r < 0)
4180 return r;
4181
4182 previous_uid = payload_uid + 1;
4183 }
4184
4185 /* And add a range to close the gap to finish the range */
4186 if (arg_uid_range > previous_uid) {
4187 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4188 if (r < 0)
4189 return r;
4190 }
4191
4192 assert(s);
4193
4194 *ret = TAKE_PTR(s);
4195 return 0;
4196}
4197
4198static int setup_uid_map(
4199 pid_t pid,
4200 const uid_t bind_user_uid[],
4201 size_t n_bind_user_uid) {
4202
4203 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4204 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4205 int r;
4206
4207 assert(pid > 1);
4208
2f893044
LP
4209 /* Build the UID map string */
4210 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4211 return log_oom();
4212
03cfe0d5 4213 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4214 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4215 if (r < 0)
4216 return log_error_errno(r, "Failed to write UID map: %m");
4217
2f893044
LP
4218 /* And now build the GID map string */
4219 s = mfree(s);
4220 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4221 return log_oom();
4222
03cfe0d5 4223 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4224 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4225 if (r < 0)
4226 return log_error_errno(r, "Failed to write GID map: %m");
4227
4228 return 0;
4229}
4230
9c1e04d0 4231static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4232 char buf[NOTIFY_BUFFER_MAX+1];
4233 char *p = NULL;
4234 struct iovec iovec = {
4235 .iov_base = buf,
4236 .iov_len = sizeof(buf)-1,
4237 };
fb29cdbe
LP
4238 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4239 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4240 struct msghdr msghdr = {
4241 .msg_iov = &iovec,
4242 .msg_iovlen = 1,
4243 .msg_control = &control,
4244 .msg_controllen = sizeof(control),
4245 };
371d72e0 4246 struct ucred *ucred;
9c1e04d0
AP
4247 ssize_t n;
4248 pid_t inner_child_pid;
4249 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4250 int r;
9c1e04d0
AP
4251
4252 assert(userdata);
4253
4254 inner_child_pid = PTR_TO_PID(userdata);
4255
4256 if (revents != EPOLLIN) {
4257 log_warning("Got unexpected poll event for notify fd.");
4258 return 0;
4259 }
4260
3691bcf3 4261 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
bb44fd07
ZJS
4262 if (ERRNO_IS_NEG_TRANSIENT(n))
4263 return 0;
4264 else if (n == -EXFULL) {
4265 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4266 return 0;
4267 } else if (n < 0)
3691bcf3 4268 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4269
9c1e04d0
AP
4270 cmsg_close_all(&msghdr);
4271
371d72e0 4272 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4273 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4274 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4275 return 0;
4276 }
4277
4278 if ((size_t) n >= sizeof(buf)) {
4279 log_warning("Received notify message exceeded maximum size. Ignoring.");
4280 return 0;
4281 }
4282
4283 buf[n] = 0;
4284 tags = strv_split(buf, "\n\r");
4285 if (!tags)
4286 return log_oom();
4287
d29cc4d6 4288 if (strv_contains(tags, "READY=1")) {
d4341b76 4289 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4290 if (r < 0)
4291 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4292 }
9c1e04d0
AP
4293
4294 p = strv_find_startswith(tags, "STATUS=");
4295 if (p)
04f590a4 4296 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4297
4298 return 0;
4299}
4300
e96ceaba 4301static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4302 int r;
9c1e04d0 4303
5773024d 4304 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4305 if (r < 0)
4306 return log_error_errno(r, "Failed to allocate notify event source: %m");
4307
5773024d 4308 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4309
4310 return 0;
4311}
4312
5d961407
LP
4313static int merge_settings(Settings *settings, const char *path) {
4314 int rl;
f757855e 4315
5d961407
LP
4316 assert(settings);
4317 assert(path);
f757855e 4318
5d961407
LP
4319 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4320 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4321
7732f92b
LP
4322 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4323 settings->start_mode >= 0) {
4324 arg_start_mode = settings->start_mode;
130d3d22 4325 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4326 }
4327
d3689b94
LP
4328 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4329 settings->ephemeral >= 0)
a2f577fc
JL
4330 arg_ephemeral = settings->ephemeral;
4331
de40a303
LP
4332 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4333 settings->root) {
4334
4335 if (!arg_settings_trusted)
4336 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4337 else
4338 free_and_replace(arg_directory, settings->root);
4339 }
4340
b53ede69
PW
4341 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4342 settings->pivot_root_new) {
4343 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4344 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4345 }
4346
5f932eb9 4347 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4348 settings->working_directory)
4349 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4350
f757855e 4351 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4352 settings->environment)
4353 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4354
de40a303
LP
4355 if ((arg_settings_mask & SETTING_USER) == 0) {
4356
4357 if (settings->user)
4358 free_and_replace(arg_user, settings->user);
4359
4360 if (uid_is_valid(settings->uid))
4361 arg_uid = settings->uid;
4362 if (gid_is_valid(settings->gid))
4363 arg_gid = settings->gid;
4364 if (settings->n_supplementary_gids > 0) {
4365 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4366 arg_n_supplementary_gids = settings->n_supplementary_gids;
4367 }
4368 }
f757855e
LP
4369
4370 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4371 uint64_t plus, minus;
7be830c6 4372 uint64_t network_minus = 0;
88fc9c9b 4373 uint64_t ambient;
f757855e 4374
de40a303
LP
4375 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4376 * Settings structure */
4377
0e265674 4378 plus = settings->capability;
a3fc6b55
LP
4379 minus = settings->drop_capability;
4380
9baa294c
LP
4381 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4382 settings_network_configured(settings)) {
a3fc6b55
LP
4383 if (settings_private_network(settings))
4384 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4385 else
7be830c6 4386 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4387 }
0e265674
LP
4388
4389 if (!arg_settings_trusted && plus != 0) {
4390 if (settings->capability != 0)
5d961407 4391 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4392 } else {
4393 arg_caps_retain &= ~network_minus;
520e0d54 4394 arg_caps_retain |= plus;
7be830c6 4395 }
f757855e 4396
a3fc6b55 4397 arg_caps_retain &= ~minus;
de40a303
LP
4398
4399 /* Copy the full capabilities over too */
4400 if (capability_quintet_is_set(&settings->full_capabilities)) {
4401 if (!arg_settings_trusted)
5238e957 4402 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4403 else
4404 arg_full_capabilities = settings->full_capabilities;
4405 }
88fc9c9b
TH
4406
4407 ambient = settings->ambient_capability;
4408 if (!arg_settings_trusted && ambient != 0)
4409 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4410 else
4411 arg_caps_ambient |= ambient;
f757855e
LP
4412 }
4413
4414 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4415 settings->kill_signal > 0)
4416 arg_kill_signal = settings->kill_signal;
4417
4418 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4419 settings->personality != PERSONALITY_INVALID)
4420 arg_personality = settings->personality;
4421
4422 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4423 !sd_id128_is_null(settings->machine_id)) {
4424
4425 if (!arg_settings_trusted)
5d961407 4426 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4427 else
4428 arg_uuid = settings->machine_id;
4429 }
4430
4431 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4432 settings->read_only >= 0)
4433 arg_read_only = settings->read_only;
4434
4435 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4436 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4437 arg_volatile_mode = settings->volatile_mode;
4438
4439 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4440 settings->n_custom_mounts > 0) {
4441
4442 if (!arg_settings_trusted)
5d961407 4443 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4444 else {
4445 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4446 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4447 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4448 settings->n_custom_mounts = 0;
4449 }
4450 }
4451
4452 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4453 settings_network_configured(settings)) {
f757855e
LP
4454
4455 if (!arg_settings_trusted)
5d961407 4456 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4457 else {
f6d6bad1 4458 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4459 arg_private_network = settings_private_network(settings);
4460
130d3d22
YW
4461 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4462 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4463 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4464 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4465
1cc6c93a
YW
4466 free_and_replace(arg_network_bridge, settings->network_bridge);
4467 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4468
4469 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4470 }
4471 }
4472
4473 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4474 settings->expose_ports) {
4475
4476 if (!arg_settings_trusted)
5d961407 4477 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4478 else {
4479 expose_port_free_all(arg_expose_ports);
1cc6c93a 4480 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4481 }
4482 }
4483
0de7acce
LP
4484 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4485 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4486
4487 if (!arg_settings_trusted)
5d961407 4488 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4489 else {
4490 arg_userns_mode = settings->userns_mode;
4491 arg_uid_shift = settings->uid_shift;
4492 arg_uid_range = settings->uid_range;
6c045a99 4493 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4494 }
4495 }
4496
0cc3c9f9
LP
4497 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4498 !strv_isempty(settings->bind_user))
2f893044
LP
4499 strv_free_and_replace(arg_bind_user, settings->bind_user);
4500
d3689b94
LP
4501 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4502 settings->notify_ready >= 0)
9c1e04d0
AP
4503 arg_notify_ready = settings->notify_ready;
4504
960e4569
LP
4505 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4506
2d09ea44
LP
4507 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4508 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4509 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4510 else {
4511 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4512 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4513 }
960e4569 4514 }
de40a303
LP
4515
4516#if HAVE_SECCOMP
2d09ea44
LP
4517 if (settings->seccomp) {
4518 if (!arg_settings_trusted)
4519 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4520 else {
4521 seccomp_release(arg_seccomp);
4522 arg_seccomp = TAKE_PTR(settings->seccomp);
4523 }
de40a303
LP
4524 }
4525#endif
960e4569
LP
4526 }
4527
bf428efb
LP
4528 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4529 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4530 continue;
4531
4532 if (!settings->rlimit[rl])
4533 continue;
4534
4535 if (!arg_settings_trusted) {
5d961407 4536 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4537 continue;
4538 }
4539
4540 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4541 }
4542
3a9530e5
LP
4543 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4544 settings->hostname)
4545 free_and_replace(arg_hostname, settings->hostname);
4546
66edd963
LP
4547 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4548 settings->no_new_privileges >= 0)
4549 arg_no_new_privileges = settings->no_new_privileges;
4550
81f345df
LP
4551 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4552 settings->oom_score_adjust_set) {
4553
4554 if (!arg_settings_trusted)
5d961407 4555 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4556 else {
4557 arg_oom_score_adjust = settings->oom_score_adjust;
4558 arg_oom_score_adjust_set = true;
4559 }
4560 }
4561
d107bb7d 4562 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4563 settings->cpu_set.set) {
d107bb7d
LP
4564
4565 if (!arg_settings_trusted)
5d961407 4566 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4567 else {
0985c7c4 4568 cpu_set_reset(&arg_cpu_set);
088d71f8 4569 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
d107bb7d
LP
4570 }
4571 }
4572
09d423e9
LP
4573 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4574 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4575 arg_resolv_conf = settings->resolv_conf;
4576
4e1d6aa9
LP
4577 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4578 settings->link_journal != _LINK_JOURNAL_INVALID) {
4579
4580 if (!arg_settings_trusted)
4581 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4582 else {
4583 arg_link_journal = settings->link_journal;
4584 arg_link_journal_try = settings->link_journal_try;
4585 }
4586 }
4587
1688841f
LP
4588 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4589 settings->timezone != _TIMEZONE_MODE_INVALID)
4590 arg_timezone = settings->timezone;
4591
de40a303
LP
4592 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4593 settings->slice) {
4594
4595 if (!arg_settings_trusted)
4596 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4597 else
4598 free_and_replace(arg_slice, settings->slice);
4599 }
4600
4601 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4602 settings->use_cgns >= 0) {
4603
4604 if (!arg_settings_trusted)
4605 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4606 else
4607 arg_use_cgns = settings->use_cgns;
4608 }
4609
4610 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4611 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4612
4613 if (!arg_settings_trusted)
4614 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4615 else
4616 arg_clone_ns_flags = settings->clone_ns_flags;
4617 }
4618
4619 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4620 settings->console_mode >= 0) {
4621
4622 if (!arg_settings_trusted)
4623 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4624 else
4625 arg_console_mode = settings->console_mode;
4626 }
4627
d3689b94
LP
4628 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4629 settings->suppress_sync >= 0)
4a4654e0
LP
4630 arg_suppress_sync = settings->suppress_sync;
4631
de40a303
LP
4632 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4633 * don't consult arg_settings_mask for them. */
4634
4635 sd_bus_message_unref(arg_property_message);
4636 arg_property_message = TAKE_PTR(settings->properties);
4637
4638 arg_console_width = settings->console_width;
4639 arg_console_height = settings->console_height;
4640
b2645747 4641 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4642 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4643 arg_n_extra_nodes = settings->n_extra_nodes;
825210d4 4644 settings->n_extra_nodes = 0;
de40a303 4645
f757855e
LP
4646 return 0;
4647}
4648
5d961407
LP
4649static int load_settings(void) {
4650 _cleanup_(settings_freep) Settings *settings = NULL;
4651 _cleanup_fclose_ FILE *f = NULL;
3603f151 4652 _cleanup_free_ char *p = NULL;
5d961407
LP
4653 int r;
4654
de40a303
LP
4655 if (arg_oci_bundle)
4656 return 0;
4657
5d961407
LP
4658 /* If all settings are masked, there's no point in looking for
4659 * the settings file */
d7a0f1f4 4660 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4661 return 0;
4662
5d961407
LP
4663 /* We first look in the admin's directories in /etc and /run */
4664 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4665 _cleanup_free_ char *j = NULL;
4666
3603f151 4667 j = path_join(i, arg_settings_filename);
5d961407
LP
4668 if (!j)
4669 return log_oom();
4670
4671 f = fopen(j, "re");
4672 if (f) {
4673 p = TAKE_PTR(j);
4674
4675 /* By default, we trust configuration from /etc and /run */
4676 if (arg_settings_trusted < 0)
4677 arg_settings_trusted = true;
4678
4679 break;
4680 }
4681
4682 if (errno != ENOENT)
4683 return log_error_errno(errno, "Failed to open %s: %m", j);
4684 }
4685
4686 if (!f) {
4687 /* After that, let's look for a file next to the
4688 * actual image we shall boot. */
4689
4690 if (arg_image) {
162f6477
LP
4691 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4692 if (r < 0)
4693 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4694 } else if (arg_directory) {
4695 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4696 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4697 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4698 }
4699
4700 if (p) {
4701 f = fopen(p, "re");
4702 if (!f && errno != ENOENT)
4703 return log_error_errno(errno, "Failed to open %s: %m", p);
4704
4705 /* By default, we do not trust configuration from /var/lib/machines */
4706 if (arg_settings_trusted < 0)
4707 arg_settings_trusted = false;
4708 }
4709 }
4710
4711 if (!f)
4712 return 0;
4713
4714 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4715
4716 r = settings_load(f, p, &settings);
4717 if (r < 0)
4718 return r;
4719
4720 return merge_settings(settings, p);
4721}
4722
de40a303
LP
4723static int load_oci_bundle(void) {
4724 _cleanup_(settings_freep) Settings *settings = NULL;
4725 int r;
4726
4727 if (!arg_oci_bundle)
4728 return 0;
4729
4730 /* By default let's trust OCI bundles */
4731 if (arg_settings_trusted < 0)
4732 arg_settings_trusted = true;
4733
4734 r = oci_load(NULL, arg_oci_bundle, &settings);
4735 if (r < 0)
4736 return r;
4737
4738 return merge_settings(settings, arg_oci_bundle);
4739}
4740
3acc84eb 4741static int run_container(
2d845785 4742 DissectedImage *dissected_image,
b0067625
ZJS
4743 FDSet *fds,
4744 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4745 struct ExposeArgs *expose_args,
3acc84eb 4746 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4747
4748 static const struct sigaction sa = {
4749 .sa_handler = nop_signal_handler,
e28c7cd0 4750 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4751 };
4752
8e766630 4753 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4754 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4755 _cleanup_close_pair_ int
19ee48a6
YW
4756 fd_inner_socket_pair[2] = PIPE_EBADF,
4757 fd_outer_socket_pair[2] = PIPE_EBADF;
8199d554 4758
5bb1d7fb 4759 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4760 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4761 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4762 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4763 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4764 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4765 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4766 _cleanup_free_ uid_t *bind_user_uid = NULL;
4767 size_t n_bind_user_uid = 0;
b0067625 4768 ContainerStatus container_status = 0;
b0067625
ZJS
4769 int ifi = 0, r;
4770 ssize_t l;
4771 sigset_t mask_chld;
254d1313 4772 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4773
4774 assert_se(sigemptyset(&mask_chld) == 0);
4775 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4776
4777 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4778 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4779 * check with getpwuid() if the specific user already exists. Note that /etc might be
4780 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4781 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4782 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4783 * really ours. */
4784
4785 etc_passwd_lock = take_etc_passwd_lock(NULL);
4786 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4787 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4788 }
4789
4790 r = barrier_create(&barrier);
4791 if (r < 0)
4792 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4793
5d9d3fcb
CB
4794 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4795 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4796
af06cd30
CB
4797 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4798 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4799
b0067625
ZJS
4800 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4801 * parent's blocking calls and give it a chance to call wait() and terminate. */
4802 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4803 if (r < 0)
4804 return log_error_errno(errno, "Failed to change the signal mask: %m");
4805
4806 r = sigaction(SIGCHLD, &sa, NULL);
4807 if (r < 0)
4808 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4809
d7bea6b6 4810 if (arg_network_namespace_path) {
5b4855ab
DDM
4811 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4812 if (child_netns_fd < 0)
d7bea6b6
DP
4813 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4814
54c2459d 4815 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4816 if (r == -EUCLEAN)
4817 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4818 else if (r < 0)
d7bea6b6 4819 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4820 else if (r == 0)
4821 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4822 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4823 }
4824
b0067625
ZJS
4825 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4826 if (*pid < 0)
4827 return log_error_errno(errno, "clone() failed%s: %m",
4828 errno == EINVAL ?
4829 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4830
4831 if (*pid == 0) {
4832 /* The outer child only has a file system namespace. */
4833 barrier_set_role(&barrier, BARRIER_CHILD);
4834
5d9d3fcb 4835 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4836 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4837
4838 (void) reset_all_signal_handlers();
4839 (void) reset_signal_mask();
4840
4841 r = outer_child(&barrier,
4842 arg_directory,
2d845785 4843 dissected_image,
af06cd30 4844 fd_outer_socket_pair[1],
5d9d3fcb 4845 fd_inner_socket_pair[1],
d7bea6b6 4846 fds,
5b4855ab 4847 child_netns_fd);
b0067625
ZJS
4848 if (r < 0)
4849 _exit(EXIT_FAILURE);
4850
4851 _exit(EXIT_SUCCESS);
4852 }
4853
4854 barrier_set_role(&barrier, BARRIER_PARENT);
4855
e4077ff6 4856 fdset_close(fds);
b0067625 4857
5d9d3fcb 4858 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4859 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4860
4861 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4862 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4863 if (mntns_fd < 0)
4864 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4865
b0067625 4866 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4867 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4868 if (l < 0)
4869 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4870 if (l != sizeof arg_uid_shift)
4871 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4872
4873 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4874 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4875 * image, but if that's already in use, pick a new one, and report back to the child,
4876 * which one we now picked. */
4877
4878 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4879 if (r < 0)
4880 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4881
af06cd30 4882 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4883 if (l < 0)
4884 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4885 if (l != sizeof arg_uid_shift)
4886 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4887 }
2f893044
LP
4888
4889 n_bind_user_uid = strv_length(arg_bind_user);
4890 if (n_bind_user_uid > 0) {
4891 /* Right after the UID shift, we'll receive the list of UID mappings for the
4892 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4893
4894 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4895 if (!bind_user_uid)
4896 return log_oom();
4897
4898 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4899 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4900 if (l < 0)
4901 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4902 if (l != sizeof(uid_t)*4)
4903 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4904 SYNTHETIC_ERRNO(EIO),
4905 "Short read while reading bind user UID pairs.");
4906 }
4907 }
b0067625
ZJS
4908 }
4909
8199d554
LP
4910 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4911 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4912 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4913 if (l < 0)
4914 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4915 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4916 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4917 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4918 }
4919
b0067625 4920 /* Wait for the outer child. */
d2e0ac3d
LP
4921 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4922 if (r < 0)
4923 return r;
4924 if (r != EXIT_SUCCESS)
4925 return -EIO;
b0067625
ZJS
4926
4927 /* And now retrieve the PID of the inner child. */
af06cd30 4928 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4929 if (l < 0)
4930 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4931 if (l != sizeof *pid)
4932 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4933
4934 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4935 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4936 if (l < 0)
4937 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4938 if (l != sizeof(arg_uuid))
4939 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4940
4941 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4942 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4943 if (notify_socket < 0)
4944 return log_error_errno(notify_socket,
4945 "Failed to receive notification socket from the outer child: %m");
4946
4947 log_debug("Init process invoked as PID "PID_FMT, *pid);
4948
4949 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4950 if (!barrier_place_and_sync(&barrier)) /* #1 */
4951 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4952
2f893044 4953 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4954 if (r < 0)
4955 return r;
4956
4957 (void) barrier_place(&barrier); /* #2 */
4958 }
4959
4960 if (arg_private_network) {
75116558
PS
4961 if (!arg_network_namespace_path) {
4962 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4963 if (!barrier_place_and_sync(&barrier)) /* #3 */
4964 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4965 }
4966
5b4855ab
DDM
4967 if (child_netns_fd < 0) {
4968 /* Make sure we have an open file descriptor to the child's network
4969 * namespace so it stays alive even if the child exits. */
4970 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4971 if (r < 0)
4972 return log_error_errno(r, "Failed to open child network namespace: %m");
4973 }
4974
4975 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4976 if (r < 0)
4977 return r;
4978
4979 if (arg_network_veth) {
4980 r = setup_veth(arg_machine, *pid, veth_name,
4981 arg_network_bridge || arg_network_zone);
4982 if (r < 0)
4983 return r;
4984 else if (r > 0)
4985 ifi = r;
4986
4987 if (arg_network_bridge) {
4988 /* Add the interface to a bridge */
4989 r = setup_bridge(veth_name, arg_network_bridge, false);
4990 if (r < 0)
4991 return r;
4992 if (r > 0)
4993 ifi = r;
4994 } else if (arg_network_zone) {
4995 /* Add the interface to a bridge, possibly creating it */
4996 r = setup_bridge(veth_name, arg_network_zone, true);
4997 if (r < 0)
4998 return r;
4999 if (r > 0)
5000 ifi = r;
5001 }
5002 }
5003
5004 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5005 if (r < 0)
5006 return r;
5007
5008 /* We created the primary and extra veth links now; let's remember this, so that we know to
5009 remove them later on. Note that we don't bother with removing veth links that were created
5010 here when their setup failed half-way, because in that case the kernel should be able to
5011 remove them on its own, since they cannot be referenced by anything yet. */
5012 *veth_created = true;
5013
5014 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5015 if (r < 0)
5016 return r;
5017
5018 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5019 if (r < 0)
5020 return r;
5021 }
5022
abdb9b08
LP
5023 if (arg_register || !arg_keep_unit) {
5024 r = sd_bus_default_system(&bus);
5025 if (r < 0)
5026 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5027
5028 r = sd_bus_set_close_on_exit(bus, false);
5029 if (r < 0)
5030 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5031 }
5032
5033 if (!arg_keep_unit) {
5034 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5035 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5036 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5037
75152a4d
LP
5038 r = sd_bus_match_signal_async(
5039 bus,
5040 NULL,
5041 "org.freedesktop.systemd1",
5042 NULL,
5043 "org.freedesktop.systemd1.Scope",
5044 "RequestStop",
5045 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5046 if (r < 0)
75152a4d 5047 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5048 }
5049
b0067625
ZJS
5050 if (arg_register) {
5051 r = register_machine(
abdb9b08 5052 bus,
b0067625
ZJS
5053 arg_machine,
5054 *pid,
5055 arg_directory,
5056 arg_uuid,
5057 ifi,
5058 arg_slice,
5059 arg_custom_mounts, arg_n_custom_mounts,
5060 arg_kill_signal,
5061 arg_property,
de40a303 5062 arg_property_message,
b0067625
ZJS
5063 arg_keep_unit,
5064 arg_container_service_name);
5065 if (r < 0)
5066 return r;
abdb9b08 5067
cd2dfc6f
LP
5068 } else if (!arg_keep_unit) {
5069 r = allocate_scope(
abdb9b08 5070 bus,
cd2dfc6f
LP
5071 arg_machine,
5072 *pid,
5073 arg_slice,
5074 arg_custom_mounts, arg_n_custom_mounts,
5075 arg_kill_signal,
de40a303
LP
5076 arg_property,
5077 arg_property_message);
cd2dfc6f
LP
5078 if (r < 0)
5079 return r;
5080
5081 } else if (arg_slice || arg_property)
5082 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5083
27da7ef0 5084 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5085 if (r < 0)
5086 return r;
5087
27da7ef0 5088 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5089 if (r < 0)
5090 return r;
b0067625 5091
de54e02d 5092 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5093 if (r < 0)
5094 return r;
5095
5096 /* Notify the child that the parent is ready with all
5097 * its setup (including cgroup-ification), and that
5098 * the child can now hand over control to the code to
5099 * run inside the container. */
75116558 5100 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5101
5102 /* Block SIGCHLD here, before notifying child.
5103 * process_pty() will handle it with the other signals. */
5104 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5105
5106 /* Reset signal to default */
9c274488 5107 r = default_signals(SIGCHLD);
b0067625
ZJS
5108 if (r < 0)
5109 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5110
5111 r = sd_event_new(&event);
5112 if (r < 0)
5113 return log_error_errno(r, "Failed to get default event source: %m");
5114
8fd010bb
LP
5115 (void) sd_event_set_watchdog(event, true);
5116
abdb9b08
LP
5117 if (bus) {
5118 r = sd_bus_attach_event(bus, event, 0);
5119 if (r < 0)
5120 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5121 }
5122
e96ceaba 5123 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5124 if (r < 0)
5125 return r;
5126
b71a0192
CB
5127 if (arg_userns_mode != USER_NAMESPACE_NO) {
5128 r = wipe_fully_visible_fs(mntns_fd);
5129 if (r < 0)
5130 return r;
5131 mntns_fd = safe_close(mntns_fd);
5132 }
5133
b0067625 5134 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5135 if (!barrier_place_and_sync(&barrier)) /* #5 */
5136 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5137
38ccb557 5138 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5139 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5140 etc_passwd_lock = safe_close(etc_passwd_lock);
5141
04f590a4
LP
5142 (void) sd_notifyf(false,
5143 "STATUS=Container running.\n"
5144 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5145 if (!arg_notify_ready) {
5146 r = sd_notify(false, "READY=1\n");
5147 if (r < 0)
5148 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5149 }
b0067625
ZJS
5150
5151 if (arg_kill_signal > 0) {
5152 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5153 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5154 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5155 } else {
5156 /* Immediately exit */
919f5ae0
LP
5157 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5158 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5159 }
5160
988851b6
LP
5161 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5162
5163 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5164 if (r < 0)
5165 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5166
6916b164 5167 /* Exit when the child exits */
919f5ae0 5168 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5169
b07ee903
CB
5170 /* Retrieve the kmsg fifo allocated by inner child */
5171 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5172 if (fd_kmsg_fifo < 0)
5173 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5174
b0067625 5175 if (arg_expose_ports) {
b07ee903 5176 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5177 if (r < 0)
5178 return r;
5179
deff68e7
FW
5180 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5181 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5182 }
5183
3acc84eb 5184 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5185 _cleanup_close_ int fd = -EBADF;
3acc84eb 5186 PTYForwardFlags flags = 0;
de40a303 5187
3acc84eb 5188 /* Retrieve the master pty allocated by inner child */
bb1aa185 5189 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5190 if (fd < 0)
5191 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5192
5193 switch (arg_console_mode) {
de40a303 5194
3acc84eb
FB
5195 case CONSOLE_READ_ONLY:
5196 flags |= PTY_FORWARD_READ_ONLY;
5197
5198 _fallthrough_;
5199
5200 case CONSOLE_INTERACTIVE:
5201 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5202
5203 r = pty_forward_new(event, fd, flags, &forward);
5204 if (r < 0)
5205 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5206
f5fbe71d 5207 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5208 (void) pty_forward_set_width_height(forward,
5209 arg_console_width,
5210 arg_console_height);
5211 break;
5212
5213 default:
5214 assert(arg_console_mode == CONSOLE_PASSIVE);
5215 }
5216
5217 *master = TAKE_FD(fd);
de40a303 5218 }
b0067625 5219
5d9d3fcb
CB
5220 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5221
b0067625
ZJS
5222 r = sd_event_loop(event);
5223 if (r < 0)
5224 return log_error_errno(r, "Failed to run event loop: %m");
5225
de40a303
LP
5226 if (forward) {
5227 char last_char = 0;
b0067625 5228
de40a303
LP
5229 (void) pty_forward_get_last_char(forward, &last_char);
5230 forward = pty_forward_free(forward);
b0067625 5231
de40a303
LP
5232 if (!arg_quiet && last_char != '\n')
5233 putc('\n', stdout);
5234 }
b0067625
ZJS
5235
5236 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5237 if (!arg_register && !arg_keep_unit && bus)
5238 terminate_scope(bus, arg_machine);
b0067625
ZJS
5239
5240 /* Normally redundant, but better safe than sorry */
c67b0082 5241 (void) kill(*pid, SIGKILL);
b0067625 5242
5d9d3fcb
CB
5243 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5244
5b4855ab
DDM
5245 if (arg_private_network) {
5246 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5247 * to avoid having to move the parent to the child network namespace. */
5248 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5249 if (r < 0)
5250 return r;
5251
5252 if (r == 0) {
254d1313 5253 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab 5254
19b761a0 5255 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5b4855ab
DDM
5256 if (r < 0) {
5257 log_error_errno(r, "Failed to open parent network namespace: %m");
5258 _exit(EXIT_FAILURE);
5259 }
5260
5261 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5262 if (r < 0) {
5263 log_error_errno(r, "Failed to enter child network namespace: %m");
5264 _exit(EXIT_FAILURE);
5265 }
5266
2f091b1b
TM
5267 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5268 * This is about ensuring interfaces get their old name back when being moved back. */
5269 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5270
5b4855ab
DDM
5271 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5272 if (r < 0)
5273 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5274
5275 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5276 }
5277 }
5278
8f03de53 5279 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5280
0bb0a9fa
ZJS
5281 /* Tell machined that we are gone. */
5282 if (bus)
5283 (void) unregister_machine(bus, arg_machine);
5284
b0067625
ZJS
5285 if (r < 0)
5286 /* We failed to wait for the container, or the container exited abnormally. */
5287 return r;
5288 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5289 /* r > 0 → The container exited with a non-zero status.
5290 * As a special case, we need to replace 133 with a different value,
5291 * because 133 is special-cased in the service file to reboot the container.
5292 * otherwise → The container exited with zero status and a reboot was not requested.
5293 */
2a49b612 5294 if (r == EXIT_FORCE_RESTART)
27e29a1e 5295 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5296 *ret = r;
b0067625
ZJS
5297 return 0; /* finito */
5298 }
5299
5300 /* CONTAINER_REBOOTED, loop again */
5301
5302 if (arg_keep_unit) {
5303 /* Special handling if we are running as a service: instead of simply
5304 * restarting the machine we want to restart the entire service, so let's
5305 * inform systemd about this with the special exit code 133. The service
5306 * file uses RestartForceExitStatus=133 so that this results in a full
5307 * nspawn restart. This is necessary since we might have cgroup parameters
5308 * set we want to have flushed out. */
2a49b612
ZJS
5309 *ret = EXIT_FORCE_RESTART;
5310 return 0; /* finito */
b0067625
ZJS
5311 }
5312
deff68e7
FW
5313 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5314 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5315
5316 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5317 *veth_created = false;
5318 return 1; /* loop again */
5319}
5320
bf428efb 5321static int initialize_rlimits(void) {
852b6250 5322 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5323 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5324 * container execution environments. */
5325
5326 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5327 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5328 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5329 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5330 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5331 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5332 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5333 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5334 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5335 [RLIMIT_NICE] = { 0, 0 },
5336 [RLIMIT_NOFILE] = { 1024, 4096 },
5337 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5338 [RLIMIT_RTPRIO] = { 0, 0 },
5339 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5340 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5341
5342 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5343 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5344 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5345 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5346 * that PID 1 changes a number of other resource limits during early initialization which is why we
5347 * don't read the other limits from PID 1 but prefer the static table above. */
5348 };
5349
5350 int rl;
5351
5352 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5353 /* Let's only fill in what the user hasn't explicitly configured anyway */
5354 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5355 const struct rlimit *v;
5356 struct rlimit buffer;
5357
5358 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5359 /* For these two let's read the limits off PID 1. See above for an explanation. */
5360
5361 if (prlimit(1, rl, NULL, &buffer) < 0)
5362 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5363
dbf1aca6
LP
5364 v = &buffer;
5365 } else if (rl == RLIMIT_NOFILE) {
5366 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5367 * userspace. Given that nspawn containers are often run without our PID 1,
5368 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5369 * so that container userspace gets similar resources as host userspace
5370 * gets. */
5371 buffer = kernel_defaults[rl];
5372 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5373 v = &buffer;
5374 } else
5375 v = kernel_defaults + rl;
5376
5377 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5378 if (!arg_rlimit[rl])
5379 return log_oom();
5380 }
5381
5382 if (DEBUG_LOGGING) {
5383 _cleanup_free_ char *k = NULL;
5384
5385 (void) rlimit_format(arg_rlimit[rl], &k);
5386 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5387 }
5388 }
5389
5390 return 0;
5391}
5392
287b7376 5393static int cant_be_in_netns(void) {
254d1313 5394 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5395 struct ucred ucred;
5396 int r;
5397
5398 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5399 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5400 * nice message. */
5401
5402 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5403 return 0;
5404
5405 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5406 if (fd < 0)
5407 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5408
1861986a 5409 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
bb44fd07
ZJS
5410 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5411 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5412 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5413 if (r < 0)
1861986a 5414 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5415
5416 r = getpeercred(fd, &ucred);
5417 if (r < 0)
5418 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5419
f7a2dc3d 5420 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5421 if (r < 0)
f7a2dc3d
CB
5422 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5423 if (r == 0)
287b7376
LP
5424 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5425 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5426 return 0;
5427}
5428
44dbef90 5429static int run(int argc, char *argv[]) {
4c27749b 5430 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5431 _cleanup_close_ int master = -EBADF;
03cfe0d5 5432 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5433 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5434 char veth_name[IFNAMSIZ] = "";
761cf19d 5435 struct ExposeArgs expose_args = {};
8e766630 5436 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5437 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5438 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5439 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5440 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5441 pid_t pid = 0;
03cfe0d5
LP
5442
5443 log_parse_environment();
5444 log_open();
415fc41c 5445
03cfe0d5
LP
5446 r = parse_argv(argc, argv);
5447 if (r <= 0)
5448 goto finish;
5449
38ee19c0
ZJS
5450 if (geteuid() != 0) {
5451 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5452 argc >= 2 ? "Need to be root." :
5453 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5454 goto finish;
38ee19c0 5455 }
fba868fa 5456
287b7376
LP
5457 r = cant_be_in_netns();
5458 if (r < 0)
5459 goto finish;
5460
bf428efb
LP
5461 r = initialize_rlimits();
5462 if (r < 0)
5463 goto finish;
5464
de40a303
LP
5465 r = load_oci_bundle();
5466 if (r < 0)
5467 goto finish;
5468
f757855e
LP
5469 r = determine_names();
5470 if (r < 0)
5471 goto finish;
5472
5473 r = load_settings();
5474 if (r < 0)
5475 goto finish;
5476
d4d99bc6 5477 r = cg_unified();
5eee8290
LP
5478 if (r < 0) {
5479 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5480 goto finish;
5481 }
5482
f757855e
LP
5483 r = verify_arguments();
5484 if (r < 0)
5485 goto finish;
03cfe0d5 5486
2f091b1b
TM
5487 r = verify_network_interfaces_initialized();
5488 if (r < 0)
5489 goto finish;
5490
49048684
ZJS
5491 /* Reapply environment settings. */
5492 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5493
2949ff26
LP
5494 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5495 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5496 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5497 (void) ignore_signals(SIGPIPE);
2949ff26 5498
03cfe0d5
LP
5499 n_fd_passed = sd_listen_fds(false);
5500 if (n_fd_passed > 0) {
5501 r = fdset_new_listen_fds(&fds, false);
5502 if (r < 0) {
5503 log_error_errno(r, "Failed to collect file descriptors: %m");
5504 goto finish;
5505 }
5506 }
5507
83e803a9
ZJS
5508 /* The "default" umask. This is appropriate for most file and directory
5509 * operations performed by nspawn, and is the umask that will be used for
5510 * the child. Functions like copy_devnodes() change the umask temporarily. */
5511 umask(0022);
5512
03cfe0d5
LP
5513 if (arg_directory) {
5514 assert(!arg_image);
5515
b35ca61a
LP
5516 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5517 * /var from the host will propagate into container dynamically (because bad things happen if
5518 * two systems write to the same /var). Let's allow it for the special cases where /var is
5519 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5520 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5521 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5522 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5523 goto finish;
5524 }
5525
5526 if (arg_ephemeral) {
5527 _cleanup_free_ char *np = NULL;
5528
f461a28d 5529 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5530 if (r < 0)
5531 goto finish;
5532
7bf011e3
LP
5533 /* If the specified path is a mount point we generate the new snapshot immediately
5534 * inside it under a random name. However if the specified is not a mount point we
5535 * create the new snapshot in the parent directory, just next to it. */
e1873695 5536 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5537 if (r < 0) {
5538 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5539 goto finish;
5540 }
5541 if (r > 0)
770b5ce4 5542 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5543 else
770b5ce4 5544 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5545 if (r < 0) {
0f3be6ca 5546 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5547 goto finish;
5548 }
5549
6992459c 5550 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5551 * only owned by us and no one else. */
6992459c 5552 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5553 if (r < 0) {
5554 log_error_errno(r, "Failed to lock %s: %m", np);
5555 goto finish;
5556 }
5557
7bf011e3
LP
5558 {
5559 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5560 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5561 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5562 BTRFS_SNAPSHOT_FALLBACK_COPY |
5563 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5564 BTRFS_SNAPSHOT_RECURSIVE |
5565 BTRFS_SNAPSHOT_QUOTA |
5566 BTRFS_SNAPSHOT_SIGINT);
7bf011e3
LP
5567 }
5568 if (r == -EINTR) {
5569 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5570 goto finish;
5571 }
03cfe0d5
LP
5572 if (r < 0) {
5573 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5574 goto finish;
ec16945e
LP
5575 }
5576
1cc6c93a 5577 free_and_replace(arg_directory, np);
17cbb288 5578 remove_directory = true;
30535c16 5579 } else {
f461a28d 5580 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5581 if (r < 0)
5582 goto finish;
5583
30535c16
LP
5584 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5585 if (r == -EBUSY) {
5586 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5587 goto finish;
5588 }
5589 if (r < 0) {
5590 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5591 goto finish;
30535c16
LP
5592 }
5593
5594 if (arg_template) {
f461a28d 5595 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5596 if (r < 0)
5597 goto finish;
5598
7bf011e3
LP
5599 {
5600 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5601 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5602 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5603 BTRFS_SNAPSHOT_FALLBACK_COPY |
5604 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5605 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5606 BTRFS_SNAPSHOT_RECURSIVE |
5607 BTRFS_SNAPSHOT_QUOTA |
5608 BTRFS_SNAPSHOT_SIGINT);
7bf011e3 5609 }
ff6c6cc1
LP
5610 if (r == -EEXIST)
5611 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5612 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5613 else if (r == -EINTR) {
5614 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5615 goto finish;
5616 } else if (r < 0) {
83521414 5617 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5618 goto finish;
ff6c6cc1
LP
5619 } else
5620 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5621 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5622 }
ec16945e
LP
5623 }
5624
7732f92b 5625 if (arg_start_mode == START_BOOT) {
aff7ae0d 5626 _cleanup_free_ char *b = NULL;
a5201ed6 5627 const char *p;
c9fe05e0 5628
aff7ae0d
LP
5629 if (arg_pivot_root_new) {
5630 b = path_join(arg_directory, arg_pivot_root_new);
5631 if (!b)
5632 return log_oom();
5633
5634 p = b;
5635 } else
a5201ed6 5636 p = arg_directory;
c9fe05e0
AR
5637
5638 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5639 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5640 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5641 goto finish;
5642 }
5643 } else {
aff7ae0d 5644 _cleanup_free_ char *p = NULL;
c9fe05e0 5645
a5201ed6 5646 if (arg_pivot_root_new)
aff7ae0d 5647 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5648 else
aff7ae0d
LP
5649 p = path_join(arg_directory, "/usr/");
5650 if (!p)
5651 return log_oom();
1b9e5b12 5652
aff7ae0d
LP
5653 if (laccess(p, F_OK) < 0) {
5654 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5655 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5656 goto finish;
1b9e5b12
LP
5657 }
5658 }
ec16945e 5659
6b9132a9 5660 } else {
d04faa4e 5661 DissectImageFlags dissect_image_flags =
4b5de5dd 5662 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5663 DISSECT_IMAGE_REQUIRE_ROOT |
5664 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5665 DISSECT_IMAGE_USR_NO_ROOT |
5666 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5667 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5668 assert(arg_image);
5669 assert(!arg_template);
5670
f461a28d 5671 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5672 if (r < 0)
5673 goto finish;
5674
0f3be6ca
LP
5675 if (arg_ephemeral) {
5676 _cleanup_free_ char *np = NULL;
5677
5678 r = tempfn_random(arg_image, "machine.", &np);
5679 if (r < 0) {
5680 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5681 goto finish;
5682 }
5683
6992459c
LP
5684 /* Always take an exclusive lock on our own ephemeral copy. */
5685 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5686 if (r < 0) {
5687 r = log_error_errno(r, "Failed to create image lock: %m");
5688 goto finish;
5689 }
5690
7bf011e3
LP
5691 {
5692 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5693 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5694 FS_NOCOW_FL, FS_NOCOW_FL,
5695 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5696 NULL, NULL);
7bf011e3
LP
5697 }
5698 if (r == -EINTR) {
5699 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5700 goto finish;
5701 }
0f3be6ca
LP
5702 if (r < 0) {
5703 r = log_error_errno(r, "Failed to copy image file: %m");
5704 goto finish;
5705 }
5706
1cc6c93a 5707 free_and_replace(arg_image, np);
0f3be6ca
LP
5708 remove_image = true;
5709 } else {
5710 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5711 if (r == -EBUSY) {
5712 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5713 goto finish;
5714 }
5715 if (r < 0) {
5716 r = log_error_errno(r, "Failed to create image lock: %m");
5717 goto finish;
5718 }
4623e8e6 5719
89e62e0b
LP
5720 r = verity_settings_load(
5721 &arg_verity_settings,
5722 arg_image, NULL, NULL);
e7cbe5cb
LB
5723 if (r < 0) {
5724 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5725 goto finish;
78ebe980 5726 }
89e62e0b
LP
5727
5728 if (arg_verity_settings.data_path)
5729 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5730 }
5731
c67b0082 5732 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5733 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5734 goto finish;
1b9e5b12 5735 }
6b9132a9 5736
c67b0082
LP
5737 remove_tmprootdir = true;
5738
5739 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5740 if (!arg_directory) {
5741 r = log_oom();
5742 goto finish;
6b9132a9 5743 }
88213476 5744
89e62e0b
LP
5745 r = loop_device_make_by_path(
5746 arg_image,
5747 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5748 /* sector_size= */ UINT32_MAX,
89e62e0b 5749 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5750 LOCK_SH,
89e62e0b 5751 &loop);
2d845785
LP
5752 if (r < 0) {
5753 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5754 goto finish;
5755 }
1b9e5b12 5756
bad31660 5757 r = dissect_loop_device_and_warn(
bad31660 5758 loop,
89e62e0b 5759 &arg_verity_settings,
84be0c71
LP
5760 /* mount_options=*/ NULL,
5761 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5762 dissect_image_flags,
e0f9e7bd 5763 &dissected_image);
2d845785 5764 if (r == -ENOPKG) {
4526113f 5765 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5766 log_notice("Note that the disk image needs to\n"
5767 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5768 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5769 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5770 " d) or contain a file system without a partition table\n"
5771 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5772 goto finish;
2d845785 5773 }
4526113f 5774 if (r < 0)
842f3b0f 5775 goto finish;
1b9e5b12 5776
88b3300f
LP
5777 r = dissected_image_load_verity_sig_partition(
5778 dissected_image,
5779 loop->fd,
5780 &arg_verity_settings);
5781 if (r < 0)
5782 goto finish;
5783
8ee9615e
LP
5784 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5785 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5786 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5787
89e62e0b
LP
5788 r = dissected_image_decrypt_interactively(
5789 dissected_image,
5790 NULL,
5791 &arg_verity_settings,
e330f97a 5792 0);
1b9e5b12
LP
5793 if (r < 0)
5794 goto finish;
0f3be6ca
LP
5795
5796 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5797 if (remove_image && unlink(arg_image) >= 0)
5798 remove_image = false;
4c27749b
LP
5799
5800 if (arg_architecture < 0)
5801 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5802 }
842f3b0f 5803
86c0dd4a 5804 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5805 if (r < 0)
5806 goto finish;
5807
de40a303
LP
5808 if (arg_console_mode < 0)
5809 arg_console_mode =
5810 isatty(STDIN_FILENO) > 0 &&
5811 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5812
de40a303
LP
5813 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5814 arg_quiet = true;
a258bf26 5815
9c857b9d 5816 if (!arg_quiet)
c85c2f79 5817 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5818 arg_machine, arg_image ?: arg_directory);
5819
988851b6 5820 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5821
8c3fe1b5
LP
5822 r = make_reaper_process(true);
5823 if (r < 0) {
5824 log_error_errno(r, "Failed to become subreaper: %m");
03cfe0d5
LP
5825 goto finish;
5826 }
5827
761cf19d
FW
5828 if (arg_expose_ports) {
5829 r = fw_ctx_new(&fw_ctx);
5830 if (r < 0) {
5831 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5832 goto finish;
5833 }
5834 expose_args.fw_ctx = fw_ctx;
5835 }
d87be9b0 5836 for (;;) {
3acc84eb 5837 r = run_container(dissected_image,
44dbef90
LP
5838 fds,
5839 veth_name, &veth_created,
761cf19d 5840 &expose_args, &master,
44dbef90 5841 &pid, &ret);
b0067625 5842 if (r <= 0)
d87be9b0 5843 break;
d87be9b0 5844 }
88213476
LP
5845
5846finish:
04f590a4
LP
5847 (void) sd_notify(false,
5848 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5849 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5850
9444b1f2 5851 if (pid > 0)
c67b0082 5852 (void) kill(pid, SIGKILL);
88213476 5853
503546da 5854 /* Try to flush whatever is still queued in the pty */
6a0f896b 5855 if (master >= 0) {
f5fbe71d 5856 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5857 master = safe_close(master);
5858 }
5859
5860 if (pid > 0)
5861 (void) wait_for_terminate(pid, NULL);
503546da 5862
50ebcf6c
LP
5863 pager_close();
5864
17cbb288 5865 if (remove_directory && arg_directory) {
ec16945e
LP
5866 int k;
5867
17cbb288 5868 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5869 if (k < 0)
17cbb288 5870 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5871 }
5872
0f3be6ca
LP
5873 if (remove_image && arg_image) {
5874 if (unlink(arg_image) < 0)
5875 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5876 }
5877
c67b0082
LP
5878 if (remove_tmprootdir) {
5879 if (rmdir(tmprootdir) < 0)
5880 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5881 }
5882
785890ac
LP
5883 if (arg_machine) {
5884 const char *p;
5885
63c372cb 5886 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5887 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5888 }
5889
deff68e7
FW
5890 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5891 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5892
5893 if (veth_created)
5894 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5895 (void) remove_bridge(arg_network_zone);
f757855e 5896
f757855e
LP
5897 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5898 expose_port_free_all(arg_expose_ports);
bf428efb 5899 rlimit_free_all(arg_rlimit);
b2645747 5900 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5901 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5902
44dbef90
LP
5903 if (r < 0)
5904 return r;
5905
5906 return ret;
88213476 5907}
44dbef90
LP
5908
5909DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);