]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #25608 from poettering/dissect-moar
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
d6b4d1c7 31#include "build.h"
b8ea7a6e 32#include "bus-error.h"
b053cd5f 33#include "bus-util.h"
8fe0087e 34#include "cap-list.h"
430f0182 35#include "capability-util.h"
04d391da 36#include "cgroup-util.h"
f461a28d 37#include "chase.h"
988851b6 38#include "common-signal.h"
8fe0087e 39#include "copy.h"
d107bb7d 40#include "cpu-set-util.h"
786d19fd 41#include "creds-util.h"
4fc9982c 42#include "dev-setup.h"
57f1b61b 43#include "discover-image.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3652872a 46#include "escape.h"
3ffd4af2 47#include "fd-util.h"
842f3b0f 48#include "fdset.h"
a5c32cff 49#include "fileio.h"
f97b34a6 50#include "format-util.h"
f4f15635 51#include "fs-util.h"
1b9e5b12 52#include "gpt.h"
4623e8e6 53#include "hexdecoct.h"
e2054217 54#include "hostname-setup.h"
8fe0087e 55#include "hostname-util.h"
910fd145 56#include "id128-util.h"
3652872a 57#include "io-util.h"
8fe0087e 58#include "log.h"
2d845785 59#include "loop-util.h"
8fe0087e 60#include "loopback-setup.h"
8fe0087e 61#include "macro.h"
44dbef90 62#include "main-func.h"
f5947a5e 63#include "missing_sched.h"
8fe0087e 64#include "mkdir.h"
4349cd7c 65#include "mount-util.h"
049af8ad 66#include "mountpoint-util.h"
0cb8e3d1 67#include "namespace-util.h"
8fe0087e 68#include "netlink-util.h"
2f893044 69#include "nspawn-bind-user.h"
07630cea 70#include "nspawn-cgroup.h"
3652872a 71#include "nspawn-creds.h"
3603efde 72#include "nspawn-def.h"
07630cea
LP
73#include "nspawn-expose-ports.h"
74#include "nspawn-mount.h"
75#include "nspawn-network.h"
de40a303 76#include "nspawn-oci.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
c9394f4f 83#include "nspawn-util.h"
91181e07 84#include "nspawn.h"
d8b4d14d 85#include "nulstr-util.h"
d58ad743 86#include "os-util.h"
50ebcf6c 87#include "pager.h"
614b022c 88#include "parse-argument.h"
6bedfcbb 89#include "parse-util.h"
294bf0c3 90#include "pretty-print.h"
0b452006 91#include "process-util.h"
8fe0087e
LP
92#include "ptyfwd.h"
93#include "random-util.h"
8869a0b4 94#include "raw-clone.h"
86775e35 95#include "resolve-util.h"
bf428efb 96#include "rlimit-util.h"
8fe0087e 97#include "rm-rf.h"
de40a303
LP
98#if HAVE_SECCOMP
99#include "seccomp-util.h"
100#endif
68b02049 101#include "selinux-util.h"
8fe0087e 102#include "signal-util.h"
2583fbea 103#include "socket-util.h"
8fcde012 104#include "stat-util.h"
15a5e950 105#include "stdio-util.h"
5c828e66 106#include "string-table.h"
07630cea 107#include "string-util.h"
8fe0087e 108#include "strv.h"
de40a303 109#include "sysctl-util.h"
8fe0087e 110#include "terminal-util.h"
e4de7287 111#include "tmpfile-util.h"
affb60b1 112#include "umask-util.h"
43c3fb46 113#include "unit-name.h"
b1d4f8e1 114#include "user-util.h"
e9642be2 115
e96ceaba
LP
116/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 118#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
6145bb4f 124 CONTAINER_REBOOTED,
113cea80
DH
125} ContainerStatus;
126
88213476 127static char *arg_directory = NULL;
ec16945e 128static char *arg_template = NULL;
5f932eb9 129static char *arg_chdir = NULL;
b53ede69
PW
130static char *arg_pivot_root_new = NULL;
131static char *arg_pivot_root_old = NULL;
687d0825 132static char *arg_user = NULL;
de40a303
LP
133static uid_t arg_uid = UID_INVALID;
134static gid_t arg_gid = GID_INVALID;
135static gid_t* arg_supplementary_gids = NULL;
136static size_t arg_n_supplementary_gids = 0;
9444b1f2 137static sd_id128_t arg_uuid = {};
3a9530e5
LP
138static char *arg_machine = NULL; /* The name used by the host to refer to this */
139static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
140static const char *arg_selinux_context = NULL;
141static const char *arg_selinux_apifs_context = NULL;
de40a303 142static char *arg_slice = NULL;
ff01d048 143static bool arg_private_network = false;
bc2f673e 144static bool arg_read_only = false;
7732f92b 145static StartMode arg_start_mode = START_PID1;
ec16945e 146static bool arg_ephemeral = false;
57fb9fb5 147static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 148static bool arg_link_journal_try = false;
520e0d54 149static uint64_t arg_caps_retain =
50b52222
LP
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 161 (1ULL << CAP_MKNOD) |
5076f0cc
LP
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
5076f0cc 165 (1ULL << CAP_SETFCAP) |
50b52222 166 (1ULL << CAP_SETGID) |
5076f0cc
LP
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
50b52222 170 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 174 (1ULL << CAP_SYS_RESOURCE) |
50b52222 175 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 176static uint64_t arg_caps_ambient = 0;
de40a303 177static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 178static CustomMount *arg_custom_mounts = NULL;
88614c8a 179static size_t arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
d7bea6b6 191static char *arg_network_namespace_path = NULL;
bb068de0 192static PagerFlags arg_pager_flags = 0;
050f7277 193static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 194static char *arg_image = NULL;
de40a303 195static char *arg_oci_bundle = NULL;
f757855e 196static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 197static ExposePort *arg_expose_ports = NULL;
f36933fe 198static char **arg_property = NULL;
de40a303 199static sd_bus_message *arg_property_message = NULL;
0de7acce 200static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 201static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 202static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 203static int arg_kill_signal = 0;
5da38d07 204static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
205static SettingsMask arg_settings_mask = 0;
206static int arg_settings_trusted = -1;
207static char **arg_parameters = NULL;
6aadfa4c 208static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 209static bool arg_notify_ready = false;
5a8ff0e6 210static bool arg_use_cgns = true;
0c582db0 211static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 212static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 213static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
214static char **arg_syscall_allow_list = NULL;
215static char **arg_syscall_deny_list = NULL;
de40a303
LP
216#if HAVE_SECCOMP
217static scmp_filter_ctx arg_seccomp = NULL;
218#endif
bf428efb 219static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 220static bool arg_no_new_privileges = false;
81f345df
LP
221static int arg_oom_score_adjust = 0;
222static bool arg_oom_score_adjust_set = false;
0985c7c4 223static CPUSet arg_cpu_set = {};
09d423e9 224static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 225static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 226static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
227static DeviceNode* arg_extra_nodes = NULL;
228static size_t arg_n_extra_nodes = 0;
229static char **arg_sysctl = NULL;
230static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
231static Credential *arg_credentials = NULL;
232static size_t arg_n_credentials = 0;
2f893044 233static char **arg_bind_user = NULL;
4a4654e0 234static bool arg_suppress_sync = false;
3603f151 235static char *arg_settings_filename = NULL;
4c27749b 236static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 237static ImagePolicy *arg_image_policy = NULL;
88213476 238
6145bb4f
LP
239STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
250STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
259STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
260STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
261STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 262STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
263STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
264STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
265#if HAVE_SECCOMP
266STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
267#endif
0985c7c4 268STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 269STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 270STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 271STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 272STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 273
dce66ffe
ZJS
274static int handle_arg_console(const char *arg) {
275 if (streq(arg, "help")) {
10e8a60b
LP
276 puts("autopipe\n"
277 "interactive\n"
dce66ffe 278 "passive\n"
10e8a60b
LP
279 "pipe\n"
280 "read-only");
dce66ffe
ZJS
281 return 0;
282 }
283
284 if (streq(arg, "interactive"))
285 arg_console_mode = CONSOLE_INTERACTIVE;
286 else if (streq(arg, "read-only"))
287 arg_console_mode = CONSOLE_READ_ONLY;
288 else if (streq(arg, "passive"))
289 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
290 else if (streq(arg, "pipe")) {
291 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
292 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
293 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
294 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
295 "Proceeding anyway.");
296
dce66ffe 297 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
298 } else if (streq(arg, "autopipe")) {
299 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
300 arg_console_mode = CONSOLE_INTERACTIVE;
301 else
302 arg_console_mode = CONSOLE_PIPE;
554c4beb 303 } else
dce66ffe
ZJS
304 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
305
306 arg_settings_mask |= SETTING_CONSOLE_MODE;
307 return 1;
308}
309
37ec0fdd
LP
310static int help(void) {
311 _cleanup_free_ char *link = NULL;
312 int r;
313
384c2c32 314 pager_open(arg_pager_flags);
50ebcf6c 315
37ec0fdd
LP
316 r = terminal_urlify_man("systemd-nspawn", "1", &link);
317 if (r < 0)
318 return log_oom();
319
25148653 320 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 321 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
322 " -h --help Show this help\n"
323 " --version Print version string\n"
69c79d3c 324 " -q --quiet Do not show status information\n"
bb068de0 325 " --no-pager Do not pipe output into a pager\n"
25148653
LP
326 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
327 "%3$sImage:%4$s\n"
1b9e5b12 328 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
329 " --template=PATH Initialize root directory from template directory,\n"
330 " if missing\n"
331 " -x --ephemeral Run container with snapshot of root directory, and\n"
332 " remove it after exit\n"
25e68fd3
LP
333 " -i --image=PATH Root file system disk image (or device node) for\n"
334 " the container\n"
84be0c71 335 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 336 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
337 " --read-only Mount the root directory read-only\n"
338 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 339 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
340 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
341 " as a DER encoded PKCS7, either as a path to a file\n"
342 " or as an ASCII base64 encoded string prefixed by\n"
343 " 'base64:'\n"
e7cbe5cb 344 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
345 " --pivot-root=PATH[:PATH]\n"
346 " Pivot root to given directory in the container\n\n"
347 "%3$sExecution:%4$s\n"
7732f92b 348 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 349 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 350 " --chdir=PATH Set working directory in the container\n"
0d2a0179 351 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
352 " -u --user=USER Run the command under specified user or UID\n"
353 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
354 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
355 " --suppress-sync=BOOLEAN\n"
356 " Suppress any form of disk data synchronization\n\n"
25148653 357 "%3$sSystem Identity:%4$s\n"
a8828ed9 358 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 359 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
360 " --uuid=UUID Set a specific machine UUID for the container\n\n"
361 "%3$sProperties:%4$s\n"
a8828ed9 362 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 363 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
364 " --register=BOOLEAN Register container as machine\n"
365 " --keep-unit Do not register a scope for the machine, reuse\n"
366 " the service unit nspawn is running in\n\n"
367 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
368 " --private-users=no Run without user namespacing\n"
369 " --private-users=yes|pick|identity\n"
370 " Run within user namespace, autoselect UID/GID range\n"
371 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 372 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
373 " --private-users-ownership=MODE\n"
374 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
375 " to private UID/GID range\n"
376 " -U Equivalent to --private-users=pick and\n"
377 " --private-users-ownership=auto\n\n"
25148653 378 "%3$sNetworking:%4$s\n"
69c79d3c
LP
379 " --private-network Disable network in container\n"
380 " --network-interface=INTERFACE\n"
381 " Assign an existing network interface to the\n"
382 " container\n"
c74e630d
LP
383 " --network-macvlan=INTERFACE\n"
384 " Create a macvlan network interface based on an\n"
385 " existing network interface to the container\n"
4bbfe7ad 386 " --network-ipvlan=INTERFACE\n"
387f6955 387 " Create an ipvlan network interface based on an\n"
4bbfe7ad 388 " existing network interface to the container\n"
a8eaaee7 389 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 390 " and container\n"
f6d6bad1
LP
391 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
392 " Add an additional virtual Ethernet link between\n"
393 " host and container\n"
ab046dde 394 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
395 " Add a virtual Ethernet connection to the container\n"
396 " and attach it to an existing bridge on the host\n"
397 " --network-zone=NAME Similar, but attach the new interface to an\n"
398 " an automatically managed bridge interface\n"
d7bea6b6
DP
399 " --network-namespace-path=PATH\n"
400 " Set network namespace to the one represented by\n"
401 " the specified kernel namespace file node\n"
6d0b55c2 402 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
403 " Expose a container IP port on the host\n\n"
404 "%3$sSecurity:%4$s\n"
a8828ed9
DW
405 " --capability=CAP In addition to the default, retain specified\n"
406 " capability\n"
407 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
408 " --ambient-capability=CAP\n"
409 " Sets the specified capability for the started\n"
410 " process. Not useful if booting a machine.\n"
f4e803c8 411 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
412 " --system-call-filter=LIST|~LIST\n"
413 " Permit/prohibit specific system calls\n"
25148653
LP
414 " -Z --selinux-context=SECLABEL\n"
415 " Set the SELinux security context to be used by\n"
416 " processes in the container\n"
417 " -L --selinux-apifs-context=SECLABEL\n"
418 " Set the SELinux security context to be used by\n"
419 " API/tmpfs file systems in the container\n\n"
420 "%3$sResources:%4$s\n"
bf428efb 421 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
422 " --oom-score-adjust=VALUE\n"
423 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
424 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
425 " --personality=ARCH Pick personality for this container\n\n"
25148653 426 "%3$sIntegration:%4$s\n"
09d423e9 427 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 428 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
429 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
430 " host, try-guest, try-host\n"
431 " -j Equivalent to --link-journal=try-guest\n\n"
432 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
433 " --bind=PATH[:PATH[:OPTIONS]]\n"
434 " Bind mount a file or directory from the host into\n"
a8828ed9 435 " the container\n"
5e5bfa6e
EY
436 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
437 " Similar, but creates a read-only bind mount\n"
de40a303
LP
438 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
439 " it\n"
06c17c39 440 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
441 " --overlay=PATH[:PATH...]:PATH\n"
442 " Create an overlay mount from the host to \n"
443 " the container\n"
444 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
445 " Similar, but creates a read-only overlay mount\n"
446 " --bind-user=NAME Bind user from host to container\n\n"
25148653 447 "%3$sInput/Output:%4$s\n"
de40a303
LP
448 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
449 " set up for the container.\n"
3652872a
LP
450 " -P --pipe Equivalent to --console=pipe\n\n"
451 "%3$sCredentials:%4$s\n"
452 " --set-credential=ID:VALUE\n"
453 " Pass a credential with literal value to container.\n"
454 " --load-credential=ID:PATH\n"
455 " Load credential to pass to container from file or\n"
456 " AF_UNIX stream socket.\n"
bc556335
DDM
457 "\nSee the %2$s for details.\n",
458 program_invocation_short_name,
459 link,
460 ansi_underline(),
461 ansi_normal(),
462 ansi_highlight(),
463 ansi_normal());
37ec0fdd
LP
464
465 return 0;
88213476
LP
466}
467
86c0dd4a 468static int custom_mount_check_all(void) {
88614c8a 469 size_t i;
5a8af538 470
5a8af538
LP
471 for (i = 0; i < arg_n_custom_mounts; i++) {
472 CustomMount *m = &arg_custom_mounts[i];
473
0de7acce 474 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 475 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 476 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 477 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 478 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
479 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
480 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 481 }
5a8af538
LP
482 }
483
484 return 0;
485}
486
8199d554 487static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 488 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 489 int r;
5da38d07 490
efdb0237 491 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
492
493 e = getenv(var);
494 if (!e) {
d5fc5b2f 495 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
496 var = "UNIFIED_CGROUP_HIERARCHY";
497 e = getenv(var);
c78c095b
ZJS
498 }
499
500 if (!isempty(e)) {
efdb0237
LP
501 r = parse_boolean(e);
502 if (r < 0)
c78c095b 503 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
504 if (r > 0)
505 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
506 else
507 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
508 }
509
8199d554
LP
510 return 0;
511}
512
513static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
514 int r;
515
75b0d8b8
ZJS
516 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
517 * in the image actually supports. */
b4cccbc1
LP
518 r = cg_all_unified();
519 if (r < 0)
520 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
521 if (r > 0) {
a8725a06
ZJS
522 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
523 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 524 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
525 if (r < 0)
526 return log_error_errno(r, "Failed to determine systemd version in container: %m");
527 if (r > 0)
528 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
529 else
530 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 531 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 532 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 533 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
534 if (r < 0)
535 return log_error_errno(r, "Failed to determine systemd version in container: %m");
536 if (r > 0)
537 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
538 else
539 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
540 } else
5da38d07 541 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 542
8199d554
LP
543 log_debug("Using %s hierarchy for container.",
544 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
546
efdb0237
LP
547 return 0;
548}
549
8a99bd0c
ZJS
550static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
551 uint64_t mask = 0;
552 int r;
553
554 for (;;) {
555 _cleanup_free_ char *t = NULL;
556
557 r = extract_first_word(&spec, &t, ",", 0);
558 if (r < 0)
559 return log_error_errno(r, "Failed to parse capability %s.", t);
560 if (r == 0)
561 break;
562
563 if (streq(t, "help")) {
564 for (int i = 0; i < capability_list_length(); i++) {
565 const char *name;
566
567 name = capability_to_name(i);
568 if (name)
569 puts(name);
570 }
571
572 return 0; /* quit */
573 }
574
575 if (streq(t, "all"))
f5fbe71d 576 mask = UINT64_MAX;
8a99bd0c
ZJS
577 else {
578 r = capability_from_name(t);
579 if (r < 0)
580 return log_error_errno(r, "Failed to parse capability %s.", t);
581
582 mask |= 1ULL << r;
583 }
584 }
585
586 *ret_mask = mask;
587 return 1; /* continue */
588}
589
49048684 590static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
591 int r;
592
593 r = getenv_bool(name);
594 if (r == -ENXIO)
49048684 595 return 0;
0c582db0 596 if (r < 0)
49048684 597 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 598
0c582db0 599 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 600 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 601 return 0;
0c582db0
LB
602}
603
49048684 604static int parse_mount_settings_env(void) {
4f086aab 605 const char *e;
1099ceeb
LP
606 int r;
607
608 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
609 if (r < 0 && r != -ENXIO)
610 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
611 if (r >= 0)
612 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
613
614 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 615 if (streq_ptr(e, "network"))
4f086aab 616 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 617
49048684
ZJS
618 else if (e) {
619 r = parse_boolean(e);
620 if (r < 0)
621 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
622
623 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 625 }
4f086aab 626
49048684 627 return 0;
4f086aab
SU
628}
629
49048684 630static int parse_environment(void) {
d5455d2f
LP
631 const char *e;
632 int r;
633
49048684
ZJS
634 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
635 if (r < 0)
636 return r;
637 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
638 if (r < 0)
639 return r;
640 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
641 if (r < 0)
642 return r;
643 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
644 if (r < 0)
645 return r;
d5455d2f 646
49048684
ZJS
647 r = parse_mount_settings_env();
648 if (r < 0)
649 return r;
d5455d2f 650
489fae52
ZJS
651 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
652 * even if it is supported. If not supported, it has no effect. */
de40a303 653 if (!cg_ns_supported())
489fae52 654 arg_use_cgns = false;
de40a303
LP
655 else {
656 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
657 if (r < 0) {
658 if (r != -ENXIO)
49048684 659 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
660
661 arg_use_cgns = true;
662 } else {
663 arg_use_cgns = r > 0;
664 arg_settings_mask |= SETTING_USE_CGNS;
665 }
666 }
d5455d2f
LP
667
668 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
669 if (e)
670 arg_container_service_name = e;
671
4a4654e0
LP
672 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
673 if (r >= 0)
674 arg_suppress_sync = r;
675 else if (r != -ENXIO)
676 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
677
49048684 678 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
679}
680
88213476 681static int parse_argv(int argc, char *argv[]) {
a41fe3a2 682 enum {
acbeb427
ZJS
683 ARG_VERSION = 0x100,
684 ARG_PRIVATE_NETWORK,
bc2f673e 685 ARG_UUID,
5076f0cc 686 ARG_READ_ONLY,
57fb9fb5 687 ARG_CAPABILITY,
88fc9c9b 688 ARG_AMBIENT_CAPABILITY,
420c7379 689 ARG_DROP_CAPABILITY,
17fe0523
LP
690 ARG_LINK_JOURNAL,
691 ARG_BIND,
f4889f65 692 ARG_BIND_RO,
06c17c39 693 ARG_TMPFS,
5a8af538
LP
694 ARG_OVERLAY,
695 ARG_OVERLAY_RO,
de40a303 696 ARG_INACCESSIBLE,
eb91eb18 697 ARG_SHARE_SYSTEM,
89f7c846 698 ARG_REGISTER,
aa28aefe 699 ARG_KEEP_UNIT,
69c79d3c 700 ARG_NETWORK_INTERFACE,
c74e630d 701 ARG_NETWORK_MACVLAN,
4bbfe7ad 702 ARG_NETWORK_IPVLAN,
ab046dde 703 ARG_NETWORK_BRIDGE,
22b28dfd 704 ARG_NETWORK_ZONE,
f6d6bad1 705 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 706 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 707 ARG_PERSONALITY,
4d9f07b4 708 ARG_VOLATILE,
ec16945e 709 ARG_TEMPLATE,
f36933fe 710 ARG_PROPERTY,
6dac160c 711 ARG_PRIVATE_USERS,
c6c8f6e2 712 ARG_KILL_SIGNAL,
f757855e 713 ARG_SETTINGS,
5f932eb9 714 ARG_CHDIR,
b53ede69 715 ARG_PIVOT_ROOT,
7336138e 716 ARG_PRIVATE_USERS_CHOWN,
6c045a99 717 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 718 ARG_NOTIFY_READY,
4623e8e6 719 ARG_ROOT_HASH,
89e62e0b
LP
720 ARG_ROOT_HASH_SIG,
721 ARG_VERITY_DATA,
960e4569 722 ARG_SYSTEM_CALL_FILTER,
bf428efb 723 ARG_RLIMIT,
3a9530e5 724 ARG_HOSTNAME,
66edd963 725 ARG_NO_NEW_PRIVILEGES,
81f345df 726 ARG_OOM_SCORE_ADJUST,
d107bb7d 727 ARG_CPU_AFFINITY,
09d423e9 728 ARG_RESOLV_CONF,
1688841f 729 ARG_TIMEZONE,
de40a303
LP
730 ARG_CONSOLE,
731 ARG_PIPE,
732 ARG_OCI_BUNDLE,
bb068de0 733 ARG_NO_PAGER,
3652872a
LP
734 ARG_SET_CREDENTIAL,
735 ARG_LOAD_CREDENTIAL,
2f893044 736 ARG_BIND_USER,
4a4654e0 737 ARG_SUPPRESS_SYNC,
84be0c71 738 ARG_IMAGE_POLICY,
a41fe3a2
LP
739 };
740
88213476 741 static const struct option options[] = {
d7bea6b6
DP
742 { "help", no_argument, NULL, 'h' },
743 { "version", no_argument, NULL, ARG_VERSION },
744 { "directory", required_argument, NULL, 'D' },
745 { "template", required_argument, NULL, ARG_TEMPLATE },
746 { "ephemeral", no_argument, NULL, 'x' },
747 { "user", required_argument, NULL, 'u' },
748 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
749 { "as-pid2", no_argument, NULL, 'a' },
750 { "boot", no_argument, NULL, 'b' },
751 { "uuid", required_argument, NULL, ARG_UUID },
752 { "read-only", no_argument, NULL, ARG_READ_ONLY },
753 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 754 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 755 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 756 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
757 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
758 { "bind", required_argument, NULL, ARG_BIND },
759 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
760 { "tmpfs", required_argument, NULL, ARG_TMPFS },
761 { "overlay", required_argument, NULL, ARG_OVERLAY },
762 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 763 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 764 { "machine", required_argument, NULL, 'M' },
3a9530e5 765 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
766 { "slice", required_argument, NULL, 'S' },
767 { "setenv", required_argument, NULL, 'E' },
768 { "selinux-context", required_argument, NULL, 'Z' },
769 { "selinux-apifs-context", required_argument, NULL, 'L' },
770 { "quiet", no_argument, NULL, 'q' },
771 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
772 { "register", required_argument, NULL, ARG_REGISTER },
773 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
774 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
775 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
776 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
777 { "network-veth", no_argument, NULL, 'n' },
778 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
779 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
780 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
781 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
782 { "personality", required_argument, NULL, ARG_PERSONALITY },
783 { "image", required_argument, NULL, 'i' },
784 { "volatile", optional_argument, NULL, ARG_VOLATILE },
785 { "port", required_argument, NULL, 'p' },
786 { "property", required_argument, NULL, ARG_PROPERTY },
787 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
788 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
789 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
790 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
791 { "settings", required_argument, NULL, ARG_SETTINGS },
792 { "chdir", required_argument, NULL, ARG_CHDIR },
793 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
794 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
795 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
796 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
797 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 798 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 799 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 800 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 801 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 802 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 803 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
804 { "console", required_argument, NULL, ARG_CONSOLE },
805 { "pipe", no_argument, NULL, ARG_PIPE },
806 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 807 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
808 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
809 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 810 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 811 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 812 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 813 {}
88213476
LP
814 };
815
9444b1f2 816 int c, r;
a42c8b54 817 uint64_t plus = 0, minus = 0;
f757855e 818 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
819
820 assert(argc >= 0);
821 assert(argv);
822
ef9c12b1
YW
823 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
824 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
825 optind = 0;
de40a303 826 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
827 switch (c) {
828
829 case 'h':
37ec0fdd 830 return help();
88213476 831
acbeb427 832 case ARG_VERSION:
3f6fd1ba 833 return version();
acbeb427 834
88213476 835 case 'D':
614b022c 836 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 837 if (r < 0)
0f03c2a4 838 return r;
de40a303
LP
839
840 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
841 break;
842
843 case ARG_TEMPLATE:
614b022c 844 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 845 if (r < 0)
0f03c2a4 846 return r;
de40a303
LP
847
848 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
849 break;
850
1b9e5b12 851 case 'i':
614b022c 852 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 853 if (r < 0)
0f03c2a4 854 return r;
de40a303
LP
855
856 arg_settings_mask |= SETTING_DIRECTORY;
857 break;
858
859 case ARG_OCI_BUNDLE:
614b022c 860 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
861 if (r < 0)
862 return r;
863
ec16945e
LP
864 break;
865
866 case 'x':
867 arg_ephemeral = true;
a2f577fc 868 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
869 break;
870
687d0825 871 case 'u':
2fc09a9c
DM
872 r = free_and_strdup(&arg_user, optarg);
873 if (r < 0)
7027ff61 874 return log_oom();
687d0825 875
f757855e 876 arg_settings_mask |= SETTING_USER;
687d0825
MV
877 break;
878
22b28dfd
LP
879 case ARG_NETWORK_ZONE: {
880 char *j;
881
b910cc72 882 j = strjoin("vz-", optarg);
22b28dfd
LP
883 if (!j)
884 return log_oom();
885
886 if (!ifname_valid(j)) {
887 log_error("Network zone name not valid: %s", j);
888 free(j);
889 return -EINVAL;
890 }
891
df1fac6d 892 free_and_replace(arg_network_zone, j);
22b28dfd
LP
893
894 arg_network_veth = true;
895 arg_private_network = true;
896 arg_settings_mask |= SETTING_NETWORK;
897 break;
898 }
899
ab046dde 900 case ARG_NETWORK_BRIDGE:
ef76dff2 901
baaa35ad
ZJS
902 if (!ifname_valid(optarg))
903 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
904 "Bridge interface name not valid: %s", optarg);
ef76dff2 905
f757855e
LP
906 r = free_and_strdup(&arg_network_bridge, optarg);
907 if (r < 0)
908 return log_oom();
ab046dde 909
4831981d 910 _fallthrough_;
0dfaa006 911 case 'n':
69c79d3c
LP
912 arg_network_veth = true;
913 arg_private_network = true;
f757855e 914 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
915 break;
916
f6d6bad1
LP
917 case ARG_NETWORK_VETH_EXTRA:
918 r = veth_extra_parse(&arg_network_veth_extra, optarg);
919 if (r < 0)
920 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
921
922 arg_private_network = true;
923 arg_settings_mask |= SETTING_NETWORK;
924 break;
925
aa28aefe 926 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
927 if (!ifname_valid(optarg))
928 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
929 "Network interface name not valid: %s", optarg);
ef76dff2 930
b390f178
DDM
931 r = test_network_interface_initialized(optarg);
932 if (r < 0)
933 return r;
934
c74e630d
LP
935 if (strv_extend(&arg_network_interfaces, optarg) < 0)
936 return log_oom();
937
938 arg_private_network = true;
f757855e 939 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
940 break;
941
942 case ARG_NETWORK_MACVLAN:
ef76dff2 943
baaa35ad
ZJS
944 if (!ifname_valid(optarg))
945 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
946 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 947
b390f178
DDM
948 r = test_network_interface_initialized(optarg);
949 if (r < 0)
950 return r;
951
c74e630d 952 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
953 return log_oom();
954
4bbfe7ad 955 arg_private_network = true;
f757855e 956 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
957 break;
958
959 case ARG_NETWORK_IPVLAN:
ef76dff2 960
baaa35ad
ZJS
961 if (!ifname_valid(optarg))
962 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
963 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 964
b390f178
DDM
965 r = test_network_interface_initialized(optarg);
966 if (r < 0)
967 return r;
968
4bbfe7ad
TG
969 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
970 return log_oom();
971
4831981d 972 _fallthrough_;
ff01d048
LP
973 case ARG_PRIVATE_NETWORK:
974 arg_private_network = true;
f757855e 975 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
976 break;
977
d7bea6b6 978 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 979 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
980 if (r < 0)
981 return r;
982
de40a303 983 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
984 break;
985
0f0dbc46 986 case 'b':
baaa35ad
ZJS
987 if (arg_start_mode == START_PID2)
988 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
989 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
990
991 arg_start_mode = START_BOOT;
992 arg_settings_mask |= SETTING_START_MODE;
993 break;
994
995 case 'a':
baaa35ad
ZJS
996 if (arg_start_mode == START_BOOT)
997 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
998 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
999
1000 arg_start_mode = START_PID2;
1001 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
1002 break;
1003
144f0fc0 1004 case ARG_UUID:
9444b1f2 1005 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
1006 if (r < 0)
1007 return log_error_errno(r, "Invalid UUID: %s", optarg);
1008
baaa35ad
ZJS
1009 if (sd_id128_is_null(arg_uuid))
1010 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1011 "Machine UUID may not be all zeroes.");
f757855e
LP
1012
1013 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 1014 break;
aa96c6cb 1015
43c3fb46
LP
1016 case 'S': {
1017 _cleanup_free_ char *mangled = NULL;
1018
1019 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
1020 if (r < 0)
1021 return log_oom();
1022
43c3fb46 1023 free_and_replace(arg_slice, mangled);
de40a303 1024 arg_settings_mask |= SETTING_SLICE;
144f0fc0 1025 break;
43c3fb46 1026 }
144f0fc0 1027
7027ff61 1028 case 'M':
c1521918 1029 if (isempty(optarg))
97b11eed 1030 arg_machine = mfree(arg_machine);
c1521918 1031 else {
52ef5dd7 1032 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1033 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1034 "Invalid machine name: %s", optarg);
7027ff61 1035
0c3c4284
LP
1036 r = free_and_strdup(&arg_machine, optarg);
1037 if (r < 0)
eb91eb18 1038 return log_oom();
eb91eb18 1039 }
9ce6d1b3 1040 break;
7027ff61 1041
3a9530e5
LP
1042 case ARG_HOSTNAME:
1043 if (isempty(optarg))
1044 arg_hostname = mfree(arg_hostname);
1045 else {
52ef5dd7 1046 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1047 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1048 "Invalid hostname: %s", optarg);
3a9530e5
LP
1049
1050 r = free_and_strdup(&arg_hostname, optarg);
1051 if (r < 0)
1052 return log_oom();
1053 }
1054
1055 arg_settings_mask |= SETTING_HOSTNAME;
1056 break;
1057
82adf6af
LP
1058 case 'Z':
1059 arg_selinux_context = optarg;
a8828ed9
DW
1060 break;
1061
82adf6af
LP
1062 case 'L':
1063 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1064 break;
1065
bc2f673e
LP
1066 case ARG_READ_ONLY:
1067 arg_read_only = true;
f757855e 1068 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1069 break;
1070
88fc9c9b
TH
1071 case ARG_AMBIENT_CAPABILITY: {
1072 uint64_t m;
1073 r = parse_capability_spec(optarg, &m);
1074 if (r <= 0)
1075 return r;
1076 arg_caps_ambient |= m;
1077 arg_settings_mask |= SETTING_CAPABILITY;
1078 break;
1079 }
420c7379
LP
1080 case ARG_CAPABILITY:
1081 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1082 uint64_t m;
1083 r = parse_capability_spec(optarg, &m);
1084 if (r <= 0)
1085 return r;
5076f0cc 1086
8a99bd0c
ZJS
1087 if (c == ARG_CAPABILITY)
1088 plus |= m;
1089 else
1090 minus |= m;
f757855e 1091 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1092 break;
1093 }
66edd963
LP
1094 case ARG_NO_NEW_PRIVILEGES:
1095 r = parse_boolean(optarg);
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1098
1099 arg_no_new_privileges = r;
1100 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1101 break;
1102
57fb9fb5
LP
1103 case 'j':
1104 arg_link_journal = LINK_GUEST;
574edc90 1105 arg_link_journal_try = true;
4e1d6aa9 1106 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1107 break;
1108
1109 case ARG_LINK_JOURNAL:
4e1d6aa9 1110 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1111 if (r < 0)
1112 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1113
4e1d6aa9 1114 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1115 break;
1116
17fe0523 1117 case ARG_BIND:
f757855e
LP
1118 case ARG_BIND_RO:
1119 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1120 if (r < 0)
1121 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1122
f757855e 1123 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1124 break;
06c17c39 1125
f757855e
LP
1126 case ARG_TMPFS:
1127 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1128 if (r < 0)
1129 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1130
f757855e 1131 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1132 break;
5a8af538
LP
1133
1134 case ARG_OVERLAY:
ad85779a
LP
1135 case ARG_OVERLAY_RO:
1136 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1137 if (r == -EADDRNOTAVAIL)
1138 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1139 if (r < 0)
1140 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1141
f757855e 1142 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1143 break;
06c17c39 1144
de40a303
LP
1145 case ARG_INACCESSIBLE:
1146 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1147 if (r < 0)
1148 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1149
1150 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1151 break;
1152
0d2a0179
ZJS
1153 case 'E':
1154 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1155 if (r < 0)
0d2a0179 1156 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1157
f757855e 1158 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1159 break;
f4889f65 1160
284c0b91
LP
1161 case 'q':
1162 arg_quiet = true;
1163 break;
1164
8a96d94e 1165 case ARG_SHARE_SYSTEM:
a6b5216c 1166 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1167 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1168 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1169 arg_clone_ns_flags = 0;
8a96d94e
LP
1170 break;
1171
eb91eb18
LP
1172 case ARG_REGISTER:
1173 r = parse_boolean(optarg);
1174 if (r < 0) {
1175 log_error("Failed to parse --register= argument: %s", optarg);
1176 return r;
1177 }
1178
1179 arg_register = r;
1180 break;
1181
89f7c846
LP
1182 case ARG_KEEP_UNIT:
1183 arg_keep_unit = true;
1184 break;
1185
6afc95b7
LP
1186 case ARG_PERSONALITY:
1187
ac45f971 1188 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1189 if (arg_personality == PERSONALITY_INVALID)
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1191 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1192
f757855e 1193 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1194 break;
1195
4d9f07b4
LP
1196 case ARG_VOLATILE:
1197
1198 if (!optarg)
f757855e 1199 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1200 else if (streq(optarg, "help")) {
1201 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1202 return 0;
1203 } else {
f757855e 1204 VolatileMode m;
4d9f07b4 1205
f757855e 1206 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1207 if (m < 0)
1208 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1209 "Failed to parse --volatile= argument: %s", optarg);
1210 else
f757855e 1211 arg_volatile_mode = m;
6d0b55c2
LP
1212 }
1213
f757855e
LP
1214 arg_settings_mask |= SETTING_VOLATILE_MODE;
1215 break;
6d0b55c2 1216
f757855e
LP
1217 case 'p':
1218 r = expose_port_parse(&arg_expose_ports, optarg);
1219 if (r == -EEXIST)
1220 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1221 if (r < 0)
1222 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1223
f757855e 1224 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1225 break;
6d0b55c2 1226
f36933fe
LP
1227 case ARG_PROPERTY:
1228 if (strv_extend(&arg_property, optarg) < 0)
1229 return log_oom();
1230
1231 break;
1232
ae209204 1233 case ARG_PRIVATE_USERS: {
33eac552 1234 int boolean;
0de7acce 1235
ae209204
ZJS
1236 if (!optarg)
1237 boolean = true;
1238 else if (!in_charset(optarg, DIGITS))
1239 /* do *not* parse numbers as booleans */
1240 boolean = parse_boolean(optarg);
33eac552
LP
1241 else
1242 boolean = -1;
ae209204 1243
33eac552 1244 if (boolean == 0) {
0de7acce
LP
1245 /* no: User namespacing off */
1246 arg_userns_mode = USER_NAMESPACE_NO;
1247 arg_uid_shift = UID_INVALID;
1248 arg_uid_range = UINT32_C(0x10000);
33eac552 1249 } else if (boolean > 0) {
0de7acce
LP
1250 /* yes: User namespacing on, UID range is read from root dir */
1251 arg_userns_mode = USER_NAMESPACE_FIXED;
1252 arg_uid_shift = UID_INVALID;
1253 arg_uid_range = UINT32_C(0x10000);
1254 } else if (streq(optarg, "pick")) {
1255 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1256 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1257 * implied by USER_NAMESPACE_PICK
33eac552 1258 * further down. */
0de7acce
LP
1259 arg_uid_shift = UID_INVALID;
1260 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1261
1262 } else if (streq(optarg, "identity")) {
6c2d70ce 1263 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1264 * itself, i.e. we don't actually map anything, but do take benefit of
1265 * isolation of capability sets. */
1266 arg_userns_mode = USER_NAMESPACE_FIXED;
1267 arg_uid_shift = 0;
1268 arg_uid_range = UINT32_C(0x10000);
0de7acce 1269 } else {
6c2058b3 1270 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1271 const char *range, *shift;
1272
0de7acce
LP
1273 /* anything else: User namespacing on, UID range is explicitly configured */
1274
6dac160c
LP
1275 range = strchr(optarg, ':');
1276 if (range) {
6c2058b3
ZJS
1277 buffer = strndup(optarg, range - optarg);
1278 if (!buffer)
1279 return log_oom();
1280 shift = buffer;
6dac160c
LP
1281
1282 range++;
bfd292ec
ZJS
1283 r = safe_atou32(range, &arg_uid_range);
1284 if (r < 0)
be715731 1285 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1286 } else
1287 shift = optarg;
1288
be715731
ZJS
1289 r = parse_uid(shift, &arg_uid_shift);
1290 if (r < 0)
1291 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1292
1293 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1294
58e13de5
LP
1295 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1296 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1297 }
be715731 1298
0de7acce 1299 arg_settings_mask |= SETTING_USERNS;
6dac160c 1300 break;
ae209204 1301 }
6dac160c 1302
0de7acce 1303 case 'U':
ccabee0d 1304 if (userns_supported()) {
6c045a99
LP
1305 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1306 * implied by USER_NAMESPACE_PICK
33eac552 1307 * further down. */
ccabee0d
LP
1308 arg_uid_shift = UID_INVALID;
1309 arg_uid_range = UINT32_C(0x10000);
1310
1311 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1312 }
1313
7336138e
LP
1314 break;
1315
0de7acce 1316 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1317 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1318
1319 arg_settings_mask |= SETTING_USERNS;
1320 break;
1321
1322 case ARG_PRIVATE_USERS_OWNERSHIP:
1323 if (streq(optarg, "help")) {
1324 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1325 return 0;
1326 }
1327
1328 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1329 if (arg_userns_ownership < 0)
1330 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1331
1332 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1333 break;
1334
c6c8f6e2 1335 case ARG_KILL_SIGNAL:
5c828e66
LP
1336 if (streq(optarg, "help")) {
1337 DUMP_STRING_TABLE(signal, int, _NSIG);
1338 return 0;
1339 }
1340
29a3db75 1341 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1342 if (arg_kill_signal < 0)
7211c853 1343 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1344
f757855e
LP
1345 arg_settings_mask |= SETTING_KILL_SIGNAL;
1346 break;
1347
1348 case ARG_SETTINGS:
1349
1350 /* no → do not read files
1351 * yes → read files, do not override cmdline, trust only subset
1352 * override → read files, override cmdline, trust only subset
1353 * trusted → read files, do not override cmdline, trust all
1354 */
1355
1356 r = parse_boolean(optarg);
1357 if (r < 0) {
1358 if (streq(optarg, "trusted")) {
1359 mask_all_settings = false;
1360 mask_no_settings = false;
1361 arg_settings_trusted = true;
1362
1363 } else if (streq(optarg, "override")) {
1364 mask_all_settings = false;
1365 mask_no_settings = true;
1366 arg_settings_trusted = -1;
1367 } else
1368 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1369 } else if (r > 0) {
1370 /* yes */
1371 mask_all_settings = false;
1372 mask_no_settings = false;
1373 arg_settings_trusted = -1;
1374 } else {
1375 /* no */
1376 mask_all_settings = true;
1377 mask_no_settings = false;
1378 arg_settings_trusted = false;
1379 }
1380
c6c8f6e2
LP
1381 break;
1382
5f932eb9 1383 case ARG_CHDIR:
baaa35ad
ZJS
1384 if (!path_is_absolute(optarg))
1385 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1386 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1387
1388 r = free_and_strdup(&arg_chdir, optarg);
1389 if (r < 0)
1390 return log_oom();
1391
1392 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1393 break;
1394
b53ede69
PW
1395 case ARG_PIVOT_ROOT:
1396 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1397 if (r < 0)
1398 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1399
1400 arg_settings_mask |= SETTING_PIVOT_ROOT;
1401 break;
1402
9c1e04d0
AP
1403 case ARG_NOTIFY_READY:
1404 r = parse_boolean(optarg);
baaa35ad
ZJS
1405 if (r < 0)
1406 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1407 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1408 arg_notify_ready = r;
1409 arg_settings_mask |= SETTING_NOTIFY_READY;
1410 break;
1411
4623e8e6 1412 case ARG_ROOT_HASH: {
89e62e0b 1413 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1414 size_t l;
1415
1416 r = unhexmem(optarg, strlen(optarg), &k, &l);
1417 if (r < 0)
1418 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1419 if (l < sizeof(sd_id128_t))
c6147113 1420 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1421
89e62e0b
LP
1422 free_and_replace(arg_verity_settings.root_hash, k);
1423 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1424 break;
1425 }
1426
c2923fdc
LB
1427 case ARG_ROOT_HASH_SIG: {
1428 char *value;
89e62e0b
LP
1429 size_t l;
1430 void *p;
c2923fdc
LB
1431
1432 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1433 r = unbase64mem(value, strlen(value), &p, &l);
1434 if (r < 0)
1435 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1436
c2923fdc 1437 } else {
89e62e0b 1438 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1439 if (r < 0)
89e62e0b 1440 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1441 }
1442
89e62e0b
LP
1443 free_and_replace(arg_verity_settings.root_hash_sig, p);
1444 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1445 break;
1446 }
1447
89e62e0b 1448 case ARG_VERITY_DATA:
614b022c 1449 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1450 if (r < 0)
1451 return r;
1452 break;
1453
960e4569
LP
1454 case ARG_SYSTEM_CALL_FILTER: {
1455 bool negative;
1456 const char *items;
1457
1458 negative = optarg[0] == '~';
1459 items = negative ? optarg + 1 : optarg;
1460
1461 for (;;) {
1462 _cleanup_free_ char *word = NULL;
1463
1464 r = extract_first_word(&items, &word, NULL, 0);
1465 if (r == 0)
1466 break;
1467 if (r == -ENOMEM)
1468 return log_oom();
1469 if (r < 0)
1470 return log_error_errno(r, "Failed to parse system call filter: %m");
1471
1472 if (negative)
6b000af4 1473 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1474 else
6b000af4 1475 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1476 if (r < 0)
1477 return log_oom();
1478 }
1479
1480 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1481 break;
1482 }
1483
bf428efb
LP
1484 case ARG_RLIMIT: {
1485 const char *eq;
622ecfa8 1486 _cleanup_free_ char *name = NULL;
bf428efb
LP
1487 int rl;
1488
5c828e66
LP
1489 if (streq(optarg, "help")) {
1490 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1491 return 0;
1492 }
1493
bf428efb 1494 eq = strchr(optarg, '=');
baaa35ad
ZJS
1495 if (!eq)
1496 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1497 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1498
1499 name = strndup(optarg, eq - optarg);
1500 if (!name)
1501 return log_oom();
1502
1503 rl = rlimit_from_string_harder(name);
baaa35ad 1504 if (rl < 0)
7211c853 1505 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1506
1507 if (!arg_rlimit[rl]) {
1508 arg_rlimit[rl] = new0(struct rlimit, 1);
1509 if (!arg_rlimit[rl])
1510 return log_oom();
1511 }
1512
1513 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1514 if (r < 0)
1515 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1516
1517 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1518 break;
1519 }
1520
81f345df
LP
1521 case ARG_OOM_SCORE_ADJUST:
1522 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1523 if (r < 0)
1524 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1525
1526 arg_oom_score_adjust_set = true;
1527 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1528 break;
1529
d107bb7d 1530 case ARG_CPU_AFFINITY: {
0985c7c4 1531 CPUSet cpuset;
d107bb7d
LP
1532
1533 r = parse_cpu_set(optarg, &cpuset);
1534 if (r < 0)
0985c7c4 1535 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1536
0985c7c4
ZJS
1537 cpu_set_reset(&arg_cpu_set);
1538 arg_cpu_set = cpuset;
d107bb7d
LP
1539 arg_settings_mask |= SETTING_CPU_AFFINITY;
1540 break;
1541 }
1542
09d423e9
LP
1543 case ARG_RESOLV_CONF:
1544 if (streq(optarg, "help")) {
1545 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1546 return 0;
1547 }
1548
1549 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1550 if (arg_resolv_conf < 0)
7211c853 1551 return log_error_errno(arg_resolv_conf,
baaa35ad 1552 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1553
1554 arg_settings_mask |= SETTING_RESOLV_CONF;
1555 break;
1556
1688841f
LP
1557 case ARG_TIMEZONE:
1558 if (streq(optarg, "help")) {
1559 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1560 return 0;
1561 }
1562
1563 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1564 if (arg_timezone < 0)
7211c853 1565 return log_error_errno(arg_timezone,
baaa35ad 1566 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1567
1568 arg_settings_mask |= SETTING_TIMEZONE;
1569 break;
1570
de40a303 1571 case ARG_CONSOLE:
dce66ffe
ZJS
1572 r = handle_arg_console(optarg);
1573 if (r <= 0)
1574 return r;
de40a303
LP
1575 break;
1576
1577 case 'P':
1578 case ARG_PIPE:
dce66ffe
ZJS
1579 r = handle_arg_console("pipe");
1580 if (r <= 0)
1581 return r;
de40a303
LP
1582 break;
1583
bb068de0
ZJS
1584 case ARG_NO_PAGER:
1585 arg_pager_flags |= PAGER_DISABLE;
1586 break;
1587
3652872a
LP
1588 case ARG_SET_CREDENTIAL: {
1589 _cleanup_free_ char *word = NULL, *data = NULL;
1590 const char *p = optarg;
1591 Credential *a;
e437538f 1592 ssize_t l;
3652872a
LP
1593
1594 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1595 if (r == -ENOMEM)
1596 return log_oom();
1597 if (r < 0)
1598 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1599 if (r == 0 || !p)
1600 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1601
1602 if (!credential_name_valid(word))
1603 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1604
12d729b2 1605 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1606 if (streq(arg_credentials[i].id, word))
1607 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1608
1609 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1610 if (l < 0)
1611 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1612
1613 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1614 if (!a)
1615 return log_oom();
1616
1617 a[arg_n_credentials++] = (Credential) {
1618 .id = TAKE_PTR(word),
1619 .data = TAKE_PTR(data),
1620 .size = l,
1621 };
1622
1623 arg_credentials = a;
1624
1625 arg_settings_mask |= SETTING_CREDENTIALS;
1626 break;
1627 }
1628
1629 case ARG_LOAD_CREDENTIAL: {
1630 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1631 _cleanup_(erase_and_freep) char *data = NULL;
1632 _cleanup_free_ char *word = NULL, *j = NULL;
1633 const char *p = optarg;
1634 Credential *a;
1635 size_t size, i;
1636
1637 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1638 if (r == -ENOMEM)
1639 return log_oom();
1640 if (r < 0)
c941b650 1641 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
3652872a 1642 if (r == 0 || !p)
c941b650 1643 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
3652872a
LP
1644
1645 if (!credential_name_valid(word))
1646 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1647
1648 for (i = 0; i < arg_n_credentials; i++)
1649 if (streq(arg_credentials[i].id, word))
1650 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1651
1652 if (path_is_absolute(p))
1653 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1654 else {
1655 const char *e;
1656
786d19fd
LP
1657 r = get_credentials_dir(&e);
1658 if (r < 0)
1659 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1660
1661 j = path_join(e, p);
1662 if (!j)
1663 return log_oom();
1664 }
1665
986311c2
LP
1666 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1667 flags,
1668 NULL,
1669 &data, &size);
3652872a
LP
1670 if (r < 0)
1671 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1672
1673 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1674 if (!a)
1675 return log_oom();
1676
1677 a[arg_n_credentials++] = (Credential) {
1678 .id = TAKE_PTR(word),
1679 .data = TAKE_PTR(data),
1680 .size = size,
1681 };
1682
1683 arg_credentials = a;
1684
1685 arg_settings_mask |= SETTING_CREDENTIALS;
1686 break;
1687 }
1688
2f893044
LP
1689 case ARG_BIND_USER:
1690 if (!valid_user_group_name(optarg, 0))
1691 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1692
1693 if (strv_extend(&arg_bind_user, optarg) < 0)
1694 return log_oom();
1695
1696 arg_settings_mask |= SETTING_BIND_USER;
1697 break;
1698
4a4654e0
LP
1699 case ARG_SUPPRESS_SYNC:
1700 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1701 if (r < 0)
1702 return r;
1703
1704 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1705 break;
1706
84be0c71
LP
1707 case ARG_IMAGE_POLICY: {
1708 _cleanup_(image_policy_freep) ImagePolicy *p = NULL;
1709
1710 r = image_policy_from_string(optarg, &p);
1711 if (r < 0)
1712 return log_error_errno(r, "Failed to parse image policy: %s", optarg);
1713
1714 image_policy_free(arg_image_policy);
1715 arg_image_policy = TAKE_PTR(p);
1716 break;
1717 }
1718
88213476
LP
1719 case '?':
1720 return -EINVAL;
1721
1722 default:
04499a70 1723 assert_not_reached();
88213476 1724 }
88213476 1725
60f1ec13
LP
1726 if (argc > optind) {
1727 strv_free(arg_parameters);
1728 arg_parameters = strv_copy(argv + optind);
1729 if (!arg_parameters)
1730 return log_oom();
d7bea6b6 1731
60f1ec13
LP
1732 arg_settings_mask |= SETTING_START_MODE;
1733 }
1734
1735 if (arg_ephemeral && arg_template && !arg_directory)
1736 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1737 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1738 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1739 * --directory=". */
1740 arg_directory = TAKE_PTR(arg_template);
1741
2642d22a
DDM
1742 arg_caps_retain |= plus;
1743 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1744
1745 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1746 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1747 * indicate that. */
1748 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1749 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1750
1751 arg_caps_retain &= ~minus;
60f1ec13 1752
de40a303 1753 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1754 r = parse_environment();
1755 if (r < 0)
1756 return r;
de40a303 1757
60f1ec13
LP
1758 /* Load all settings from .nspawn files */
1759 if (mask_no_settings)
1760 arg_settings_mask = 0;
1761
1762 /* Don't load any settings from .nspawn files */
1763 if (mask_all_settings)
1764 arg_settings_mask = _SETTINGS_MASK_ALL;
1765
1766 return 1;
1767}
1768
1769static int verify_arguments(void) {
1770 int r;
a6b5216c 1771
75b0d8b8
ZJS
1772 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1773 /* If we are running the stub init in the container, we don't need to look at what the init
1774 * in the container supports, because we are not using it. Let's immediately pick the right
1775 * setting based on the host system configuration.
1776 *
1777 * We only do this, if the user didn't use an environment variable to override the detection.
1778 */
1779
1780 r = cg_all_unified();
1781 if (r < 0)
1782 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1783 if (r > 0)
1784 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1785 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1786 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1787 else
1788 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1789 }
1790
4f086aab
SU
1791 if (arg_userns_mode != USER_NAMESPACE_NO)
1792 arg_mount_settings |= MOUNT_USE_USERNS;
1793
1794 if (arg_private_network)
1795 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1796
48a8d337
LB
1797 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1798 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1799 arg_register = false;
baaa35ad 1800 if (arg_start_mode != START_PID1)
60f1ec13 1801 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1802 }
eb91eb18 1803
6c045a99
LP
1804 if (arg_userns_ownership < 0)
1805 arg_userns_ownership =
f61c7f88 1806 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1807 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1808
60f1ec13
LP
1809 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1810 arg_kill_signal = SIGRTMIN+3;
1811
e5a4bb0d
LP
1812 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1813 arg_read_only = true;
1814
2436ea76
DDM
1815 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1816 arg_read_only = true;
1817
baaa35ad 1818 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1819 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1820 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1821 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1822
baaa35ad 1823 if (arg_directory && arg_image)
60f1ec13 1824 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1825
baaa35ad 1826 if (arg_template && arg_image)
60f1ec13 1827 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1828
baaa35ad 1829 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1830 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1831
baaa35ad 1832 if (arg_ephemeral && arg_template)
60f1ec13 1833 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1834
baaa35ad 1835 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1836 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1837
baaa35ad 1838 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1839 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1840
6c045a99 1841 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1842 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1843 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1844
6c045a99
LP
1845 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1846 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1847 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1848 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1849 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1850
679ecd36
SZ
1851 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1852 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1853 if (arg_network_namespace_path &&
1854 (arg_network_interfaces || arg_network_macvlan ||
1855 arg_network_ipvlan || arg_network_veth_extra ||
1856 arg_network_bridge || arg_network_zone ||
679ecd36 1857 arg_network_veth))
de40a303 1858 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1859
60f1ec13 1860 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1862 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1863
baaa35ad 1864 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1865 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1866
baaa35ad 1867 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1868 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1869
baaa35ad 1870 if (arg_expose_ports && !arg_private_network)
60f1ec13 1871 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1872
88fc9c9b 1873 if (arg_caps_ambient) {
f5fbe71d 1874 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1875 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1876
1877 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1878 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1879
1880 if (arg_start_mode == START_BOOT)
1881 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1882 }
1883
2f893044
LP
1884 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1885 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1886
1887 /* Drop duplicate --bind-user= entries */
1888 strv_uniq(arg_bind_user);
1889
60f1ec13
LP
1890 r = custom_mount_check_all();
1891 if (r < 0)
1892 return r;
c6c8f6e2 1893
f757855e 1894 return 0;
88213476
LP
1895}
1896
91181e07 1897int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1898 assert(p);
1899
0de7acce 1900 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1901 return 0;
1902
1903 if (uid == UID_INVALID && gid == GID_INVALID)
1904 return 0;
1905
1906 if (uid != UID_INVALID) {
1907 uid += arg_uid_shift;
1908
1909 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1910 return -EOVERFLOW;
1911 }
1912
1913 if (gid != GID_INVALID) {
1914 gid += (gid_t) arg_uid_shift;
1915
1916 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1917 return -EOVERFLOW;
1918 }
1919
7c248223 1920 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1921}
1922
91181e07 1923int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1924 const char *q;
dae8b82e 1925 int r;
03cfe0d5
LP
1926
1927 q = prefix_roota(root, path);
3f692e2e 1928 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1929 if (r == -EEXIST)
1930 return 0;
1931 if (r < 0)
1932 return r;
03cfe0d5
LP
1933
1934 return userns_lchown(q, uid, gid);
1935}
1936
1688841f 1937static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1938 return PATH_STARTSWITH_SET(
1939 path,
1940 "../usr/share/zoneinfo/",
1941 "/usr/share/zoneinfo/");
1688841f
LP
1942}
1943
83205269
LP
1944static bool etc_writable(void) {
1945 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1946}
1947
e58a1277 1948static int setup_timezone(const char *dest) {
1688841f
LP
1949 _cleanup_free_ char *p = NULL, *etc = NULL;
1950 const char *where, *check;
1951 TimezoneMode m;
d4036145 1952 int r;
f8440af5 1953
e58a1277
LP
1954 assert(dest);
1955
1688841f 1956 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1957 r = readlink_malloc("/etc/localtime", &p);
1958 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1959 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1960 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1961 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1962 else if (r < 0) {
1963 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1964 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1965 * file.
1966 *
1967 * Example:
1968 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1969 */
1970 return 0;
1971 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1972 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1973 else
1974 m = arg_timezone;
1975 } else
1976 m = arg_timezone;
1977
1978 if (m == TIMEZONE_OFF)
1979 return 0;
1980
f461a28d 1981 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1982 if (r < 0) {
1688841f 1983 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1984 return 0;
1985 }
1986
1688841f
LP
1987 where = strjoina(etc, "/localtime");
1988
1989 switch (m) {
1990
1991 case TIMEZONE_DELETE:
1992 if (unlink(where) < 0)
1993 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1994
d4036145 1995 return 0;
d4036145 1996
1688841f
LP
1997 case TIMEZONE_SYMLINK: {
1998 _cleanup_free_ char *q = NULL;
1999 const char *z, *what;
4d1c38b8 2000
1688841f
LP
2001 z = timezone_from_path(p);
2002 if (!z) {
2003 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 2004 return 0;
1688841f 2005 }
d4036145 2006
1688841f
LP
2007 r = readlink_malloc(where, &q);
2008 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
2009 return 0; /* Already pointing to the right place? Then do nothing .. */
2010
2011 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 2012 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
2013 if (r < 0)
2014 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
2015 else {
2016 if (unlink(where) < 0 && errno != ENOENT) {
2017 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2018 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2019 return 0;
2020 }
2021
2022 what = strjoina("../usr/share/zoneinfo/", z);
2023 if (symlink(what, where) < 0) {
2024 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2025 errno, "Failed to correct timezone of container, ignoring: %m");
2026 return 0;
2027 }
2028
2029 break;
2030 }
2031
2032 _fallthrough_;
d4036145 2033 }
68fb0892 2034
1688841f
LP
2035 case TIMEZONE_BIND: {
2036 _cleanup_free_ char *resolved = NULL;
2037 int found;
2038
f461a28d 2039 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2040 if (found < 0) {
2041 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2042 return 0;
2043 }
2044
2045 if (found == 0) /* missing? */
2046 (void) touch(resolved);
2047
511a8cfe 2048 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2049 if (r >= 0)
511a8cfe 2050 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2051
2052 _fallthrough_;
79d80fc1 2053 }
4d9f07b4 2054
1688841f
LP
2055 case TIMEZONE_COPY:
2056 /* If mounting failed, try to copy */
7c2f5495 2057 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2058 if (r < 0) {
2059 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2060 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2061 return 0;
2062 }
2063
2064 break;
2065
2066 default:
04499a70 2067 assert_not_reached();
d4036145 2068 }
e58a1277 2069
1688841f 2070 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2071 r = userns_lchown(where, 0, 0);
2072 if (r < 0)
1688841f 2073 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2074
e58a1277 2075 return 0;
88213476
LP
2076}
2077
09d423e9
LP
2078static int have_resolv_conf(const char *path) {
2079 assert(path);
2080
2081 if (access(path, F_OK) < 0) {
2082 if (errno == ENOENT)
2083 return 0;
2084
2085 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2086 }
2087
2088 return 1;
2089}
2090
7357272e 2091static int resolved_listening(void) {
b8ea7a6e 2092 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2093 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2094 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2095 int r;
2096
7357272e 2097 /* Check if resolved is listening */
b053cd5f
LP
2098
2099 r = sd_bus_open_system(&bus);
2100 if (r < 0)
b8ea7a6e 2101 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2102
7357272e 2103 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2104 if (r < 0)
2105 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2106 if (r == 0)
2107 return 0;
7357272e
DM
2108
2109 r = sd_bus_get_property_string(bus,
2110 "org.freedesktop.resolve1",
2111 "/org/freedesktop/resolve1",
2112 "org.freedesktop.resolve1.Manager",
2113 "DNSStubListener",
b8ea7a6e 2114 &error,
7357272e
DM
2115 &dns_stub_listener_mode);
2116 if (r < 0)
b8ea7a6e 2117 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2118
2119 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2120}
2121
2547bb41 2122static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2123 _cleanup_free_ char *etc = NULL;
2124 const char *where, *what;
2125 ResolvConfMode m;
2126 int r;
2547bb41
LP
2127
2128 assert(dest);
2129
09d423e9
LP
2130 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2131 if (arg_private_network)
2132 m = RESOLV_CONF_OFF;
86775e35
LP
2133 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2134 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2135 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2136 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2137 else
83205269 2138 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2139
09d423e9
LP
2140 } else
2141 m = arg_resolv_conf;
2142
2143 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2144 return 0;
2145
f461a28d 2146 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2147 if (r < 0) {
2148 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2149 return 0;
2150 }
2151
2152 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2153
2154 if (m == RESOLV_CONF_DELETE) {
2155 if (unlink(where) < 0)
2156 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2157
87447ae4
LP
2158 return 0;
2159 }
79d80fc1 2160
86775e35
LP
2161 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2162 what = PRIVATE_STATIC_RESOLV_CONF;
2163 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2164 what = PRIVATE_UPLINK_RESOLV_CONF;
2165 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2166 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2167 else
2168 what = "/etc/resolv.conf";
87447ae4 2169
86775e35 2170 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2171 _cleanup_free_ char *resolved = NULL;
2172 int found;
2173
f461a28d 2174 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2175 if (found < 0) {
2176 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2177 return 0;
2178 }
3539724c 2179
87447ae4
LP
2180 if (found == 0) /* missing? */
2181 (void) touch(resolved);
5367354d 2182
511a8cfe 2183 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2184 if (r >= 0)
511a8cfe 2185 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2186
2187 /* If that didn't work, let's copy the file */
3539724c
LP
2188 }
2189
86775e35 2190 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2191 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2192 else
7c2f5495 2193 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2194 if (r < 0) {
3539724c
LP
2195 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2196 * resolved or something similar runs inside and the symlink points there.
68a313c5 2197 *
3539724c 2198 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2199 */
86775e35
LP
2200 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2201 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2202 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2203 return 0;
2204 }
2547bb41 2205
03cfe0d5
LP
2206 r = userns_lchown(where, 0, 0);
2207 if (r < 0)
3539724c 2208 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2209
2547bb41
LP
2210 return 0;
2211}
2212
1e4f1671 2213static int setup_boot_id(void) {
cdde6ba6
LP
2214 _cleanup_(unlink_and_freep) char *from = NULL;
2215 _cleanup_free_ char *path = NULL;
3bbaff3e 2216 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2217 const char *to;
04bc4a3f
LP
2218 int r;
2219
1eacc470 2220 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2221
1eacc470 2222 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2223 if (r < 0)
2224 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2225
2226 r = sd_id128_randomize(&rnd);
f647962d
MS
2227 if (r < 0)
2228 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2229
b40c8ebd 2230 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2231 if (r < 0)
2232 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2233
cdde6ba6
LP
2234 from = TAKE_PTR(path);
2235 to = "/proc/sys/kernel/random/boot_id";
2236
511a8cfe 2237 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2238 if (r < 0)
2239 return r;
04bc4a3f 2240
511a8cfe 2241 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2242}
2243
e58a1277 2244static int copy_devnodes(const char *dest) {
88213476
LP
2245 static const char devnodes[] =
2246 "null\0"
2247 "zero\0"
2248 "full\0"
2249 "random\0"
2250 "urandom\0"
85614d66
TG
2251 "tty\0"
2252 "net/tun\0";
88213476 2253
e58a1277 2254 int r = 0;
a258bf26
LP
2255
2256 assert(dest);
124640f1 2257
52f05ef2 2258 BLOCK_WITH_UMASK(0000);
88213476 2259
03cfe0d5
LP
2260 /* Create /dev/net, so that we can create /dev/net/tun in it */
2261 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2262 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2263
88213476 2264 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2265 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2266 struct stat st;
88213476 2267
c6134d3e 2268 from = path_join("/dev/", d);
8967f291
LP
2269 if (!from)
2270 return log_oom();
2271
c6134d3e 2272 to = path_join(dest, from);
8967f291
LP
2273 if (!to)
2274 return log_oom();
88213476
LP
2275
2276 if (stat(from, &st) < 0) {
2277
4a62c710
MS
2278 if (errno != ENOENT)
2279 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2280
baaa35ad
ZJS
2281 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2282 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2283 "%s is not a char or block device, cannot copy.", from);
2284 else {
8dfce114
LP
2285 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2286
81f5049b 2287 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2288 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2289 if (errno == EEXIST)
8dbf71ec 2290 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2291 if (errno != EPERM)
2292 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2293
8dfce114 2294 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2295 r = touch(to);
2296 if (r < 0)
2297 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2298 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2299 if (r < 0)
2300 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2301 }
6278cf60 2302
03cfe0d5
LP
2303 r = userns_lchown(to, 0, 0);
2304 if (r < 0)
2305 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2306
657ee2d8 2307 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2308 if (!dn)
2309 return log_oom();
2310
2311 r = userns_mkdir(dest, dn, 0755, 0, 0);
2312 if (r < 0)
2313 return log_error_errno(r, "Failed to create '%s': %m", dn);
2314
2315 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2316 return log_oom();
2317
c6134d3e 2318 prefixed = path_join(dest, sl);
8dfce114
LP
2319 if (!prefixed)
2320 return log_oom();
2321
2d9b74ba 2322 t = path_join("..", d);
8dfce114
LP
2323 if (!t)
2324 return log_oom();
2325
2326 if (symlink(t, prefixed) < 0)
2327 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2328 }
88213476
LP
2329 }
2330
e58a1277
LP
2331 return r;
2332}
88213476 2333
de40a303 2334static int make_extra_nodes(const char *dest) {
de40a303
LP
2335 size_t i;
2336 int r;
2337
52f05ef2 2338 BLOCK_WITH_UMASK(0000);
de40a303
LP
2339
2340 for (i = 0; i < arg_n_extra_nodes; i++) {
2341 _cleanup_free_ char *path = NULL;
2342 DeviceNode *n = arg_extra_nodes + i;
2343
c6134d3e 2344 path = path_join(dest, n->path);
de40a303
LP
2345 if (!path)
2346 return log_oom();
2347
2348 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2349 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2350
2351 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2352 if (r < 0)
2353 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2354 }
2355
2356 return 0;
2357}
2358
03cfe0d5
LP
2359static int setup_pts(const char *dest) {
2360 _cleanup_free_ char *options = NULL;
2361 const char *p;
709f6e46 2362 int r;
03cfe0d5 2363
349cc4a5 2364#if HAVE_SELINUX
03cfe0d5
LP
2365 if (arg_selinux_apifs_context)
2366 (void) asprintf(&options,
3dce8915 2367 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2368 arg_uid_shift + TTY_GID,
2369 arg_selinux_apifs_context);
2370 else
2371#endif
2372 (void) asprintf(&options,
3dce8915 2373 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2374 arg_uid_shift + TTY_GID);
f2d88580 2375
03cfe0d5 2376 if (!options)
f2d88580
LP
2377 return log_oom();
2378
03cfe0d5 2379 /* Mount /dev/pts itself */
cc9fce65 2380 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2381 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to create /dev/pts: %m");
2384
511a8cfe 2385 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2386 if (r < 0)
2387 return r;
709f6e46
MS
2388 r = userns_lchown(p, 0, 0);
2389 if (r < 0)
2390 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2391
2392 /* Create /dev/ptmx symlink */
2393 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2394 if (symlink("pts/ptmx", p) < 0)
2395 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2396 r = userns_lchown(p, 0, 0);
2397 if (r < 0)
2398 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2399
03cfe0d5
LP
2400 /* And fix /dev/pts/ptmx ownership */
2401 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2402 r = userns_lchown(p, 0, 0);
2403 if (r < 0)
2404 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2405
f2d88580
LP
2406 return 0;
2407}
2408
3acc84eb 2409static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2410 _cleanup_close_ int terminal = -EBADF;
e58a1277 2411 int r;
e58a1277 2412
335d2ead
LP
2413 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2414 * explicitly, if we are configured to. */
2415 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2416 if (terminal < 0)
2417 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2418
3acc84eb
FB
2419 /* Make sure we can continue logging to the original stderr, even if
2420 * stderr points elsewhere now */
2421 r = log_dup_console();
2422 if (r < 0)
2423 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2424
3acc84eb
FB
2425 /* invalidates 'terminal' on success and failure */
2426 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2427 TAKE_FD(terminal);
f647962d 2428 if (r < 0)
3acc84eb
FB
2429 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2430
2431 return 0;
2432}
88213476 2433
3acc84eb
FB
2434static int setup_dev_console(const char *console) {
2435 _cleanup_free_ char *p = NULL;
2436 int r;
a258bf26 2437
3acc84eb
FB
2438 /* Create /dev/console symlink */
2439 r = path_make_relative("/dev", console, &p);
81f5049b 2440 if (r < 0)
3acc84eb
FB
2441 return log_error_errno(r, "Failed to create relative path: %m");
2442
2443 if (symlink(p, "/dev/console") < 0)
2444 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2445
3acc84eb 2446 return 0;
e58a1277
LP
2447}
2448
8e5430c4
LP
2449static int setup_keyring(void) {
2450 key_serial_t keyring;
2451
6b000af4
LP
2452 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2453 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2454 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2455 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2456 * into the container. */
8e5430c4
LP
2457
2458 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2459 if (keyring == -1) {
2460 if (errno == ENOSYS)
2461 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2462 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2463 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2464 else
2465 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2466 }
2467
2468 return 0;
2469}
2470
3652872a
LP
2471static int setup_credentials(const char *root) {
2472 const char *q;
2473 int r;
2474
2475 if (arg_n_credentials <= 0)
2476 return 0;
2477
2478 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2479 if (r < 0)
2480 return log_error_errno(r, "Failed to create /run/host: %m");
2481
2482 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2483 if (r < 0)
2484 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2485
2486 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2487 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2488 if (r < 0)
2489 return r;
2490
2491 for (size_t i = 0; i < arg_n_credentials; i++) {
2492 _cleanup_free_ char *j = NULL;
254d1313 2493 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2494
2495 j = path_join(q, arg_credentials[i].id);
2496 if (!j)
2497 return log_oom();
2498
2499 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2500 if (fd < 0)
2501 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2502
2503 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2504 if (r < 0)
2505 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2506
2507 if (fchmod(fd, 0400) < 0)
2508 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2509
2510 if (arg_userns_mode != USER_NAMESPACE_NO) {
2511 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2512 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2513 }
2514 }
2515
2516 if (chmod(q, 0500) < 0)
2517 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2518
2519 r = userns_lchown(q, 0, 0);
2520 if (r < 0)
2521 return r;
2522
2523 /* Make both mount and superblock read-only now */
511a8cfe 2524 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2525 if (r < 0)
2526 return r;
2527
511a8cfe 2528 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2529}
2530
5d9d3fcb 2531static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2532 _cleanup_(unlink_and_freep) char *from = NULL;
2533 _cleanup_free_ char *fifo = NULL;
254d1313 2534 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2535 int r;
e58a1277 2536
5d9d3fcb 2537 assert(fd_inner_socket >= 0);
a258bf26 2538
52f05ef2 2539 BLOCK_WITH_UMASK(0000);
a258bf26 2540
30fd9a2d 2541 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2542 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2543 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2544 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2545
1eacc470 2546 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2547 if (r < 0)
2548 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2549
9ec5a93c 2550 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2551 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2552
2553 from = TAKE_PTR(fifo);
9ec5a93c 2554
511a8cfe 2555 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2556 if (r < 0)
2557 return r;
e58a1277 2558
669fc4e5 2559 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2560 if (fd < 0)
2561 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2562
9ec5a93c 2563 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2564 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2565 if (r < 0)
2566 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2567
25ea79fe 2568 return 0;
88213476
LP
2569}
2570
761cf19d 2571struct ExposeArgs {
deff68e7
FW
2572 union in_addr_union address4;
2573 union in_addr_union address6;
761cf19d
FW
2574 struct FirewallContext *fw_ctx;
2575};
2576
1c4baffc 2577static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2578 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2579
2580 assert(rtnl);
2581 assert(m);
6d0b55c2 2582
fb9044cb
LP
2583 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2584 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2585 return 0;
2586}
2587
3a74cea5 2588static int setup_hostname(void) {
c818eef1 2589 int r;
3a74cea5 2590
0c582db0 2591 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2592 return 0;
2593
c818eef1
LP
2594 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2595 if (r < 0)
2596 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2597
7027ff61 2598 return 0;
3a74cea5
LP
2599}
2600
57fb9fb5 2601static int setup_journal(const char *directory) {
0f5e1382 2602 _cleanup_free_ char *d = NULL;
5980d463 2603 const char *p, *q;
b2238e38 2604 sd_id128_t this_id;
8054d749 2605 bool try;
57fb9fb5
LP
2606 int r;
2607
df9a75e4
LP
2608 /* Don't link journals in ephemeral mode */
2609 if (arg_ephemeral)
2610 return 0;
2611
8054d749
LP
2612 if (arg_link_journal == LINK_NO)
2613 return 0;
2614
2615 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2616
4d680aee 2617 r = sd_id128_get_machine(&this_id);
f647962d
MS
2618 if (r < 0)
2619 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2620
e01ff70a 2621 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2622 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2623 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2624 if (try)
4d680aee 2625 return 0;
df9a75e4 2626 return -EEXIST;
4d680aee
ZJS
2627 }
2628
369ca6da
ZJS
2629 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2630 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2631 if (r < 0) {
2632 bool ignore = r == -EROFS && try;
2633 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2634 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2635 return ignore ? 0 : r;
2636 }
2637 }
03cfe0d5 2638
85b55869 2639 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2640 q = prefix_roota(directory, p);
27407a01 2641
e1873695 2642 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2643 if (try)
2644 return 0;
27407a01 2645
baaa35ad
ZJS
2646 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2647 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2648 }
2649
e1873695 2650 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2651 if (try)
2652 return 0;
57fb9fb5 2653
baaa35ad
ZJS
2654 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2655 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2656 }
2657
2658 r = readlink_and_make_absolute(p, &d);
2659 if (r >= 0) {
3742095b 2660 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2661 path_equal(d, q)) {
2662
03cfe0d5 2663 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2664 if (r < 0)
709f6e46 2665 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2666 return 0;
57fb9fb5
LP
2667 }
2668
4a62c710
MS
2669 if (unlink(p) < 0)
2670 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2671 } else if (r == -EINVAL) {
2672
2673 if (arg_link_journal == LINK_GUEST &&
2674 rmdir(p) < 0) {
2675
27407a01
ZJS
2676 if (errno == ENOTDIR) {
2677 log_error("%s already exists and is neither a symlink nor a directory", p);
2678 return r;
4314d33f
MS
2679 } else
2680 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2681 }
4314d33f
MS
2682 } else if (r != -ENOENT)
2683 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2684
2685 if (arg_link_journal == LINK_GUEST) {
2686
2687 if (symlink(q, p) < 0) {
8054d749 2688 if (try) {
56f64d95 2689 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2690 return 0;
4314d33f
MS
2691 } else
2692 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2693 }
2694
03cfe0d5 2695 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2696 if (r < 0)
709f6e46 2697 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2698 return 0;
57fb9fb5
LP
2699 }
2700
2701 if (arg_link_journal == LINK_HOST) {
ccddd104 2702 /* don't create parents here — if the host doesn't have
574edc90 2703 * permanent journal set up, don't force it here */
ba8e6c4d 2704
3f692e2e 2705 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2706 if (r < 0 && r != -EEXIST) {
8054d749 2707 if (try) {
dae8b82e 2708 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2709 return 0;
4314d33f 2710 } else
dae8b82e 2711 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2712 }
2713
27407a01
ZJS
2714 } else if (access(p, F_OK) < 0)
2715 return 0;
57fb9fb5 2716
db55bbf2 2717 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2718 log_warning("%s is not empty, proceeding anyway.", q);
2719
03cfe0d5 2720 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2721 if (r < 0)
2722 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2723
511a8cfe 2724 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2725 if (r < 0)
4a62c710 2726 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2727
27407a01 2728 return 0;
57fb9fb5
LP
2729}
2730
de40a303
LP
2731static int drop_capabilities(uid_t uid) {
2732 CapabilityQuintet q;
2733
2734 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2735 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2736 * arg_caps_retain. */
2737
2738 if (capability_quintet_is_set(&arg_full_capabilities)) {
2739 q = arg_full_capabilities;
2740
f5fbe71d 2741 if (q.bounding == UINT64_MAX)
de40a303
LP
2742 q.bounding = uid == 0 ? arg_caps_retain : 0;
2743
f5fbe71d 2744 if (q.effective == UINT64_MAX)
de40a303
LP
2745 q.effective = uid == 0 ? q.bounding : 0;
2746
f5fbe71d 2747 if (q.inheritable == UINT64_MAX)
88fc9c9b 2748 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2749
f5fbe71d 2750 if (q.permitted == UINT64_MAX)
88fc9c9b 2751 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2752
f5fbe71d 2753 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2754 q.ambient = arg_caps_ambient;
f66ad460
AZ
2755
2756 if (capability_quintet_mangle(&q))
2757 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2758
2759 } else {
de40a303
LP
2760 q = (CapabilityQuintet) {
2761 .bounding = arg_caps_retain,
2762 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2763 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2764 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2765 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2766 };
2767
f66ad460
AZ
2768 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2769 * in order to maintain the same behavior as systemd < 242. */
2770 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2771 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2772 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2773
2774 }
2775
de40a303 2776 return capability_quintet_enforce(&q);
88213476
LP
2777}
2778
db999e0f
LP
2779static int reset_audit_loginuid(void) {
2780 _cleanup_free_ char *p = NULL;
2781 int r;
2782
0c582db0 2783 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2784 return 0;
2785
2786 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2787 if (r == -ENOENT)
db999e0f 2788 return 0;
f647962d
MS
2789 if (r < 0)
2790 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2791
2792 /* Already reset? */
2793 if (streq(p, "4294967295"))
2794 return 0;
2795
57512c89 2796 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2797 if (r < 0) {
10a87006
LP
2798 log_error_errno(r,
2799 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2800 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2801 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2802 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2803 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2804
db999e0f 2805 sleep(5);
77b6e194 2806 }
db999e0f
LP
2807
2808 return 0;
77b6e194
LP
2809}
2810
e79581dd 2811static int mount_tunnel_dig(const char *root) {
785890ac 2812 const char *p, *q;
709f6e46 2813 int r;
785890ac
LP
2814
2815 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2816 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2817 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2818 (void) mkdir_p(p, 0600);
2819
5a27b395 2820 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2821 if (r < 0)
5a27b395 2822 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2823
e79581dd 2824 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2825 if (r < 0)
e79581dd 2826 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2827
e79581dd 2828 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2829 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2830 if (r < 0)
2831 return r;
785890ac 2832
511a8cfe 2833 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2834 if (r < 0)
2835 return r;
785890ac 2836
e79581dd
CB
2837 return 0;
2838}
2839
2840static int mount_tunnel_open(void) {
2841 int r;
2842
2843 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2844 if (r < 0)
2845 return r;
2846
2847 return 0;
785890ac
LP
2848}
2849
317feb4d 2850static int setup_machine_id(const char *directory) {
3bbaff3e 2851 int r;
e01ff70a 2852
317feb4d
LP
2853 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2854 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2855 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2856 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2857 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2858 * container behaves nicely). */
2859
319477f1 2860 r = id128_get_machine(directory, &arg_uuid);
317feb4d 2861 if (r < 0) {
74e795ee 2862 if (!ERRNO_IS_MACHINE_ID_UNSET(r)) /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d 2863 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2864
317feb4d
LP
2865 if (sd_id128_is_null(arg_uuid)) {
2866 r = sd_id128_randomize(&arg_uuid);
2867 if (r < 0)
2868 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2869 }
317feb4d 2870 }
691675ba 2871
e01ff70a
MS
2872 return 0;
2873}
2874
7336138e
LP
2875static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2876 int r;
2877
2878 assert(directory);
2879
6c045a99 2880 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2881 return 0;
2882
2883 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2884 if (r == -EOPNOTSUPP)
2885 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2886 if (r == -EBADE)
2887 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2888 if (r < 0)
2889 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2890 if (r == 0)
2891 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2892 else
2893 log_debug("Patched directory tree to match UID/GID range.");
2894
2895 return r;
2896}
2897
113cea80 2898/*
6d416b9c
LS
2899 * Return values:
2900 * < 0 : wait_for_terminate() failed to get the state of the
2901 * container, the container was terminated by a signal, or
2902 * failed for an unknown reason. No change is made to the
2903 * container argument.
2904 * > 0 : The program executed in the container terminated with an
2905 * error. The exit code of the program executed in the
919699ec
LP
2906 * container is returned. The container argument has been set
2907 * to CONTAINER_TERMINATED.
6d416b9c
LS
2908 * 0 : The container is being rebooted, has been shut down or exited
2909 * successfully. The container argument has been set to either
2910 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2911 *
6d416b9c
LS
2912 * That is, success is indicated by a return value of zero, and an
2913 * error is indicated by a non-zero value.
113cea80
DH
2914 */
2915static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2916 siginfo_t status;
919699ec 2917 int r;
113cea80
DH
2918
2919 r = wait_for_terminate(pid, &status);
f647962d
MS
2920 if (r < 0)
2921 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2922
2923 switch (status.si_code) {
fddbb89c 2924
113cea80 2925 case CLD_EXITED:
b5a2179b 2926 if (status.si_status == 0)
919699ec 2927 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2928 else
919699ec 2929 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2930
919699ec
LP
2931 *container = CONTAINER_TERMINATED;
2932 return status.si_status;
113cea80
DH
2933
2934 case CLD_KILLED:
2935 if (status.si_status == SIGINT) {
919699ec 2936 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2937 *container = CONTAINER_TERMINATED;
919699ec
LP
2938 return 0;
2939
113cea80 2940 } else if (status.si_status == SIGHUP) {
919699ec 2941 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2942 *container = CONTAINER_REBOOTED;
919699ec 2943 return 0;
113cea80 2944 }
919699ec 2945
4831981d 2946 _fallthrough_;
113cea80 2947 case CLD_DUMPED:
baaa35ad
ZJS
2948 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2949 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2950
2951 default:
baaa35ad
ZJS
2952 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2953 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2954 }
113cea80
DH
2955}
2956
023fb90b
LP
2957static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2958 pid_t pid;
2959
4a0b58c4 2960 pid = PTR_TO_PID(userdata);
023fb90b 2961 if (pid > 0) {
c6c8f6e2 2962 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2963 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2964 sd_event_source_set_userdata(s, NULL);
2965 return 0;
2966 }
2967 }
2968
2969 sd_event_exit(sd_event_source_get_event(s), 0);
2970 return 0;
2971}
2972
6916b164 2973static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2974 pid_t pid;
2975
2976 assert(s);
2977 assert(ssi);
2978
2979 pid = PTR_TO_PID(userdata);
2980
6916b164
AU
2981 for (;;) {
2982 siginfo_t si = {};
abdb9b08 2983
6916b164
AU
2984 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2985 return log_error_errno(errno, "Failed to waitid(): %m");
2986 if (si.si_pid == 0) /* No pending children. */
2987 break;
abdb9b08 2988 if (si.si_pid == pid) {
6916b164
AU
2989 /* The main process we care for has exited. Return from
2990 * signal handler but leave the zombie. */
2991 sd_event_exit(sd_event_source_get_event(s), 0);
2992 break;
2993 }
abdb9b08 2994
6916b164
AU
2995 /* Reap all other children. */
2996 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2997 }
2998
2999 return 0;
3000}
3001
abdb9b08
LP
3002static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
3003 pid_t pid;
3004
3005 assert(m);
3006
3007 pid = PTR_TO_PID(userdata);
3008
3009 if (arg_kill_signal > 0) {
3010 log_info("Container termination requested. Attempting to halt container.");
3011 (void) kill(pid, arg_kill_signal);
3012 } else {
3013 log_info("Container termination requested. Exiting.");
3014 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
3015 }
3016
3017 return 0;
3018}
3019
ec16945e 3020static int determine_names(void) {
1b9cebf6 3021 int r;
ec16945e 3022
c1521918
LP
3023 if (arg_template && !arg_directory && arg_machine) {
3024
3025 /* If --template= was specified then we should not
3026 * search for a machine, but instead create a new one
3027 * in /var/lib/machine. */
3028
657ee2d8 3029 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
3030 if (!arg_directory)
3031 return log_oom();
3032 }
3033
ec16945e 3034 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3035 if (arg_machine) {
3036 _cleanup_(image_unrefp) Image *i = NULL;
3037
d577d4a4 3038 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3039 if (r == -ENOENT)
3040 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3041 if (r < 0)
3042 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3043
eb38edce 3044 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3045 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3046 else
0f03c2a4 3047 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3048 if (r < 0)
0f3be6ca 3049 return log_oom();
1b9cebf6 3050
aee327b8
LP
3051 if (!arg_ephemeral)
3052 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3053 } else {
3054 r = safe_getcwd(&arg_directory);
3055 if (r < 0)
3056 return log_error_errno(r, "Failed to determine current directory: %m");
3057 }
ec16945e 3058
c6147113
LP
3059 if (!arg_directory && !arg_image)
3060 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3061 }
3062
3063 if (!arg_machine) {
b9ba4dab
LP
3064 if (arg_directory && path_equal(arg_directory, "/"))
3065 arg_machine = gethostname_malloc();
e9b88a6d
LP
3066 else if (arg_image) {
3067 char *e;
4827ab48 3068
b36e39d2
LP
3069 r = path_extract_filename(arg_image, &arg_machine);
3070 if (r < 0)
3071 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 3072
e9b88a6d
LP
3073 /* Truncate suffix if there is one */
3074 e = endswith(arg_machine, ".raw");
3075 if (e)
3076 *e = 0;
b36e39d2
LP
3077 } else {
3078 r = path_extract_filename(arg_directory, &arg_machine);
3079 if (r < 0)
3080 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3081 }
ec16945e 3082
ae691c1d 3083 hostname_cleanup(arg_machine);
52ef5dd7 3084 if (!hostname_is_valid(arg_machine, 0))
c6147113 3085 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3086
3603f151
LB
3087 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3088 * to match fixed config file names. */
3089 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3090 if (!arg_settings_filename)
3091 return log_oom();
3092
e9b88a6d
LP
3093 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3094 * instances at once without manually having to specify -M each time. */
3095 if (arg_ephemeral)
3096 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3097 return log_oom();
3603f151
LB
3098 } else {
3099 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3100 if (!arg_settings_filename)
3101 return log_oom();
ec16945e
LP
3102 }
3103
3104 return 0;
3105}
3106
f461a28d 3107static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3108 char *chased;
3109 int r;
3110
3111 assert(p);
3112
3113 if (!*p)
3114 return 0;
3115
f461a28d 3116 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3117 if (r < 0)
3118 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3119
a5648b80 3120 return free_and_replace(*p, chased);
3f342ec4
LP
3121}
3122
03cfe0d5 3123static int determine_uid_shift(const char *directory) {
6dac160c 3124
0de7acce 3125 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3126 arg_uid_shift = 0;
6dac160c 3127 return 0;
03cfe0d5 3128 }
6dac160c
LP
3129
3130 if (arg_uid_shift == UID_INVALID) {
3131 struct stat st;
3132
993da6d4
LP
3133 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3134
3135 if (stat(directory, &st) < 0)
03cfe0d5 3136 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3137
3138 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3139
baaa35ad
ZJS
3140 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3141 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3142 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3143
3144 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3145
3146 if (arg_uid_shift != 0) {
3147 /* If the image is shifted already, then we'll fall back to classic chowning, for
3148 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3149
3150 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3151 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3152 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3153 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3154 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3155 "UID base of %s is not zero, UID mapping not supported.", directory);
3156 }
6dac160c
LP
3157 }
3158
58e13de5
LP
3159 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3160 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3161
6dac160c
LP
3162 return 0;
3163}
3164
de40a303
LP
3165static unsigned long effective_clone_ns_flags(void) {
3166 unsigned long flags = arg_clone_ns_flags;
3167
3168 if (arg_private_network)
3169 flags |= CLONE_NEWNET;
3170 if (arg_use_cgns)
3171 flags |= CLONE_NEWCGROUP;
3172 if (arg_userns_mode != USER_NAMESPACE_NO)
3173 flags |= CLONE_NEWUSER;
3174
3175 return flags;
3176}
3177
3178static int patch_sysctl(void) {
3179
3180 /* This table is inspired by runc's sysctl() function */
3181 static const struct {
3182 const char *key;
3183 bool prefix;
3184 unsigned long clone_flags;
3185 } safe_sysctl[] = {
3186 { "kernel.hostname", false, CLONE_NEWUTS },
3187 { "kernel.domainname", false, CLONE_NEWUTS },
3188 { "kernel.msgmax", false, CLONE_NEWIPC },
3189 { "kernel.msgmnb", false, CLONE_NEWIPC },
3190 { "kernel.msgmni", false, CLONE_NEWIPC },
3191 { "kernel.sem", false, CLONE_NEWIPC },
3192 { "kernel.shmall", false, CLONE_NEWIPC },
3193 { "kernel.shmmax", false, CLONE_NEWIPC },
3194 { "kernel.shmmni", false, CLONE_NEWIPC },
3195 { "fs.mqueue.", true, CLONE_NEWIPC },
3196 { "net.", true, CLONE_NEWNET },
3197 };
3198
3199 unsigned long flags;
de40a303
LP
3200 int r;
3201
3202 flags = effective_clone_ns_flags();
3203
3204 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3205 bool good = false;
3206 size_t i;
3207
3208 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3209
3210 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3211 continue;
3212
3213 if (safe_sysctl[i].prefix)
3214 good = startswith(*k, safe_sysctl[i].key);
3215 else
3216 good = streq(*k, safe_sysctl[i].key);
3217
3218 if (good)
3219 break;
3220 }
3221
c6147113
LP
3222 if (!good)
3223 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3224
3225 r = sysctl_write(*k, *v);
3226 if (r < 0)
3227 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3228 }
3229
3230 return 0;
3231}
3232
03cfe0d5
LP
3233static int inner_child(
3234 Barrier *barrier,
5d9d3fcb 3235 int fd_inner_socket,
e1bb4b0d
LB
3236 FDSet *fds,
3237 char **os_release_pairs) {
69c79d3c 3238
03cfe0d5 3239 _cleanup_free_ char *home = NULL;
88614c8a 3240 size_t n_env = 1;
4ab3d29f
ZJS
3241 char *envp[] = {
3242 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3243 NULL, /* container */
03cfe0d5
LP
3244 NULL, /* TERM */
3245 NULL, /* HOME */
3246 NULL, /* USER */
3247 NULL, /* LOGNAME */
3248 NULL, /* container_uuid */
3249 NULL, /* LISTEN_FDS */
3250 NULL, /* LISTEN_PID */
9c1e04d0 3251 NULL, /* NOTIFY_SOCKET */
3652872a 3252 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3253 NULL, /* LANG */
03cfe0d5
LP
3254 NULL
3255 };
1a68e1e5 3256 const char *exec_target;
2371271c 3257 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3258 int r, which_failed;
88213476 3259
b37469d7
LP
3260 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3261 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3262 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3263 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3264 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3265 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3266 * namespace.
3267 *
3268 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3269 * unshare(). See below. */
3270
03cfe0d5 3271 assert(barrier);
5d9d3fcb 3272 assert(fd_inner_socket >= 0);
88213476 3273
de40a303
LP
3274 log_debug("Inner child is initializing.");
3275
0de7acce 3276 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3277 /* Tell the parent, that it now can write the UID map. */
3278 (void) barrier_place(barrier); /* #1 */
7027ff61 3279
03cfe0d5 3280 /* Wait until the parent wrote the UID map */
baaa35ad 3281 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3282 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3283
2a2e78e9
LP
3284 /* Become the new root user inside our namespace */
3285 r = reset_uid_gid();
3286 if (r < 0)
3287 return log_error_errno(r, "Couldn't become new root: %m");
3288
3289 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3290 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3291 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3292 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3293 if (r < 0)
3294 return r;
3295 }
6d66bd3b 3296
0de7acce 3297 r = mount_all(NULL,
4f086aab 3298 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3299 arg_uid_shift,
0de7acce 3300 arg_selinux_apifs_context);
03cfe0d5
LP
3301 if (r < 0)
3302 return r;
3303
04413780
ZJS
3304 if (!arg_network_namespace_path && arg_private_network) {
3305 r = unshare(CLONE_NEWNET);
3306 if (r < 0)
3307 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3308
3309 /* Tell the parent that it can setup network interfaces. */
3310 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3311 }
3312
4f086aab 3313 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3314 if (r < 0)
3315 return r;
3316
03cfe0d5
LP
3317 /* Wait until we are cgroup-ified, so that we
3318 * can mount the right cgroup path writable */
baaa35ad
ZJS
3319 if (!barrier_place_and_sync(barrier)) /* #4 */
3320 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3321 "Parent died too early");
88213476 3322
489fae52 3323 if (arg_use_cgns) {
0996ef00
CB
3324 r = unshare(CLONE_NEWCGROUP);
3325 if (r < 0)
04413780 3326 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3327 r = mount_cgroups(
3328 "",
3329 arg_unified_cgroup_hierarchy,
3330 arg_userns_mode != USER_NAMESPACE_NO,
3331 arg_uid_shift,
3332 arg_uid_range,
5a8ff0e6 3333 arg_selinux_apifs_context,
ada54120 3334 true);
1433e0f2 3335 } else
0996ef00 3336 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3337 if (r < 0)
3338 return r;
ec16945e 3339
1e4f1671 3340 r = setup_boot_id();
03cfe0d5
LP
3341 if (r < 0)
3342 return r;
ec16945e 3343
5d9d3fcb 3344 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3345 if (r < 0)
3346 return r;
ec16945e 3347
de40a303
LP
3348 r = mount_custom(
3349 "/",
3350 arg_custom_mounts,
3351 arg_n_custom_mounts,
de40a303 3352 0,
c0c8f718 3353 0,
de40a303 3354 arg_selinux_apifs_context,
5f0a6347 3355 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3356 if (r < 0)
3357 return r;
3358
03cfe0d5
LP
3359 if (setsid() < 0)
3360 return log_error_errno(errno, "setsid() failed: %m");
3361
3362 if (arg_private_network)
df883de9 3363 (void) loopback_setup();
03cfe0d5 3364
7a8f6325 3365 if (arg_expose_ports) {
b07ee903 3366 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3367 if (r < 0)
3368 return r;
7a8f6325 3369 }
03cfe0d5 3370
3acc84eb 3371 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3372 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3373 _cleanup_free_ char *console = NULL;
3374
3375 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3376 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3377 if (master < 0)
dc98caea 3378 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3379
3380 r = setup_dev_console(console);
3381 if (r < 0)
105a1a36 3382 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3383
bb1aa185 3384 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3385 if (r < 0)
3386 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3387
3388 r = setup_stdio_as_dev_console();
3389 if (r < 0)
3390 return r;
3391 }
3392
de40a303
LP
3393 r = patch_sysctl();
3394 if (r < 0)
3395 return r;
3396
81f345df
LP
3397 if (arg_oom_score_adjust_set) {
3398 r = set_oom_score_adjust(arg_oom_score_adjust);
3399 if (r < 0)
3400 return log_error_errno(r, "Failed to adjust OOM score: %m");
3401 }
3402
0985c7c4
ZJS
3403 if (arg_cpu_set.set)
3404 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3405 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3406
c818eef1 3407 (void) setup_hostname();
03cfe0d5 3408
050f7277 3409 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3410 r = safe_personality(arg_personality);
3411 if (r < 0)
3412 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3413#ifdef ARCHITECTURE_SECONDARY
3414 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3415 r = safe_personality(PER_LINUX32);
3416 if (r < 0)
3417 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3418#endif
3419 } else if (arg_architecture >= 0 && arg_architecture != native_architecture())
3420 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3421 "Selected architecture '%s' not supported locally, refusing.",
3422 architecture_to_string(arg_architecture));
03cfe0d5 3423
de40a303
LP
3424 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3425 if (r < 0)
3426 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3427
3428#if HAVE_SECCOMP
3429 if (arg_seccomp) {
3430
3431 if (is_seccomp_available()) {
3432
3433 r = seccomp_load(arg_seccomp);
7bc5e0b1 3434 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3435 return log_error_errno(r, "Failed to install seccomp filter: %m");
3436 if (r < 0)
3437 log_debug_errno(r, "Failed to install seccomp filter: %m");
3438 }
3439 } else
3440#endif
3441 {
6b000af4 3442 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3443 if (r < 0)
3444 return r;
3445 }
3446
4a4654e0 3447 if (arg_suppress_sync) {
20e458ae 3448#if HAVE_SECCOMP
4a4654e0
LP
3449 r = seccomp_suppress_sync();
3450 if (r < 0)
3451 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3452#else
2db32618 3453 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3454#endif
4a4654e0
LP
3455 }
3456
349cc4a5 3457#if HAVE_SELINUX
03cfe0d5 3458 if (arg_selinux_context)
2ed96880 3459 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3460 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3461#endif
3462
de40a303
LP
3463 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3464 * if we need to later on. */
3465 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3466 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3467
3468 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3469 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3470 else
3462d773 3471 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3472 if (r < 0)
3473 return r;
3474
de40a303
LP
3475 r = drop_capabilities(getuid());
3476 if (r < 0)
3477 return log_error_errno(r, "Dropping capabilities failed: %m");
3478
66edd963
LP
3479 if (arg_no_new_privileges)
3480 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3481 return log_error_errno(errno, "Failed to disable new privileges: %m");
3482
6aadfa4c
ILG
3483 /* LXC sets container=lxc, so follow the scheme here */
3484 envp[n_env++] = strjoina("container=", arg_container_service_name);
3485
03cfe0d5
LP
3486 envp[n_env] = strv_find_prefix(environ, "TERM=");
3487 if (envp[n_env])
313cefa1 3488 n_env++;
03cfe0d5 3489
de40a303 3490 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3491 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3492 return log_oom();
3493
3494 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3495 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3496 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3497 return log_oom();
03cfe0d5 3498
3bbaff3e 3499 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3500
b7416360 3501 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3502 return log_oom();
03cfe0d5
LP
3503
3504 if (fdset_size(fds) > 0) {
3505 r = fdset_cloexec(fds, false);
3506 if (r < 0)
3507 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3508
4ab3d29f
ZJS
3509 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3510 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3511 return log_oom();
3512 }
4ab3d29f 3513 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3514 return log_oom();
03cfe0d5 3515
3652872a
LP
3516 if (arg_n_credentials > 0) {
3517 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3518 if (!envp[n_env])
3519 return log_oom();
3520 n_env++;
3521 }
3522
b626f695 3523 if (arg_start_mode != START_BOOT) {
a22f5186 3524 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3525 if (!envp[n_env])
3526 return log_oom();
3527 n_env++;
3528 }
3529
4ab3d29f 3530 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3531 if (!env_use)
3532 return log_oom();
03cfe0d5
LP
3533
3534 /* Let the parent know that we are ready and
3535 * wait until the parent is ready with the
3536 * setup, too... */
baaa35ad 3537 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3538 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3539
5f932eb9
LP
3540 if (arg_chdir)
3541 if (chdir(arg_chdir) < 0)
3542 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3543
7732f92b 3544 if (arg_start_mode == START_PID2) {
75bf701f 3545 r = stub_pid1(arg_uuid);
7732f92b
LP
3546 if (r < 0)
3547 return r;
3548 }
3549
335d2ead
LP
3550 if (arg_console_mode != CONSOLE_PIPE) {
3551 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3552 * are configured for that. Acquire it as controlling tty. */
3553 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3554 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3555 }
3556
de40a303
LP
3557 log_debug("Inner child completed, invoking payload.");
3558
8ca082b4
LP
3559 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3560 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3561 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3562 log_close();
8ca082b4 3563 log_set_open_when_needed(true);
a3b00f91 3564 log_settle_target();
8ca082b4 3565
03cfe0d5
LP
3566 (void) fdset_close_others(fds);
3567
7732f92b 3568 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3569 char **a;
3570 size_t m;
3571
3572 /* Automatically search for the init system */
3573
75f32f04
ZJS
3574 m = strv_length(arg_parameters);
3575 a = newa(char*, m + 2);
3576 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3577 a[1 + m] = NULL;
03cfe0d5 3578
a5096641
LP
3579 FOREACH_STRING(init,
3580 "/usr/lib/systemd/systemd",
3581 "/lib/systemd/systemd",
3582 "/sbin/init") {
3583 a[0] = (char*) init;
3584 execve(a[0], a, env_use);
3585 }
ced58da7
LP
3586
3587 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3588 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3589 const char *dollar_path;
3590
1a68e1e5 3591 exec_target = arg_parameters[0];
b6b180b7
LP
3592
3593 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3594 * binary. */
3595 dollar_path = strv_env_get(env_use, "PATH");
3596 if (dollar_path) {
6f646e01 3597 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3598 return log_error_errno(errno, "Failed to update $PATH: %m");
3599 }
3600
f757855e 3601 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3602 } else {
5f932eb9 3603 if (!arg_chdir)
d929b0f9
ZJS
3604 /* If we cannot change the directory, we'll end up in /, that is expected. */
3605 (void) chdir(home ?: "/root");
5f932eb9 3606
53350c7b 3607 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3608 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3609 execle("/bin/bash", "-bash", NULL, env_use);
3610 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3611 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3612
53350c7b 3613 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3614 }
3615
8ca082b4 3616 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3617}
3618
e96ceaba 3619static int setup_notify_child(void) {
254d1313 3620 _cleanup_close_ int fd = -EBADF;
1eb874b9 3621 static const union sockaddr_union sa = {
44ed5214
LP
3622 .un.sun_family = AF_UNIX,
3623 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3624 };
3625 int r;
3626
3627 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3628 if (fd < 0)
3629 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3630
3631 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3632 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3633
9c1e04d0 3634 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3635 if (r < 0)
44ed5214 3636 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3637
adc7d9f0 3638 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3639 if (r < 0)
adc7d9f0 3640 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3641
2ff48e98 3642 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3643 if (r < 0)
2ff48e98 3644 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3645
271f518f 3646 return TAKE_FD(fd);
9c1e04d0
AP
3647}
3648
03cfe0d5
LP
3649static int outer_child(
3650 Barrier *barrier,
3651 const char *directory,
2d845785 3652 DissectedImage *dissected_image,
af06cd30 3653 int fd_outer_socket,
5d9d3fcb 3654 int fd_inner_socket,
d7bea6b6
DP
3655 FDSet *fds,
3656 int netns_fd) {
03cfe0d5 3657
2f893044 3658 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3659 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3660 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3661 bool idmap = false;
e5f10caf 3662 const char *p;
03cfe0d5
LP
3663 pid_t pid;
3664 ssize_t l;
de40a303 3665 int r;
03cfe0d5 3666
d1d0b895
LP
3667 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3668 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3669 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3670 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3671 * forked off it, and it exits. */
b37469d7 3672
03cfe0d5
LP
3673 assert(barrier);
3674 assert(directory);
af06cd30 3675 assert(fd_outer_socket >= 0);
5d9d3fcb 3676 assert(fd_inner_socket >= 0);
03cfe0d5 3677
de40a303
LP
3678 log_debug("Outer child is initializing.");
3679
e1bb4b0d
LB
3680 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3681 if (r < 0)
3682 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3683
03cfe0d5
LP
3684 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3685 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3686
03cfe0d5
LP
3687 r = reset_audit_loginuid();
3688 if (r < 0)
3689 return r;
3690
2a2e78e9
LP
3691 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3692 * mounts to the real root. */
511a8cfe 3693 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3694 if (r < 0)
3695 return r;
03cfe0d5 3696
2d845785 3697 if (dissected_image) {
d1d0b895
LP
3698 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3699 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3700 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3701 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3702
af187ab2 3703 r = dissected_image_mount_and_warn(
d04faa4e
LP
3704 dissected_image,
3705 directory,
3706 arg_uid_shift,
21b61b1d 3707 arg_uid_range,
d04faa4e
LP
3708 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3709 DISSECT_IMAGE_DISCARD_ON_LOOP|
3710 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3711 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3712 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3713 if (r < 0)
af187ab2 3714 return r;
2d845785 3715 }
03cfe0d5 3716
391567f4
LP
3717 r = determine_uid_shift(directory);
3718 if (r < 0)
3719 return r;
3720
0de7acce 3721 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3722 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3723 if (r < 0)
3724 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3725
af06cd30 3726 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3727 if (l < 0)
3728 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3729 mntns_fd = safe_close(mntns_fd);
3730
0e7ac751 3731 /* Let the parent know which UID shift we read from the image */
af06cd30 3732 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3733 if (l < 0)
3734 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3735 if (l != sizeof(arg_uid_shift))
3736 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3737 "Short write while sending UID shift.");
0e7ac751 3738
0de7acce 3739 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3740 /* When we are supposed to pick the UID shift, the parent will check now whether the
3741 * UID shift we just read from the image is available. If yes, it will send the UID
3742 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3743
af06cd30 3744 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3745 if (l < 0)
3746 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3747 if (l != sizeof(arg_uid_shift))
3748 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3749 "Short read while receiving UID shift.");
0e7ac751
LP
3750 }
3751
ff6c6cc1
LP
3752 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3753 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3754 }
3755
6f83d3d1
LP
3756 if (path_equal(directory, "/")) {
3757 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3758 * place, so that we can make changes to its mount structure (for example, to implement
3759 * --volatile=) without this interfering with our ability to access files such as
3760 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3761 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3762 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3763 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3764
511a8cfe 3765 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3766 if (r < 0)
3767 return r;
3768
3769 directory = "/run/systemd/nspawn-root";
e50cd82f 3770 }
7d0ecdd6 3771
75f81732
LP
3772 /* Make sure we always have a mount that we can move to root later on. */
3773 r = make_mount_point(directory);
3774 if (r < 0)
3775 return r;
3776
3777 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3778 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3779 * we'll live in our own little world from now on, and propagation from the host may only happen via
3780 * the mount tunnel dir, or not at all. */
3781 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3782 if (r < 0)
3783 return r;
3784
7d0ecdd6
LP
3785 r = setup_pivot_root(
3786 directory,
3787 arg_pivot_root_new,
3788 arg_pivot_root_old);
3789 if (r < 0)
3790 return r;
3791
3792 r = setup_volatile_mode(
3793 directory,
3794 arg_volatile_mode,
7d0ecdd6 3795 arg_uid_shift,
8f1ed04a 3796 arg_selinux_apifs_context);
7d0ecdd6
LP
3797 if (r < 0)
3798 return r;
3799
2f893044
LP
3800 r = bind_user_prepare(
3801 directory,
3802 arg_bind_user,
3803 arg_uid_shift,
3804 arg_uid_range,
3805 &arg_custom_mounts, &arg_n_custom_mounts,
3806 &bind_user_context);
3807 if (r < 0)
3808 return r;
3809
3810 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3811 /* Send the user maps we determined to the parent, so that it installs it in our user
3812 * namespace UID map table */
2f893044
LP
3813
3814 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3815 uid_t map[] = {
3816 bind_user_context->data[i].payload_user->uid,
3817 bind_user_context->data[i].host_user->uid,
3818 (uid_t) bind_user_context->data[i].payload_group->gid,
3819 (uid_t) bind_user_context->data[i].host_group->gid,
3820 };
3821
af06cd30 3822 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3823 if (l < 0)
3824 return log_error_errno(errno, "Failed to send user UID map: %m");
3825 if (l != sizeof(map))
3826 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3827 "Short write while sending user UID map.");
3828 }
3829 }
3830
5f0a6347
DDM
3831 r = mount_custom(
3832 directory,
3833 arg_custom_mounts,
3834 arg_n_custom_mounts,
5f0a6347 3835 arg_uid_shift,
c0c8f718 3836 arg_uid_range,
5f0a6347
DDM
3837 arg_selinux_apifs_context,
3838 MOUNT_ROOT_ONLY);
3839 if (r < 0)
3840 return r;
3841
c0c8f718
AV
3842 if (arg_userns_mode != USER_NAMESPACE_NO &&
3843 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3844 arg_uid_shift != 0) {
3845
2b2777ed 3846 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
c0c8f718
AV
3847 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3848 /* This might fail because the kernel or file system doesn't support idmapping. We
3849 * can't really distinguish this nicely, nor do we have any guarantees about the
3850 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3851 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3852 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3853 "ID mapped mounts are apparently not available, sorry.");
3854
3855 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3856 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3857 } else if (r < 0)
3858 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3859 else {
3860 log_debug("ID mapped mounts available, making use of them.");
3861 idmap = true;
3862 }
3863 }
3864
2d3a5a73
LP
3865 if (dissected_image) {
3866 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3867 r = dissected_image_mount(
3868 dissected_image,
3869 directory,
3870 arg_uid_shift,
21b61b1d 3871 arg_uid_range,
d04faa4e
LP
3872 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3873 DISSECT_IMAGE_DISCARD_ON_LOOP|
3874 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3875 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3876 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3877 if (r == -EUCLEAN)
3878 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3879 if (r < 0)
4fcb96ce 3880 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3881 }
3882
8199d554
LP
3883 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3884 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3885
3886 r = detect_unified_cgroup_hierarchy_from_image(directory);
3887 if (r < 0)
3888 return r;
3889
fefb7a6d 3890 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3891 if (l < 0)
3892 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3893 if (l != sizeof(arg_unified_cgroup_hierarchy))
3894 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3895 "Short write while sending cgroup mode.");
8199d554
LP
3896 }
3897
4ad14eff
LP
3898 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3899 if (r < 0)
3900 return r;
3901
03cfe0d5
LP
3902 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3903 if (r < 0)
3904 return r;
3905
bbd407ea
DDM
3906 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3907 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3908 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3909 if (r < 0)
3910 return log_error_errno(r, "Failed to make tree read-only: %m");
3911 }
3912
0de7acce 3913 r = mount_all(directory,
4f086aab 3914 arg_mount_settings,
0de7acce 3915 arg_uid_shift,
0de7acce 3916 arg_selinux_apifs_context);
03cfe0d5
LP
3917 if (r < 0)
3918 return r;
3919
07fa00f9
LP
3920 r = copy_devnodes(directory);
3921 if (r < 0)
03cfe0d5
LP
3922 return r;
3923
de40a303
LP
3924 r = make_extra_nodes(directory);
3925 if (r < 0)
3926 return r;
3927
3928 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3929
9fac5029 3930 p = prefix_roota(directory, "/run/host");
e5f10caf 3931 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3932
07fa00f9
LP
3933 r = setup_pts(directory);
3934 if (r < 0)
03cfe0d5
LP
3935 return r;
3936
e79581dd 3937 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3938 if (r < 0)
3939 return r;
3940
8e5430c4
LP
3941 r = setup_keyring();
3942 if (r < 0)
3943 return r;
3944
3652872a
LP
3945 r = setup_credentials(directory);
3946 if (r < 0)
3947 return r;
3948
2f893044
LP
3949 r = bind_user_setup(bind_user_context, directory);
3950 if (r < 0)
3951 return r;
3952
5c4deb9a
MJ
3953 r = mount_custom(
3954 directory,
3955 arg_custom_mounts,
3956 arg_n_custom_mounts,
3957 arg_uid_shift,
c0c8f718 3958 arg_uid_range,
5c4deb9a
MJ
3959 arg_selinux_apifs_context,
3960 MOUNT_NON_ROOT_ONLY);
3961 if (r < 0)
3962 return r;
3963
03cfe0d5
LP
3964 r = setup_timezone(directory);
3965 if (r < 0)
3966 return r;
3967
3968 r = setup_resolv_conf(directory);
3969 if (r < 0)
3970 return r;
3971
e01ff70a
MS
3972 r = setup_machine_id(directory);
3973 if (r < 0)
3974 return r;
3975
03cfe0d5
LP
3976 r = setup_journal(directory);
3977 if (r < 0)
3978 return r;
3979
0f48ba7b
LP
3980 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3981 p = prefix_roota(directory, "/run/host/container-manager");
3982 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3983
3984 /* The same stuff as the $container_uuid env var */
3985 p = prefix_roota(directory, "/run/host/container-uuid");
3986 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3987
489fae52 3988 if (!arg_use_cgns) {
0996ef00
CB
3989 r = mount_cgroups(
3990 directory,
3991 arg_unified_cgroup_hierarchy,
3992 arg_userns_mode != USER_NAMESPACE_NO,
3993 arg_uid_shift,
3994 arg_uid_range,
5a8ff0e6 3995 arg_selinux_apifs_context,
ada54120 3996 false);
0996ef00
CB
3997 if (r < 0)
3998 return r;
3999 }
03cfe0d5 4000
57c10a56
CB
4001 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
4002 * mounts available in systemd services inside the container that create a new mount namespace. See
4003 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
4004 * will inherit the shared propagation mode.
4005 *
4006 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4007 * directory mount to root later on.
4008 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4009 */
9d50f850 4010 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
4011 if (r < 0)
4012 return log_error_errno(r, "Failed to move root directory: %m");
4013
e79581dd
CB
4014 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4015 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4016 * the container. */
4017 r = mount_tunnel_open();
4018 if (r < 0)
4019 return r;
4020
b71a0192
CB
4021 if (arg_userns_mode != USER_NAMESPACE_NO) {
4022 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4023 * requires that a fully visible instance is already present in the target mount
4024 * namespace. Mount one here so the inner child can mount its own instances. Later
4025 * we umount the temporary instances created here before we actually exec the
4026 * payload. Since the rootfs is shared the umount will propagate into the container.
4027 * Note, the inner child wouldn't be able to unmount the instances on its own since
4028 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4029 * this. */
4030 r = pin_fully_visible_fs();
4031 if (r < 0)
4032 return r;
4033 }
4034
e96ceaba 4035 fd = setup_notify_child();
9c1e04d0
AP
4036 if (fd < 0)
4037 return fd;
4038
03cfe0d5 4039 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 4040 arg_clone_ns_flags |
8869a0b4 4041 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
4042 if (pid < 0)
4043 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 4044 if (pid == 0) {
af06cd30 4045 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 4046
2a2e78e9
LP
4047 /* The inner child has all namespaces that are requested, so that we all are owned by the
4048 * user if user namespaces are turned on. */
03cfe0d5 4049
d7bea6b6
DP
4050 if (arg_network_namespace_path) {
4051 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4052 if (r < 0)
e2d39e54 4053 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4054 }
4055
11875a98 4056 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
4057 if (r < 0)
4058 _exit(EXIT_FAILURE);
4059
4060 _exit(EXIT_SUCCESS);
4061 }
4062
af06cd30 4063 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
4064 if (l < 0)
4065 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4066 if (l != sizeof(pid))
4067 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4068 "Short write while sending PID.");
03cfe0d5 4069
af06cd30 4070 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
4071 if (l < 0)
4072 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4073 if (l != sizeof(arg_uuid))
4074 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4075 "Short write while sending machine ID.");
e01ff70a 4076
af06cd30 4077 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 4078 if (l < 0)
ba72801d 4079 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4080
af06cd30 4081 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4082 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4083 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4084
4085 return 0;
4086}
4087
0e7ac751 4088static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4089 bool tried_hashed = false;
0e7ac751
LP
4090 unsigned n_tries = 100;
4091 uid_t candidate;
4092 int r;
4093
4094 assert(shift);
4095 assert(ret_lock_file);
0de7acce 4096 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4097 assert(arg_uid_range == 0x10000U);
4098
4099 candidate = *shift;
4100
4101 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4102
4103 for (;;) {
fbd0b64f 4104 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4105 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4106
4107 if (--n_tries <= 0)
4108 return -EBUSY;
4109
87d5e4f2 4110 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4111 goto next;
4112 if ((candidate & UINT32_C(0xFFFF)) != 0)
4113 goto next;
4114
4115 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4116 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4117 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4118 goto next;
4119 if (r < 0)
4120 return r;
4121
4122 /* Make some superficial checks whether the range is currently known in the user database */
4123 if (getpwuid(candidate))
4124 goto next;
4125 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4126 goto next;
4127 if (getgrgid(candidate))
4128 goto next;
4129 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4130 goto next;
4131
4132 *ret_lock_file = lf;
4133 lf = (struct LockFile) LOCK_FILE_INIT;
4134 *shift = candidate;
4135 return 0;
4136
4137 next:
d381c8a6
LP
4138 if (arg_machine && !tried_hashed) {
4139 /* Try to hash the base from the container name */
4140
4141 static const uint8_t hash_key[] = {
4142 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4143 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4144 };
4145
4146 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4147
4148 tried_hashed = true;
4149 } else
4150 random_bytes(&candidate, sizeof(candidate));
4151
87d5e4f2 4152 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4153 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4154 }
4155}
4156
2f893044
LP
4157static int add_one_uid_map(
4158 char **p,
4159 uid_t container_uid,
4160 uid_t host_uid,
4161 uid_t range) {
4162
4163 return strextendf(p,
4164 UID_FMT " " UID_FMT " " UID_FMT "\n",
4165 container_uid, host_uid, range);
4166}
4167
4168static int make_uid_map_string(
4169 const uid_t bind_user_uid[],
4170 size_t n_bind_user_uid,
4171 size_t offset,
4172 char **ret) {
4173
4174 _cleanup_free_ char *s = NULL;
4175 uid_t previous_uid = 0;
4176 int r;
4177
4178 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4179 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4180 assert(ret);
4181
4182 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4183 * quadruplet, consisting of host and container UID + GID. */
4184
4185 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4186 uid_t payload_uid = bind_user_uid[i*4+offset],
4187 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4188
4189 assert(previous_uid <= payload_uid);
4190 assert(payload_uid < arg_uid_range);
4191
4192 /* Add a range to close the gap to previous entry */
4193 if (payload_uid > previous_uid) {
4194 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4195 if (r < 0)
4196 return r;
4197 }
4198
4199 /* Map this specific user */
4200 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4201 if (r < 0)
4202 return r;
4203
4204 previous_uid = payload_uid + 1;
4205 }
4206
4207 /* And add a range to close the gap to finish the range */
4208 if (arg_uid_range > previous_uid) {
4209 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4210 if (r < 0)
4211 return r;
4212 }
4213
4214 assert(s);
4215
4216 *ret = TAKE_PTR(s);
4217 return 0;
4218}
4219
4220static int setup_uid_map(
4221 pid_t pid,
4222 const uid_t bind_user_uid[],
4223 size_t n_bind_user_uid) {
4224
4225 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4226 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4227 int r;
4228
4229 assert(pid > 1);
4230
2f893044
LP
4231 /* Build the UID map string */
4232 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4233 return log_oom();
4234
03cfe0d5 4235 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4236 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4237 if (r < 0)
4238 return log_error_errno(r, "Failed to write UID map: %m");
4239
2f893044
LP
4240 /* And now build the GID map string */
4241 s = mfree(s);
4242 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4243 return log_oom();
4244
03cfe0d5 4245 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4246 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4247 if (r < 0)
4248 return log_error_errno(r, "Failed to write GID map: %m");
4249
4250 return 0;
4251}
4252
9c1e04d0 4253static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4254 char buf[NOTIFY_BUFFER_MAX+1];
4255 char *p = NULL;
4256 struct iovec iovec = {
4257 .iov_base = buf,
4258 .iov_len = sizeof(buf)-1,
4259 };
fb29cdbe
LP
4260 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4261 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4262 struct msghdr msghdr = {
4263 .msg_iov = &iovec,
4264 .msg_iovlen = 1,
4265 .msg_control = &control,
4266 .msg_controllen = sizeof(control),
4267 };
371d72e0 4268 struct ucred *ucred;
9c1e04d0
AP
4269 ssize_t n;
4270 pid_t inner_child_pid;
4271 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4272 int r;
9c1e04d0
AP
4273
4274 assert(userdata);
4275
4276 inner_child_pid = PTR_TO_PID(userdata);
4277
4278 if (revents != EPOLLIN) {
4279 log_warning("Got unexpected poll event for notify fd.");
4280 return 0;
4281 }
4282
3691bcf3 4283 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
8add30a0
YW
4284 if (n < 0) {
4285 if (ERRNO_IS_TRANSIENT(n))
4286 return 0;
4287 if (n == -EXFULL) {
4288 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4289 return 0;
4290 }
3691bcf3 4291 return log_warning_errno(n, "Couldn't read notification socket: %m");
8add30a0 4292 }
9c1e04d0 4293
9c1e04d0
AP
4294 cmsg_close_all(&msghdr);
4295
371d72e0 4296 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4297 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4298 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4299 return 0;
4300 }
4301
4302 if ((size_t) n >= sizeof(buf)) {
4303 log_warning("Received notify message exceeded maximum size. Ignoring.");
4304 return 0;
4305 }
4306
4307 buf[n] = 0;
4308 tags = strv_split(buf, "\n\r");
4309 if (!tags)
4310 return log_oom();
4311
d29cc4d6 4312 if (strv_contains(tags, "READY=1")) {
d4341b76 4313 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4314 if (r < 0)
4315 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4316 }
9c1e04d0
AP
4317
4318 p = strv_find_startswith(tags, "STATUS=");
4319 if (p)
04f590a4 4320 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4321
4322 return 0;
4323}
4324
e96ceaba 4325static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4326 int r;
9c1e04d0 4327
5773024d 4328 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4329 if (r < 0)
4330 return log_error_errno(r, "Failed to allocate notify event source: %m");
4331
5773024d 4332 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4333
4334 return 0;
4335}
4336
5d961407
LP
4337static int merge_settings(Settings *settings, const char *path) {
4338 int rl;
f757855e 4339
5d961407
LP
4340 assert(settings);
4341 assert(path);
f757855e 4342
5d961407
LP
4343 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4344 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4345
7732f92b
LP
4346 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4347 settings->start_mode >= 0) {
4348 arg_start_mode = settings->start_mode;
130d3d22 4349 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4350 }
4351
d3689b94
LP
4352 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4353 settings->ephemeral >= 0)
a2f577fc
JL
4354 arg_ephemeral = settings->ephemeral;
4355
de40a303
LP
4356 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4357 settings->root) {
4358
4359 if (!arg_settings_trusted)
4360 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4361 else
4362 free_and_replace(arg_directory, settings->root);
4363 }
4364
b53ede69
PW
4365 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4366 settings->pivot_root_new) {
4367 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4368 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4369 }
4370
5f932eb9 4371 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4372 settings->working_directory)
4373 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4374
f757855e 4375 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4376 settings->environment)
4377 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4378
de40a303
LP
4379 if ((arg_settings_mask & SETTING_USER) == 0) {
4380
4381 if (settings->user)
4382 free_and_replace(arg_user, settings->user);
4383
4384 if (uid_is_valid(settings->uid))
4385 arg_uid = settings->uid;
4386 if (gid_is_valid(settings->gid))
4387 arg_gid = settings->gid;
4388 if (settings->n_supplementary_gids > 0) {
4389 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4390 arg_n_supplementary_gids = settings->n_supplementary_gids;
4391 }
4392 }
f757855e
LP
4393
4394 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4395 uint64_t plus, minus;
7be830c6 4396 uint64_t network_minus = 0;
88fc9c9b 4397 uint64_t ambient;
f757855e 4398
de40a303
LP
4399 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4400 * Settings structure */
4401
0e265674 4402 plus = settings->capability;
a3fc6b55
LP
4403 minus = settings->drop_capability;
4404
9baa294c
LP
4405 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4406 settings_network_configured(settings)) {
a3fc6b55
LP
4407 if (settings_private_network(settings))
4408 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4409 else
7be830c6 4410 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4411 }
0e265674
LP
4412
4413 if (!arg_settings_trusted && plus != 0) {
4414 if (settings->capability != 0)
5d961407 4415 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4416 } else {
4417 arg_caps_retain &= ~network_minus;
520e0d54 4418 arg_caps_retain |= plus;
7be830c6 4419 }
f757855e 4420
a3fc6b55 4421 arg_caps_retain &= ~minus;
de40a303
LP
4422
4423 /* Copy the full capabilities over too */
4424 if (capability_quintet_is_set(&settings->full_capabilities)) {
4425 if (!arg_settings_trusted)
5238e957 4426 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4427 else
4428 arg_full_capabilities = settings->full_capabilities;
4429 }
88fc9c9b
TH
4430
4431 ambient = settings->ambient_capability;
4432 if (!arg_settings_trusted && ambient != 0)
4433 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4434 else
4435 arg_caps_ambient |= ambient;
f757855e
LP
4436 }
4437
4438 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4439 settings->kill_signal > 0)
4440 arg_kill_signal = settings->kill_signal;
4441
4442 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4443 settings->personality != PERSONALITY_INVALID)
4444 arg_personality = settings->personality;
4445
4446 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4447 !sd_id128_is_null(settings->machine_id)) {
4448
4449 if (!arg_settings_trusted)
5d961407 4450 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4451 else
4452 arg_uuid = settings->machine_id;
4453 }
4454
4455 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4456 settings->read_only >= 0)
4457 arg_read_only = settings->read_only;
4458
4459 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4460 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4461 arg_volatile_mode = settings->volatile_mode;
4462
4463 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4464 settings->n_custom_mounts > 0) {
4465
4466 if (!arg_settings_trusted)
5d961407 4467 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4468 else {
4469 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4470 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4471 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4472 settings->n_custom_mounts = 0;
4473 }
4474 }
4475
4476 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4477 settings_network_configured(settings)) {
f757855e
LP
4478
4479 if (!arg_settings_trusted)
5d961407 4480 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4481 else {
f6d6bad1 4482 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4483 arg_private_network = settings_private_network(settings);
4484
130d3d22
YW
4485 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4486 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4487 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4488 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4489
1cc6c93a
YW
4490 free_and_replace(arg_network_bridge, settings->network_bridge);
4491 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4492
4493 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4494 }
4495 }
4496
4497 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4498 settings->expose_ports) {
4499
4500 if (!arg_settings_trusted)
5d961407 4501 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4502 else {
4503 expose_port_free_all(arg_expose_ports);
1cc6c93a 4504 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4505 }
4506 }
4507
0de7acce
LP
4508 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4509 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4510
4511 if (!arg_settings_trusted)
5d961407 4512 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4513 else {
4514 arg_userns_mode = settings->userns_mode;
4515 arg_uid_shift = settings->uid_shift;
4516 arg_uid_range = settings->uid_range;
6c045a99 4517 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4518 }
4519 }
4520
0cc3c9f9
LP
4521 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4522 !strv_isempty(settings->bind_user))
2f893044
LP
4523 strv_free_and_replace(arg_bind_user, settings->bind_user);
4524
d3689b94
LP
4525 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4526 settings->notify_ready >= 0)
9c1e04d0
AP
4527 arg_notify_ready = settings->notify_ready;
4528
960e4569
LP
4529 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4530
2d09ea44
LP
4531 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4532 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4533 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4534 else {
4535 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4536 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4537 }
960e4569 4538 }
de40a303
LP
4539
4540#if HAVE_SECCOMP
2d09ea44
LP
4541 if (settings->seccomp) {
4542 if (!arg_settings_trusted)
4543 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4544 else {
4545 seccomp_release(arg_seccomp);
4546 arg_seccomp = TAKE_PTR(settings->seccomp);
4547 }
de40a303
LP
4548 }
4549#endif
960e4569
LP
4550 }
4551
bf428efb
LP
4552 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4553 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4554 continue;
4555
4556 if (!settings->rlimit[rl])
4557 continue;
4558
4559 if (!arg_settings_trusted) {
5d961407 4560 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4561 continue;
4562 }
4563
4564 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4565 }
4566
3a9530e5
LP
4567 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4568 settings->hostname)
4569 free_and_replace(arg_hostname, settings->hostname);
4570
66edd963
LP
4571 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4572 settings->no_new_privileges >= 0)
4573 arg_no_new_privileges = settings->no_new_privileges;
4574
81f345df
LP
4575 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4576 settings->oom_score_adjust_set) {
4577
4578 if (!arg_settings_trusted)
5d961407 4579 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4580 else {
4581 arg_oom_score_adjust = settings->oom_score_adjust;
4582 arg_oom_score_adjust_set = true;
4583 }
4584 }
4585
d107bb7d 4586 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4587 settings->cpu_set.set) {
d107bb7d
LP
4588
4589 if (!arg_settings_trusted)
5d961407 4590 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4591 else {
0985c7c4
ZJS
4592 cpu_set_reset(&arg_cpu_set);
4593 arg_cpu_set = settings->cpu_set;
4594 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4595 }
4596 }
4597
09d423e9
LP
4598 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4599 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4600 arg_resolv_conf = settings->resolv_conf;
4601
4e1d6aa9
LP
4602 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4603 settings->link_journal != _LINK_JOURNAL_INVALID) {
4604
4605 if (!arg_settings_trusted)
4606 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4607 else {
4608 arg_link_journal = settings->link_journal;
4609 arg_link_journal_try = settings->link_journal_try;
4610 }
4611 }
4612
1688841f
LP
4613 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4614 settings->timezone != _TIMEZONE_MODE_INVALID)
4615 arg_timezone = settings->timezone;
4616
de40a303
LP
4617 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4618 settings->slice) {
4619
4620 if (!arg_settings_trusted)
4621 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4622 else
4623 free_and_replace(arg_slice, settings->slice);
4624 }
4625
4626 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4627 settings->use_cgns >= 0) {
4628
4629 if (!arg_settings_trusted)
4630 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4631 else
4632 arg_use_cgns = settings->use_cgns;
4633 }
4634
4635 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4636 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4637
4638 if (!arg_settings_trusted)
4639 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4640 else
4641 arg_clone_ns_flags = settings->clone_ns_flags;
4642 }
4643
4644 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4645 settings->console_mode >= 0) {
4646
4647 if (!arg_settings_trusted)
4648 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4649 else
4650 arg_console_mode = settings->console_mode;
4651 }
4652
d3689b94
LP
4653 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4654 settings->suppress_sync >= 0)
4a4654e0
LP
4655 arg_suppress_sync = settings->suppress_sync;
4656
de40a303
LP
4657 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4658 * don't consult arg_settings_mask for them. */
4659
4660 sd_bus_message_unref(arg_property_message);
4661 arg_property_message = TAKE_PTR(settings->properties);
4662
4663 arg_console_width = settings->console_width;
4664 arg_console_height = settings->console_height;
4665
b2645747 4666 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4667 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4668 arg_n_extra_nodes = settings->n_extra_nodes;
4669
f757855e
LP
4670 return 0;
4671}
4672
5d961407
LP
4673static int load_settings(void) {
4674 _cleanup_(settings_freep) Settings *settings = NULL;
4675 _cleanup_fclose_ FILE *f = NULL;
3603f151 4676 _cleanup_free_ char *p = NULL;
5d961407
LP
4677 int r;
4678
de40a303
LP
4679 if (arg_oci_bundle)
4680 return 0;
4681
5d961407
LP
4682 /* If all settings are masked, there's no point in looking for
4683 * the settings file */
d7a0f1f4 4684 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4685 return 0;
4686
5d961407
LP
4687 /* We first look in the admin's directories in /etc and /run */
4688 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4689 _cleanup_free_ char *j = NULL;
4690
3603f151 4691 j = path_join(i, arg_settings_filename);
5d961407
LP
4692 if (!j)
4693 return log_oom();
4694
4695 f = fopen(j, "re");
4696 if (f) {
4697 p = TAKE_PTR(j);
4698
4699 /* By default, we trust configuration from /etc and /run */
4700 if (arg_settings_trusted < 0)
4701 arg_settings_trusted = true;
4702
4703 break;
4704 }
4705
4706 if (errno != ENOENT)
4707 return log_error_errno(errno, "Failed to open %s: %m", j);
4708 }
4709
4710 if (!f) {
4711 /* After that, let's look for a file next to the
4712 * actual image we shall boot. */
4713
4714 if (arg_image) {
162f6477
LP
4715 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4716 if (r < 0)
4717 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4718 } else if (arg_directory) {
4719 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4720 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4721 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4722 }
4723
4724 if (p) {
4725 f = fopen(p, "re");
4726 if (!f && errno != ENOENT)
4727 return log_error_errno(errno, "Failed to open %s: %m", p);
4728
4729 /* By default, we do not trust configuration from /var/lib/machines */
4730 if (arg_settings_trusted < 0)
4731 arg_settings_trusted = false;
4732 }
4733 }
4734
4735 if (!f)
4736 return 0;
4737
4738 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4739
4740 r = settings_load(f, p, &settings);
4741 if (r < 0)
4742 return r;
4743
4744 return merge_settings(settings, p);
4745}
4746
de40a303
LP
4747static int load_oci_bundle(void) {
4748 _cleanup_(settings_freep) Settings *settings = NULL;
4749 int r;
4750
4751 if (!arg_oci_bundle)
4752 return 0;
4753
4754 /* By default let's trust OCI bundles */
4755 if (arg_settings_trusted < 0)
4756 arg_settings_trusted = true;
4757
4758 r = oci_load(NULL, arg_oci_bundle, &settings);
4759 if (r < 0)
4760 return r;
4761
4762 return merge_settings(settings, arg_oci_bundle);
4763}
4764
3acc84eb 4765static int run_container(
2d845785 4766 DissectedImage *dissected_image,
b0067625
ZJS
4767 FDSet *fds,
4768 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4769 struct ExposeArgs *expose_args,
3acc84eb 4770 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4771
4772 static const struct sigaction sa = {
4773 .sa_handler = nop_signal_handler,
e28c7cd0 4774 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4775 };
4776
8e766630 4777 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4778 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4779 _cleanup_close_pair_ int
19ee48a6
YW
4780 fd_inner_socket_pair[2] = PIPE_EBADF,
4781 fd_outer_socket_pair[2] = PIPE_EBADF;
8199d554 4782
5bb1d7fb 4783 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4784 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4785 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4786 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4787 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4788 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4789 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4790 _cleanup_free_ uid_t *bind_user_uid = NULL;
4791 size_t n_bind_user_uid = 0;
b0067625 4792 ContainerStatus container_status = 0;
b0067625
ZJS
4793 int ifi = 0, r;
4794 ssize_t l;
4795 sigset_t mask_chld;
254d1313 4796 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4797
4798 assert_se(sigemptyset(&mask_chld) == 0);
4799 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4800
4801 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4802 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4803 * check with getpwuid() if the specific user already exists. Note that /etc might be
4804 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4805 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4806 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4807 * really ours. */
4808
4809 etc_passwd_lock = take_etc_passwd_lock(NULL);
4810 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4811 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4812 }
4813
4814 r = barrier_create(&barrier);
4815 if (r < 0)
4816 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4817
5d9d3fcb
CB
4818 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4819 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4820
af06cd30
CB
4821 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4822 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4823
b0067625
ZJS
4824 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4825 * parent's blocking calls and give it a chance to call wait() and terminate. */
4826 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4827 if (r < 0)
4828 return log_error_errno(errno, "Failed to change the signal mask: %m");
4829
4830 r = sigaction(SIGCHLD, &sa, NULL);
4831 if (r < 0)
4832 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4833
d7bea6b6 4834 if (arg_network_namespace_path) {
5b4855ab
DDM
4835 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4836 if (child_netns_fd < 0)
d7bea6b6
DP
4837 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4838
54c2459d 4839 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4840 if (r == -EUCLEAN)
4841 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4842 else if (r < 0)
d7bea6b6 4843 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4844 else if (r == 0)
4845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4846 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4847 }
4848
b0067625
ZJS
4849 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4850 if (*pid < 0)
4851 return log_error_errno(errno, "clone() failed%s: %m",
4852 errno == EINVAL ?
4853 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4854
4855 if (*pid == 0) {
4856 /* The outer child only has a file system namespace. */
4857 barrier_set_role(&barrier, BARRIER_CHILD);
4858
5d9d3fcb 4859 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4860 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4861
4862 (void) reset_all_signal_handlers();
4863 (void) reset_signal_mask();
4864
4865 r = outer_child(&barrier,
4866 arg_directory,
2d845785 4867 dissected_image,
af06cd30 4868 fd_outer_socket_pair[1],
5d9d3fcb 4869 fd_inner_socket_pair[1],
d7bea6b6 4870 fds,
5b4855ab 4871 child_netns_fd);
b0067625
ZJS
4872 if (r < 0)
4873 _exit(EXIT_FAILURE);
4874
4875 _exit(EXIT_SUCCESS);
4876 }
4877
4878 barrier_set_role(&barrier, BARRIER_PARENT);
4879
e4077ff6 4880 fdset_close(fds);
b0067625 4881
5d9d3fcb 4882 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4883 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4884
4885 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4886 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4887 if (mntns_fd < 0)
4888 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4889
b0067625 4890 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4891 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4892 if (l < 0)
4893 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4894 if (l != sizeof arg_uid_shift)
4895 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4896
4897 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4898 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4899 * image, but if that's already in use, pick a new one, and report back to the child,
4900 * which one we now picked. */
4901
4902 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4903 if (r < 0)
4904 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4905
af06cd30 4906 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4907 if (l < 0)
4908 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4909 if (l != sizeof arg_uid_shift)
4910 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4911 }
2f893044
LP
4912
4913 n_bind_user_uid = strv_length(arg_bind_user);
4914 if (n_bind_user_uid > 0) {
4915 /* Right after the UID shift, we'll receive the list of UID mappings for the
4916 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4917
4918 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4919 if (!bind_user_uid)
4920 return log_oom();
4921
4922 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4923 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4924 if (l < 0)
4925 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4926 if (l != sizeof(uid_t)*4)
4927 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4928 SYNTHETIC_ERRNO(EIO),
4929 "Short read while reading bind user UID pairs.");
4930 }
4931 }
b0067625
ZJS
4932 }
4933
8199d554
LP
4934 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4935 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4936 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4937 if (l < 0)
4938 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4939 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4940 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4941 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4942 }
4943
b0067625 4944 /* Wait for the outer child. */
d2e0ac3d
LP
4945 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4946 if (r < 0)
4947 return r;
4948 if (r != EXIT_SUCCESS)
4949 return -EIO;
b0067625
ZJS
4950
4951 /* And now retrieve the PID of the inner child. */
af06cd30 4952 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4953 if (l < 0)
4954 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4955 if (l != sizeof *pid)
4956 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4957
4958 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4959 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4960 if (l < 0)
4961 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4962 if (l != sizeof(arg_uuid))
4963 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4964
4965 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4966 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4967 if (notify_socket < 0)
4968 return log_error_errno(notify_socket,
4969 "Failed to receive notification socket from the outer child: %m");
4970
4971 log_debug("Init process invoked as PID "PID_FMT, *pid);
4972
4973 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4974 if (!barrier_place_and_sync(&barrier)) /* #1 */
4975 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4976
2f893044 4977 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4978 if (r < 0)
4979 return r;
4980
4981 (void) barrier_place(&barrier); /* #2 */
4982 }
4983
4984 if (arg_private_network) {
75116558
PS
4985 if (!arg_network_namespace_path) {
4986 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4987 if (!barrier_place_and_sync(&barrier)) /* #3 */
4988 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4989 }
4990
5b4855ab
DDM
4991 if (child_netns_fd < 0) {
4992 /* Make sure we have an open file descriptor to the child's network
4993 * namespace so it stays alive even if the child exits. */
4994 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4995 if (r < 0)
4996 return log_error_errno(r, "Failed to open child network namespace: %m");
4997 }
4998
4999 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
5000 if (r < 0)
5001 return r;
5002
5003 if (arg_network_veth) {
5004 r = setup_veth(arg_machine, *pid, veth_name,
5005 arg_network_bridge || arg_network_zone);
5006 if (r < 0)
5007 return r;
5008 else if (r > 0)
5009 ifi = r;
5010
5011 if (arg_network_bridge) {
5012 /* Add the interface to a bridge */
5013 r = setup_bridge(veth_name, arg_network_bridge, false);
5014 if (r < 0)
5015 return r;
5016 if (r > 0)
5017 ifi = r;
5018 } else if (arg_network_zone) {
5019 /* Add the interface to a bridge, possibly creating it */
5020 r = setup_bridge(veth_name, arg_network_zone, true);
5021 if (r < 0)
5022 return r;
5023 if (r > 0)
5024 ifi = r;
5025 }
5026 }
5027
5028 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5029 if (r < 0)
5030 return r;
5031
5032 /* We created the primary and extra veth links now; let's remember this, so that we know to
5033 remove them later on. Note that we don't bother with removing veth links that were created
5034 here when their setup failed half-way, because in that case the kernel should be able to
5035 remove them on its own, since they cannot be referenced by anything yet. */
5036 *veth_created = true;
5037
5038 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5039 if (r < 0)
5040 return r;
5041
5042 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5043 if (r < 0)
5044 return r;
5045 }
5046
abdb9b08
LP
5047 if (arg_register || !arg_keep_unit) {
5048 r = sd_bus_default_system(&bus);
5049 if (r < 0)
5050 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5051
5052 r = sd_bus_set_close_on_exit(bus, false);
5053 if (r < 0)
5054 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5055 }
5056
5057 if (!arg_keep_unit) {
5058 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5059 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5060 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5061
75152a4d
LP
5062 r = sd_bus_match_signal_async(
5063 bus,
5064 NULL,
5065 "org.freedesktop.systemd1",
5066 NULL,
5067 "org.freedesktop.systemd1.Scope",
5068 "RequestStop",
5069 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5070 if (r < 0)
75152a4d 5071 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5072 }
5073
b0067625
ZJS
5074 if (arg_register) {
5075 r = register_machine(
abdb9b08 5076 bus,
b0067625
ZJS
5077 arg_machine,
5078 *pid,
5079 arg_directory,
5080 arg_uuid,
5081 ifi,
5082 arg_slice,
5083 arg_custom_mounts, arg_n_custom_mounts,
5084 arg_kill_signal,
5085 arg_property,
de40a303 5086 arg_property_message,
b0067625
ZJS
5087 arg_keep_unit,
5088 arg_container_service_name);
5089 if (r < 0)
5090 return r;
abdb9b08 5091
cd2dfc6f
LP
5092 } else if (!arg_keep_unit) {
5093 r = allocate_scope(
abdb9b08 5094 bus,
cd2dfc6f
LP
5095 arg_machine,
5096 *pid,
5097 arg_slice,
5098 arg_custom_mounts, arg_n_custom_mounts,
5099 arg_kill_signal,
de40a303
LP
5100 arg_property,
5101 arg_property_message);
cd2dfc6f
LP
5102 if (r < 0)
5103 return r;
5104
5105 } else if (arg_slice || arg_property)
5106 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5107
27da7ef0 5108 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5109 if (r < 0)
5110 return r;
5111
27da7ef0 5112 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5113 if (r < 0)
5114 return r;
b0067625 5115
de54e02d 5116 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5117 if (r < 0)
5118 return r;
5119
5120 /* Notify the child that the parent is ready with all
5121 * its setup (including cgroup-ification), and that
5122 * the child can now hand over control to the code to
5123 * run inside the container. */
75116558 5124 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5125
5126 /* Block SIGCHLD here, before notifying child.
5127 * process_pty() will handle it with the other signals. */
5128 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5129
5130 /* Reset signal to default */
9c274488 5131 r = default_signals(SIGCHLD);
b0067625
ZJS
5132 if (r < 0)
5133 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5134
5135 r = sd_event_new(&event);
5136 if (r < 0)
5137 return log_error_errno(r, "Failed to get default event source: %m");
5138
8fd010bb
LP
5139 (void) sd_event_set_watchdog(event, true);
5140
abdb9b08
LP
5141 if (bus) {
5142 r = sd_bus_attach_event(bus, event, 0);
5143 if (r < 0)
5144 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5145 }
5146
e96ceaba 5147 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5148 if (r < 0)
5149 return r;
5150
b71a0192
CB
5151 if (arg_userns_mode != USER_NAMESPACE_NO) {
5152 r = wipe_fully_visible_fs(mntns_fd);
5153 if (r < 0)
5154 return r;
5155 mntns_fd = safe_close(mntns_fd);
5156 }
5157
b0067625 5158 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5159 if (!barrier_place_and_sync(&barrier)) /* #5 */
5160 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5161
38ccb557 5162 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5163 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5164 etc_passwd_lock = safe_close(etc_passwd_lock);
5165
04f590a4
LP
5166 (void) sd_notifyf(false,
5167 "STATUS=Container running.\n"
5168 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5169 if (!arg_notify_ready) {
5170 r = sd_notify(false, "READY=1\n");
5171 if (r < 0)
5172 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5173 }
b0067625
ZJS
5174
5175 if (arg_kill_signal > 0) {
5176 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5177 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5178 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5179 } else {
5180 /* Immediately exit */
919f5ae0
LP
5181 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5182 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5183 }
5184
988851b6
LP
5185 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5186
5187 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5188 if (r < 0)
5189 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5190
6916b164 5191 /* Exit when the child exits */
919f5ae0 5192 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5193
b07ee903
CB
5194 /* Retrieve the kmsg fifo allocated by inner child */
5195 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5196 if (fd_kmsg_fifo < 0)
5197 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5198
b0067625 5199 if (arg_expose_ports) {
b07ee903 5200 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5201 if (r < 0)
5202 return r;
5203
deff68e7
FW
5204 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5205 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5206 }
5207
3acc84eb 5208 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5209 _cleanup_close_ int fd = -EBADF;
3acc84eb 5210 PTYForwardFlags flags = 0;
de40a303 5211
3acc84eb 5212 /* Retrieve the master pty allocated by inner child */
bb1aa185 5213 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5214 if (fd < 0)
5215 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5216
5217 switch (arg_console_mode) {
de40a303 5218
3acc84eb
FB
5219 case CONSOLE_READ_ONLY:
5220 flags |= PTY_FORWARD_READ_ONLY;
5221
5222 _fallthrough_;
5223
5224 case CONSOLE_INTERACTIVE:
5225 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5226
5227 r = pty_forward_new(event, fd, flags, &forward);
5228 if (r < 0)
5229 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5230
f5fbe71d 5231 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5232 (void) pty_forward_set_width_height(forward,
5233 arg_console_width,
5234 arg_console_height);
5235 break;
5236
5237 default:
5238 assert(arg_console_mode == CONSOLE_PASSIVE);
5239 }
5240
5241 *master = TAKE_FD(fd);
de40a303 5242 }
b0067625 5243
5d9d3fcb
CB
5244 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5245
b0067625
ZJS
5246 r = sd_event_loop(event);
5247 if (r < 0)
5248 return log_error_errno(r, "Failed to run event loop: %m");
5249
de40a303
LP
5250 if (forward) {
5251 char last_char = 0;
b0067625 5252
de40a303
LP
5253 (void) pty_forward_get_last_char(forward, &last_char);
5254 forward = pty_forward_free(forward);
b0067625 5255
de40a303
LP
5256 if (!arg_quiet && last_char != '\n')
5257 putc('\n', stdout);
5258 }
b0067625
ZJS
5259
5260 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5261 if (!arg_register && !arg_keep_unit && bus)
5262 terminate_scope(bus, arg_machine);
b0067625
ZJS
5263
5264 /* Normally redundant, but better safe than sorry */
c67b0082 5265 (void) kill(*pid, SIGKILL);
b0067625 5266
5d9d3fcb
CB
5267 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5268
5b4855ab
DDM
5269 if (arg_private_network) {
5270 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5271 * to avoid having to move the parent to the child network namespace. */
5272 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5273 if (r < 0)
5274 return r;
5275
5276 if (r == 0) {
254d1313 5277 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab
DDM
5278
5279 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5280 if (r < 0) {
5281 log_error_errno(r, "Failed to open parent network namespace: %m");
5282 _exit(EXIT_FAILURE);
5283 }
5284
5285 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5286 if (r < 0) {
5287 log_error_errno(r, "Failed to enter child network namespace: %m");
5288 _exit(EXIT_FAILURE);
5289 }
5290
5291 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5292 if (r < 0)
5293 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5294
5295 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5296 }
5297 }
5298
8f03de53 5299 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5300
0bb0a9fa
ZJS
5301 /* Tell machined that we are gone. */
5302 if (bus)
5303 (void) unregister_machine(bus, arg_machine);
5304
b0067625
ZJS
5305 if (r < 0)
5306 /* We failed to wait for the container, or the container exited abnormally. */
5307 return r;
5308 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5309 /* r > 0 → The container exited with a non-zero status.
5310 * As a special case, we need to replace 133 with a different value,
5311 * because 133 is special-cased in the service file to reboot the container.
5312 * otherwise → The container exited with zero status and a reboot was not requested.
5313 */
2a49b612 5314 if (r == EXIT_FORCE_RESTART)
27e29a1e 5315 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5316 *ret = r;
b0067625
ZJS
5317 return 0; /* finito */
5318 }
5319
5320 /* CONTAINER_REBOOTED, loop again */
5321
5322 if (arg_keep_unit) {
5323 /* Special handling if we are running as a service: instead of simply
5324 * restarting the machine we want to restart the entire service, so let's
5325 * inform systemd about this with the special exit code 133. The service
5326 * file uses RestartForceExitStatus=133 so that this results in a full
5327 * nspawn restart. This is necessary since we might have cgroup parameters
5328 * set we want to have flushed out. */
2a49b612
ZJS
5329 *ret = EXIT_FORCE_RESTART;
5330 return 0; /* finito */
b0067625
ZJS
5331 }
5332
deff68e7
FW
5333 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5334 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5335
5336 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5337 *veth_created = false;
5338 return 1; /* loop again */
5339}
5340
bf428efb 5341static int initialize_rlimits(void) {
852b6250 5342 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5343 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5344 * container execution environments. */
5345
5346 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5347 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5348 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5349 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5350 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5351 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5352 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5353 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5354 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5355 [RLIMIT_NICE] = { 0, 0 },
5356 [RLIMIT_NOFILE] = { 1024, 4096 },
5357 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5358 [RLIMIT_RTPRIO] = { 0, 0 },
5359 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5360 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5361
5362 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5363 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5364 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5365 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5366 * that PID 1 changes a number of other resource limits during early initialization which is why we
5367 * don't read the other limits from PID 1 but prefer the static table above. */
5368 };
5369
5370 int rl;
5371
5372 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5373 /* Let's only fill in what the user hasn't explicitly configured anyway */
5374 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5375 const struct rlimit *v;
5376 struct rlimit buffer;
5377
5378 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5379 /* For these two let's read the limits off PID 1. See above for an explanation. */
5380
5381 if (prlimit(1, rl, NULL, &buffer) < 0)
5382 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5383
dbf1aca6
LP
5384 v = &buffer;
5385 } else if (rl == RLIMIT_NOFILE) {
5386 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5387 * userspace. Given that nspawn containers are often run without our PID 1,
5388 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5389 * so that container userspace gets similar resources as host userspace
5390 * gets. */
5391 buffer = kernel_defaults[rl];
5392 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5393 v = &buffer;
5394 } else
5395 v = kernel_defaults + rl;
5396
5397 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5398 if (!arg_rlimit[rl])
5399 return log_oom();
5400 }
5401
5402 if (DEBUG_LOGGING) {
5403 _cleanup_free_ char *k = NULL;
5404
5405 (void) rlimit_format(arg_rlimit[rl], &k);
5406 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5407 }
5408 }
5409
5410 return 0;
5411}
5412
287b7376 5413static int cant_be_in_netns(void) {
254d1313 5414 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5415 struct ucred ucred;
5416 int r;
5417
5418 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5419 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5420 * nice message. */
5421
5422 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5423 return 0;
5424
5425 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5426 if (fd < 0)
5427 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5428
1861986a
LP
5429 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5430 if (r < 0) {
5431 if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
287b7376
LP
5432 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5433 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5434
1861986a 5435 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5436 }
5437
5438 r = getpeercred(fd, &ucred);
5439 if (r < 0)
5440 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5441
f7a2dc3d 5442 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5443 if (r < 0)
f7a2dc3d
CB
5444 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5445 if (r == 0)
287b7376
LP
5446 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5447 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5448 return 0;
5449}
5450
44dbef90 5451static int run(int argc, char *argv[]) {
4c27749b 5452 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5453 _cleanup_close_ int master = -EBADF;
03cfe0d5 5454 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5455 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5456 char veth_name[IFNAMSIZ] = "";
761cf19d 5457 struct ExposeArgs expose_args = {};
8e766630 5458 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5459 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5460 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5461 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5462 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5463 pid_t pid = 0;
03cfe0d5
LP
5464
5465 log_parse_environment();
5466 log_open();
415fc41c 5467
03cfe0d5
LP
5468 r = parse_argv(argc, argv);
5469 if (r <= 0)
5470 goto finish;
5471
38ee19c0
ZJS
5472 if (geteuid() != 0) {
5473 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5474 argc >= 2 ? "Need to be root." :
5475 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5476 goto finish;
38ee19c0 5477 }
fba868fa 5478
287b7376
LP
5479 r = cant_be_in_netns();
5480 if (r < 0)
5481 goto finish;
5482
bf428efb
LP
5483 r = initialize_rlimits();
5484 if (r < 0)
5485 goto finish;
5486
de40a303
LP
5487 r = load_oci_bundle();
5488 if (r < 0)
5489 goto finish;
5490
f757855e
LP
5491 r = determine_names();
5492 if (r < 0)
5493 goto finish;
5494
5495 r = load_settings();
5496 if (r < 0)
5497 goto finish;
5498
d4d99bc6 5499 r = cg_unified();
5eee8290
LP
5500 if (r < 0) {
5501 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5502 goto finish;
5503 }
5504
f757855e
LP
5505 r = verify_arguments();
5506 if (r < 0)
5507 goto finish;
03cfe0d5 5508
49048684
ZJS
5509 /* Reapply environment settings. */
5510 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5511
2949ff26
LP
5512 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5513 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5514 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5515 (void) ignore_signals(SIGPIPE);
2949ff26 5516
03cfe0d5
LP
5517 n_fd_passed = sd_listen_fds(false);
5518 if (n_fd_passed > 0) {
5519 r = fdset_new_listen_fds(&fds, false);
5520 if (r < 0) {
5521 log_error_errno(r, "Failed to collect file descriptors: %m");
5522 goto finish;
5523 }
5524 }
5525
83e803a9
ZJS
5526 /* The "default" umask. This is appropriate for most file and directory
5527 * operations performed by nspawn, and is the umask that will be used for
5528 * the child. Functions like copy_devnodes() change the umask temporarily. */
5529 umask(0022);
5530
03cfe0d5
LP
5531 if (arg_directory) {
5532 assert(!arg_image);
5533
b35ca61a
LP
5534 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5535 * /var from the host will propagate into container dynamically (because bad things happen if
5536 * two systems write to the same /var). Let's allow it for the special cases where /var is
5537 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5538 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5539 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5540 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5541 goto finish;
5542 }
5543
5544 if (arg_ephemeral) {
5545 _cleanup_free_ char *np = NULL;
5546
f461a28d 5547 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5548 if (r < 0)
5549 goto finish;
5550
7bf011e3
LP
5551 /* If the specified path is a mount point we generate the new snapshot immediately
5552 * inside it under a random name. However if the specified is not a mount point we
5553 * create the new snapshot in the parent directory, just next to it. */
e1873695 5554 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5555 if (r < 0) {
5556 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5557 goto finish;
5558 }
5559 if (r > 0)
770b5ce4 5560 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5561 else
770b5ce4 5562 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5563 if (r < 0) {
0f3be6ca 5564 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5565 goto finish;
5566 }
5567
6992459c 5568 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5569 * only owned by us and no one else. */
6992459c 5570 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5571 if (r < 0) {
5572 log_error_errno(r, "Failed to lock %s: %m", np);
5573 goto finish;
5574 }
5575
7bf011e3
LP
5576 {
5577 BLOCK_SIGNALS(SIGINT);
5578 r = btrfs_subvol_snapshot(arg_directory, np,
5579 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5580 BTRFS_SNAPSHOT_FALLBACK_COPY |
5581 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5582 BTRFS_SNAPSHOT_RECURSIVE |
5583 BTRFS_SNAPSHOT_QUOTA |
5584 BTRFS_SNAPSHOT_SIGINT);
5585 }
5586 if (r == -EINTR) {
5587 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5588 goto finish;
5589 }
03cfe0d5
LP
5590 if (r < 0) {
5591 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5592 goto finish;
ec16945e
LP
5593 }
5594
1cc6c93a 5595 free_and_replace(arg_directory, np);
17cbb288 5596 remove_directory = true;
30535c16 5597 } else {
f461a28d 5598 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5599 if (r < 0)
5600 goto finish;
5601
30535c16
LP
5602 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5603 if (r == -EBUSY) {
5604 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5605 goto finish;
5606 }
5607 if (r < 0) {
5608 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5609 goto finish;
30535c16
LP
5610 }
5611
5612 if (arg_template) {
f461a28d 5613 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5614 if (r < 0)
5615 goto finish;
5616
7bf011e3
LP
5617 {
5618 BLOCK_SIGNALS(SIGINT);
5619 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5620 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5621 BTRFS_SNAPSHOT_FALLBACK_COPY |
5622 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5623 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5624 BTRFS_SNAPSHOT_RECURSIVE |
5625 BTRFS_SNAPSHOT_QUOTA |
5626 BTRFS_SNAPSHOT_SIGINT);
5627 }
ff6c6cc1
LP
5628 if (r == -EEXIST)
5629 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5630 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5631 else if (r == -EINTR) {
5632 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5633 goto finish;
5634 } else if (r < 0) {
83521414 5635 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5636 goto finish;
ff6c6cc1
LP
5637 } else
5638 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5639 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5640 }
ec16945e
LP
5641 }
5642
7732f92b 5643 if (arg_start_mode == START_BOOT) {
aff7ae0d 5644 _cleanup_free_ char *b = NULL;
a5201ed6 5645 const char *p;
c9fe05e0 5646
aff7ae0d
LP
5647 if (arg_pivot_root_new) {
5648 b = path_join(arg_directory, arg_pivot_root_new);
5649 if (!b)
5650 return log_oom();
5651
5652 p = b;
5653 } else
a5201ed6 5654 p = arg_directory;
c9fe05e0
AR
5655
5656 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5657 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5658 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5659 goto finish;
5660 }
5661 } else {
aff7ae0d 5662 _cleanup_free_ char *p = NULL;
c9fe05e0 5663
a5201ed6 5664 if (arg_pivot_root_new)
aff7ae0d 5665 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5666 else
aff7ae0d
LP
5667 p = path_join(arg_directory, "/usr/");
5668 if (!p)
5669 return log_oom();
1b9e5b12 5670
aff7ae0d
LP
5671 if (laccess(p, F_OK) < 0) {
5672 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5673 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5674 goto finish;
1b9e5b12
LP
5675 }
5676 }
ec16945e 5677
6b9132a9 5678 } else {
d04faa4e 5679 DissectImageFlags dissect_image_flags =
4b5de5dd 5680 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5681 DISSECT_IMAGE_REQUIRE_ROOT |
5682 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5683 DISSECT_IMAGE_USR_NO_ROOT |
5684 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5685 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5686 assert(arg_image);
5687 assert(!arg_template);
5688
f461a28d 5689 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5690 if (r < 0)
5691 goto finish;
5692
0f3be6ca
LP
5693 if (arg_ephemeral) {
5694 _cleanup_free_ char *np = NULL;
5695
5696 r = tempfn_random(arg_image, "machine.", &np);
5697 if (r < 0) {
5698 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5699 goto finish;
5700 }
5701
6992459c
LP
5702 /* Always take an exclusive lock on our own ephemeral copy. */
5703 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5704 if (r < 0) {
5705 r = log_error_errno(r, "Failed to create image lock: %m");
5706 goto finish;
5707 }
5708
7bf011e3
LP
5709 {
5710 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5711 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5712 FS_NOCOW_FL, FS_NOCOW_FL,
5713 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5714 NULL, NULL);
7bf011e3
LP
5715 }
5716 if (r == -EINTR) {
5717 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5718 goto finish;
5719 }
0f3be6ca
LP
5720 if (r < 0) {
5721 r = log_error_errno(r, "Failed to copy image file: %m");
5722 goto finish;
5723 }
5724
1cc6c93a 5725 free_and_replace(arg_image, np);
0f3be6ca
LP
5726 remove_image = true;
5727 } else {
5728 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5729 if (r == -EBUSY) {
5730 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5731 goto finish;
5732 }
5733 if (r < 0) {
5734 r = log_error_errno(r, "Failed to create image lock: %m");
5735 goto finish;
5736 }
4623e8e6 5737
89e62e0b
LP
5738 r = verity_settings_load(
5739 &arg_verity_settings,
5740 arg_image, NULL, NULL);
e7cbe5cb
LB
5741 if (r < 0) {
5742 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5743 goto finish;
78ebe980 5744 }
89e62e0b
LP
5745
5746 if (arg_verity_settings.data_path)
5747 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5748 }
5749
c67b0082 5750 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5751 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5752 goto finish;
1b9e5b12 5753 }
6b9132a9 5754
c67b0082
LP
5755 remove_tmprootdir = true;
5756
5757 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5758 if (!arg_directory) {
5759 r = log_oom();
5760 goto finish;
6b9132a9 5761 }
88213476 5762
89e62e0b
LP
5763 r = loop_device_make_by_path(
5764 arg_image,
5765 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5766 /* sector_size= */ UINT32_MAX,
89e62e0b 5767 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5768 LOCK_SH,
89e62e0b 5769 &loop);
2d845785
LP
5770 if (r < 0) {
5771 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5772 goto finish;
5773 }
1b9e5b12 5774
bad31660 5775 r = dissect_loop_device_and_warn(
bad31660 5776 loop,
89e62e0b 5777 &arg_verity_settings,
84be0c71
LP
5778 /* mount_options=*/ NULL,
5779 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5780 dissect_image_flags,
e0f9e7bd 5781 &dissected_image);
2d845785 5782 if (r == -ENOPKG) {
4526113f 5783 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5784 log_notice("Note that the disk image needs to\n"
5785 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5786 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5787 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5788 " d) or contain a file system without a partition table\n"
5789 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5790 goto finish;
2d845785 5791 }
4526113f 5792 if (r < 0)
842f3b0f 5793 goto finish;
1b9e5b12 5794
88b3300f
LP
5795 r = dissected_image_load_verity_sig_partition(
5796 dissected_image,
5797 loop->fd,
5798 &arg_verity_settings);
5799 if (r < 0)
5800 goto finish;
5801
8ee9615e
LP
5802 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5803 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5804 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5805
89e62e0b
LP
5806 r = dissected_image_decrypt_interactively(
5807 dissected_image,
5808 NULL,
5809 &arg_verity_settings,
e330f97a 5810 0);
1b9e5b12
LP
5811 if (r < 0)
5812 goto finish;
0f3be6ca
LP
5813
5814 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5815 if (remove_image && unlink(arg_image) >= 0)
5816 remove_image = false;
4c27749b
LP
5817
5818 if (arg_architecture < 0)
5819 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5820 }
842f3b0f 5821
86c0dd4a 5822 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5823 if (r < 0)
5824 goto finish;
5825
de40a303
LP
5826 if (arg_console_mode < 0)
5827 arg_console_mode =
5828 isatty(STDIN_FILENO) > 0 &&
5829 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5830
de40a303
LP
5831 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5832 arg_quiet = true;
a258bf26 5833
9c857b9d 5834 if (!arg_quiet)
c85c2f79 5835 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5836 arg_machine, arg_image ?: arg_directory);
5837
988851b6 5838 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5839
66edd963 5840 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5841 r = log_error_errno(errno, "Failed to become subreaper: %m");
5842 goto finish;
5843 }
5844
761cf19d
FW
5845 if (arg_expose_ports) {
5846 r = fw_ctx_new(&fw_ctx);
5847 if (r < 0) {
5848 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5849 goto finish;
5850 }
5851 expose_args.fw_ctx = fw_ctx;
5852 }
d87be9b0 5853 for (;;) {
3acc84eb 5854 r = run_container(dissected_image,
44dbef90
LP
5855 fds,
5856 veth_name, &veth_created,
761cf19d 5857 &expose_args, &master,
44dbef90 5858 &pid, &ret);
b0067625 5859 if (r <= 0)
d87be9b0 5860 break;
d87be9b0 5861 }
88213476
LP
5862
5863finish:
04f590a4
LP
5864 (void) sd_notify(false,
5865 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5866 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5867
9444b1f2 5868 if (pid > 0)
c67b0082 5869 (void) kill(pid, SIGKILL);
88213476 5870
503546da 5871 /* Try to flush whatever is still queued in the pty */
6a0f896b 5872 if (master >= 0) {
f5fbe71d 5873 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5874 master = safe_close(master);
5875 }
5876
5877 if (pid > 0)
5878 (void) wait_for_terminate(pid, NULL);
503546da 5879
50ebcf6c
LP
5880 pager_close();
5881
17cbb288 5882 if (remove_directory && arg_directory) {
ec16945e
LP
5883 int k;
5884
17cbb288 5885 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5886 if (k < 0)
17cbb288 5887 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5888 }
5889
0f3be6ca
LP
5890 if (remove_image && arg_image) {
5891 if (unlink(arg_image) < 0)
5892 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5893 }
5894
c67b0082
LP
5895 if (remove_tmprootdir) {
5896 if (rmdir(tmprootdir) < 0)
5897 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5898 }
5899
785890ac
LP
5900 if (arg_machine) {
5901 const char *p;
5902
63c372cb 5903 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5904 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5905 }
5906
deff68e7
FW
5907 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5908 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5909
5910 if (veth_created)
5911 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5912 (void) remove_bridge(arg_network_zone);
f757855e 5913
f757855e
LP
5914 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5915 expose_port_free_all(arg_expose_ports);
bf428efb 5916 rlimit_free_all(arg_rlimit);
b2645747 5917 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5918 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5919
44dbef90
LP
5920 if (r < 0)
5921 return r;
5922
5923 return ret;
88213476 5924}
44dbef90
LP
5925
5926DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);