]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
mount-util: add make_fsmount()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
d6b4d1c7 31#include "build.h"
b8ea7a6e 32#include "bus-error.h"
7f8a85e6 33#include "bus-locator.h"
b053cd5f 34#include "bus-util.h"
8fe0087e 35#include "cap-list.h"
430f0182 36#include "capability-util.h"
04d391da 37#include "cgroup-util.h"
f461a28d 38#include "chase.h"
988851b6 39#include "common-signal.h"
8fe0087e 40#include "copy.h"
d107bb7d 41#include "cpu-set-util.h"
786d19fd 42#include "creds-util.h"
4fc9982c 43#include "dev-setup.h"
57f1b61b 44#include "discover-image.h"
2d845785 45#include "dissect-image.h"
8fe0087e 46#include "env-util.h"
3652872a 47#include "escape.h"
3ffd4af2 48#include "fd-util.h"
842f3b0f 49#include "fdset.h"
a5c32cff 50#include "fileio.h"
f97b34a6 51#include "format-util.h"
f4f15635 52#include "fs-util.h"
1b9e5b12 53#include "gpt.h"
4623e8e6 54#include "hexdecoct.h"
e2054217 55#include "hostname-setup.h"
8fe0087e 56#include "hostname-util.h"
910fd145 57#include "id128-util.h"
3652872a 58#include "io-util.h"
8fe0087e 59#include "log.h"
2d845785 60#include "loop-util.h"
8fe0087e 61#include "loopback-setup.h"
8fe0087e 62#include "macro.h"
44dbef90 63#include "main-func.h"
f5947a5e 64#include "missing_sched.h"
8fe0087e 65#include "mkdir.h"
4349cd7c 66#include "mount-util.h"
049af8ad 67#include "mountpoint-util.h"
0cb8e3d1 68#include "namespace-util.h"
8fe0087e 69#include "netlink-util.h"
2f893044 70#include "nspawn-bind-user.h"
07630cea 71#include "nspawn-cgroup.h"
3652872a 72#include "nspawn-creds.h"
3603efde 73#include "nspawn-def.h"
07630cea
LP
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
de40a303 77#include "nspawn-oci.h"
7336138e 78#include "nspawn-patch-uid.h"
07630cea 79#include "nspawn-register.h"
910fd145 80#include "nspawn-seccomp.h"
07630cea
LP
81#include "nspawn-settings.h"
82#include "nspawn-setuid.h"
7732f92b 83#include "nspawn-stub-pid1.h"
c9394f4f 84#include "nspawn-util.h"
91181e07 85#include "nspawn.h"
d8b4d14d 86#include "nulstr-util.h"
d58ad743 87#include "os-util.h"
50ebcf6c 88#include "pager.h"
614b022c 89#include "parse-argument.h"
6bedfcbb 90#include "parse-util.h"
294bf0c3 91#include "pretty-print.h"
0b452006 92#include "process-util.h"
8fe0087e
LP
93#include "ptyfwd.h"
94#include "random-util.h"
8869a0b4 95#include "raw-clone.h"
86775e35 96#include "resolve-util.h"
bf428efb 97#include "rlimit-util.h"
8fe0087e 98#include "rm-rf.h"
de40a303 99#include "seccomp-util.h"
68b02049 100#include "selinux-util.h"
8fe0087e 101#include "signal-util.h"
2583fbea 102#include "socket-util.h"
8fcde012 103#include "stat-util.h"
15a5e950 104#include "stdio-util.h"
5c828e66 105#include "string-table.h"
07630cea 106#include "string-util.h"
8fe0087e 107#include "strv.h"
de40a303 108#include "sysctl-util.h"
8fe0087e 109#include "terminal-util.h"
e4de7287 110#include "tmpfile-util.h"
affb60b1 111#include "umask-util.h"
43c3fb46 112#include "unit-name.h"
b1d4f8e1 113#include "user-util.h"
e9642be2 114
e96ceaba
LP
115/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
116#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
e79581dd 117#define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
0e7ac751 118
2a49b612
ZJS
119#define EXIT_FORCE_RESTART 133
120
113cea80
DH
121typedef enum ContainerStatus {
122 CONTAINER_TERMINATED,
6145bb4f 123 CONTAINER_REBOOTED,
113cea80
DH
124} ContainerStatus;
125
88213476 126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
b53ede69
PW
129static char *arg_pivot_root_new = NULL;
130static char *arg_pivot_root_old = NULL;
687d0825 131static char *arg_user = NULL;
de40a303
LP
132static uid_t arg_uid = UID_INVALID;
133static gid_t arg_gid = GID_INVALID;
134static gid_t* arg_supplementary_gids = NULL;
135static size_t arg_n_supplementary_gids = 0;
9444b1f2 136static sd_id128_t arg_uuid = {};
3a9530e5
LP
137static char *arg_machine = NULL; /* The name used by the host to refer to this */
138static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
139static const char *arg_selinux_context = NULL;
140static const char *arg_selinux_apifs_context = NULL;
de40a303 141static char *arg_slice = NULL;
ff01d048 142static bool arg_private_network = false;
bc2f673e 143static bool arg_read_only = false;
7732f92b 144static StartMode arg_start_mode = START_PID1;
ec16945e 145static bool arg_ephemeral = false;
57fb9fb5 146static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 147static bool arg_link_journal_try = false;
520e0d54 148static uint64_t arg_caps_retain =
50b52222
LP
149 (1ULL << CAP_AUDIT_CONTROL) |
150 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
151 (1ULL << CAP_CHOWN) |
152 (1ULL << CAP_DAC_OVERRIDE) |
153 (1ULL << CAP_DAC_READ_SEARCH) |
154 (1ULL << CAP_FOWNER) |
155 (1ULL << CAP_FSETID) |
156 (1ULL << CAP_IPC_OWNER) |
157 (1ULL << CAP_KILL) |
158 (1ULL << CAP_LEASE) |
159 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 160 (1ULL << CAP_MKNOD) |
5076f0cc
LP
161 (1ULL << CAP_NET_BIND_SERVICE) |
162 (1ULL << CAP_NET_BROADCAST) |
163 (1ULL << CAP_NET_RAW) |
5076f0cc 164 (1ULL << CAP_SETFCAP) |
50b52222 165 (1ULL << CAP_SETGID) |
5076f0cc
LP
166 (1ULL << CAP_SETPCAP) |
167 (1ULL << CAP_SETUID) |
168 (1ULL << CAP_SYS_ADMIN) |
50b52222 169 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
170 (1ULL << CAP_SYS_CHROOT) |
171 (1ULL << CAP_SYS_NICE) |
172 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 173 (1ULL << CAP_SYS_RESOURCE) |
50b52222 174 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 175static uint64_t arg_caps_ambient = 0;
de40a303 176static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 177static CustomMount *arg_custom_mounts = NULL;
88614c8a 178static size_t arg_n_custom_mounts = 0;
f4889f65 179static char **arg_setenv = NULL;
284c0b91 180static bool arg_quiet = false;
eb91eb18 181static bool arg_register = true;
89f7c846 182static bool arg_keep_unit = false;
aa28aefe 183static char **arg_network_interfaces = NULL;
c74e630d 184static char **arg_network_macvlan = NULL;
4bbfe7ad 185static char **arg_network_ipvlan = NULL;
69c79d3c 186static bool arg_network_veth = false;
f6d6bad1 187static char **arg_network_veth_extra = NULL;
f757855e 188static char *arg_network_bridge = NULL;
22b28dfd 189static char *arg_network_zone = NULL;
d7bea6b6 190static char *arg_network_namespace_path = NULL;
bb068de0 191static PagerFlags arg_pager_flags = 0;
050f7277 192static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 193static char *arg_image = NULL;
de40a303 194static char *arg_oci_bundle = NULL;
f757855e 195static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 196static ExposePort *arg_expose_ports = NULL;
f36933fe 197static char **arg_property = NULL;
de40a303 198static sd_bus_message *arg_property_message = NULL;
0de7acce 199static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 200static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 201static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 202static int arg_kill_signal = 0;
5da38d07 203static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
204static SettingsMask arg_settings_mask = 0;
205static int arg_settings_trusted = -1;
206static char **arg_parameters = NULL;
6aadfa4c 207static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 208static bool arg_notify_ready = false;
5a8ff0e6 209static bool arg_use_cgns = true;
0c582db0 210static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 211static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 212static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
213static char **arg_syscall_allow_list = NULL;
214static char **arg_syscall_deny_list = NULL;
de40a303
LP
215#if HAVE_SECCOMP
216static scmp_filter_ctx arg_seccomp = NULL;
217#endif
bf428efb 218static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 219static bool arg_no_new_privileges = false;
81f345df
LP
220static int arg_oom_score_adjust = 0;
221static bool arg_oom_score_adjust_set = false;
0985c7c4 222static CPUSet arg_cpu_set = {};
09d423e9 223static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 224static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 225static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
226static DeviceNode* arg_extra_nodes = NULL;
227static size_t arg_n_extra_nodes = 0;
228static char **arg_sysctl = NULL;
229static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
230static Credential *arg_credentials = NULL;
231static size_t arg_n_credentials = 0;
2f893044 232static char **arg_bind_user = NULL;
4a4654e0 233static bool arg_suppress_sync = false;
3603f151 234static char *arg_settings_filename = NULL;
4c27749b 235static Architecture arg_architecture = _ARCHITECTURE_INVALID;
84be0c71 236static ImagePolicy *arg_image_policy = NULL;
88213476 237
6145bb4f
LP
238STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
244STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
245STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
246STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
247STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
249STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
250STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
251STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
252STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
254STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
255STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
256STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
258STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
259STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
260STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 261STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
262STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
263STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
264#if HAVE_SECCOMP
265STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
266#endif
0985c7c4 267STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 268STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 269STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
3603f151 270STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
84be0c71 271STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
6145bb4f 272
dce66ffe
ZJS
273static int handle_arg_console(const char *arg) {
274 if (streq(arg, "help")) {
10e8a60b
LP
275 puts("autopipe\n"
276 "interactive\n"
dce66ffe 277 "passive\n"
10e8a60b
LP
278 "pipe\n"
279 "read-only");
dce66ffe
ZJS
280 return 0;
281 }
282
283 if (streq(arg, "interactive"))
284 arg_console_mode = CONSOLE_INTERACTIVE;
285 else if (streq(arg, "read-only"))
286 arg_console_mode = CONSOLE_READ_ONLY;
287 else if (streq(arg, "passive"))
288 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
289 else if (streq(arg, "pipe")) {
290 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
291 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
292 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
293 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
294 "Proceeding anyway.");
295
dce66ffe 296 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
297 } else if (streq(arg, "autopipe")) {
298 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
299 arg_console_mode = CONSOLE_INTERACTIVE;
300 else
301 arg_console_mode = CONSOLE_PIPE;
554c4beb 302 } else
dce66ffe
ZJS
303 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
304
305 arg_settings_mask |= SETTING_CONSOLE_MODE;
306 return 1;
307}
308
37ec0fdd
LP
309static int help(void) {
310 _cleanup_free_ char *link = NULL;
311 int r;
312
384c2c32 313 pager_open(arg_pager_flags);
50ebcf6c 314
37ec0fdd
LP
315 r = terminal_urlify_man("systemd-nspawn", "1", &link);
316 if (r < 0)
317 return log_oom();
318
25148653 319 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 320 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
321 " -h --help Show this help\n"
322 " --version Print version string\n"
69c79d3c 323 " -q --quiet Do not show status information\n"
bb068de0 324 " --no-pager Do not pipe output into a pager\n"
25148653
LP
325 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
326 "%3$sImage:%4$s\n"
1b9e5b12 327 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
328 " --template=PATH Initialize root directory from template directory,\n"
329 " if missing\n"
330 " -x --ephemeral Run container with snapshot of root directory, and\n"
331 " remove it after exit\n"
25e68fd3
LP
332 " -i --image=PATH Root file system disk image (or device node) for\n"
333 " the container\n"
84be0c71 334 " --image-policy=POLICY Specify disk image dissection policy\n"
de40a303 335 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
336 " --read-only Mount the root directory read-only\n"
337 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 338 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
339 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
340 " as a DER encoded PKCS7, either as a path to a file\n"
341 " or as an ASCII base64 encoded string prefixed by\n"
342 " 'base64:'\n"
e7cbe5cb 343 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
344 " --pivot-root=PATH[:PATH]\n"
345 " Pivot root to given directory in the container\n\n"
346 "%3$sExecution:%4$s\n"
7732f92b 347 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 348 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 349 " --chdir=PATH Set working directory in the container\n"
0d2a0179 350 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
25148653
LP
351 " -u --user=USER Run the command under specified user or UID\n"
352 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
4a4654e0
LP
353 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
354 " --suppress-sync=BOOLEAN\n"
355 " Suppress any form of disk data synchronization\n\n"
25148653 356 "%3$sSystem Identity:%4$s\n"
a8828ed9 357 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 358 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
359 " --uuid=UUID Set a specific machine UUID for the container\n\n"
360 "%3$sProperties:%4$s\n"
a8828ed9 361 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 362 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
363 " --register=BOOLEAN Register container as machine\n"
364 " --keep-unit Do not register a scope for the machine, reuse\n"
365 " the service unit nspawn is running in\n\n"
366 "%3$sUser Namespacing:%4$s\n"
b917743d
YW
367 " --private-users=no Run without user namespacing\n"
368 " --private-users=yes|pick|identity\n"
369 " Run within user namespace, autoselect UID/GID range\n"
370 " --private-users=UIDBASE[:NUIDS]\n"
90b4a64d 371 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
372 " --private-users-ownership=MODE\n"
373 " Adjust ('chown') or map ('map') OS tree ownership\n"
b917743d
YW
374 " to private UID/GID range\n"
375 " -U Equivalent to --private-users=pick and\n"
376 " --private-users-ownership=auto\n\n"
25148653 377 "%3$sNetworking:%4$s\n"
69c79d3c 378 " --private-network Disable network in container\n"
2f091b1b 379 " --network-interface=HOSTIF[:CONTAINERIF]\n"
69c79d3c
LP
380 " Assign an existing network interface to the\n"
381 " container\n"
2f091b1b 382 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
c74e630d
LP
383 " Create a macvlan network interface based on an\n"
384 " existing network interface to the container\n"
2f091b1b 385 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
387f6955 386 " Create an ipvlan network interface based on an\n"
4bbfe7ad 387 " existing network interface to the container\n"
a8eaaee7 388 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 389 " and container\n"
f6d6bad1
LP
390 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
391 " Add an additional virtual Ethernet link between\n"
392 " host and container\n"
ab046dde 393 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
394 " Add a virtual Ethernet connection to the container\n"
395 " and attach it to an existing bridge on the host\n"
396 " --network-zone=NAME Similar, but attach the new interface to an\n"
397 " an automatically managed bridge interface\n"
d7bea6b6
DP
398 " --network-namespace-path=PATH\n"
399 " Set network namespace to the one represented by\n"
400 " the specified kernel namespace file node\n"
6d0b55c2 401 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
402 " Expose a container IP port on the host\n\n"
403 "%3$sSecurity:%4$s\n"
a8828ed9
DW
404 " --capability=CAP In addition to the default, retain specified\n"
405 " capability\n"
406 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
407 " --ambient-capability=CAP\n"
408 " Sets the specified capability for the started\n"
409 " process. Not useful if booting a machine.\n"
f4e803c8 410 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
411 " --system-call-filter=LIST|~LIST\n"
412 " Permit/prohibit specific system calls\n"
25148653
LP
413 " -Z --selinux-context=SECLABEL\n"
414 " Set the SELinux security context to be used by\n"
415 " processes in the container\n"
416 " -L --selinux-apifs-context=SECLABEL\n"
417 " Set the SELinux security context to be used by\n"
418 " API/tmpfs file systems in the container\n\n"
419 "%3$sResources:%4$s\n"
bf428efb 420 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
421 " --oom-score-adjust=VALUE\n"
422 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
423 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
424 " --personality=ARCH Pick personality for this container\n\n"
25148653 425 "%3$sIntegration:%4$s\n"
09d423e9 426 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 427 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
428 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
429 " host, try-guest, try-host\n"
430 " -j Equivalent to --link-journal=try-guest\n\n"
431 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
432 " --bind=PATH[:PATH[:OPTIONS]]\n"
433 " Bind mount a file or directory from the host into\n"
a8828ed9 434 " the container\n"
5e5bfa6e
EY
435 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
436 " Similar, but creates a read-only bind mount\n"
de40a303
LP
437 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
438 " it\n"
06c17c39 439 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
440 " --overlay=PATH[:PATH...]:PATH\n"
441 " Create an overlay mount from the host to \n"
442 " the container\n"
443 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
444 " Similar, but creates a read-only overlay mount\n"
445 " --bind-user=NAME Bind user from host to container\n\n"
25148653 446 "%3$sInput/Output:%4$s\n"
de40a303
LP
447 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
448 " set up for the container.\n"
3652872a
LP
449 " -P --pipe Equivalent to --console=pipe\n\n"
450 "%3$sCredentials:%4$s\n"
451 " --set-credential=ID:VALUE\n"
452 " Pass a credential with literal value to container.\n"
453 " --load-credential=ID:PATH\n"
454 " Load credential to pass to container from file or\n"
455 " AF_UNIX stream socket.\n"
bc556335
DDM
456 "\nSee the %2$s for details.\n",
457 program_invocation_short_name,
458 link,
459 ansi_underline(),
460 ansi_normal(),
461 ansi_highlight(),
462 ansi_normal());
37ec0fdd
LP
463
464 return 0;
88213476
LP
465}
466
86c0dd4a 467static int custom_mount_check_all(void) {
88614c8a 468 size_t i;
5a8af538 469
5a8af538
LP
470 for (i = 0; i < arg_n_custom_mounts; i++) {
471 CustomMount *m = &arg_custom_mounts[i];
472
0de7acce 473 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 474 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 475 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 476 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 477 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
479 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 480 }
5a8af538
LP
481 }
482
483 return 0;
484}
485
8199d554 486static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 487 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 488 int r;
5da38d07 489
efdb0237 490 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
491
492 e = getenv(var);
493 if (!e) {
d5fc5b2f 494 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
495 var = "UNIFIED_CGROUP_HIERARCHY";
496 e = getenv(var);
c78c095b
ZJS
497 }
498
499 if (!isempty(e)) {
efdb0237
LP
500 r = parse_boolean(e);
501 if (r < 0)
c78c095b 502 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
503 if (r > 0)
504 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
505 else
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
507 }
508
8199d554
LP
509 return 0;
510}
511
512static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
513 int r;
514
75b0d8b8
ZJS
515 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
516 * in the image actually supports. */
b4cccbc1
LP
517 r = cg_all_unified();
518 if (r < 0)
519 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
520 if (r > 0) {
a8725a06
ZJS
521 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
522 * routine only detects 231, so we'll have a false negative here for 230. */
7e6821ed 523 r = systemd_installation_has_version(directory, "230");
a8725a06
ZJS
524 if (r < 0)
525 return log_error_errno(r, "Failed to determine systemd version in container: %m");
526 if (r > 0)
527 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
528 else
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 530 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b 531 /* Mixed cgroup hierarchy support was added in 233 */
7e6821ed 532 r = systemd_installation_has_version(directory, "233");
0fd9563f
ZJS
533 if (r < 0)
534 return log_error_errno(r, "Failed to determine systemd version in container: %m");
535 if (r > 0)
536 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
537 else
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
539 } else
5da38d07 540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 541
8199d554
LP
542 log_debug("Using %s hierarchy for container.",
543 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
544 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
545
efdb0237
LP
546 return 0;
547}
548
8a99bd0c
ZJS
549static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
550 uint64_t mask = 0;
551 int r;
552
553 for (;;) {
554 _cleanup_free_ char *t = NULL;
555
556 r = extract_first_word(&spec, &t, ",", 0);
557 if (r < 0)
558 return log_error_errno(r, "Failed to parse capability %s.", t);
559 if (r == 0)
560 break;
561
562 if (streq(t, "help")) {
563 for (int i = 0; i < capability_list_length(); i++) {
564 const char *name;
565
566 name = capability_to_name(i);
567 if (name)
568 puts(name);
569 }
570
571 return 0; /* quit */
572 }
573
574 if (streq(t, "all"))
f5fbe71d 575 mask = UINT64_MAX;
8a99bd0c
ZJS
576 else {
577 r = capability_from_name(t);
578 if (r < 0)
579 return log_error_errno(r, "Failed to parse capability %s.", t);
580
581 mask |= 1ULL << r;
582 }
583 }
584
585 *ret_mask = mask;
586 return 1; /* continue */
587}
588
49048684 589static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
590 int r;
591
592 r = getenv_bool(name);
593 if (r == -ENXIO)
49048684 594 return 0;
0c582db0 595 if (r < 0)
49048684 596 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 597
0c582db0 598 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 599 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 600 return 0;
0c582db0
LB
601}
602
49048684 603static int parse_mount_settings_env(void) {
4f086aab 604 const char *e;
1099ceeb
LP
605 int r;
606
607 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
608 if (r < 0 && r != -ENXIO)
609 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
610 if (r >= 0)
611 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
612
613 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 614 if (streq_ptr(e, "network"))
4f086aab 615 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 616
49048684
ZJS
617 else if (e) {
618 r = parse_boolean(e);
619 if (r < 0)
620 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
621
622 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
623 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 624 }
4f086aab 625
49048684 626 return 0;
4f086aab
SU
627}
628
49048684 629static int parse_environment(void) {
d5455d2f
LP
630 const char *e;
631 int r;
632
49048684
ZJS
633 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
634 if (r < 0)
635 return r;
636 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
637 if (r < 0)
638 return r;
639 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
640 if (r < 0)
641 return r;
642 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
643 if (r < 0)
644 return r;
d5455d2f 645
49048684
ZJS
646 r = parse_mount_settings_env();
647 if (r < 0)
648 return r;
d5455d2f 649
489fae52
ZJS
650 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
651 * even if it is supported. If not supported, it has no effect. */
de40a303 652 if (!cg_ns_supported())
489fae52 653 arg_use_cgns = false;
de40a303
LP
654 else {
655 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
656 if (r < 0) {
657 if (r != -ENXIO)
49048684 658 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
659
660 arg_use_cgns = true;
661 } else {
662 arg_use_cgns = r > 0;
663 arg_settings_mask |= SETTING_USE_CGNS;
664 }
665 }
d5455d2f
LP
666
667 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
668 if (e)
669 arg_container_service_name = e;
670
4a4654e0
LP
671 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
672 if (r >= 0)
673 arg_suppress_sync = r;
674 else if (r != -ENXIO)
675 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
676
49048684 677 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
678}
679
88213476 680static int parse_argv(int argc, char *argv[]) {
a41fe3a2 681 enum {
acbeb427
ZJS
682 ARG_VERSION = 0x100,
683 ARG_PRIVATE_NETWORK,
bc2f673e 684 ARG_UUID,
5076f0cc 685 ARG_READ_ONLY,
57fb9fb5 686 ARG_CAPABILITY,
88fc9c9b 687 ARG_AMBIENT_CAPABILITY,
420c7379 688 ARG_DROP_CAPABILITY,
17fe0523
LP
689 ARG_LINK_JOURNAL,
690 ARG_BIND,
f4889f65 691 ARG_BIND_RO,
06c17c39 692 ARG_TMPFS,
5a8af538
LP
693 ARG_OVERLAY,
694 ARG_OVERLAY_RO,
de40a303 695 ARG_INACCESSIBLE,
eb91eb18 696 ARG_SHARE_SYSTEM,
89f7c846 697 ARG_REGISTER,
aa28aefe 698 ARG_KEEP_UNIT,
69c79d3c 699 ARG_NETWORK_INTERFACE,
c74e630d 700 ARG_NETWORK_MACVLAN,
4bbfe7ad 701 ARG_NETWORK_IPVLAN,
ab046dde 702 ARG_NETWORK_BRIDGE,
22b28dfd 703 ARG_NETWORK_ZONE,
f6d6bad1 704 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 705 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 706 ARG_PERSONALITY,
4d9f07b4 707 ARG_VOLATILE,
ec16945e 708 ARG_TEMPLATE,
f36933fe 709 ARG_PROPERTY,
6dac160c 710 ARG_PRIVATE_USERS,
c6c8f6e2 711 ARG_KILL_SIGNAL,
f757855e 712 ARG_SETTINGS,
5f932eb9 713 ARG_CHDIR,
b53ede69 714 ARG_PIVOT_ROOT,
7336138e 715 ARG_PRIVATE_USERS_CHOWN,
6c045a99 716 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 717 ARG_NOTIFY_READY,
4623e8e6 718 ARG_ROOT_HASH,
89e62e0b
LP
719 ARG_ROOT_HASH_SIG,
720 ARG_VERITY_DATA,
960e4569 721 ARG_SYSTEM_CALL_FILTER,
bf428efb 722 ARG_RLIMIT,
3a9530e5 723 ARG_HOSTNAME,
66edd963 724 ARG_NO_NEW_PRIVILEGES,
81f345df 725 ARG_OOM_SCORE_ADJUST,
d107bb7d 726 ARG_CPU_AFFINITY,
09d423e9 727 ARG_RESOLV_CONF,
1688841f 728 ARG_TIMEZONE,
de40a303
LP
729 ARG_CONSOLE,
730 ARG_PIPE,
731 ARG_OCI_BUNDLE,
bb068de0 732 ARG_NO_PAGER,
3652872a
LP
733 ARG_SET_CREDENTIAL,
734 ARG_LOAD_CREDENTIAL,
2f893044 735 ARG_BIND_USER,
4a4654e0 736 ARG_SUPPRESS_SYNC,
84be0c71 737 ARG_IMAGE_POLICY,
a41fe3a2
LP
738 };
739
88213476 740 static const struct option options[] = {
d7bea6b6
DP
741 { "help", no_argument, NULL, 'h' },
742 { "version", no_argument, NULL, ARG_VERSION },
743 { "directory", required_argument, NULL, 'D' },
744 { "template", required_argument, NULL, ARG_TEMPLATE },
745 { "ephemeral", no_argument, NULL, 'x' },
746 { "user", required_argument, NULL, 'u' },
747 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
748 { "as-pid2", no_argument, NULL, 'a' },
749 { "boot", no_argument, NULL, 'b' },
750 { "uuid", required_argument, NULL, ARG_UUID },
751 { "read-only", no_argument, NULL, ARG_READ_ONLY },
752 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 753 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 754 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 755 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
756 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
757 { "bind", required_argument, NULL, ARG_BIND },
758 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
759 { "tmpfs", required_argument, NULL, ARG_TMPFS },
760 { "overlay", required_argument, NULL, ARG_OVERLAY },
761 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 762 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 763 { "machine", required_argument, NULL, 'M' },
3a9530e5 764 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
765 { "slice", required_argument, NULL, 'S' },
766 { "setenv", required_argument, NULL, 'E' },
767 { "selinux-context", required_argument, NULL, 'Z' },
768 { "selinux-apifs-context", required_argument, NULL, 'L' },
769 { "quiet", no_argument, NULL, 'q' },
770 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
771 { "register", required_argument, NULL, ARG_REGISTER },
772 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
773 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
774 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
775 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
776 { "network-veth", no_argument, NULL, 'n' },
777 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
778 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
779 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
780 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
781 { "personality", required_argument, NULL, ARG_PERSONALITY },
782 { "image", required_argument, NULL, 'i' },
783 { "volatile", optional_argument, NULL, ARG_VOLATILE },
784 { "port", required_argument, NULL, 'p' },
785 { "property", required_argument, NULL, ARG_PROPERTY },
786 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
787 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
788 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
789 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
790 { "settings", required_argument, NULL, ARG_SETTINGS },
791 { "chdir", required_argument, NULL, ARG_CHDIR },
792 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
793 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
794 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
795 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
796 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 797 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 798 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 799 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 800 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 801 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 802 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
803 { "console", required_argument, NULL, ARG_CONSOLE },
804 { "pipe", no_argument, NULL, ARG_PIPE },
805 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 806 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
807 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
808 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 809 { "bind-user", required_argument, NULL, ARG_BIND_USER },
4a4654e0 810 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
84be0c71 811 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
eb9da376 812 {}
88213476
LP
813 };
814
9444b1f2 815 int c, r;
a42c8b54 816 uint64_t plus = 0, minus = 0;
f757855e 817 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
818
819 assert(argc >= 0);
820 assert(argv);
821
ef9c12b1
YW
822 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
823 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
824 optind = 0;
de40a303 825 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
826 switch (c) {
827
828 case 'h':
37ec0fdd 829 return help();
88213476 830
acbeb427 831 case ARG_VERSION:
3f6fd1ba 832 return version();
acbeb427 833
88213476 834 case 'D':
614b022c 835 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 836 if (r < 0)
0f03c2a4 837 return r;
de40a303
LP
838
839 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
840 break;
841
842 case ARG_TEMPLATE:
614b022c 843 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 844 if (r < 0)
0f03c2a4 845 return r;
de40a303
LP
846
847 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
848 break;
849
1b9e5b12 850 case 'i':
614b022c 851 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 852 if (r < 0)
0f03c2a4 853 return r;
de40a303
LP
854
855 arg_settings_mask |= SETTING_DIRECTORY;
856 break;
857
858 case ARG_OCI_BUNDLE:
614b022c 859 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
860 if (r < 0)
861 return r;
862
ec16945e
LP
863 break;
864
865 case 'x':
866 arg_ephemeral = true;
a2f577fc 867 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
868 break;
869
687d0825 870 case 'u':
2fc09a9c
DM
871 r = free_and_strdup(&arg_user, optarg);
872 if (r < 0)
7027ff61 873 return log_oom();
687d0825 874
f757855e 875 arg_settings_mask |= SETTING_USER;
687d0825
MV
876 break;
877
22b28dfd 878 case ARG_NETWORK_ZONE: {
fee9f7b5 879 _cleanup_free_ char *j = NULL;
22b28dfd 880
b910cc72 881 j = strjoin("vz-", optarg);
22b28dfd
LP
882 if (!j)
883 return log_oom();
884
fee9f7b5
FS
885 if (!ifname_valid(j))
886 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
887 "Network zone name not valid: %s", j);
22b28dfd 888
df1fac6d 889 free_and_replace(arg_network_zone, j);
22b28dfd
LP
890
891 arg_network_veth = true;
892 arg_private_network = true;
893 arg_settings_mask |= SETTING_NETWORK;
894 break;
895 }
896
ab046dde 897 case ARG_NETWORK_BRIDGE:
ef76dff2 898
baaa35ad
ZJS
899 if (!ifname_valid(optarg))
900 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
901 "Bridge interface name not valid: %s", optarg);
ef76dff2 902
f757855e
LP
903 r = free_and_strdup(&arg_network_bridge, optarg);
904 if (r < 0)
905 return log_oom();
ab046dde 906
4831981d 907 _fallthrough_;
0dfaa006 908 case 'n':
69c79d3c
LP
909 arg_network_veth = true;
910 arg_private_network = true;
f757855e 911 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
912 break;
913
f6d6bad1
LP
914 case ARG_NETWORK_VETH_EXTRA:
915 r = veth_extra_parse(&arg_network_veth_extra, optarg);
916 if (r < 0)
917 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
918
919 arg_private_network = true;
920 arg_settings_mask |= SETTING_NETWORK;
921 break;
922
aa28aefe 923 case ARG_NETWORK_INTERFACE:
2f091b1b 924 r = interface_pair_parse(&arg_network_interfaces, optarg);
b390f178
DDM
925 if (r < 0)
926 return r;
927
c74e630d 928 arg_private_network = true;
f757855e 929 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
930 break;
931
932 case ARG_NETWORK_MACVLAN:
2f091b1b 933 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
b390f178
DDM
934 if (r < 0)
935 return r;
936
4bbfe7ad 937 arg_private_network = true;
f757855e 938 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
939 break;
940
941 case ARG_NETWORK_IPVLAN:
2f091b1b 942 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
b390f178
DDM
943 if (r < 0)
944 return r;
945
4831981d 946 _fallthrough_;
ff01d048
LP
947 case ARG_PRIVATE_NETWORK:
948 arg_private_network = true;
f757855e 949 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
950 break;
951
d7bea6b6 952 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 953 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
954 if (r < 0)
955 return r;
956
de40a303 957 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
958 break;
959
0f0dbc46 960 case 'b':
baaa35ad
ZJS
961 if (arg_start_mode == START_PID2)
962 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
963 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
964
965 arg_start_mode = START_BOOT;
966 arg_settings_mask |= SETTING_START_MODE;
967 break;
968
969 case 'a':
baaa35ad
ZJS
970 if (arg_start_mode == START_BOOT)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
973
974 arg_start_mode = START_PID2;
975 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
976 break;
977
144f0fc0 978 case ARG_UUID:
aea3f594
ZJS
979 r = id128_from_string_nonzero(optarg, &arg_uuid);
980 if (r == -ENXIO)
baaa35ad
ZJS
981 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
982 "Machine UUID may not be all zeroes.");
aea3f594
ZJS
983 if (r < 0)
984 return log_error_errno(r, "Invalid UUID: %s", optarg);
f757855e
LP
985
986 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 987 break;
aa96c6cb 988
43c3fb46
LP
989 case 'S': {
990 _cleanup_free_ char *mangled = NULL;
991
992 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
993 if (r < 0)
994 return log_oom();
995
43c3fb46 996 free_and_replace(arg_slice, mangled);
de40a303 997 arg_settings_mask |= SETTING_SLICE;
144f0fc0 998 break;
43c3fb46 999 }
144f0fc0 1000
7027ff61 1001 case 'M':
c1521918 1002 if (isempty(optarg))
97b11eed 1003 arg_machine = mfree(arg_machine);
c1521918 1004 else {
52ef5dd7 1005 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1006 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1007 "Invalid machine name: %s", optarg);
7027ff61 1008
0c3c4284
LP
1009 r = free_and_strdup(&arg_machine, optarg);
1010 if (r < 0)
eb91eb18 1011 return log_oom();
eb91eb18 1012 }
9ce6d1b3 1013 break;
7027ff61 1014
3a9530e5
LP
1015 case ARG_HOSTNAME:
1016 if (isempty(optarg))
1017 arg_hostname = mfree(arg_hostname);
1018 else {
52ef5dd7 1019 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1020 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1021 "Invalid hostname: %s", optarg);
3a9530e5
LP
1022
1023 r = free_and_strdup(&arg_hostname, optarg);
1024 if (r < 0)
1025 return log_oom();
1026 }
1027
1028 arg_settings_mask |= SETTING_HOSTNAME;
1029 break;
1030
82adf6af
LP
1031 case 'Z':
1032 arg_selinux_context = optarg;
a8828ed9
DW
1033 break;
1034
82adf6af
LP
1035 case 'L':
1036 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1037 break;
1038
bc2f673e
LP
1039 case ARG_READ_ONLY:
1040 arg_read_only = true;
f757855e 1041 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1042 break;
1043
88fc9c9b
TH
1044 case ARG_AMBIENT_CAPABILITY: {
1045 uint64_t m;
1046 r = parse_capability_spec(optarg, &m);
1047 if (r <= 0)
1048 return r;
1049 arg_caps_ambient |= m;
1050 arg_settings_mask |= SETTING_CAPABILITY;
1051 break;
1052 }
420c7379
LP
1053 case ARG_CAPABILITY:
1054 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1055 uint64_t m;
1056 r = parse_capability_spec(optarg, &m);
1057 if (r <= 0)
1058 return r;
5076f0cc 1059
8a99bd0c
ZJS
1060 if (c == ARG_CAPABILITY)
1061 plus |= m;
1062 else
1063 minus |= m;
f757855e 1064 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1065 break;
1066 }
66edd963
LP
1067 case ARG_NO_NEW_PRIVILEGES:
1068 r = parse_boolean(optarg);
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1071
1072 arg_no_new_privileges = r;
1073 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1074 break;
1075
57fb9fb5
LP
1076 case 'j':
1077 arg_link_journal = LINK_GUEST;
574edc90 1078 arg_link_journal_try = true;
4e1d6aa9 1079 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1080 break;
1081
1082 case ARG_LINK_JOURNAL:
4e1d6aa9 1083 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1086
4e1d6aa9 1087 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1088 break;
1089
17fe0523 1090 case ARG_BIND:
f757855e
LP
1091 case ARG_BIND_RO:
1092 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1095
f757855e 1096 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1097 break;
06c17c39 1098
f757855e
LP
1099 case ARG_TMPFS:
1100 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1101 if (r < 0)
1102 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1103
f757855e 1104 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1105 break;
5a8af538
LP
1106
1107 case ARG_OVERLAY:
ad85779a
LP
1108 case ARG_OVERLAY_RO:
1109 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1110 if (r == -EADDRNOTAVAIL)
1111 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1112 if (r < 0)
1113 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1114
f757855e 1115 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1116 break;
06c17c39 1117
de40a303
LP
1118 case ARG_INACCESSIBLE:
1119 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1120 if (r < 0)
1121 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1122
1123 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1124 break;
1125
0d2a0179
ZJS
1126 case 'E':
1127 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
aaf057c4 1128 if (r < 0)
0d2a0179 1129 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
f4889f65 1130
f757855e 1131 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65 1132 break;
f4889f65 1133
284c0b91
LP
1134 case 'q':
1135 arg_quiet = true;
1136 break;
1137
8a96d94e 1138 case ARG_SHARE_SYSTEM:
a6b5216c 1139 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1140 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1141 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1142 arg_clone_ns_flags = 0;
8a96d94e
LP
1143 break;
1144
eb91eb18
LP
1145 case ARG_REGISTER:
1146 r = parse_boolean(optarg);
1147 if (r < 0) {
1148 log_error("Failed to parse --register= argument: %s", optarg);
1149 return r;
1150 }
1151
1152 arg_register = r;
1153 break;
1154
89f7c846
LP
1155 case ARG_KEEP_UNIT:
1156 arg_keep_unit = true;
1157 break;
1158
6afc95b7
LP
1159 case ARG_PERSONALITY:
1160
ac45f971 1161 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1162 if (arg_personality == PERSONALITY_INVALID)
1163 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1164 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1165
f757855e 1166 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1167 break;
1168
4d9f07b4
LP
1169 case ARG_VOLATILE:
1170
1171 if (!optarg)
f757855e 1172 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1173 else if (streq(optarg, "help")) {
1174 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1175 return 0;
1176 } else {
f757855e 1177 VolatileMode m;
4d9f07b4 1178
f757855e 1179 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1180 if (m < 0)
1181 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1182 "Failed to parse --volatile= argument: %s", optarg);
1183 else
f757855e 1184 arg_volatile_mode = m;
6d0b55c2
LP
1185 }
1186
f757855e
LP
1187 arg_settings_mask |= SETTING_VOLATILE_MODE;
1188 break;
6d0b55c2 1189
f757855e
LP
1190 case 'p':
1191 r = expose_port_parse(&arg_expose_ports, optarg);
1192 if (r == -EEXIST)
1193 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1194 if (r < 0)
1195 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1196
f757855e 1197 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1198 break;
6d0b55c2 1199
f36933fe
LP
1200 case ARG_PROPERTY:
1201 if (strv_extend(&arg_property, optarg) < 0)
1202 return log_oom();
1203
1204 break;
1205
ae209204 1206 case ARG_PRIVATE_USERS: {
33eac552 1207 int boolean;
0de7acce 1208
ae209204
ZJS
1209 if (!optarg)
1210 boolean = true;
1211 else if (!in_charset(optarg, DIGITS))
1212 /* do *not* parse numbers as booleans */
1213 boolean = parse_boolean(optarg);
33eac552
LP
1214 else
1215 boolean = -1;
ae209204 1216
33eac552 1217 if (boolean == 0) {
0de7acce
LP
1218 /* no: User namespacing off */
1219 arg_userns_mode = USER_NAMESPACE_NO;
1220 arg_uid_shift = UID_INVALID;
1221 arg_uid_range = UINT32_C(0x10000);
33eac552 1222 } else if (boolean > 0) {
0de7acce
LP
1223 /* yes: User namespacing on, UID range is read from root dir */
1224 arg_userns_mode = USER_NAMESPACE_FIXED;
1225 arg_uid_shift = UID_INVALID;
1226 arg_uid_range = UINT32_C(0x10000);
1227 } else if (streq(optarg, "pick")) {
1228 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1229 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1230 * implied by USER_NAMESPACE_PICK
33eac552 1231 * further down. */
0de7acce
LP
1232 arg_uid_shift = UID_INVALID;
1233 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1234
1235 } else if (streq(optarg, "identity")) {
6c2d70ce 1236 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
33eac552
LP
1237 * itself, i.e. we don't actually map anything, but do take benefit of
1238 * isolation of capability sets. */
1239 arg_userns_mode = USER_NAMESPACE_FIXED;
1240 arg_uid_shift = 0;
1241 arg_uid_range = UINT32_C(0x10000);
0de7acce 1242 } else {
6c2058b3 1243 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1244 const char *range, *shift;
1245
0de7acce
LP
1246 /* anything else: User namespacing on, UID range is explicitly configured */
1247
6dac160c
LP
1248 range = strchr(optarg, ':');
1249 if (range) {
6c2058b3
ZJS
1250 buffer = strndup(optarg, range - optarg);
1251 if (!buffer)
1252 return log_oom();
1253 shift = buffer;
6dac160c
LP
1254
1255 range++;
bfd292ec
ZJS
1256 r = safe_atou32(range, &arg_uid_range);
1257 if (r < 0)
be715731 1258 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1259 } else
1260 shift = optarg;
1261
be715731
ZJS
1262 r = parse_uid(shift, &arg_uid_shift);
1263 if (r < 0)
1264 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1265
1266 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1267
58e13de5
LP
1268 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1269 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1270 }
be715731 1271
0de7acce 1272 arg_settings_mask |= SETTING_USERNS;
6dac160c 1273 break;
ae209204 1274 }
6dac160c 1275
0de7acce 1276 case 'U':
ccabee0d 1277 if (userns_supported()) {
6c045a99
LP
1278 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1279 * implied by USER_NAMESPACE_PICK
33eac552 1280 * further down. */
ccabee0d
LP
1281 arg_uid_shift = UID_INVALID;
1282 arg_uid_range = UINT32_C(0x10000);
1283
1284 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1285 }
1286
7336138e
LP
1287 break;
1288
0de7acce 1289 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1290 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1291
1292 arg_settings_mask |= SETTING_USERNS;
1293 break;
1294
1295 case ARG_PRIVATE_USERS_OWNERSHIP:
1296 if (streq(optarg, "help")) {
1297 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1298 return 0;
1299 }
1300
1301 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1302 if (arg_userns_ownership < 0)
1303 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1304
1305 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1306 break;
1307
c6c8f6e2 1308 case ARG_KILL_SIGNAL:
5c828e66
LP
1309 if (streq(optarg, "help")) {
1310 DUMP_STRING_TABLE(signal, int, _NSIG);
1311 return 0;
1312 }
1313
29a3db75 1314 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1315 if (arg_kill_signal < 0)
7211c853 1316 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1317
f757855e
LP
1318 arg_settings_mask |= SETTING_KILL_SIGNAL;
1319 break;
1320
1321 case ARG_SETTINGS:
1322
1323 /* no → do not read files
1324 * yes → read files, do not override cmdline, trust only subset
1325 * override → read files, override cmdline, trust only subset
1326 * trusted → read files, do not override cmdline, trust all
1327 */
1328
1329 r = parse_boolean(optarg);
1330 if (r < 0) {
1331 if (streq(optarg, "trusted")) {
1332 mask_all_settings = false;
1333 mask_no_settings = false;
1334 arg_settings_trusted = true;
1335
1336 } else if (streq(optarg, "override")) {
1337 mask_all_settings = false;
1338 mask_no_settings = true;
1339 arg_settings_trusted = -1;
1340 } else
1341 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1342 } else if (r > 0) {
1343 /* yes */
1344 mask_all_settings = false;
1345 mask_no_settings = false;
1346 arg_settings_trusted = -1;
1347 } else {
1348 /* no */
1349 mask_all_settings = true;
1350 mask_no_settings = false;
1351 arg_settings_trusted = false;
1352 }
1353
c6c8f6e2
LP
1354 break;
1355
5f932eb9 1356 case ARG_CHDIR:
baaa35ad
ZJS
1357 if (!path_is_absolute(optarg))
1358 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1359 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1360
1361 r = free_and_strdup(&arg_chdir, optarg);
1362 if (r < 0)
1363 return log_oom();
1364
1365 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1366 break;
1367
b53ede69
PW
1368 case ARG_PIVOT_ROOT:
1369 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1370 if (r < 0)
1371 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1372
1373 arg_settings_mask |= SETTING_PIVOT_ROOT;
1374 break;
1375
9c1e04d0
AP
1376 case ARG_NOTIFY_READY:
1377 r = parse_boolean(optarg);
baaa35ad
ZJS
1378 if (r < 0)
1379 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1380 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1381 arg_notify_ready = r;
1382 arg_settings_mask |= SETTING_NOTIFY_READY;
1383 break;
1384
4623e8e6 1385 case ARG_ROOT_HASH: {
89e62e0b 1386 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1387 size_t l;
1388
1389 r = unhexmem(optarg, strlen(optarg), &k, &l);
1390 if (r < 0)
1391 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1392 if (l < sizeof(sd_id128_t))
da890466 1393 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
4623e8e6 1394
89e62e0b
LP
1395 free_and_replace(arg_verity_settings.root_hash, k);
1396 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1397 break;
1398 }
1399
c2923fdc
LB
1400 case ARG_ROOT_HASH_SIG: {
1401 char *value;
89e62e0b
LP
1402 size_t l;
1403 void *p;
c2923fdc
LB
1404
1405 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1406 r = unbase64mem(value, strlen(value), &p, &l);
1407 if (r < 0)
1408 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1409
c2923fdc 1410 } else {
89e62e0b 1411 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1412 if (r < 0)
89e62e0b 1413 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1414 }
1415
89e62e0b
LP
1416 free_and_replace(arg_verity_settings.root_hash_sig, p);
1417 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1418 break;
1419 }
1420
89e62e0b 1421 case ARG_VERITY_DATA:
614b022c 1422 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1423 if (r < 0)
1424 return r;
1425 break;
1426
960e4569
LP
1427 case ARG_SYSTEM_CALL_FILTER: {
1428 bool negative;
1429 const char *items;
1430
1431 negative = optarg[0] == '~';
1432 items = negative ? optarg + 1 : optarg;
1433
1434 for (;;) {
1435 _cleanup_free_ char *word = NULL;
1436
1437 r = extract_first_word(&items, &word, NULL, 0);
1438 if (r == 0)
1439 break;
1440 if (r == -ENOMEM)
1441 return log_oom();
1442 if (r < 0)
1443 return log_error_errno(r, "Failed to parse system call filter: %m");
1444
1445 if (negative)
6b000af4 1446 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1447 else
6b000af4 1448 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1449 if (r < 0)
1450 return log_oom();
1451 }
1452
1453 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1454 break;
1455 }
1456
bf428efb
LP
1457 case ARG_RLIMIT: {
1458 const char *eq;
622ecfa8 1459 _cleanup_free_ char *name = NULL;
bf428efb
LP
1460 int rl;
1461
5c828e66
LP
1462 if (streq(optarg, "help")) {
1463 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1464 return 0;
1465 }
1466
bf428efb 1467 eq = strchr(optarg, '=');
baaa35ad
ZJS
1468 if (!eq)
1469 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1470 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1471
1472 name = strndup(optarg, eq - optarg);
1473 if (!name)
1474 return log_oom();
1475
1476 rl = rlimit_from_string_harder(name);
baaa35ad 1477 if (rl < 0)
7211c853 1478 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1479
1480 if (!arg_rlimit[rl]) {
1481 arg_rlimit[rl] = new0(struct rlimit, 1);
1482 if (!arg_rlimit[rl])
1483 return log_oom();
1484 }
1485
1486 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1487 if (r < 0)
1488 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1489
1490 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1491 break;
1492 }
1493
81f345df
LP
1494 case ARG_OOM_SCORE_ADJUST:
1495 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1498
1499 arg_oom_score_adjust_set = true;
1500 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1501 break;
1502
d107bb7d 1503 case ARG_CPU_AFFINITY: {
0985c7c4 1504 CPUSet cpuset;
d107bb7d
LP
1505
1506 r = parse_cpu_set(optarg, &cpuset);
1507 if (r < 0)
0985c7c4 1508 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1509
0985c7c4
ZJS
1510 cpu_set_reset(&arg_cpu_set);
1511 arg_cpu_set = cpuset;
d107bb7d
LP
1512 arg_settings_mask |= SETTING_CPU_AFFINITY;
1513 break;
1514 }
1515
09d423e9
LP
1516 case ARG_RESOLV_CONF:
1517 if (streq(optarg, "help")) {
1518 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1519 return 0;
1520 }
1521
1522 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1523 if (arg_resolv_conf < 0)
7211c853 1524 return log_error_errno(arg_resolv_conf,
baaa35ad 1525 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1526
1527 arg_settings_mask |= SETTING_RESOLV_CONF;
1528 break;
1529
1688841f
LP
1530 case ARG_TIMEZONE:
1531 if (streq(optarg, "help")) {
1532 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1533 return 0;
1534 }
1535
1536 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1537 if (arg_timezone < 0)
7211c853 1538 return log_error_errno(arg_timezone,
baaa35ad 1539 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1540
1541 arg_settings_mask |= SETTING_TIMEZONE;
1542 break;
1543
de40a303 1544 case ARG_CONSOLE:
dce66ffe
ZJS
1545 r = handle_arg_console(optarg);
1546 if (r <= 0)
1547 return r;
de40a303
LP
1548 break;
1549
1550 case 'P':
1551 case ARG_PIPE:
dce66ffe
ZJS
1552 r = handle_arg_console("pipe");
1553 if (r <= 0)
1554 return r;
de40a303
LP
1555 break;
1556
bb068de0
ZJS
1557 case ARG_NO_PAGER:
1558 arg_pager_flags |= PAGER_DISABLE;
1559 break;
1560
3652872a
LP
1561 case ARG_SET_CREDENTIAL: {
1562 _cleanup_free_ char *word = NULL, *data = NULL;
1563 const char *p = optarg;
1564 Credential *a;
e437538f 1565 ssize_t l;
3652872a
LP
1566
1567 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1568 if (r == -ENOMEM)
1569 return log_oom();
1570 if (r < 0)
1571 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1572 if (r == 0 || !p)
1573 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1574
1575 if (!credential_name_valid(word))
1576 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1577
12d729b2 1578 for (size_t i = 0; i < arg_n_credentials; i++)
3652872a
LP
1579 if (streq(arg_credentials[i].id, word))
1580 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1581
1582 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1583 if (l < 0)
1584 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1585
1586 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1587 if (!a)
1588 return log_oom();
1589
1590 a[arg_n_credentials++] = (Credential) {
1591 .id = TAKE_PTR(word),
1592 .data = TAKE_PTR(data),
1593 .size = l,
1594 };
1595
1596 arg_credentials = a;
1597
1598 arg_settings_mask |= SETTING_CREDENTIALS;
1599 break;
1600 }
1601
1602 case ARG_LOAD_CREDENTIAL: {
1603 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1604 _cleanup_(erase_and_freep) char *data = NULL;
1605 _cleanup_free_ char *word = NULL, *j = NULL;
1606 const char *p = optarg;
1607 Credential *a;
1608 size_t size, i;
1609
1610 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1611 if (r == -ENOMEM)
1612 return log_oom();
1613 if (r < 0)
c941b650 1614 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
3652872a 1615 if (r == 0 || !p)
c941b650 1616 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
3652872a
LP
1617
1618 if (!credential_name_valid(word))
1619 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1620
1621 for (i = 0; i < arg_n_credentials; i++)
1622 if (streq(arg_credentials[i].id, word))
1623 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1624
1625 if (path_is_absolute(p))
1626 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1627 else {
1628 const char *e;
1629
786d19fd
LP
1630 r = get_credentials_dir(&e);
1631 if (r < 0)
1632 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1633
1634 j = path_join(e, p);
1635 if (!j)
1636 return log_oom();
1637 }
1638
986311c2
LP
1639 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1640 flags,
1641 NULL,
1642 &data, &size);
3652872a
LP
1643 if (r < 0)
1644 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1645
1646 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1647 if (!a)
1648 return log_oom();
1649
1650 a[arg_n_credentials++] = (Credential) {
1651 .id = TAKE_PTR(word),
1652 .data = TAKE_PTR(data),
1653 .size = size,
1654 };
1655
1656 arg_credentials = a;
1657
1658 arg_settings_mask |= SETTING_CREDENTIALS;
1659 break;
1660 }
1661
2f893044
LP
1662 case ARG_BIND_USER:
1663 if (!valid_user_group_name(optarg, 0))
1664 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1665
1666 if (strv_extend(&arg_bind_user, optarg) < 0)
1667 return log_oom();
1668
1669 arg_settings_mask |= SETTING_BIND_USER;
1670 break;
1671
4a4654e0
LP
1672 case ARG_SUPPRESS_SYNC:
1673 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1674 if (r < 0)
1675 return r;
1676
1677 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1678 break;
1679
06e78680
YW
1680 case ARG_IMAGE_POLICY:
1681 r = parse_image_policy_argument(optarg, &arg_image_policy);
84be0c71 1682 if (r < 0)
06e78680 1683 return r;
84be0c71 1684 break;
84be0c71 1685
88213476
LP
1686 case '?':
1687 return -EINVAL;
1688
1689 default:
04499a70 1690 assert_not_reached();
88213476 1691 }
88213476 1692
60f1ec13
LP
1693 if (argc > optind) {
1694 strv_free(arg_parameters);
1695 arg_parameters = strv_copy(argv + optind);
1696 if (!arg_parameters)
1697 return log_oom();
d7bea6b6 1698
60f1ec13
LP
1699 arg_settings_mask |= SETTING_START_MODE;
1700 }
1701
1702 if (arg_ephemeral && arg_template && !arg_directory)
1703 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1704 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1705 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1706 * --directory=". */
1707 arg_directory = TAKE_PTR(arg_template);
1708
2642d22a
DDM
1709 arg_caps_retain |= plus;
1710 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1711
1712 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1713 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1714 * indicate that. */
1715 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1716 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1717
1718 arg_caps_retain &= ~minus;
60f1ec13 1719
de40a303 1720 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1721 r = parse_environment();
1722 if (r < 0)
1723 return r;
de40a303 1724
60f1ec13
LP
1725 /* Load all settings from .nspawn files */
1726 if (mask_no_settings)
1727 arg_settings_mask = 0;
1728
1729 /* Don't load any settings from .nspawn files */
1730 if (mask_all_settings)
1731 arg_settings_mask = _SETTINGS_MASK_ALL;
1732
1733 return 1;
1734}
1735
1736static int verify_arguments(void) {
1737 int r;
a6b5216c 1738
75b0d8b8
ZJS
1739 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1740 /* If we are running the stub init in the container, we don't need to look at what the init
1741 * in the container supports, because we are not using it. Let's immediately pick the right
1742 * setting based on the host system configuration.
1743 *
1744 * We only do this, if the user didn't use an environment variable to override the detection.
1745 */
1746
1747 r = cg_all_unified();
1748 if (r < 0)
1749 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1750 if (r > 0)
1751 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1752 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1753 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1754 else
1755 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1756 }
1757
4f086aab
SU
1758 if (arg_userns_mode != USER_NAMESPACE_NO)
1759 arg_mount_settings |= MOUNT_USE_USERNS;
1760
1761 if (arg_private_network)
1762 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1763
48a8d337
LB
1764 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1765 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1766 arg_register = false;
baaa35ad 1767 if (arg_start_mode != START_PID1)
60f1ec13 1768 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1769 }
eb91eb18 1770
6c045a99
LP
1771 if (arg_userns_ownership < 0)
1772 arg_userns_ownership =
f61c7f88 1773 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1774 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1775
60f1ec13
LP
1776 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1777 arg_kill_signal = SIGRTMIN+3;
1778
e5a4bb0d
LP
1779 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1780 arg_read_only = true;
1781
2436ea76
DDM
1782 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1783 arg_read_only = true;
1784
baaa35ad 1785 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1786 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1787 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1789
baaa35ad 1790 if (arg_directory && arg_image)
60f1ec13 1791 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1792
baaa35ad 1793 if (arg_template && arg_image)
60f1ec13 1794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1795
baaa35ad 1796 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1798
baaa35ad 1799 if (arg_ephemeral && arg_template)
60f1ec13 1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1801
baaa35ad 1802 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1804
baaa35ad 1805 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1806 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1807
6c045a99 1808 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1809 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1810 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1811
6c045a99
LP
1812 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1813 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1814 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1815 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1816 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1817
679ecd36
SZ
1818 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1819 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1820 if (arg_network_namespace_path &&
1821 (arg_network_interfaces || arg_network_macvlan ||
1822 arg_network_ipvlan || arg_network_veth_extra ||
1823 arg_network_bridge || arg_network_zone ||
679ecd36 1824 arg_network_veth))
de40a303 1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1826
60f1ec13 1827 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1828 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1829 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1830
baaa35ad 1831 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1833
baaa35ad 1834 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1835 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1836
baaa35ad 1837 if (arg_expose_ports && !arg_private_network)
60f1ec13 1838 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1839
88fc9c9b 1840 if (arg_caps_ambient) {
f5fbe71d 1841 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1842 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1843
1844 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1846
1847 if (arg_start_mode == START_BOOT)
1848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1849 }
1850
2f893044
LP
1851 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1852 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1853
1854 /* Drop duplicate --bind-user= entries */
1855 strv_uniq(arg_bind_user);
1856
60f1ec13
LP
1857 r = custom_mount_check_all();
1858 if (r < 0)
1859 return r;
c6c8f6e2 1860
f757855e 1861 return 0;
88213476
LP
1862}
1863
2f091b1b
TM
1864static int verify_network_interfaces_initialized(void) {
1865 int r;
1866 r = test_network_interfaces_initialized(arg_network_interfaces);
1867 if (r < 0)
1868 return r;
1869
1870 r = test_network_interfaces_initialized(arg_network_macvlan);
1871 if (r < 0)
1872 return r;
1873
1874 r = test_network_interfaces_initialized(arg_network_ipvlan);
1875 if (r < 0)
1876 return r;
1877
1878 return 0;
1879}
1880
91181e07 1881int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1882 assert(p);
1883
0de7acce 1884 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1885 return 0;
1886
1887 if (uid == UID_INVALID && gid == GID_INVALID)
1888 return 0;
1889
1890 if (uid != UID_INVALID) {
1891 uid += arg_uid_shift;
1892
1893 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1894 return -EOVERFLOW;
1895 }
1896
1897 if (gid != GID_INVALID) {
1898 gid += (gid_t) arg_uid_shift;
1899
1900 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1901 return -EOVERFLOW;
1902 }
1903
7c248223 1904 return RET_NERRNO(lchown(p, uid, gid));
b12afc8c
LP
1905}
1906
91181e07 1907int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1908 const char *q;
dae8b82e 1909 int r;
03cfe0d5
LP
1910
1911 q = prefix_roota(root, path);
3f692e2e 1912 r = RET_NERRNO(mkdir(q, mode));
dae8b82e
ZJS
1913 if (r == -EEXIST)
1914 return 0;
1915 if (r < 0)
1916 return r;
03cfe0d5
LP
1917
1918 return userns_lchown(q, uid, gid);
1919}
1920
1688841f 1921static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1922 return PATH_STARTSWITH_SET(
1923 path,
1924 "../usr/share/zoneinfo/",
1925 "/usr/share/zoneinfo/");
1688841f
LP
1926}
1927
83205269
LP
1928static bool etc_writable(void) {
1929 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1930}
1931
e58a1277 1932static int setup_timezone(const char *dest) {
1688841f
LP
1933 _cleanup_free_ char *p = NULL, *etc = NULL;
1934 const char *where, *check;
1935 TimezoneMode m;
d4036145 1936 int r;
f8440af5 1937
e58a1277
LP
1938 assert(dest);
1939
1688841f 1940 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1941 r = readlink_malloc("/etc/localtime", &p);
1942 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1943 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1944 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1945 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1946 else if (r < 0) {
1947 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1948 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1949 * file.
1950 *
1951 * Example:
1952 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1953 */
1954 return 0;
1955 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1956 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1957 else
1958 m = arg_timezone;
1959 } else
1960 m = arg_timezone;
1961
1962 if (m == TIMEZONE_OFF)
1963 return 0;
1964
f461a28d 1965 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1966 if (r < 0) {
1688841f 1967 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1968 return 0;
1969 }
1970
1688841f
LP
1971 where = strjoina(etc, "/localtime");
1972
1973 switch (m) {
1974
1975 case TIMEZONE_DELETE:
1976 if (unlink(where) < 0)
1977 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1978
d4036145 1979 return 0;
d4036145 1980
1688841f
LP
1981 case TIMEZONE_SYMLINK: {
1982 _cleanup_free_ char *q = NULL;
1983 const char *z, *what;
4d1c38b8 1984
1688841f
LP
1985 z = timezone_from_path(p);
1986 if (!z) {
1987 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1988 return 0;
1688841f 1989 }
d4036145 1990
1688841f
LP
1991 r = readlink_malloc(where, &q);
1992 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1993 return 0; /* Already pointing to the right place? Then do nothing .. */
1994
1995 check = strjoina(dest, "/usr/share/zoneinfo/", z);
f461a28d 1996 r = chase(check, dest, 0, NULL, NULL);
1688841f
LP
1997 if (r < 0)
1998 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1999 else {
2000 if (unlink(where) < 0 && errno != ENOENT) {
2001 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
2002 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
2003 return 0;
2004 }
2005
2006 what = strjoina("../usr/share/zoneinfo/", z);
2007 if (symlink(what, where) < 0) {
2008 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2009 errno, "Failed to correct timezone of container, ignoring: %m");
2010 return 0;
2011 }
2012
2013 break;
2014 }
2015
2016 _fallthrough_;
d4036145 2017 }
68fb0892 2018
1688841f
LP
2019 case TIMEZONE_BIND: {
2020 _cleanup_free_ char *resolved = NULL;
2021 int found;
2022
f461a28d 2023 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
2024 if (found < 0) {
2025 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2026 return 0;
2027 }
2028
2029 if (found == 0) /* missing? */
2030 (void) touch(resolved);
2031
511a8cfe 2032 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 2033 if (r >= 0)
511a8cfe 2034 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2035
2036 _fallthrough_;
79d80fc1 2037 }
4d9f07b4 2038
1688841f
LP
2039 case TIMEZONE_COPY:
2040 /* If mounting failed, try to copy */
7c2f5495 2041 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2042 if (r < 0) {
2043 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2044 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2045 return 0;
2046 }
2047
2048 break;
2049
2050 default:
04499a70 2051 assert_not_reached();
d4036145 2052 }
e58a1277 2053
1688841f 2054 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2055 r = userns_lchown(where, 0, 0);
2056 if (r < 0)
1688841f 2057 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2058
e58a1277 2059 return 0;
88213476
LP
2060}
2061
09d423e9
LP
2062static int have_resolv_conf(const char *path) {
2063 assert(path);
2064
2065 if (access(path, F_OK) < 0) {
2066 if (errno == ENOENT)
2067 return 0;
2068
2069 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2070 }
2071
2072 return 1;
2073}
2074
7357272e 2075static int resolved_listening(void) {
b8ea7a6e 2076 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2077 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2078 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2079 int r;
2080
7357272e 2081 /* Check if resolved is listening */
b053cd5f
LP
2082
2083 r = sd_bus_open_system(&bus);
2084 if (r < 0)
b8ea7a6e 2085 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2086
7357272e 2087 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2088 if (r < 0)
2089 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2090 if (r == 0)
2091 return 0;
7357272e 2092
7f8a85e6 2093 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
7357272e 2094 if (r < 0)
b8ea7a6e 2095 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2096
2097 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2098}
2099
2547bb41 2100static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2101 _cleanup_free_ char *etc = NULL;
2102 const char *where, *what;
2103 ResolvConfMode m;
2104 int r;
2547bb41
LP
2105
2106 assert(dest);
2107
09d423e9
LP
2108 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2109 if (arg_private_network)
2110 m = RESOLV_CONF_OFF;
86775e35
LP
2111 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2112 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2113 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2114 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2115 else
83205269 2116 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2117
09d423e9
LP
2118 } else
2119 m = arg_resolv_conf;
2120
2121 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2122 return 0;
2123
f461a28d 2124 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2125 if (r < 0) {
2126 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2127 return 0;
2128 }
2129
2130 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2131
2132 if (m == RESOLV_CONF_DELETE) {
2133 if (unlink(where) < 0)
2134 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2135
87447ae4
LP
2136 return 0;
2137 }
79d80fc1 2138
86775e35
LP
2139 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2140 what = PRIVATE_STATIC_RESOLV_CONF;
2141 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2142 what = PRIVATE_UPLINK_RESOLV_CONF;
2143 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2144 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2145 else
2146 what = "/etc/resolv.conf";
87447ae4 2147
86775e35 2148 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2149 _cleanup_free_ char *resolved = NULL;
2150 int found;
2151
d404c8d8 2152 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
09d423e9
LP
2153 if (found < 0) {
2154 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2155 return 0;
2156 }
3539724c 2157
87447ae4
LP
2158 if (found == 0) /* missing? */
2159 (void) touch(resolved);
5367354d 2160
511a8cfe 2161 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2162 if (r >= 0)
511a8cfe 2163 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2164
2165 /* If that didn't work, let's copy the file */
3539724c
LP
2166 }
2167
86775e35 2168 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
7c2f5495 2169 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
86775e35 2170 else
7c2f5495 2171 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
79d80fc1 2172 if (r < 0) {
3539724c
LP
2173 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2174 * resolved or something similar runs inside and the symlink points there.
68a313c5 2175 *
3539724c 2176 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2177 */
86775e35
LP
2178 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2179 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2180 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2181 return 0;
2182 }
2547bb41 2183
03cfe0d5
LP
2184 r = userns_lchown(where, 0, 0);
2185 if (r < 0)
3539724c 2186 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2187
2547bb41
LP
2188 return 0;
2189}
2190
1e4f1671 2191static int setup_boot_id(void) {
cdde6ba6
LP
2192 _cleanup_(unlink_and_freep) char *from = NULL;
2193 _cleanup_free_ char *path = NULL;
3bbaff3e 2194 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2195 const char *to;
04bc4a3f
LP
2196 int r;
2197
1eacc470 2198 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2199
1eacc470 2200 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2201 if (r < 0)
2202 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2203
2204 r = sd_id128_randomize(&rnd);
f647962d
MS
2205 if (r < 0)
2206 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2207
b40c8ebd 2208 r = id128_write(path, ID128_FORMAT_UUID, rnd);
f647962d
MS
2209 if (r < 0)
2210 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2211
cdde6ba6
LP
2212 from = TAKE_PTR(path);
2213 to = "/proc/sys/kernel/random/boot_id";
2214
511a8cfe 2215 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2216 if (r < 0)
2217 return r;
04bc4a3f 2218
511a8cfe 2219 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2220}
2221
e58a1277 2222static int copy_devnodes(const char *dest) {
88213476
LP
2223 static const char devnodes[] =
2224 "null\0"
2225 "zero\0"
2226 "full\0"
2227 "random\0"
2228 "urandom\0"
85614d66
TG
2229 "tty\0"
2230 "net/tun\0";
88213476 2231
e58a1277 2232 int r = 0;
a258bf26
LP
2233
2234 assert(dest);
124640f1 2235
52f05ef2 2236 BLOCK_WITH_UMASK(0000);
88213476 2237
03cfe0d5
LP
2238 /* Create /dev/net, so that we can create /dev/net/tun in it */
2239 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2240 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2241
88213476 2242 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2243 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2244 struct stat st;
88213476 2245
c6134d3e 2246 from = path_join("/dev/", d);
8967f291
LP
2247 if (!from)
2248 return log_oom();
2249
c6134d3e 2250 to = path_join(dest, from);
8967f291
LP
2251 if (!to)
2252 return log_oom();
88213476
LP
2253
2254 if (stat(from, &st) < 0) {
2255
4a62c710
MS
2256 if (errno != ENOENT)
2257 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2258
baaa35ad
ZJS
2259 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2260 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2261 "%s is not a char or block device, cannot copy.", from);
2262 else {
8dfce114
LP
2263 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2264
81f5049b 2265 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2266 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2267 if (errno == EEXIST)
8dbf71ec 2268 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2269 if (errno != EPERM)
2270 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2271
8dfce114 2272 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2273 r = touch(to);
2274 if (r < 0)
2275 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2276 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2277 if (r < 0)
2278 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2279 }
6278cf60 2280
03cfe0d5
LP
2281 r = userns_lchown(to, 0, 0);
2282 if (r < 0)
2283 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2284
657ee2d8 2285 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2286 if (!dn)
2287 return log_oom();
2288
2289 r = userns_mkdir(dest, dn, 0755, 0, 0);
2290 if (r < 0)
2291 return log_error_errno(r, "Failed to create '%s': %m", dn);
2292
2293 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2294 return log_oom();
2295
c6134d3e 2296 prefixed = path_join(dest, sl);
8dfce114
LP
2297 if (!prefixed)
2298 return log_oom();
2299
2d9b74ba 2300 t = path_join("..", d);
8dfce114
LP
2301 if (!t)
2302 return log_oom();
2303
2304 if (symlink(t, prefixed) < 0)
2305 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2306 }
88213476
LP
2307 }
2308
e58a1277
LP
2309 return r;
2310}
88213476 2311
de40a303 2312static int make_extra_nodes(const char *dest) {
de40a303
LP
2313 size_t i;
2314 int r;
2315
52f05ef2 2316 BLOCK_WITH_UMASK(0000);
de40a303
LP
2317
2318 for (i = 0; i < arg_n_extra_nodes; i++) {
2319 _cleanup_free_ char *path = NULL;
2320 DeviceNode *n = arg_extra_nodes + i;
2321
c6134d3e 2322 path = path_join(dest, n->path);
de40a303
LP
2323 if (!path)
2324 return log_oom();
2325
2326 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2327 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2328
2329 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2330 if (r < 0)
2331 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2332 }
2333
2334 return 0;
2335}
2336
03cfe0d5
LP
2337static int setup_pts(const char *dest) {
2338 _cleanup_free_ char *options = NULL;
2339 const char *p;
709f6e46 2340 int r;
03cfe0d5 2341
349cc4a5 2342#if HAVE_SELINUX
03cfe0d5
LP
2343 if (arg_selinux_apifs_context)
2344 (void) asprintf(&options,
3dce8915 2345 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2346 arg_uid_shift + TTY_GID,
2347 arg_selinux_apifs_context);
2348 else
2349#endif
2350 (void) asprintf(&options,
3dce8915 2351 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2352 arg_uid_shift + TTY_GID);
f2d88580 2353
03cfe0d5 2354 if (!options)
f2d88580
LP
2355 return log_oom();
2356
03cfe0d5 2357 /* Mount /dev/pts itself */
cc9fce65 2358 p = prefix_roota(dest, "/dev/pts");
3f692e2e 2359 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e
ZJS
2360 if (r < 0)
2361 return log_error_errno(r, "Failed to create /dev/pts: %m");
2362
511a8cfe 2363 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2364 if (r < 0)
2365 return r;
709f6e46
MS
2366 r = userns_lchown(p, 0, 0);
2367 if (r < 0)
2368 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2369
2370 /* Create /dev/ptmx symlink */
2371 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2372 if (symlink("pts/ptmx", p) < 0)
2373 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2374 r = userns_lchown(p, 0, 0);
2375 if (r < 0)
2376 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2377
03cfe0d5
LP
2378 /* And fix /dev/pts/ptmx ownership */
2379 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2380 r = userns_lchown(p, 0, 0);
2381 if (r < 0)
2382 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2383
f2d88580
LP
2384 return 0;
2385}
2386
3acc84eb 2387static int setup_stdio_as_dev_console(void) {
5bb1d7fb 2388 _cleanup_close_ int terminal = -EBADF;
e58a1277 2389 int r;
e58a1277 2390
335d2ead
LP
2391 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2392 * explicitly, if we are configured to. */
2393 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2394 if (terminal < 0)
2395 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2396
3acc84eb
FB
2397 /* Make sure we can continue logging to the original stderr, even if
2398 * stderr points elsewhere now */
2399 r = log_dup_console();
2400 if (r < 0)
2401 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2402
3acc84eb
FB
2403 /* invalidates 'terminal' on success and failure */
2404 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2405 TAKE_FD(terminal);
f647962d 2406 if (r < 0)
3acc84eb
FB
2407 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2408
2409 return 0;
2410}
88213476 2411
3acc84eb
FB
2412static int setup_dev_console(const char *console) {
2413 _cleanup_free_ char *p = NULL;
2414 int r;
a258bf26 2415
3acc84eb
FB
2416 /* Create /dev/console symlink */
2417 r = path_make_relative("/dev", console, &p);
81f5049b 2418 if (r < 0)
3acc84eb
FB
2419 return log_error_errno(r, "Failed to create relative path: %m");
2420
2421 if (symlink(p, "/dev/console") < 0)
2422 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2423
3acc84eb 2424 return 0;
e58a1277
LP
2425}
2426
8e5430c4
LP
2427static int setup_keyring(void) {
2428 key_serial_t keyring;
2429
6b000af4
LP
2430 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2431 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2432 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2433 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2434 * into the container. */
8e5430c4
LP
2435
2436 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2437 if (keyring == -1) {
2438 if (errno == ENOSYS)
2439 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2440 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2441 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2442 else
2443 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2444 }
2445
2446 return 0;
2447}
2448
3652872a
LP
2449static int setup_credentials(const char *root) {
2450 const char *q;
2451 int r;
2452
2453 if (arg_n_credentials <= 0)
2454 return 0;
2455
2456 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to create /run/host: %m");
2459
2460 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2463
2464 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2465 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2466 if (r < 0)
2467 return r;
2468
2469 for (size_t i = 0; i < arg_n_credentials; i++) {
2470 _cleanup_free_ char *j = NULL;
254d1313 2471 _cleanup_close_ int fd = -EBADF;
3652872a
LP
2472
2473 j = path_join(q, arg_credentials[i].id);
2474 if (!j)
2475 return log_oom();
2476
2477 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2478 if (fd < 0)
2479 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2480
e22c60a9 2481 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
3652872a
LP
2482 if (r < 0)
2483 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2484
2485 if (fchmod(fd, 0400) < 0)
2486 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2487
2488 if (arg_userns_mode != USER_NAMESPACE_NO) {
2489 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2490 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2491 }
2492 }
2493
2494 if (chmod(q, 0500) < 0)
2495 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2496
2497 r = userns_lchown(q, 0, 0);
2498 if (r < 0)
2499 return r;
2500
2501 /* Make both mount and superblock read-only now */
511a8cfe 2502 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2503 if (r < 0)
2504 return r;
2505
511a8cfe 2506 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2507}
2508
5d9d3fcb 2509static int setup_kmsg(int fd_inner_socket) {
9ec5a93c
LP
2510 _cleanup_(unlink_and_freep) char *from = NULL;
2511 _cleanup_free_ char *fifo = NULL;
254d1313 2512 _cleanup_close_ int fd = -EBADF;
9ec5a93c 2513 int r;
e58a1277 2514
5d9d3fcb 2515 assert(fd_inner_socket >= 0);
a258bf26 2516
52f05ef2 2517 BLOCK_WITH_UMASK(0000);
a258bf26 2518
30fd9a2d 2519 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2520 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2521 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2522 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2523
1eacc470 2524 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2525 if (r < 0)
2526 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2527
9ec5a93c 2528 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2529 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2530
2531 from = TAKE_PTR(fifo);
9ec5a93c 2532
511a8cfe 2533 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2534 if (r < 0)
2535 return r;
e58a1277 2536
669fc4e5 2537 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2538 if (fd < 0)
2539 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2540
9ec5a93c 2541 /* Store away the fd in the socket, so that it stays open as long as we run the child */
5d9d3fcb 2542 r = send_one_fd(fd_inner_socket, fd, 0);
d9603714
DH
2543 if (r < 0)
2544 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2545
25ea79fe 2546 return 0;
88213476
LP
2547}
2548
761cf19d 2549struct ExposeArgs {
deff68e7
FW
2550 union in_addr_union address4;
2551 union in_addr_union address6;
761cf19d
FW
2552 struct FirewallContext *fw_ctx;
2553};
2554
1c4baffc 2555static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
99534007 2556 struct ExposeArgs *args = ASSERT_PTR(userdata);
6d0b55c2
LP
2557
2558 assert(rtnl);
2559 assert(m);
6d0b55c2 2560
fb9044cb
LP
2561 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2562 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2563 return 0;
2564}
2565
3a74cea5 2566static int setup_hostname(void) {
c818eef1 2567 int r;
3a74cea5 2568
0c582db0 2569 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2570 return 0;
2571
c818eef1
LP
2572 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2573 if (r < 0)
2574 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2575
7027ff61 2576 return 0;
3a74cea5
LP
2577}
2578
57fb9fb5 2579static int setup_journal(const char *directory) {
0f5e1382 2580 _cleanup_free_ char *d = NULL;
5980d463 2581 const char *p, *q;
b2238e38 2582 sd_id128_t this_id;
8054d749 2583 bool try;
57fb9fb5
LP
2584 int r;
2585
df9a75e4
LP
2586 /* Don't link journals in ephemeral mode */
2587 if (arg_ephemeral)
2588 return 0;
2589
8054d749
LP
2590 if (arg_link_journal == LINK_NO)
2591 return 0;
2592
2593 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2594
4d680aee 2595 r = sd_id128_get_machine(&this_id);
f647962d
MS
2596 if (r < 0)
2597 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2598
e01ff70a 2599 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2600 log_full(try ? LOG_WARNING : LOG_ERR,
85b55869 2601 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
8054d749 2602 if (try)
4d680aee 2603 return 0;
df9a75e4 2604 return -EEXIST;
4d680aee
ZJS
2605 }
2606
369ca6da
ZJS
2607 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2608 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2609 if (r < 0) {
2610 bool ignore = r == -EROFS && try;
2611 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2612 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2613 return ignore ? 0 : r;
2614 }
2615 }
03cfe0d5 2616
85b55869 2617 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
03cfe0d5 2618 q = prefix_roota(directory, p);
27407a01 2619
e1873695 2620 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2621 if (try)
2622 return 0;
27407a01 2623
baaa35ad
ZJS
2624 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2625 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2626 }
2627
e1873695 2628 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2629 if (try)
2630 return 0;
57fb9fb5 2631
baaa35ad
ZJS
2632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2633 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2634 }
2635
2636 r = readlink_and_make_absolute(p, &d);
2637 if (r >= 0) {
3742095b 2638 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2639 path_equal(d, q)) {
2640
03cfe0d5 2641 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2642 if (r < 0)
709f6e46 2643 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2644 return 0;
57fb9fb5
LP
2645 }
2646
4a62c710
MS
2647 if (unlink(p) < 0)
2648 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2649 } else if (r == -EINVAL) {
2650
2651 if (arg_link_journal == LINK_GUEST &&
2652 rmdir(p) < 0) {
2653
27407a01
ZJS
2654 if (errno == ENOTDIR) {
2655 log_error("%s already exists and is neither a symlink nor a directory", p);
2656 return r;
4314d33f
MS
2657 } else
2658 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2659 }
4314d33f
MS
2660 } else if (r != -ENOENT)
2661 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2662
2663 if (arg_link_journal == LINK_GUEST) {
2664
2665 if (symlink(q, p) < 0) {
8054d749 2666 if (try) {
56f64d95 2667 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2668 return 0;
4314d33f
MS
2669 } else
2670 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2671 }
2672
03cfe0d5 2673 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2674 if (r < 0)
709f6e46 2675 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2676 return 0;
57fb9fb5
LP
2677 }
2678
2679 if (arg_link_journal == LINK_HOST) {
ccddd104 2680 /* don't create parents here — if the host doesn't have
574edc90 2681 * permanent journal set up, don't force it here */
ba8e6c4d 2682
3f692e2e 2683 r = RET_NERRNO(mkdir(p, 0755));
dae8b82e 2684 if (r < 0 && r != -EEXIST) {
8054d749 2685 if (try) {
dae8b82e 2686 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2687 return 0;
4314d33f 2688 } else
dae8b82e 2689 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2690 }
2691
27407a01
ZJS
2692 } else if (access(p, F_OK) < 0)
2693 return 0;
57fb9fb5 2694
db55bbf2 2695 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
cdb2b9d0
LP
2696 log_warning("%s is not empty, proceeding anyway.", q);
2697
03cfe0d5 2698 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2699 if (r < 0)
2700 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2701
511a8cfe 2702 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2703 if (r < 0)
4a62c710 2704 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2705
27407a01 2706 return 0;
57fb9fb5
LP
2707}
2708
de40a303
LP
2709static int drop_capabilities(uid_t uid) {
2710 CapabilityQuintet q;
2711
2712 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2713 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2714 * arg_caps_retain. */
2715
2716 if (capability_quintet_is_set(&arg_full_capabilities)) {
2717 q = arg_full_capabilities;
2718
f5fbe71d 2719 if (q.bounding == UINT64_MAX)
de40a303
LP
2720 q.bounding = uid == 0 ? arg_caps_retain : 0;
2721
f5fbe71d 2722 if (q.effective == UINT64_MAX)
de40a303
LP
2723 q.effective = uid == 0 ? q.bounding : 0;
2724
f5fbe71d 2725 if (q.inheritable == UINT64_MAX)
88fc9c9b 2726 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2727
f5fbe71d 2728 if (q.permitted == UINT64_MAX)
88fc9c9b 2729 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2730
f5fbe71d 2731 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2732 q.ambient = arg_caps_ambient;
f66ad460
AZ
2733
2734 if (capability_quintet_mangle(&q))
2735 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2736
2737 } else {
de40a303
LP
2738 q = (CapabilityQuintet) {
2739 .bounding = arg_caps_retain,
2740 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2741 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2742 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2743 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2744 };
2745
f66ad460
AZ
2746 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2747 * in order to maintain the same behavior as systemd < 242. */
2748 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2749 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2750 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2751
2752 }
2753
de40a303 2754 return capability_quintet_enforce(&q);
88213476
LP
2755}
2756
db999e0f
LP
2757static int reset_audit_loginuid(void) {
2758 _cleanup_free_ char *p = NULL;
2759 int r;
2760
0c582db0 2761 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2762 return 0;
2763
2764 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2765 if (r == -ENOENT)
db999e0f 2766 return 0;
f647962d
MS
2767 if (r < 0)
2768 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2769
2770 /* Already reset? */
2771 if (streq(p, "4294967295"))
2772 return 0;
2773
57512c89 2774 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2775 if (r < 0) {
10a87006
LP
2776 log_error_errno(r,
2777 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2778 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2779 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2780 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2781 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2782
db999e0f 2783 sleep(5);
77b6e194 2784 }
db999e0f
LP
2785
2786 return 0;
77b6e194
LP
2787}
2788
e79581dd 2789static int mount_tunnel_dig(const char *root) {
785890ac 2790 const char *p, *q;
709f6e46 2791 int r;
785890ac
LP
2792
2793 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2794 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2795 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2796 (void) mkdir_p(p, 0600);
2797
5a27b395 2798 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2799 if (r < 0)
5a27b395 2800 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2801
e79581dd 2802 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
709f6e46 2803 if (r < 0)
e79581dd 2804 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
03cfe0d5 2805
e79581dd 2806 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
511a8cfe 2807 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2808 if (r < 0)
2809 return r;
785890ac 2810
511a8cfe 2811 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2812 if (r < 0)
2813 return r;
785890ac 2814
e79581dd
CB
2815 return 0;
2816}
2817
2818static int mount_tunnel_open(void) {
2819 int r;
2820
2821 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2822 if (r < 0)
2823 return r;
2824
2825 return 0;
785890ac
LP
2826}
2827
317feb4d 2828static int setup_machine_id(const char *directory) {
3bbaff3e 2829 int r;
e01ff70a 2830
317feb4d
LP
2831 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2832 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2833 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2834 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2835 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2836 * container behaves nicely). */
2837
319477f1 2838 r = id128_get_machine(directory, &arg_uuid);
bb44fd07
ZJS
2839 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2840 /* If the file is missing, empty, or uninitialized, we don't mind */
317feb4d
LP
2841 if (sd_id128_is_null(arg_uuid)) {
2842 r = sd_id128_randomize(&arg_uuid);
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2845 }
bb44fd07
ZJS
2846 } else if (r < 0)
2847 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2848
e01ff70a
MS
2849 return 0;
2850}
2851
7336138e
LP
2852static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2853 int r;
2854
2855 assert(directory);
2856
6c045a99 2857 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2858 return 0;
2859
2860 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2861 if (r == -EOPNOTSUPP)
2862 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2863 if (r == -EBADE)
2864 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2865 if (r < 0)
2866 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2867 if (r == 0)
2868 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2869 else
2870 log_debug("Patched directory tree to match UID/GID range.");
2871
2872 return r;
2873}
2874
113cea80 2875/*
6d416b9c
LS
2876 * Return values:
2877 * < 0 : wait_for_terminate() failed to get the state of the
2878 * container, the container was terminated by a signal, or
2879 * failed for an unknown reason. No change is made to the
2880 * container argument.
2881 * > 0 : The program executed in the container terminated with an
2882 * error. The exit code of the program executed in the
919699ec
LP
2883 * container is returned. The container argument has been set
2884 * to CONTAINER_TERMINATED.
6d416b9c
LS
2885 * 0 : The container is being rebooted, has been shut down or exited
2886 * successfully. The container argument has been set to either
2887 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2888 *
6d416b9c
LS
2889 * That is, success is indicated by a return value of zero, and an
2890 * error is indicated by a non-zero value.
113cea80
DH
2891 */
2892static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2893 siginfo_t status;
919699ec 2894 int r;
113cea80
DH
2895
2896 r = wait_for_terminate(pid, &status);
f647962d
MS
2897 if (r < 0)
2898 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2899
2900 switch (status.si_code) {
fddbb89c 2901
113cea80 2902 case CLD_EXITED:
b5a2179b 2903 if (status.si_status == 0)
919699ec 2904 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2905 else
919699ec 2906 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2907
919699ec
LP
2908 *container = CONTAINER_TERMINATED;
2909 return status.si_status;
113cea80
DH
2910
2911 case CLD_KILLED:
2912 if (status.si_status == SIGINT) {
919699ec 2913 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2914 *container = CONTAINER_TERMINATED;
919699ec
LP
2915 return 0;
2916
113cea80 2917 } else if (status.si_status == SIGHUP) {
919699ec 2918 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2919 *container = CONTAINER_REBOOTED;
919699ec 2920 return 0;
113cea80 2921 }
919699ec 2922
4831981d 2923 _fallthrough_;
113cea80 2924 case CLD_DUMPED:
baaa35ad
ZJS
2925 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2926 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2927
2928 default:
baaa35ad
ZJS
2929 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2930 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2931 }
113cea80
DH
2932}
2933
023fb90b
LP
2934static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2935 pid_t pid;
2936
4a0b58c4 2937 pid = PTR_TO_PID(userdata);
023fb90b 2938 if (pid > 0) {
c6c8f6e2 2939 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2940 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2941 sd_event_source_set_userdata(s, NULL);
2942 return 0;
2943 }
2944 }
2945
2946 sd_event_exit(sd_event_source_get_event(s), 0);
2947 return 0;
2948}
2949
6916b164 2950static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2951 pid_t pid;
2952
2953 assert(s);
2954 assert(ssi);
2955
2956 pid = PTR_TO_PID(userdata);
2957
6916b164
AU
2958 for (;;) {
2959 siginfo_t si = {};
abdb9b08 2960
6916b164
AU
2961 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2962 return log_error_errno(errno, "Failed to waitid(): %m");
2963 if (si.si_pid == 0) /* No pending children. */
2964 break;
abdb9b08 2965 if (si.si_pid == pid) {
6916b164
AU
2966 /* The main process we care for has exited. Return from
2967 * signal handler but leave the zombie. */
2968 sd_event_exit(sd_event_source_get_event(s), 0);
2969 break;
2970 }
abdb9b08 2971
6916b164
AU
2972 /* Reap all other children. */
2973 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2974 }
2975
2976 return 0;
2977}
2978
abdb9b08
LP
2979static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2980 pid_t pid;
2981
2982 assert(m);
2983
2984 pid = PTR_TO_PID(userdata);
2985
2986 if (arg_kill_signal > 0) {
2987 log_info("Container termination requested. Attempting to halt container.");
2988 (void) kill(pid, arg_kill_signal);
2989 } else {
2990 log_info("Container termination requested. Exiting.");
2991 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2992 }
2993
2994 return 0;
2995}
2996
ec16945e 2997static int determine_names(void) {
1b9cebf6 2998 int r;
ec16945e 2999
c1521918
LP
3000 if (arg_template && !arg_directory && arg_machine) {
3001
3002 /* If --template= was specified then we should not
3003 * search for a machine, but instead create a new one
3004 * in /var/lib/machine. */
3005
657ee2d8 3006 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
3007 if (!arg_directory)
3008 return log_oom();
3009 }
3010
ec16945e 3011 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3012 if (arg_machine) {
3013 _cleanup_(image_unrefp) Image *i = NULL;
3014
d577d4a4 3015 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
3016 if (r == -ENOENT)
3017 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3018 if (r < 0)
3019 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3020
eb38edce 3021 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3022 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3023 else
0f03c2a4 3024 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3025 if (r < 0)
0f3be6ca 3026 return log_oom();
1b9cebf6 3027
aee327b8
LP
3028 if (!arg_ephemeral)
3029 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3030 } else {
3031 r = safe_getcwd(&arg_directory);
3032 if (r < 0)
3033 return log_error_errno(r, "Failed to determine current directory: %m");
3034 }
ec16945e 3035
c6147113
LP
3036 if (!arg_directory && !arg_image)
3037 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3038 }
3039
3040 if (!arg_machine) {
b9ba4dab
LP
3041 if (arg_directory && path_equal(arg_directory, "/"))
3042 arg_machine = gethostname_malloc();
e9b88a6d
LP
3043 else if (arg_image) {
3044 char *e;
4827ab48 3045
b36e39d2
LP
3046 r = path_extract_filename(arg_image, &arg_machine);
3047 if (r < 0)
3048 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
4827ab48 3049
e9b88a6d
LP
3050 /* Truncate suffix if there is one */
3051 e = endswith(arg_machine, ".raw");
3052 if (e)
3053 *e = 0;
b36e39d2
LP
3054 } else {
3055 r = path_extract_filename(arg_directory, &arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3058 }
ec16945e 3059
ae691c1d 3060 hostname_cleanup(arg_machine);
52ef5dd7 3061 if (!hostname_is_valid(arg_machine, 0))
c6147113 3062 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3063
3603f151
LB
3064 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3065 * to match fixed config file names. */
3066 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3067 if (!arg_settings_filename)
3068 return log_oom();
3069
e9b88a6d
LP
3070 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3071 * instances at once without manually having to specify -M each time. */
3072 if (arg_ephemeral)
3073 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3074 return log_oom();
3603f151
LB
3075 } else {
3076 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3077 if (!arg_settings_filename)
3078 return log_oom();
ec16945e
LP
3079 }
3080
3081 return 0;
3082}
3083
f461a28d 3084static int chase_and_update(char **p, unsigned flags) {
3f342ec4
LP
3085 char *chased;
3086 int r;
3087
3088 assert(p);
3089
3090 if (!*p)
3091 return 0;
3092
f461a28d 3093 r = chase(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3094 if (r < 0)
3095 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3096
a5648b80 3097 return free_and_replace(*p, chased);
3f342ec4
LP
3098}
3099
03cfe0d5 3100static int determine_uid_shift(const char *directory) {
6dac160c 3101
0de7acce 3102 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3103 arg_uid_shift = 0;
6dac160c 3104 return 0;
03cfe0d5 3105 }
6dac160c
LP
3106
3107 if (arg_uid_shift == UID_INVALID) {
3108 struct stat st;
3109
993da6d4
LP
3110 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3111
3112 if (stat(directory, &st) < 0)
03cfe0d5 3113 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3114
3115 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3116
baaa35ad
ZJS
3117 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3118 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3119 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3120
3121 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3122
3123 if (arg_uid_shift != 0) {
3124 /* If the image is shifted already, then we'll fall back to classic chowning, for
3125 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3126
3127 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3128 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3129 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3130 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3131 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3132 "UID base of %s is not zero, UID mapping not supported.", directory);
3133 }
6dac160c
LP
3134 }
3135
58e13de5
LP
3136 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3137 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3138
6dac160c
LP
3139 return 0;
3140}
3141
de40a303
LP
3142static unsigned long effective_clone_ns_flags(void) {
3143 unsigned long flags = arg_clone_ns_flags;
3144
3145 if (arg_private_network)
3146 flags |= CLONE_NEWNET;
3147 if (arg_use_cgns)
3148 flags |= CLONE_NEWCGROUP;
3149 if (arg_userns_mode != USER_NAMESPACE_NO)
3150 flags |= CLONE_NEWUSER;
3151
3152 return flags;
3153}
3154
3155static int patch_sysctl(void) {
3156
3157 /* This table is inspired by runc's sysctl() function */
3158 static const struct {
3159 const char *key;
3160 bool prefix;
3161 unsigned long clone_flags;
3162 } safe_sysctl[] = {
3163 { "kernel.hostname", false, CLONE_NEWUTS },
3164 { "kernel.domainname", false, CLONE_NEWUTS },
3165 { "kernel.msgmax", false, CLONE_NEWIPC },
3166 { "kernel.msgmnb", false, CLONE_NEWIPC },
3167 { "kernel.msgmni", false, CLONE_NEWIPC },
3168 { "kernel.sem", false, CLONE_NEWIPC },
3169 { "kernel.shmall", false, CLONE_NEWIPC },
3170 { "kernel.shmmax", false, CLONE_NEWIPC },
3171 { "kernel.shmmni", false, CLONE_NEWIPC },
3172 { "fs.mqueue.", true, CLONE_NEWIPC },
3173 { "net.", true, CLONE_NEWNET },
3174 };
3175
3176 unsigned long flags;
de40a303
LP
3177 int r;
3178
3179 flags = effective_clone_ns_flags();
3180
3181 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3182 bool good = false;
3183 size_t i;
3184
3185 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3186
3187 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3188 continue;
3189
3190 if (safe_sysctl[i].prefix)
3191 good = startswith(*k, safe_sysctl[i].key);
3192 else
3193 good = streq(*k, safe_sysctl[i].key);
3194
3195 if (good)
3196 break;
3197 }
3198
c6147113
LP
3199 if (!good)
3200 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3201
3202 r = sysctl_write(*k, *v);
3203 if (r < 0)
3204 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3205 }
3206
3207 return 0;
3208}
3209
03cfe0d5
LP
3210static int inner_child(
3211 Barrier *barrier,
5d9d3fcb 3212 int fd_inner_socket,
e1bb4b0d
LB
3213 FDSet *fds,
3214 char **os_release_pairs) {
69c79d3c 3215
03cfe0d5 3216 _cleanup_free_ char *home = NULL;
88614c8a 3217 size_t n_env = 1;
4ab3d29f
ZJS
3218 char *envp[] = {
3219 (char*) "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3220 NULL, /* container */
03cfe0d5
LP
3221 NULL, /* TERM */
3222 NULL, /* HOME */
3223 NULL, /* USER */
3224 NULL, /* LOGNAME */
3225 NULL, /* container_uuid */
3226 NULL, /* LISTEN_FDS */
3227 NULL, /* LISTEN_PID */
9c1e04d0 3228 NULL, /* NOTIFY_SOCKET */
3652872a 3229 NULL, /* CREDENTIALS_DIRECTORY */
b626f695 3230 NULL, /* LANG */
03cfe0d5
LP
3231 NULL
3232 };
1a68e1e5 3233 const char *exec_target;
2371271c 3234 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3235 int r, which_failed;
88213476 3236
b37469d7
LP
3237 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3238 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3239 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3240 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3241 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3242 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3243 * namespace.
3244 *
3245 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3246 * unshare(). See below. */
3247
03cfe0d5 3248 assert(barrier);
5d9d3fcb 3249 assert(fd_inner_socket >= 0);
88213476 3250
de40a303
LP
3251 log_debug("Inner child is initializing.");
3252
0de7acce 3253 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3254 /* Tell the parent, that it now can write the UID map. */
3255 (void) barrier_place(barrier); /* #1 */
7027ff61 3256
03cfe0d5 3257 /* Wait until the parent wrote the UID map */
baaa35ad 3258 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3259 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3260
2a2e78e9
LP
3261 /* Become the new root user inside our namespace */
3262 r = reset_uid_gid();
3263 if (r < 0)
3264 return log_error_errno(r, "Couldn't become new root: %m");
3265
3266 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3267 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3268 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3269 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3270 if (r < 0)
3271 return r;
3272 }
6d66bd3b 3273
0de7acce 3274 r = mount_all(NULL,
4f086aab 3275 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3276 arg_uid_shift,
0de7acce 3277 arg_selinux_apifs_context);
03cfe0d5
LP
3278 if (r < 0)
3279 return r;
3280
04413780
ZJS
3281 if (!arg_network_namespace_path && arg_private_network) {
3282 r = unshare(CLONE_NEWNET);
3283 if (r < 0)
3284 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3285
3286 /* Tell the parent that it can setup network interfaces. */
3287 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3288 }
3289
4f086aab 3290 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3291 if (r < 0)
3292 return r;
3293
03cfe0d5
LP
3294 /* Wait until we are cgroup-ified, so that we
3295 * can mount the right cgroup path writable */
baaa35ad
ZJS
3296 if (!barrier_place_and_sync(barrier)) /* #4 */
3297 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3298 "Parent died too early");
88213476 3299
489fae52 3300 if (arg_use_cgns) {
0996ef00
CB
3301 r = unshare(CLONE_NEWCGROUP);
3302 if (r < 0)
04413780 3303 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3304 r = mount_cgroups(
3305 "",
3306 arg_unified_cgroup_hierarchy,
3307 arg_userns_mode != USER_NAMESPACE_NO,
3308 arg_uid_shift,
3309 arg_uid_range,
5a8ff0e6 3310 arg_selinux_apifs_context,
ada54120 3311 true);
1433e0f2 3312 } else
0996ef00 3313 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3314 if (r < 0)
3315 return r;
ec16945e 3316
1e4f1671 3317 r = setup_boot_id();
03cfe0d5
LP
3318 if (r < 0)
3319 return r;
ec16945e 3320
5d9d3fcb 3321 r = setup_kmsg(fd_inner_socket);
03cfe0d5
LP
3322 if (r < 0)
3323 return r;
ec16945e 3324
de40a303
LP
3325 r = mount_custom(
3326 "/",
3327 arg_custom_mounts,
3328 arg_n_custom_mounts,
de40a303 3329 0,
c0c8f718 3330 0,
de40a303 3331 arg_selinux_apifs_context,
5f0a6347 3332 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3333 if (r < 0)
3334 return r;
3335
03cfe0d5
LP
3336 if (setsid() < 0)
3337 return log_error_errno(errno, "setsid() failed: %m");
3338
3339 if (arg_private_network)
df883de9 3340 (void) loopback_setup();
03cfe0d5 3341
7a8f6325 3342 if (arg_expose_ports) {
b07ee903 3343 r = expose_port_send_rtnl(fd_inner_socket);
7a8f6325
LP
3344 if (r < 0)
3345 return r;
7a8f6325 3346 }
03cfe0d5 3347
3acc84eb 3348 if (arg_console_mode != CONSOLE_PIPE) {
5bb1d7fb 3349 _cleanup_close_ int master = -EBADF;
3acc84eb
FB
3350 _cleanup_free_ char *console = NULL;
3351
3352 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3353 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3354 if (master < 0)
dc98caea 3355 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3356
3357 r = setup_dev_console(console);
3358 if (r < 0)
105a1a36 3359 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb 3360
bb1aa185 3361 r = send_one_fd(fd_inner_socket, master, 0);
3acc84eb
FB
3362 if (r < 0)
3363 return log_error_errno(r, "Failed to send master fd: %m");
3acc84eb
FB
3364
3365 r = setup_stdio_as_dev_console();
3366 if (r < 0)
3367 return r;
3368 }
3369
de40a303
LP
3370 r = patch_sysctl();
3371 if (r < 0)
3372 return r;
3373
81f345df
LP
3374 if (arg_oom_score_adjust_set) {
3375 r = set_oom_score_adjust(arg_oom_score_adjust);
3376 if (r < 0)
3377 return log_error_errno(r, "Failed to adjust OOM score: %m");
3378 }
3379
0985c7c4
ZJS
3380 if (arg_cpu_set.set)
3381 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3382 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3383
c818eef1 3384 (void) setup_hostname();
03cfe0d5 3385
050f7277 3386 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3387 r = safe_personality(arg_personality);
3388 if (r < 0)
3389 return log_error_errno(r, "personality() failed: %m");
4c27749b
LP
3390#ifdef ARCHITECTURE_SECONDARY
3391 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
21022b9d
LP
3392 r = safe_personality(PER_LINUX32);
3393 if (r < 0)
3394 return log_error_errno(r, "personality() failed: %m");
4c27749b 3395#endif
af262e5f
LB
3396 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3397 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3398 "invocation with qemu userspace emulator (or equivalent) in effect.",
3399 architecture_to_string(arg_architecture));
03cfe0d5 3400
de40a303
LP
3401 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3402 if (r < 0)
3403 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3404
3405#if HAVE_SECCOMP
3406 if (arg_seccomp) {
3407
3408 if (is_seccomp_available()) {
de40a303 3409 r = seccomp_load(arg_seccomp);
3c098014
ZJS
3410 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3411 return log_error_errno(r, "Failed to install seccomp filter: %m");
3412 if (r < 0)
de40a303
LP
3413 log_debug_errno(r, "Failed to install seccomp filter: %m");
3414 }
3415 } else
3416#endif
3417 {
6b000af4 3418 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3419 if (r < 0)
3420 return r;
3421 }
3422
4a4654e0 3423 if (arg_suppress_sync) {
20e458ae 3424#if HAVE_SECCOMP
4a4654e0
LP
3425 r = seccomp_suppress_sync();
3426 if (r < 0)
3427 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
20e458ae 3428#else
2db32618 3429 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
20e458ae 3430#endif
4a4654e0
LP
3431 }
3432
349cc4a5 3433#if HAVE_SELINUX
03cfe0d5 3434 if (arg_selinux_context)
2ed96880 3435 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3436 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3437#endif
3438
de40a303
LP
3439 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3440 * if we need to later on. */
3441 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3442 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3443
3444 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3445 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3446 else
3462d773 3447 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3448 if (r < 0)
3449 return r;
3450
de40a303
LP
3451 r = drop_capabilities(getuid());
3452 if (r < 0)
3453 return log_error_errno(r, "Dropping capabilities failed: %m");
3454
66edd963
LP
3455 if (arg_no_new_privileges)
3456 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3457 return log_error_errno(errno, "Failed to disable new privileges: %m");
3458
6aadfa4c
ILG
3459 /* LXC sets container=lxc, so follow the scheme here */
3460 envp[n_env++] = strjoina("container=", arg_container_service_name);
3461
03cfe0d5
LP
3462 envp[n_env] = strv_find_prefix(environ, "TERM=");
3463 if (envp[n_env])
313cefa1 3464 n_env++;
03cfe0d5 3465
de40a303 3466 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3467 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
de40a303
LP
3468 return log_oom();
3469
3470 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
4ab3d29f 3471 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
1da3cb81 3472 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
de40a303 3473 return log_oom();
03cfe0d5 3474
3bbaff3e 3475 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3476
b7416360 3477 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
e01ff70a 3478 return log_oom();
03cfe0d5
LP
3479
3480 if (fdset_size(fds) > 0) {
3481 r = fdset_cloexec(fds, false);
3482 if (r < 0)
3483 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3484
4ab3d29f
ZJS
3485 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3486 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
03cfe0d5
LP
3487 return log_oom();
3488 }
4ab3d29f 3489 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
9c1e04d0 3490 return log_oom();
03cfe0d5 3491
3652872a
LP
3492 if (arg_n_credentials > 0) {
3493 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3494 if (!envp[n_env])
3495 return log_oom();
3496 n_env++;
3497 }
3498
b626f695 3499 if (arg_start_mode != START_BOOT) {
a22f5186 3500 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
b626f695
DDM
3501 if (!envp[n_env])
3502 return log_oom();
3503 n_env++;
3504 }
3505
4ab3d29f 3506 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
2371271c
TG
3507 if (!env_use)
3508 return log_oom();
03cfe0d5
LP
3509
3510 /* Let the parent know that we are ready and
3511 * wait until the parent is ready with the
3512 * setup, too... */
baaa35ad 3513 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3514 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3515
5f932eb9
LP
3516 if (arg_chdir)
3517 if (chdir(arg_chdir) < 0)
3518 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3519
7732f92b 3520 if (arg_start_mode == START_PID2) {
75bf701f 3521 r = stub_pid1(arg_uuid);
7732f92b
LP
3522 if (r < 0)
3523 return r;
3524 }
3525
335d2ead
LP
3526 if (arg_console_mode != CONSOLE_PIPE) {
3527 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3528 * are configured for that. Acquire it as controlling tty. */
3529 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3530 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3531 }
3532
de40a303
LP
3533 log_debug("Inner child completed, invoking payload.");
3534
8ca082b4
LP
3535 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3536 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3537 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3538 log_close();
8ca082b4 3539 log_set_open_when_needed(true);
a3b00f91 3540 log_settle_target();
8ca082b4 3541
03cfe0d5
LP
3542 (void) fdset_close_others(fds);
3543
7732f92b 3544 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3545 char **a;
3546 size_t m;
3547
3548 /* Automatically search for the init system */
3549
75f32f04
ZJS
3550 m = strv_length(arg_parameters);
3551 a = newa(char*, m + 2);
3552 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3553 a[1 + m] = NULL;
03cfe0d5 3554
a5096641
LP
3555 FOREACH_STRING(init,
3556 "/usr/lib/systemd/systemd",
3557 "/lib/systemd/systemd",
3558 "/sbin/init") {
3559 a[0] = (char*) init;
3560 execve(a[0], a, env_use);
3561 }
ced58da7
LP
3562
3563 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3564 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3565 const char *dollar_path;
3566
1a68e1e5 3567 exec_target = arg_parameters[0];
b6b180b7
LP
3568
3569 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3570 * binary. */
3571 dollar_path = strv_env_get(env_use, "PATH");
3572 if (dollar_path) {
6f646e01 3573 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3574 return log_error_errno(errno, "Failed to update $PATH: %m");
3575 }
3576
f757855e 3577 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3578 } else {
5f932eb9 3579 if (!arg_chdir)
d929b0f9
ZJS
3580 /* If we cannot change the directory, we'll end up in /, that is expected. */
3581 (void) chdir(home ?: "/root");
5f932eb9 3582
53350c7b 3583 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3584 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3585 execle("/bin/bash", "-bash", NULL, env_use);
3586 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3587 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7 3588
53350c7b 3589 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
03cfe0d5
LP
3590 }
3591
8ca082b4 3592 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3593}
3594
e96ceaba 3595static int setup_notify_child(void) {
254d1313 3596 _cleanup_close_ int fd = -EBADF;
1eb874b9 3597 static const union sockaddr_union sa = {
44ed5214
LP
3598 .un.sun_family = AF_UNIX,
3599 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3600 };
3601 int r;
3602
3603 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3604 if (fd < 0)
3605 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3606
3607 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3608 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3609
9c1e04d0 3610 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3611 if (r < 0)
44ed5214 3612 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3613
adc7d9f0 3614 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3615 if (r < 0)
adc7d9f0 3616 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3617
2ff48e98 3618 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3619 if (r < 0)
2ff48e98 3620 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3621
271f518f 3622 return TAKE_FD(fd);
9c1e04d0
AP
3623}
3624
03cfe0d5
LP
3625static int outer_child(
3626 Barrier *barrier,
3627 const char *directory,
2d845785 3628 DissectedImage *dissected_image,
af06cd30 3629 int fd_outer_socket,
5d9d3fcb 3630 int fd_inner_socket,
d7bea6b6
DP
3631 FDSet *fds,
3632 int netns_fd) {
03cfe0d5 3633
2f893044 3634 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3635 _cleanup_strv_free_ char **os_release_pairs = NULL;
254d1313 3636 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
f61c7f88 3637 bool idmap = false;
e5f10caf 3638 const char *p;
03cfe0d5
LP
3639 pid_t pid;
3640 ssize_t l;
de40a303 3641 int r;
03cfe0d5 3642
d1d0b895
LP
3643 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3644 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3645 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3646 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3647 * forked off it, and it exits. */
b37469d7 3648
03cfe0d5
LP
3649 assert(barrier);
3650 assert(directory);
af06cd30 3651 assert(fd_outer_socket >= 0);
5d9d3fcb 3652 assert(fd_inner_socket >= 0);
03cfe0d5 3653
de40a303
LP
3654 log_debug("Outer child is initializing.");
3655
e1bb4b0d
LB
3656 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3657 if (r < 0)
3658 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3659
03cfe0d5
LP
3660 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3661 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3662
03cfe0d5
LP
3663 r = reset_audit_loginuid();
3664 if (r < 0)
3665 return r;
3666
2a2e78e9
LP
3667 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3668 * mounts to the real root. */
511a8cfe 3669 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3670 if (r < 0)
3671 return r;
03cfe0d5 3672
2d845785 3673 if (dissected_image) {
d1d0b895
LP
3674 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3675 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3676 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3677 * right place right away. This makes sure ESP partitions and userns are compatible. */
2d3a5a73 3678
af187ab2 3679 r = dissected_image_mount_and_warn(
d04faa4e
LP
3680 dissected_image,
3681 directory,
3682 arg_uid_shift,
21b61b1d 3683 arg_uid_range,
d04faa4e
LP
3684 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3685 DISSECT_IMAGE_DISCARD_ON_LOOP|
3686 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3687 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3688 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3689 if (r < 0)
af187ab2 3690 return r;
2d845785 3691 }
03cfe0d5 3692
391567f4
LP
3693 r = determine_uid_shift(directory);
3694 if (r < 0)
3695 return r;
3696
0de7acce 3697 if (arg_userns_mode != USER_NAMESPACE_NO) {
b71a0192
CB
3698 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3699 if (r < 0)
3700 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3701
af06cd30 3702 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
b71a0192
CB
3703 if (l < 0)
3704 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3705 mntns_fd = safe_close(mntns_fd);
3706
0e7ac751 3707 /* Let the parent know which UID shift we read from the image */
af06cd30 3708 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
825d5287
RM
3709 if (l < 0)
3710 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3711 if (l != sizeof(arg_uid_shift))
3712 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3713 "Short write while sending UID shift.");
0e7ac751 3714
0de7acce 3715 if (arg_userns_mode == USER_NAMESPACE_PICK) {
d1d0b895
LP
3716 /* When we are supposed to pick the UID shift, the parent will check now whether the
3717 * UID shift we just read from the image is available. If yes, it will send the UID
3718 * shift back to us, if not it will pick a different one, and send it back to us. */
0e7ac751 3719
af06cd30 3720 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
0e7ac751
LP
3721 if (l < 0)
3722 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3723 if (l != sizeof(arg_uid_shift))
3724 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3725 "Short read while receiving UID shift.");
0e7ac751
LP
3726 }
3727
ff6c6cc1
LP
3728 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3729 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3730 }
3731
6f83d3d1
LP
3732 if (path_equal(directory, "/")) {
3733 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3734 * place, so that we can make changes to its mount structure (for example, to implement
3735 * --volatile=) without this interfering with our ability to access files such as
3736 * /etc/localtime to copy into the container. Note that we use a fixed place for this
6c2d70ce 3737 * (instead of a temporary directory, since we are living in our own mount namespace here
7802194a 3738 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
6f83d3d1
LP
3739 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3740
511a8cfe 3741 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3742 if (r < 0)
3743 return r;
3744
3745 directory = "/run/systemd/nspawn-root";
e50cd82f 3746 }
7d0ecdd6 3747
75f81732
LP
3748 /* Make sure we always have a mount that we can move to root later on. */
3749 r = make_mount_point(directory);
3750 if (r < 0)
3751 return r;
3752
3753 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3754 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3755 * we'll live in our own little world from now on, and propagation from the host may only happen via
3756 * the mount tunnel dir, or not at all. */
3757 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3758 if (r < 0)
3759 return r;
3760
7d0ecdd6
LP
3761 r = setup_pivot_root(
3762 directory,
3763 arg_pivot_root_new,
3764 arg_pivot_root_old);
3765 if (r < 0)
3766 return r;
3767
3768 r = setup_volatile_mode(
3769 directory,
3770 arg_volatile_mode,
7d0ecdd6 3771 arg_uid_shift,
8f1ed04a 3772 arg_selinux_apifs_context);
7d0ecdd6
LP
3773 if (r < 0)
3774 return r;
3775
2f893044
LP
3776 r = bind_user_prepare(
3777 directory,
3778 arg_bind_user,
3779 arg_uid_shift,
3780 arg_uid_range,
3781 &arg_custom_mounts, &arg_n_custom_mounts,
3782 &bind_user_context);
3783 if (r < 0)
3784 return r;
3785
3786 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
d1d0b895
LP
3787 /* Send the user maps we determined to the parent, so that it installs it in our user
3788 * namespace UID map table */
2f893044
LP
3789
3790 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3791 uid_t map[] = {
3792 bind_user_context->data[i].payload_user->uid,
3793 bind_user_context->data[i].host_user->uid,
3794 (uid_t) bind_user_context->data[i].payload_group->gid,
3795 (uid_t) bind_user_context->data[i].host_group->gid,
3796 };
3797
af06cd30 3798 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
2f893044
LP
3799 if (l < 0)
3800 return log_error_errno(errno, "Failed to send user UID map: %m");
3801 if (l != sizeof(map))
3802 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3803 "Short write while sending user UID map.");
3804 }
3805 }
3806
5f0a6347
DDM
3807 r = mount_custom(
3808 directory,
3809 arg_custom_mounts,
3810 arg_n_custom_mounts,
5f0a6347 3811 arg_uid_shift,
c0c8f718 3812 arg_uid_range,
5f0a6347
DDM
3813 arg_selinux_apifs_context,
3814 MOUNT_ROOT_ONLY);
3815 if (r < 0)
3816 return r;
3817
c0c8f718
AV
3818 if (arg_userns_mode != USER_NAMESPACE_NO &&
3819 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3820 arg_uid_shift != 0) {
3821
2b2777ed 3822 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
bb44fd07
ZJS
3823 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3824 /* This might fail because the kernel or file system doesn't support idmapping. We
3825 * can't really distinguish this nicely, nor do we have any guarantees about the
3826 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3827 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3828 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3829 "ID mapped mounts are apparently not available, sorry.");
3830
3831 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3832 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3833 } else if (r < 0)
3834 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3835 else {
c0c8f718
AV
3836 log_debug("ID mapped mounts available, making use of them.");
3837 idmap = true;
3838 }
3839 }
3840
2d3a5a73
LP
3841 if (dissected_image) {
3842 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3843 r = dissected_image_mount(
3844 dissected_image,
3845 directory,
3846 arg_uid_shift,
21b61b1d 3847 arg_uid_range,
d04faa4e
LP
3848 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3849 DISSECT_IMAGE_DISCARD_ON_LOOP|
3850 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3851 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3852 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3853 if (r == -EUCLEAN)
3854 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3855 if (r < 0)
4fcb96ce 3856 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3857 }
3858
8199d554
LP
3859 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3860 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3861
3862 r = detect_unified_cgroup_hierarchy_from_image(directory);
3863 if (r < 0)
3864 return r;
3865
fefb7a6d 3866 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
8199d554
LP
3867 if (l < 0)
3868 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3869 if (l != sizeof(arg_unified_cgroup_hierarchy))
3870 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3871 "Short write while sending cgroup mode.");
8199d554
LP
3872 }
3873
4ad14eff
LP
3874 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3875 if (r < 0)
3876 return r;
3877
03cfe0d5
LP
3878 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3879 if (r < 0)
3880 return r;
3881
bbd407ea
DDM
3882 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3883 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3884 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3885 if (r < 0)
3886 return log_error_errno(r, "Failed to make tree read-only: %m");
3887 }
3888
0de7acce 3889 r = mount_all(directory,
4f086aab 3890 arg_mount_settings,
0de7acce 3891 arg_uid_shift,
0de7acce 3892 arg_selinux_apifs_context);
03cfe0d5
LP
3893 if (r < 0)
3894 return r;
3895
07fa00f9
LP
3896 r = copy_devnodes(directory);
3897 if (r < 0)
03cfe0d5
LP
3898 return r;
3899
de40a303
LP
3900 r = make_extra_nodes(directory);
3901 if (r < 0)
3902 return r;
3903
3904 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3905
9fac5029 3906 p = prefix_roota(directory, "/run/host");
e5f10caf 3907 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3908
07fa00f9
LP
3909 r = setup_pts(directory);
3910 if (r < 0)
03cfe0d5
LP
3911 return r;
3912
e79581dd 3913 r = mount_tunnel_dig(directory);
03cfe0d5
LP
3914 if (r < 0)
3915 return r;
3916
8e5430c4
LP
3917 r = setup_keyring();
3918 if (r < 0)
3919 return r;
3920
3652872a
LP
3921 r = setup_credentials(directory);
3922 if (r < 0)
3923 return r;
3924
2f893044
LP
3925 r = bind_user_setup(bind_user_context, directory);
3926 if (r < 0)
3927 return r;
3928
5c4deb9a
MJ
3929 r = mount_custom(
3930 directory,
3931 arg_custom_mounts,
3932 arg_n_custom_mounts,
3933 arg_uid_shift,
c0c8f718 3934 arg_uid_range,
5c4deb9a
MJ
3935 arg_selinux_apifs_context,
3936 MOUNT_NON_ROOT_ONLY);
3937 if (r < 0)
3938 return r;
3939
03cfe0d5
LP
3940 r = setup_timezone(directory);
3941 if (r < 0)
3942 return r;
3943
3944 r = setup_resolv_conf(directory);
3945 if (r < 0)
3946 return r;
3947
e01ff70a
MS
3948 r = setup_machine_id(directory);
3949 if (r < 0)
3950 return r;
3951
03cfe0d5
LP
3952 r = setup_journal(directory);
3953 if (r < 0)
3954 return r;
3955
0f48ba7b
LP
3956 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3957 p = prefix_roota(directory, "/run/host/container-manager");
3958 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3959
3960 /* The same stuff as the $container_uuid env var */
3961 p = prefix_roota(directory, "/run/host/container-uuid");
3962 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3963
489fae52 3964 if (!arg_use_cgns) {
0996ef00
CB
3965 r = mount_cgroups(
3966 directory,
3967 arg_unified_cgroup_hierarchy,
3968 arg_userns_mode != USER_NAMESPACE_NO,
3969 arg_uid_shift,
3970 arg_uid_range,
5a8ff0e6 3971 arg_selinux_apifs_context,
ada54120 3972 false);
0996ef00
CB
3973 if (r < 0)
3974 return r;
3975 }
03cfe0d5 3976
57c10a56
CB
3977 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3978 * mounts available in systemd services inside the container that create a new mount namespace. See
3979 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3980 * will inherit the shared propagation mode.
3981 *
3982 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3983 * directory mount to root later on.
3984 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3985 */
9d50f850 3986 r = mount_switch_root(directory, MS_SHARED);
03cfe0d5
LP
3987 if (r < 0)
3988 return log_error_errno(r, "Failed to move root directory: %m");
3989
e79581dd
CB
3990 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3991 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3992 * the container. */
3993 r = mount_tunnel_open();
3994 if (r < 0)
3995 return r;
3996
b71a0192
CB
3997 if (arg_userns_mode != USER_NAMESPACE_NO) {
3998 /* In order to mount procfs and sysfs in an unprivileged container the kernel
3999 * requires that a fully visible instance is already present in the target mount
4000 * namespace. Mount one here so the inner child can mount its own instances. Later
4001 * we umount the temporary instances created here before we actually exec the
4002 * payload. Since the rootfs is shared the umount will propagate into the container.
4003 * Note, the inner child wouldn't be able to unmount the instances on its own since
4004 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4005 * this. */
4006 r = pin_fully_visible_fs();
4007 if (r < 0)
4008 return r;
4009 }
4010
e96ceaba 4011 fd = setup_notify_child();
9c1e04d0
AP
4012 if (fd < 0)
4013 return fd;
4014
03cfe0d5 4015 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 4016 arg_clone_ns_flags |
8869a0b4 4017 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
4018 if (pid < 0)
4019 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5 4020 if (pid == 0) {
af06cd30 4021 fd_outer_socket = safe_close(fd_outer_socket);
03cfe0d5 4022
2a2e78e9
LP
4023 /* The inner child has all namespaces that are requested, so that we all are owned by the
4024 * user if user namespaces are turned on. */
03cfe0d5 4025
d7bea6b6
DP
4026 if (arg_network_namespace_path) {
4027 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4028 if (r < 0)
e2d39e54 4029 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
4030 }
4031
11875a98 4032 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
03cfe0d5
LP
4033 if (r < 0)
4034 _exit(EXIT_FAILURE);
4035
4036 _exit(EXIT_SUCCESS);
4037 }
4038
af06cd30 4039 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
03cfe0d5
LP
4040 if (l < 0)
4041 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
4042 if (l != sizeof(pid))
4043 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4044 "Short write while sending PID.");
03cfe0d5 4045
af06cd30 4046 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
e01ff70a
MS
4047 if (l < 0)
4048 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
4049 if (l != sizeof(arg_uuid))
4050 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4051 "Short write while sending machine ID.");
e01ff70a 4052
af06cd30 4053 l = send_one_fd(fd_outer_socket, fd, 0);
9c1e04d0 4054 if (l < 0)
ba72801d 4055 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 4056
af06cd30 4057 fd_outer_socket = safe_close(fd_outer_socket);
5d9d3fcb 4058 fd_inner_socket = safe_close(fd_inner_socket);
d7bea6b6 4059 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
4060
4061 return 0;
4062}
4063
0e7ac751 4064static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4065 bool tried_hashed = false;
0e7ac751
LP
4066 unsigned n_tries = 100;
4067 uid_t candidate;
4068 int r;
4069
4070 assert(shift);
4071 assert(ret_lock_file);
0de7acce 4072 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4073 assert(arg_uid_range == 0x10000U);
4074
4075 candidate = *shift;
4076
4077 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4078
4079 for (;;) {
fbd0b64f 4080 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4081 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4082
4083 if (--n_tries <= 0)
4084 return -EBUSY;
4085
87d5e4f2 4086 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4087 goto next;
4088 if ((candidate & UINT32_C(0xFFFF)) != 0)
4089 goto next;
4090
4091 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4092 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4093 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4094 goto next;
4095 if (r < 0)
4096 return r;
4097
4098 /* Make some superficial checks whether the range is currently known in the user database */
4099 if (getpwuid(candidate))
4100 goto next;
4101 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4102 goto next;
4103 if (getgrgid(candidate))
4104 goto next;
4105 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4106 goto next;
4107
4108 *ret_lock_file = lf;
4109 lf = (struct LockFile) LOCK_FILE_INIT;
4110 *shift = candidate;
4111 return 0;
4112
4113 next:
d381c8a6
LP
4114 if (arg_machine && !tried_hashed) {
4115 /* Try to hash the base from the container name */
4116
4117 static const uint8_t hash_key[] = {
4118 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4119 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4120 };
4121
4122 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4123
4124 tried_hashed = true;
4125 } else
4126 random_bytes(&candidate, sizeof(candidate));
4127
87d5e4f2 4128 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4129 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4130 }
4131}
4132
2f893044
LP
4133static int add_one_uid_map(
4134 char **p,
4135 uid_t container_uid,
4136 uid_t host_uid,
4137 uid_t range) {
4138
4139 return strextendf(p,
4140 UID_FMT " " UID_FMT " " UID_FMT "\n",
4141 container_uid, host_uid, range);
4142}
4143
4144static int make_uid_map_string(
4145 const uid_t bind_user_uid[],
4146 size_t n_bind_user_uid,
4147 size_t offset,
4148 char **ret) {
4149
4150 _cleanup_free_ char *s = NULL;
4151 uid_t previous_uid = 0;
4152 int r;
4153
4154 assert(n_bind_user_uid == 0 || bind_user_uid);
2f092762 4155 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
2f893044
LP
4156 assert(ret);
4157
4158 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4159 * quadruplet, consisting of host and container UID + GID. */
4160
4161 for (size_t i = 0; i < n_bind_user_uid; i++) {
05ab439a
YW
4162 uid_t payload_uid = bind_user_uid[i*4+offset],
4163 host_uid = bind_user_uid[i*4+offset+1];
2f893044
LP
4164
4165 assert(previous_uid <= payload_uid);
4166 assert(payload_uid < arg_uid_range);
4167
4168 /* Add a range to close the gap to previous entry */
4169 if (payload_uid > previous_uid) {
4170 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4171 if (r < 0)
4172 return r;
4173 }
4174
4175 /* Map this specific user */
4176 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4177 if (r < 0)
4178 return r;
4179
4180 previous_uid = payload_uid + 1;
4181 }
4182
4183 /* And add a range to close the gap to finish the range */
4184 if (arg_uid_range > previous_uid) {
4185 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4186 if (r < 0)
4187 return r;
4188 }
4189
4190 assert(s);
4191
4192 *ret = TAKE_PTR(s);
4193 return 0;
4194}
4195
4196static int setup_uid_map(
4197 pid_t pid,
4198 const uid_t bind_user_uid[],
4199 size_t n_bind_user_uid) {
4200
4201 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4202 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4203 int r;
4204
4205 assert(pid > 1);
4206
2f893044
LP
4207 /* Build the UID map string */
4208 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4209 return log_oom();
4210
03cfe0d5 4211 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4212 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4213 if (r < 0)
4214 return log_error_errno(r, "Failed to write UID map: %m");
4215
2f893044
LP
4216 /* And now build the GID map string */
4217 s = mfree(s);
4218 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4219 return log_oom();
4220
03cfe0d5 4221 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4222 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4223 if (r < 0)
4224 return log_error_errno(r, "Failed to write GID map: %m");
4225
4226 return 0;
4227}
4228
9c1e04d0 4229static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4230 char buf[NOTIFY_BUFFER_MAX+1];
4231 char *p = NULL;
4232 struct iovec iovec = {
4233 .iov_base = buf,
4234 .iov_len = sizeof(buf)-1,
4235 };
fb29cdbe
LP
4236 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4237 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4238 struct msghdr msghdr = {
4239 .msg_iov = &iovec,
4240 .msg_iovlen = 1,
4241 .msg_control = &control,
4242 .msg_controllen = sizeof(control),
4243 };
371d72e0 4244 struct ucred *ucred;
9c1e04d0
AP
4245 ssize_t n;
4246 pid_t inner_child_pid;
4247 _cleanup_strv_free_ char **tags = NULL;
4bf4f50f 4248 int r;
9c1e04d0
AP
4249
4250 assert(userdata);
4251
4252 inner_child_pid = PTR_TO_PID(userdata);
4253
4254 if (revents != EPOLLIN) {
4255 log_warning("Got unexpected poll event for notify fd.");
4256 return 0;
4257 }
4258
3691bcf3 4259 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
bb44fd07
ZJS
4260 if (ERRNO_IS_NEG_TRANSIENT(n))
4261 return 0;
4262 else if (n == -EXFULL) {
4263 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4264 return 0;
4265 } else if (n < 0)
3691bcf3 4266 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4267
9c1e04d0
AP
4268 cmsg_close_all(&msghdr);
4269
371d72e0 4270 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4271 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4272 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4273 return 0;
4274 }
4275
4276 if ((size_t) n >= sizeof(buf)) {
4277 log_warning("Received notify message exceeded maximum size. Ignoring.");
4278 return 0;
4279 }
4280
4281 buf[n] = 0;
4282 tags = strv_split(buf, "\n\r");
4283 if (!tags)
4284 return log_oom();
4285
d29cc4d6 4286 if (strv_contains(tags, "READY=1")) {
d4341b76 4287 r = sd_notify(false, "READY=1\n");
4bf4f50f
ZJS
4288 if (r < 0)
4289 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4290 }
9c1e04d0
AP
4291
4292 p = strv_find_startswith(tags, "STATUS=");
4293 if (p)
04f590a4 4294 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4295
4296 return 0;
4297}
4298
e96ceaba 4299static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4300 int r;
9c1e04d0 4301
5773024d 4302 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4303 if (r < 0)
4304 return log_error_errno(r, "Failed to allocate notify event source: %m");
4305
5773024d 4306 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4307
4308 return 0;
4309}
4310
5d961407
LP
4311static int merge_settings(Settings *settings, const char *path) {
4312 int rl;
f757855e 4313
5d961407
LP
4314 assert(settings);
4315 assert(path);
f757855e 4316
5d961407
LP
4317 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4318 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4319
7732f92b
LP
4320 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4321 settings->start_mode >= 0) {
4322 arg_start_mode = settings->start_mode;
130d3d22 4323 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4324 }
4325
d3689b94
LP
4326 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4327 settings->ephemeral >= 0)
a2f577fc
JL
4328 arg_ephemeral = settings->ephemeral;
4329
de40a303
LP
4330 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4331 settings->root) {
4332
4333 if (!arg_settings_trusted)
4334 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4335 else
4336 free_and_replace(arg_directory, settings->root);
4337 }
4338
b53ede69
PW
4339 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4340 settings->pivot_root_new) {
4341 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4342 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4343 }
4344
5f932eb9 4345 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4346 settings->working_directory)
4347 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4348
f757855e 4349 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4350 settings->environment)
4351 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4352
de40a303
LP
4353 if ((arg_settings_mask & SETTING_USER) == 0) {
4354
4355 if (settings->user)
4356 free_and_replace(arg_user, settings->user);
4357
4358 if (uid_is_valid(settings->uid))
4359 arg_uid = settings->uid;
4360 if (gid_is_valid(settings->gid))
4361 arg_gid = settings->gid;
4362 if (settings->n_supplementary_gids > 0) {
4363 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4364 arg_n_supplementary_gids = settings->n_supplementary_gids;
4365 }
4366 }
f757855e
LP
4367
4368 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4369 uint64_t plus, minus;
7be830c6 4370 uint64_t network_minus = 0;
88fc9c9b 4371 uint64_t ambient;
f757855e 4372
de40a303
LP
4373 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4374 * Settings structure */
4375
0e265674 4376 plus = settings->capability;
a3fc6b55
LP
4377 minus = settings->drop_capability;
4378
9baa294c
LP
4379 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4380 settings_network_configured(settings)) {
a3fc6b55
LP
4381 if (settings_private_network(settings))
4382 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4383 else
7be830c6 4384 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4385 }
0e265674
LP
4386
4387 if (!arg_settings_trusted && plus != 0) {
4388 if (settings->capability != 0)
5d961407 4389 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4390 } else {
4391 arg_caps_retain &= ~network_minus;
520e0d54 4392 arg_caps_retain |= plus;
7be830c6 4393 }
f757855e 4394
a3fc6b55 4395 arg_caps_retain &= ~minus;
de40a303
LP
4396
4397 /* Copy the full capabilities over too */
4398 if (capability_quintet_is_set(&settings->full_capabilities)) {
4399 if (!arg_settings_trusted)
5238e957 4400 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4401 else
4402 arg_full_capabilities = settings->full_capabilities;
4403 }
88fc9c9b
TH
4404
4405 ambient = settings->ambient_capability;
4406 if (!arg_settings_trusted && ambient != 0)
4407 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4408 else
4409 arg_caps_ambient |= ambient;
f757855e
LP
4410 }
4411
4412 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4413 settings->kill_signal > 0)
4414 arg_kill_signal = settings->kill_signal;
4415
4416 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4417 settings->personality != PERSONALITY_INVALID)
4418 arg_personality = settings->personality;
4419
4420 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4421 !sd_id128_is_null(settings->machine_id)) {
4422
4423 if (!arg_settings_trusted)
5d961407 4424 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4425 else
4426 arg_uuid = settings->machine_id;
4427 }
4428
4429 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4430 settings->read_only >= 0)
4431 arg_read_only = settings->read_only;
4432
4433 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4434 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4435 arg_volatile_mode = settings->volatile_mode;
4436
4437 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4438 settings->n_custom_mounts > 0) {
4439
4440 if (!arg_settings_trusted)
5d961407 4441 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4442 else {
4443 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4444 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4445 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4446 settings->n_custom_mounts = 0;
4447 }
4448 }
4449
4450 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
a1dfd585 4451 settings_network_configured(settings)) {
f757855e
LP
4452
4453 if (!arg_settings_trusted)
5d961407 4454 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4455 else {
f6d6bad1 4456 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4457 arg_private_network = settings_private_network(settings);
4458
130d3d22
YW
4459 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4460 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4461 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4462 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4463
1cc6c93a
YW
4464 free_and_replace(arg_network_bridge, settings->network_bridge);
4465 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4466
4467 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4468 }
4469 }
4470
4471 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4472 settings->expose_ports) {
4473
4474 if (!arg_settings_trusted)
5d961407 4475 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4476 else {
4477 expose_port_free_all(arg_expose_ports);
1cc6c93a 4478 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4479 }
4480 }
4481
0de7acce
LP
4482 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4483 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4484
4485 if (!arg_settings_trusted)
5d961407 4486 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4487 else {
4488 arg_userns_mode = settings->userns_mode;
4489 arg_uid_shift = settings->uid_shift;
4490 arg_uid_range = settings->uid_range;
6c045a99 4491 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4492 }
4493 }
4494
0cc3c9f9
LP
4495 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4496 !strv_isempty(settings->bind_user))
2f893044
LP
4497 strv_free_and_replace(arg_bind_user, settings->bind_user);
4498
d3689b94
LP
4499 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4500 settings->notify_ready >= 0)
9c1e04d0
AP
4501 arg_notify_ready = settings->notify_ready;
4502
960e4569
LP
4503 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4504
2d09ea44
LP
4505 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4506 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4507 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4508 else {
4509 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4510 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4511 }
960e4569 4512 }
de40a303
LP
4513
4514#if HAVE_SECCOMP
2d09ea44
LP
4515 if (settings->seccomp) {
4516 if (!arg_settings_trusted)
4517 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4518 else {
4519 seccomp_release(arg_seccomp);
4520 arg_seccomp = TAKE_PTR(settings->seccomp);
4521 }
de40a303
LP
4522 }
4523#endif
960e4569
LP
4524 }
4525
bf428efb
LP
4526 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4527 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4528 continue;
4529
4530 if (!settings->rlimit[rl])
4531 continue;
4532
4533 if (!arg_settings_trusted) {
5d961407 4534 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4535 continue;
4536 }
4537
4538 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4539 }
4540
3a9530e5
LP
4541 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4542 settings->hostname)
4543 free_and_replace(arg_hostname, settings->hostname);
4544
66edd963
LP
4545 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4546 settings->no_new_privileges >= 0)
4547 arg_no_new_privileges = settings->no_new_privileges;
4548
81f345df
LP
4549 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4550 settings->oom_score_adjust_set) {
4551
4552 if (!arg_settings_trusted)
5d961407 4553 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4554 else {
4555 arg_oom_score_adjust = settings->oom_score_adjust;
4556 arg_oom_score_adjust_set = true;
4557 }
4558 }
4559
d107bb7d 4560 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4561 settings->cpu_set.set) {
d107bb7d
LP
4562
4563 if (!arg_settings_trusted)
5d961407 4564 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4565 else {
0985c7c4 4566 cpu_set_reset(&arg_cpu_set);
088d71f8 4567 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
d107bb7d
LP
4568 }
4569 }
4570
09d423e9
LP
4571 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4572 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4573 arg_resolv_conf = settings->resolv_conf;
4574
4e1d6aa9
LP
4575 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4576 settings->link_journal != _LINK_JOURNAL_INVALID) {
4577
4578 if (!arg_settings_trusted)
4579 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4580 else {
4581 arg_link_journal = settings->link_journal;
4582 arg_link_journal_try = settings->link_journal_try;
4583 }
4584 }
4585
1688841f
LP
4586 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4587 settings->timezone != _TIMEZONE_MODE_INVALID)
4588 arg_timezone = settings->timezone;
4589
de40a303
LP
4590 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4591 settings->slice) {
4592
4593 if (!arg_settings_trusted)
4594 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4595 else
4596 free_and_replace(arg_slice, settings->slice);
4597 }
4598
4599 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4600 settings->use_cgns >= 0) {
4601
4602 if (!arg_settings_trusted)
4603 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4604 else
4605 arg_use_cgns = settings->use_cgns;
4606 }
4607
4608 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4609 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4610
4611 if (!arg_settings_trusted)
4612 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4613 else
4614 arg_clone_ns_flags = settings->clone_ns_flags;
4615 }
4616
4617 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4618 settings->console_mode >= 0) {
4619
4620 if (!arg_settings_trusted)
4621 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4622 else
4623 arg_console_mode = settings->console_mode;
4624 }
4625
d3689b94
LP
4626 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4627 settings->suppress_sync >= 0)
4a4654e0
LP
4628 arg_suppress_sync = settings->suppress_sync;
4629
de40a303
LP
4630 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4631 * don't consult arg_settings_mask for them. */
4632
4633 sd_bus_message_unref(arg_property_message);
4634 arg_property_message = TAKE_PTR(settings->properties);
4635
4636 arg_console_width = settings->console_width;
4637 arg_console_height = settings->console_height;
4638
b2645747 4639 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4640 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4641 arg_n_extra_nodes = settings->n_extra_nodes;
825210d4 4642 settings->n_extra_nodes = 0;
de40a303 4643
f757855e
LP
4644 return 0;
4645}
4646
5d961407
LP
4647static int load_settings(void) {
4648 _cleanup_(settings_freep) Settings *settings = NULL;
4649 _cleanup_fclose_ FILE *f = NULL;
3603f151 4650 _cleanup_free_ char *p = NULL;
5d961407
LP
4651 int r;
4652
de40a303
LP
4653 if (arg_oci_bundle)
4654 return 0;
4655
5d961407
LP
4656 /* If all settings are masked, there's no point in looking for
4657 * the settings file */
d7a0f1f4 4658 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4659 return 0;
4660
5d961407
LP
4661 /* We first look in the admin's directories in /etc and /run */
4662 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4663 _cleanup_free_ char *j = NULL;
4664
3603f151 4665 j = path_join(i, arg_settings_filename);
5d961407
LP
4666 if (!j)
4667 return log_oom();
4668
4669 f = fopen(j, "re");
4670 if (f) {
4671 p = TAKE_PTR(j);
4672
4673 /* By default, we trust configuration from /etc and /run */
4674 if (arg_settings_trusted < 0)
4675 arg_settings_trusted = true;
4676
4677 break;
4678 }
4679
4680 if (errno != ENOENT)
4681 return log_error_errno(errno, "Failed to open %s: %m", j);
4682 }
4683
4684 if (!f) {
4685 /* After that, let's look for a file next to the
4686 * actual image we shall boot. */
4687
4688 if (arg_image) {
162f6477
LP
4689 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4690 if (r < 0)
4691 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4692 } else if (arg_directory) {
4693 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4694 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4695 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
5d961407
LP
4696 }
4697
4698 if (p) {
4699 f = fopen(p, "re");
4700 if (!f && errno != ENOENT)
4701 return log_error_errno(errno, "Failed to open %s: %m", p);
4702
4703 /* By default, we do not trust configuration from /var/lib/machines */
4704 if (arg_settings_trusted < 0)
4705 arg_settings_trusted = false;
4706 }
4707 }
4708
4709 if (!f)
4710 return 0;
4711
4712 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4713
4714 r = settings_load(f, p, &settings);
4715 if (r < 0)
4716 return r;
4717
4718 return merge_settings(settings, p);
4719}
4720
de40a303
LP
4721static int load_oci_bundle(void) {
4722 _cleanup_(settings_freep) Settings *settings = NULL;
4723 int r;
4724
4725 if (!arg_oci_bundle)
4726 return 0;
4727
4728 /* By default let's trust OCI bundles */
4729 if (arg_settings_trusted < 0)
4730 arg_settings_trusted = true;
4731
4732 r = oci_load(NULL, arg_oci_bundle, &settings);
4733 if (r < 0)
4734 return r;
4735
4736 return merge_settings(settings, arg_oci_bundle);
4737}
4738
3acc84eb 4739static int run_container(
2d845785 4740 DissectedImage *dissected_image,
b0067625
ZJS
4741 FDSet *fds,
4742 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4743 struct ExposeArgs *expose_args,
3acc84eb 4744 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4745
4746 static const struct sigaction sa = {
4747 .sa_handler = nop_signal_handler,
e28c7cd0 4748 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4749 };
4750
8e766630 4751 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5bb1d7fb 4752 _cleanup_close_ int etc_passwd_lock = -EBADF;
b0067625 4753 _cleanup_close_pair_ int
19ee48a6
YW
4754 fd_inner_socket_pair[2] = PIPE_EBADF,
4755 fd_outer_socket_pair[2] = PIPE_EBADF;
8199d554 4756
5bb1d7fb 4757 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
b0067625 4758 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4759 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4760 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4761 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4762 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4763 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4764 _cleanup_free_ uid_t *bind_user_uid = NULL;
4765 size_t n_bind_user_uid = 0;
b0067625 4766 ContainerStatus container_status = 0;
b0067625
ZJS
4767 int ifi = 0, r;
4768 ssize_t l;
4769 sigset_t mask_chld;
254d1313 4770 _cleanup_close_ int child_netns_fd = -EBADF;
b0067625
ZJS
4771
4772 assert_se(sigemptyset(&mask_chld) == 0);
4773 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4774
4775 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4776 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4777 * check with getpwuid() if the specific user already exists. Note that /etc might be
4778 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4779 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4780 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4781 * really ours. */
4782
4783 etc_passwd_lock = take_etc_passwd_lock(NULL);
4784 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4785 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4786 }
4787
4788 r = barrier_create(&barrier);
4789 if (r < 0)
4790 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4791
5d9d3fcb
CB
4792 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4793 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4794
af06cd30
CB
4795 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4796 return log_error_errno(errno, "Failed to create outer socket pair: %m");
b0067625 4797
b0067625
ZJS
4798 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4799 * parent's blocking calls and give it a chance to call wait() and terminate. */
4800 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4801 if (r < 0)
4802 return log_error_errno(errno, "Failed to change the signal mask: %m");
4803
4804 r = sigaction(SIGCHLD, &sa, NULL);
4805 if (r < 0)
4806 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4807
d7bea6b6 4808 if (arg_network_namespace_path) {
5b4855ab
DDM
4809 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4810 if (child_netns_fd < 0)
d7bea6b6
DP
4811 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4812
54c2459d 4813 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4814 if (r == -EUCLEAN)
4815 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4816 else if (r < 0)
d7bea6b6 4817 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4818 else if (r == 0)
4819 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4820 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4821 }
4822
b0067625
ZJS
4823 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4824 if (*pid < 0)
4825 return log_error_errno(errno, "clone() failed%s: %m",
4826 errno == EINVAL ?
4827 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4828
4829 if (*pid == 0) {
4830 /* The outer child only has a file system namespace. */
4831 barrier_set_role(&barrier, BARRIER_CHILD);
4832
5d9d3fcb 4833 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
af06cd30 4834 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
b0067625
ZJS
4835
4836 (void) reset_all_signal_handlers();
4837 (void) reset_signal_mask();
4838
4839 r = outer_child(&barrier,
4840 arg_directory,
2d845785 4841 dissected_image,
af06cd30 4842 fd_outer_socket_pair[1],
5d9d3fcb 4843 fd_inner_socket_pair[1],
d7bea6b6 4844 fds,
5b4855ab 4845 child_netns_fd);
b0067625
ZJS
4846 if (r < 0)
4847 _exit(EXIT_FAILURE);
4848
4849 _exit(EXIT_SUCCESS);
4850 }
4851
4852 barrier_set_role(&barrier, BARRIER_PARENT);
4853
e4077ff6 4854 fdset_close(fds);
b0067625 4855
5d9d3fcb 4856 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
af06cd30 4857 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
b0067625
ZJS
4858
4859 if (arg_userns_mode != USER_NAMESPACE_NO) {
af06cd30 4860 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
b71a0192
CB
4861 if (mntns_fd < 0)
4862 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4863
b0067625 4864 /* The child just let us know the UID shift it might have read from the image. */
af06cd30 4865 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
b0067625
ZJS
4866 if (l < 0)
4867 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4868 if (l != sizeof arg_uid_shift)
4869 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4870
4871 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4872 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4873 * image, but if that's already in use, pick a new one, and report back to the child,
4874 * which one we now picked. */
4875
4876 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4877 if (r < 0)
4878 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4879
af06cd30 4880 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
b0067625
ZJS
4881 if (l < 0)
4882 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4883 if (l != sizeof arg_uid_shift)
4884 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4885 }
2f893044
LP
4886
4887 n_bind_user_uid = strv_length(arg_bind_user);
4888 if (n_bind_user_uid > 0) {
4889 /* Right after the UID shift, we'll receive the list of UID mappings for the
4890 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4891
4892 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4893 if (!bind_user_uid)
4894 return log_oom();
4895
4896 for (size_t i = 0; i < n_bind_user_uid; i++) {
af06cd30 4897 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
2f893044
LP
4898 if (l < 0)
4899 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4900 if (l != sizeof(uid_t)*4)
4901 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4902 SYNTHETIC_ERRNO(EIO),
4903 "Short read while reading bind user UID pairs.");
4904 }
4905 }
b0067625
ZJS
4906 }
4907
8199d554
LP
4908 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4909 /* The child let us know the support cgroup mode it might have read from the image. */
fefb7a6d 4910 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
8199d554
LP
4911 if (l < 0)
4912 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113 4913 if (l != sizeof(arg_unified_cgroup_hierarchy))
c0f86d66 4914 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
c6147113 4915 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4916 }
4917
b0067625 4918 /* Wait for the outer child. */
d2e0ac3d
LP
4919 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4920 if (r < 0)
4921 return r;
4922 if (r != EXIT_SUCCESS)
4923 return -EIO;
b0067625
ZJS
4924
4925 /* And now retrieve the PID of the inner child. */
af06cd30 4926 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
b0067625
ZJS
4927 if (l < 0)
4928 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4929 if (l != sizeof *pid)
4930 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4931
4932 /* We also retrieve container UUID in case it was generated by outer child */
af06cd30 4933 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
b0067625
ZJS
4934 if (l < 0)
4935 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4936 if (l != sizeof(arg_uuid))
4937 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4938
4939 /* We also retrieve the socket used for notifications generated by outer child */
af06cd30 4940 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
b0067625
ZJS
4941 if (notify_socket < 0)
4942 return log_error_errno(notify_socket,
4943 "Failed to receive notification socket from the outer child: %m");
4944
4945 log_debug("Init process invoked as PID "PID_FMT, *pid);
4946
4947 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4948 if (!barrier_place_and_sync(&barrier)) /* #1 */
4949 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4950
2f893044 4951 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4952 if (r < 0)
4953 return r;
4954
4955 (void) barrier_place(&barrier); /* #2 */
4956 }
4957
4958 if (arg_private_network) {
75116558
PS
4959 if (!arg_network_namespace_path) {
4960 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4961 if (!barrier_place_and_sync(&barrier)) /* #3 */
4962 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4963 }
4964
5b4855ab
DDM
4965 if (child_netns_fd < 0) {
4966 /* Make sure we have an open file descriptor to the child's network
4967 * namespace so it stays alive even if the child exits. */
4968 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4969 if (r < 0)
4970 return log_error_errno(r, "Failed to open child network namespace: %m");
4971 }
4972
4973 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4974 if (r < 0)
4975 return r;
4976
4977 if (arg_network_veth) {
4978 r = setup_veth(arg_machine, *pid, veth_name,
4979 arg_network_bridge || arg_network_zone);
4980 if (r < 0)
4981 return r;
4982 else if (r > 0)
4983 ifi = r;
4984
4985 if (arg_network_bridge) {
4986 /* Add the interface to a bridge */
4987 r = setup_bridge(veth_name, arg_network_bridge, false);
4988 if (r < 0)
4989 return r;
4990 if (r > 0)
4991 ifi = r;
4992 } else if (arg_network_zone) {
4993 /* Add the interface to a bridge, possibly creating it */
4994 r = setup_bridge(veth_name, arg_network_zone, true);
4995 if (r < 0)
4996 return r;
4997 if (r > 0)
4998 ifi = r;
4999 }
5000 }
5001
5002 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5003 if (r < 0)
5004 return r;
5005
5006 /* We created the primary and extra veth links now; let's remember this, so that we know to
5007 remove them later on. Note that we don't bother with removing veth links that were created
5008 here when their setup failed half-way, because in that case the kernel should be able to
5009 remove them on its own, since they cannot be referenced by anything yet. */
5010 *veth_created = true;
5011
5012 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5013 if (r < 0)
5014 return r;
5015
5016 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5017 if (r < 0)
5018 return r;
5019 }
5020
abdb9b08
LP
5021 if (arg_register || !arg_keep_unit) {
5022 r = sd_bus_default_system(&bus);
5023 if (r < 0)
5024 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5025
5026 r = sd_bus_set_close_on_exit(bus, false);
5027 if (r < 0)
5028 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5029 }
5030
5031 if (!arg_keep_unit) {
5032 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5033 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5034 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5035
75152a4d
LP
5036 r = sd_bus_match_signal_async(
5037 bus,
5038 NULL,
5039 "org.freedesktop.systemd1",
5040 NULL,
5041 "org.freedesktop.systemd1.Scope",
5042 "RequestStop",
5043 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5044 if (r < 0)
75152a4d 5045 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5046 }
5047
b0067625
ZJS
5048 if (arg_register) {
5049 r = register_machine(
abdb9b08 5050 bus,
b0067625
ZJS
5051 arg_machine,
5052 *pid,
5053 arg_directory,
5054 arg_uuid,
5055 ifi,
5056 arg_slice,
5057 arg_custom_mounts, arg_n_custom_mounts,
5058 arg_kill_signal,
5059 arg_property,
de40a303 5060 arg_property_message,
b0067625
ZJS
5061 arg_keep_unit,
5062 arg_container_service_name);
5063 if (r < 0)
5064 return r;
abdb9b08 5065
cd2dfc6f
LP
5066 } else if (!arg_keep_unit) {
5067 r = allocate_scope(
abdb9b08 5068 bus,
cd2dfc6f
LP
5069 arg_machine,
5070 *pid,
5071 arg_slice,
5072 arg_custom_mounts, arg_n_custom_mounts,
5073 arg_kill_signal,
de40a303
LP
5074 arg_property,
5075 arg_property_message);
cd2dfc6f
LP
5076 if (r < 0)
5077 return r;
5078
5079 } else if (arg_slice || arg_property)
5080 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5081
27da7ef0 5082 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5083 if (r < 0)
5084 return r;
5085
27da7ef0 5086 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5087 if (r < 0)
5088 return r;
b0067625 5089
de54e02d 5090 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5091 if (r < 0)
5092 return r;
5093
5094 /* Notify the child that the parent is ready with all
5095 * its setup (including cgroup-ification), and that
5096 * the child can now hand over control to the code to
5097 * run inside the container. */
75116558 5098 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5099
5100 /* Block SIGCHLD here, before notifying child.
5101 * process_pty() will handle it with the other signals. */
5102 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5103
5104 /* Reset signal to default */
9c274488 5105 r = default_signals(SIGCHLD);
b0067625
ZJS
5106 if (r < 0)
5107 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5108
5109 r = sd_event_new(&event);
5110 if (r < 0)
5111 return log_error_errno(r, "Failed to get default event source: %m");
5112
8fd010bb
LP
5113 (void) sd_event_set_watchdog(event, true);
5114
abdb9b08
LP
5115 if (bus) {
5116 r = sd_bus_attach_event(bus, event, 0);
5117 if (r < 0)
5118 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5119 }
5120
e96ceaba 5121 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5122 if (r < 0)
5123 return r;
5124
b71a0192
CB
5125 if (arg_userns_mode != USER_NAMESPACE_NO) {
5126 r = wipe_fully_visible_fs(mntns_fd);
5127 if (r < 0)
5128 return r;
5129 mntns_fd = safe_close(mntns_fd);
5130 }
5131
b0067625 5132 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5133 if (!barrier_place_and_sync(&barrier)) /* #5 */
5134 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5135
38ccb557 5136 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5137 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5138 etc_passwd_lock = safe_close(etc_passwd_lock);
5139
04f590a4
LP
5140 (void) sd_notifyf(false,
5141 "STATUS=Container running.\n"
5142 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4bf4f50f
ZJS
5143 if (!arg_notify_ready) {
5144 r = sd_notify(false, "READY=1\n");
5145 if (r < 0)
5146 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5147 }
b0067625
ZJS
5148
5149 if (arg_kill_signal > 0) {
5150 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5151 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5152 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5153 } else {
5154 /* Immediately exit */
919f5ae0
LP
5155 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5156 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5157 }
5158
988851b6
LP
5159 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5160
5161 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5162 if (r < 0)
5163 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5164
6916b164 5165 /* Exit when the child exits */
919f5ae0 5166 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625 5167
b07ee903
CB
5168 /* Retrieve the kmsg fifo allocated by inner child */
5169 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5170 if (fd_kmsg_fifo < 0)
5171 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5172
b0067625 5173 if (arg_expose_ports) {
b07ee903 5174 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5175 if (r < 0)
5176 return r;
5177
deff68e7
FW
5178 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5179 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5180 }
5181
3acc84eb 5182 if (arg_console_mode != CONSOLE_PIPE) {
254d1313 5183 _cleanup_close_ int fd = -EBADF;
3acc84eb 5184 PTYForwardFlags flags = 0;
de40a303 5185
3acc84eb 5186 /* Retrieve the master pty allocated by inner child */
bb1aa185 5187 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
3acc84eb
FB
5188 if (fd < 0)
5189 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5190
5191 switch (arg_console_mode) {
de40a303 5192
3acc84eb
FB
5193 case CONSOLE_READ_ONLY:
5194 flags |= PTY_FORWARD_READ_ONLY;
5195
5196 _fallthrough_;
5197
5198 case CONSOLE_INTERACTIVE:
5199 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5200
5201 r = pty_forward_new(event, fd, flags, &forward);
5202 if (r < 0)
5203 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5204
f5fbe71d 5205 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5206 (void) pty_forward_set_width_height(forward,
5207 arg_console_width,
5208 arg_console_height);
5209 break;
5210
5211 default:
5212 assert(arg_console_mode == CONSOLE_PASSIVE);
5213 }
5214
5215 *master = TAKE_FD(fd);
de40a303 5216 }
b0067625 5217
5d9d3fcb
CB
5218 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5219
b0067625
ZJS
5220 r = sd_event_loop(event);
5221 if (r < 0)
5222 return log_error_errno(r, "Failed to run event loop: %m");
5223
de40a303
LP
5224 if (forward) {
5225 char last_char = 0;
b0067625 5226
de40a303
LP
5227 (void) pty_forward_get_last_char(forward, &last_char);
5228 forward = pty_forward_free(forward);
b0067625 5229
de40a303
LP
5230 if (!arg_quiet && last_char != '\n')
5231 putc('\n', stdout);
5232 }
b0067625
ZJS
5233
5234 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5235 if (!arg_register && !arg_keep_unit && bus)
5236 terminate_scope(bus, arg_machine);
b0067625
ZJS
5237
5238 /* Normally redundant, but better safe than sorry */
c67b0082 5239 (void) kill(*pid, SIGKILL);
b0067625 5240
5d9d3fcb
CB
5241 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5242
5b4855ab
DDM
5243 if (arg_private_network) {
5244 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5245 * to avoid having to move the parent to the child network namespace. */
5246 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5247 if (r < 0)
5248 return r;
5249
5250 if (r == 0) {
254d1313 5251 _cleanup_close_ int parent_netns_fd = -EBADF;
5b4855ab 5252
19b761a0 5253 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5b4855ab
DDM
5254 if (r < 0) {
5255 log_error_errno(r, "Failed to open parent network namespace: %m");
5256 _exit(EXIT_FAILURE);
5257 }
5258
5259 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5260 if (r < 0) {
5261 log_error_errno(r, "Failed to enter child network namespace: %m");
5262 _exit(EXIT_FAILURE);
5263 }
5264
2f091b1b
TM
5265 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5266 * This is about ensuring interfaces get their old name back when being moved back. */
5267 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5268
5b4855ab
DDM
5269 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5270 if (r < 0)
5271 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5272
5273 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5274 }
5275 }
5276
8f03de53 5277 r = wait_for_container(TAKE_PID(*pid), &container_status);
b0067625 5278
0bb0a9fa
ZJS
5279 /* Tell machined that we are gone. */
5280 if (bus)
5281 (void) unregister_machine(bus, arg_machine);
5282
b0067625
ZJS
5283 if (r < 0)
5284 /* We failed to wait for the container, or the container exited abnormally. */
5285 return r;
5286 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5287 /* r > 0 → The container exited with a non-zero status.
5288 * As a special case, we need to replace 133 with a different value,
5289 * because 133 is special-cased in the service file to reboot the container.
5290 * otherwise → The container exited with zero status and a reboot was not requested.
5291 */
2a49b612 5292 if (r == EXIT_FORCE_RESTART)
27e29a1e 5293 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5294 *ret = r;
b0067625
ZJS
5295 return 0; /* finito */
5296 }
5297
5298 /* CONTAINER_REBOOTED, loop again */
5299
5300 if (arg_keep_unit) {
5301 /* Special handling if we are running as a service: instead of simply
5302 * restarting the machine we want to restart the entire service, so let's
5303 * inform systemd about this with the special exit code 133. The service
5304 * file uses RestartForceExitStatus=133 so that this results in a full
5305 * nspawn restart. This is necessary since we might have cgroup parameters
5306 * set we want to have flushed out. */
2a49b612
ZJS
5307 *ret = EXIT_FORCE_RESTART;
5308 return 0; /* finito */
b0067625
ZJS
5309 }
5310
deff68e7
FW
5311 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5312 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5313
5314 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5315 *veth_created = false;
5316 return 1; /* loop again */
5317}
5318
bf428efb 5319static int initialize_rlimits(void) {
852b6250 5320 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
bf428efb
LP
5321 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5322 * container execution environments. */
5323
5324 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
852b6250
LP
5325 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5326 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5327 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5328 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5329 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5330 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5331 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5332 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5333 [RLIMIT_NICE] = { 0, 0 },
5334 [RLIMIT_NOFILE] = { 1024, 4096 },
5335 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5336 [RLIMIT_RTPRIO] = { 0, 0 },
5337 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5338 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
bf428efb
LP
5339
5340 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5341 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5342 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5343 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5344 * that PID 1 changes a number of other resource limits during early initialization which is why we
5345 * don't read the other limits from PID 1 but prefer the static table above. */
5346 };
5347
5348 int rl;
5349
5350 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5351 /* Let's only fill in what the user hasn't explicitly configured anyway */
5352 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5353 const struct rlimit *v;
5354 struct rlimit buffer;
5355
5356 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5357 /* For these two let's read the limits off PID 1. See above for an explanation. */
5358
5359 if (prlimit(1, rl, NULL, &buffer) < 0)
5360 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5361
dbf1aca6
LP
5362 v = &buffer;
5363 } else if (rl == RLIMIT_NOFILE) {
5364 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5365 * userspace. Given that nspawn containers are often run without our PID 1,
5366 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5367 * so that container userspace gets similar resources as host userspace
5368 * gets. */
5369 buffer = kernel_defaults[rl];
5370 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
bf428efb
LP
5371 v = &buffer;
5372 } else
5373 v = kernel_defaults + rl;
5374
5375 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5376 if (!arg_rlimit[rl])
5377 return log_oom();
5378 }
5379
5380 if (DEBUG_LOGGING) {
5381 _cleanup_free_ char *k = NULL;
5382
5383 (void) rlimit_format(arg_rlimit[rl], &k);
5384 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5385 }
5386 }
5387
5388 return 0;
5389}
5390
287b7376 5391static int cant_be_in_netns(void) {
254d1313 5392 _cleanup_close_ int fd = -EBADF;
287b7376
LP
5393 struct ucred ucred;
5394 int r;
5395
5396 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5397 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5398 * nice message. */
5399
5400 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5401 return 0;
5402
5403 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5404 if (fd < 0)
5405 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5406
1861986a 5407 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
bb44fd07
ZJS
5408 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5409 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5410 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5411 if (r < 0)
1861986a 5412 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
287b7376
LP
5413
5414 r = getpeercred(fd, &ucred);
5415 if (r < 0)
5416 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5417
f7a2dc3d 5418 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
287b7376 5419 if (r < 0)
f7a2dc3d
CB
5420 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5421 if (r == 0)
287b7376
LP
5422 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5423 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5424 return 0;
5425}
5426
44dbef90 5427static int run(int argc, char *argv[]) {
4c27749b 5428 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5bb1d7fb 5429 _cleanup_close_ int master = -EBADF;
03cfe0d5 5430 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5431 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5432 char veth_name[IFNAMSIZ] = "";
761cf19d 5433 struct ExposeArgs expose_args = {};
8e766630 5434 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5435 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5436 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e 5437 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5438 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5439 pid_t pid = 0;
03cfe0d5
LP
5440
5441 log_parse_environment();
5442 log_open();
415fc41c 5443
03cfe0d5
LP
5444 r = parse_argv(argc, argv);
5445 if (r <= 0)
5446 goto finish;
5447
38ee19c0
ZJS
5448 if (geteuid() != 0) {
5449 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5450 argc >= 2 ? "Need to be root." :
5451 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5452 goto finish;
38ee19c0 5453 }
fba868fa 5454
287b7376
LP
5455 r = cant_be_in_netns();
5456 if (r < 0)
5457 goto finish;
5458
bf428efb
LP
5459 r = initialize_rlimits();
5460 if (r < 0)
5461 goto finish;
5462
de40a303
LP
5463 r = load_oci_bundle();
5464 if (r < 0)
5465 goto finish;
5466
f757855e
LP
5467 r = determine_names();
5468 if (r < 0)
5469 goto finish;
5470
5471 r = load_settings();
5472 if (r < 0)
5473 goto finish;
5474
d4d99bc6 5475 r = cg_unified();
5eee8290
LP
5476 if (r < 0) {
5477 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5478 goto finish;
5479 }
5480
f757855e
LP
5481 r = verify_arguments();
5482 if (r < 0)
5483 goto finish;
03cfe0d5 5484
2f091b1b
TM
5485 r = verify_network_interfaces_initialized();
5486 if (r < 0)
5487 goto finish;
5488
49048684
ZJS
5489 /* Reapply environment settings. */
5490 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5491
2949ff26
LP
5492 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5493 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5494 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5495 (void) ignore_signals(SIGPIPE);
2949ff26 5496
03cfe0d5
LP
5497 n_fd_passed = sd_listen_fds(false);
5498 if (n_fd_passed > 0) {
5499 r = fdset_new_listen_fds(&fds, false);
5500 if (r < 0) {
5501 log_error_errno(r, "Failed to collect file descriptors: %m");
5502 goto finish;
5503 }
5504 }
5505
83e803a9
ZJS
5506 /* The "default" umask. This is appropriate for most file and directory
5507 * operations performed by nspawn, and is the umask that will be used for
5508 * the child. Functions like copy_devnodes() change the umask temporarily. */
5509 umask(0022);
5510
03cfe0d5
LP
5511 if (arg_directory) {
5512 assert(!arg_image);
5513
b35ca61a
LP
5514 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5515 * /var from the host will propagate into container dynamically (because bad things happen if
5516 * two systems write to the same /var). Let's allow it for the special cases where /var is
5517 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5518 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
1406bd66
LP
5519 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5520 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5521 goto finish;
5522 }
5523
5524 if (arg_ephemeral) {
5525 _cleanup_free_ char *np = NULL;
5526
f461a28d 5527 r = chase_and_update(&arg_directory, 0);
3f342ec4
LP
5528 if (r < 0)
5529 goto finish;
5530
7bf011e3
LP
5531 /* If the specified path is a mount point we generate the new snapshot immediately
5532 * inside it under a random name. However if the specified is not a mount point we
5533 * create the new snapshot in the parent directory, just next to it. */
e1873695 5534 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5535 if (r < 0) {
5536 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5537 goto finish;
5538 }
5539 if (r > 0)
770b5ce4 5540 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5541 else
770b5ce4 5542 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5543 if (r < 0) {
0f3be6ca 5544 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5545 goto finish;
5546 }
5547
6992459c 5548 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5549 * only owned by us and no one else. */
6992459c 5550 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5551 if (r < 0) {
5552 log_error_errno(r, "Failed to lock %s: %m", np);
5553 goto finish;
5554 }
5555
7bf011e3
LP
5556 {
5557 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5558 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5559 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5560 BTRFS_SNAPSHOT_FALLBACK_COPY |
5561 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5562 BTRFS_SNAPSHOT_RECURSIVE |
5563 BTRFS_SNAPSHOT_QUOTA |
5564 BTRFS_SNAPSHOT_SIGINT);
7bf011e3
LP
5565 }
5566 if (r == -EINTR) {
5567 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5568 goto finish;
5569 }
03cfe0d5
LP
5570 if (r < 0) {
5571 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5572 goto finish;
ec16945e
LP
5573 }
5574
1cc6c93a 5575 free_and_replace(arg_directory, np);
17cbb288 5576 remove_directory = true;
30535c16 5577 } else {
f461a28d 5578 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5579 if (r < 0)
5580 goto finish;
5581
30535c16
LP
5582 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5583 if (r == -EBUSY) {
5584 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5585 goto finish;
5586 }
5587 if (r < 0) {
5588 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5589 goto finish;
30535c16
LP
5590 }
5591
5592 if (arg_template) {
f461a28d 5593 r = chase_and_update(&arg_template, 0);
3f342ec4
LP
5594 if (r < 0)
5595 goto finish;
5596
7bf011e3
LP
5597 {
5598 BLOCK_SIGNALS(SIGINT);
fab4ef72
DDM
5599 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5600 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5601 BTRFS_SNAPSHOT_FALLBACK_COPY |
5602 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5603 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5604 BTRFS_SNAPSHOT_RECURSIVE |
5605 BTRFS_SNAPSHOT_QUOTA |
5606 BTRFS_SNAPSHOT_SIGINT);
7bf011e3 5607 }
ff6c6cc1
LP
5608 if (r == -EEXIST)
5609 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5610 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5611 else if (r == -EINTR) {
5612 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5613 goto finish;
5614 } else if (r < 0) {
83521414 5615 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5616 goto finish;
ff6c6cc1
LP
5617 } else
5618 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5619 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5620 }
ec16945e
LP
5621 }
5622
7732f92b 5623 if (arg_start_mode == START_BOOT) {
aff7ae0d 5624 _cleanup_free_ char *b = NULL;
a5201ed6 5625 const char *p;
c9fe05e0 5626
aff7ae0d
LP
5627 if (arg_pivot_root_new) {
5628 b = path_join(arg_directory, arg_pivot_root_new);
5629 if (!b)
5630 return log_oom();
5631
5632 p = b;
5633 } else
a5201ed6 5634 p = arg_directory;
c9fe05e0
AR
5635
5636 if (path_is_os_tree(p) <= 0) {
aff7ae0d
LP
5637 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5638 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
1b9e5b12
LP
5639 goto finish;
5640 }
5641 } else {
aff7ae0d 5642 _cleanup_free_ char *p = NULL;
c9fe05e0 5643
a5201ed6 5644 if (arg_pivot_root_new)
aff7ae0d 5645 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
a5201ed6 5646 else
aff7ae0d
LP
5647 p = path_join(arg_directory, "/usr/");
5648 if (!p)
5649 return log_oom();
1b9e5b12 5650
aff7ae0d
LP
5651 if (laccess(p, F_OK) < 0) {
5652 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5653 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
1b9e5b12 5654 goto finish;
1b9e5b12
LP
5655 }
5656 }
ec16945e 5657
6b9132a9 5658 } else {
d04faa4e 5659 DissectImageFlags dissect_image_flags =
4b5de5dd 5660 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5661 DISSECT_IMAGE_REQUIRE_ROOT |
5662 DISSECT_IMAGE_RELAX_VAR_CHECK |
73d88b80
LP
5663 DISSECT_IMAGE_USR_NO_ROOT |
5664 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5665 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
ec16945e
LP
5666 assert(arg_image);
5667 assert(!arg_template);
5668
f461a28d 5669 r = chase_and_update(&arg_image, 0);
3f342ec4
LP
5670 if (r < 0)
5671 goto finish;
5672
0f3be6ca
LP
5673 if (arg_ephemeral) {
5674 _cleanup_free_ char *np = NULL;
5675
5676 r = tempfn_random(arg_image, "machine.", &np);
5677 if (r < 0) {
5678 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5679 goto finish;
5680 }
5681
6992459c
LP
5682 /* Always take an exclusive lock on our own ephemeral copy. */
5683 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5684 if (r < 0) {
5685 r = log_error_errno(r, "Failed to create image lock: %m");
5686 goto finish;
5687 }
5688
7bf011e3
LP
5689 {
5690 BLOCK_SIGNALS(SIGINT);
7c2f5495
DDM
5691 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5692 FS_NOCOW_FL, FS_NOCOW_FL,
5693 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5694 NULL, NULL);
7bf011e3
LP
5695 }
5696 if (r == -EINTR) {
5697 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5698 goto finish;
5699 }
0f3be6ca
LP
5700 if (r < 0) {
5701 r = log_error_errno(r, "Failed to copy image file: %m");
5702 goto finish;
5703 }
5704
1cc6c93a 5705 free_and_replace(arg_image, np);
0f3be6ca
LP
5706 remove_image = true;
5707 } else {
5708 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5709 if (r == -EBUSY) {
5710 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5711 goto finish;
5712 }
5713 if (r < 0) {
5714 r = log_error_errno(r, "Failed to create image lock: %m");
5715 goto finish;
5716 }
4623e8e6 5717
89e62e0b
LP
5718 r = verity_settings_load(
5719 &arg_verity_settings,
5720 arg_image, NULL, NULL);
e7cbe5cb
LB
5721 if (r < 0) {
5722 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5723 goto finish;
78ebe980 5724 }
89e62e0b
LP
5725
5726 if (arg_verity_settings.data_path)
5727 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5728 }
5729
c67b0082 5730 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5731 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5732 goto finish;
1b9e5b12 5733 }
6b9132a9 5734
c67b0082
LP
5735 remove_tmprootdir = true;
5736
5737 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5738 if (!arg_directory) {
5739 r = log_oom();
5740 goto finish;
6b9132a9 5741 }
88213476 5742
89e62e0b
LP
5743 r = loop_device_make_by_path(
5744 arg_image,
5745 arg_read_only ? O_RDONLY : O_RDWR,
22ee78a8 5746 /* sector_size= */ UINT32_MAX,
89e62e0b 5747 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
7f52206a 5748 LOCK_SH,
89e62e0b 5749 &loop);
2d845785
LP
5750 if (r < 0) {
5751 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5752 goto finish;
5753 }
1b9e5b12 5754
bad31660 5755 r = dissect_loop_device_and_warn(
bad31660 5756 loop,
89e62e0b 5757 &arg_verity_settings,
84be0c71
LP
5758 /* mount_options=*/ NULL,
5759 arg_image_policy ?: &image_policy_container,
e7cbe5cb 5760 dissect_image_flags,
e0f9e7bd 5761 &dissected_image);
2d845785 5762 if (r == -ENOPKG) {
4526113f 5763 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5764 log_notice("Note that the disk image needs to\n"
5765 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5766 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
db811444 5767 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
2d845785
LP
5768 " d) or contain a file system without a partition table\n"
5769 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5770 goto finish;
2d845785 5771 }
4526113f 5772 if (r < 0)
842f3b0f 5773 goto finish;
1b9e5b12 5774
88b3300f
LP
5775 r = dissected_image_load_verity_sig_partition(
5776 dissected_image,
5777 loop->fd,
5778 &arg_verity_settings);
5779 if (r < 0)
5780 goto finish;
5781
8ee9615e
LP
5782 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5783 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5784 "root hash signature found! Proceeding without integrity checking.", arg_image);
4623e8e6 5785
89e62e0b
LP
5786 r = dissected_image_decrypt_interactively(
5787 dissected_image,
5788 NULL,
5789 &arg_verity_settings,
e330f97a 5790 0);
1b9e5b12
LP
5791 if (r < 0)
5792 goto finish;
0f3be6ca
LP
5793
5794 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5795 if (remove_image && unlink(arg_image) >= 0)
5796 remove_image = false;
4c27749b
LP
5797
5798 if (arg_architecture < 0)
5799 arg_architecture = dissected_image_architecture(dissected_image);
842f3b0f 5800 }
842f3b0f 5801
86c0dd4a 5802 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5803 if (r < 0)
5804 goto finish;
5805
de40a303
LP
5806 if (arg_console_mode < 0)
5807 arg_console_mode =
5808 isatty(STDIN_FILENO) > 0 &&
5809 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5810
de40a303
LP
5811 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5812 arg_quiet = true;
a258bf26 5813
9c857b9d 5814 if (!arg_quiet)
c85c2f79 5815 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
9c857b9d
LP
5816 arg_machine, arg_image ?: arg_directory);
5817
988851b6 5818 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
a258bf26 5819
8c3fe1b5
LP
5820 r = make_reaper_process(true);
5821 if (r < 0) {
5822 log_error_errno(r, "Failed to become subreaper: %m");
03cfe0d5
LP
5823 goto finish;
5824 }
5825
761cf19d
FW
5826 if (arg_expose_ports) {
5827 r = fw_ctx_new(&fw_ctx);
5828 if (r < 0) {
5829 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5830 goto finish;
5831 }
5832 expose_args.fw_ctx = fw_ctx;
5833 }
d87be9b0 5834 for (;;) {
3acc84eb 5835 r = run_container(dissected_image,
44dbef90
LP
5836 fds,
5837 veth_name, &veth_created,
761cf19d 5838 &expose_args, &master,
44dbef90 5839 &pid, &ret);
b0067625 5840 if (r <= 0)
d87be9b0 5841 break;
d87be9b0 5842 }
88213476
LP
5843
5844finish:
04f590a4
LP
5845 (void) sd_notify(false,
5846 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5847 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5848
9444b1f2 5849 if (pid > 0)
c67b0082 5850 (void) kill(pid, SIGKILL);
88213476 5851
503546da 5852 /* Try to flush whatever is still queued in the pty */
6a0f896b 5853 if (master >= 0) {
f5fbe71d 5854 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5855 master = safe_close(master);
5856 }
5857
5858 if (pid > 0)
5859 (void) wait_for_terminate(pid, NULL);
503546da 5860
50ebcf6c
LP
5861 pager_close();
5862
17cbb288 5863 if (remove_directory && arg_directory) {
ec16945e
LP
5864 int k;
5865
17cbb288 5866 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5867 if (k < 0)
17cbb288 5868 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5869 }
5870
0f3be6ca
LP
5871 if (remove_image && arg_image) {
5872 if (unlink(arg_image) < 0)
5873 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5874 }
5875
c67b0082
LP
5876 if (remove_tmprootdir) {
5877 if (rmdir(tmprootdir) < 0)
5878 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5879 }
5880
785890ac
LP
5881 if (arg_machine) {
5882 const char *p;
5883
63c372cb 5884 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5885 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5886 }
5887
deff68e7
FW
5888 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5889 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5890
5891 if (veth_created)
5892 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5893 (void) remove_bridge(arg_network_zone);
f757855e 5894
f757855e
LP
5895 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5896 expose_port_free_all(arg_expose_ports);
bf428efb 5897 rlimit_free_all(arg_rlimit);
b2645747 5898 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5899 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5900
44dbef90
LP
5901 if (r < 0)
5902 return r;
5903
5904 return ret;
88213476 5905}
44dbef90
LP
5906
5907DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);