]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #19913 from yuwata/network-fix-counter
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
88213476 2
349cc4a5 3#if HAVE_BLKID
8fe0087e 4#endif
88213476 5#include <errno.h>
88213476 6#include <getopt.h>
503f480f 7#include <linux/fs.h>
1b9e5b12 8#include <linux/loop.h>
349cc4a5 9#if HAVE_SELINUX
8fe0087e 10#include <selinux/selinux.h>
1b9e5b12 11#endif
8fe0087e 12#include <stdlib.h>
8fe0087e 13#include <sys/file.h>
335d2ead 14#include <sys/ioctl.h>
8fe0087e
LP
15#include <sys/personality.h>
16#include <sys/prctl.h>
17#include <sys/types.h>
6916b164 18#include <sys/wait.h>
335d2ead 19#include <termios.h>
8fe0087e 20#include <unistd.h>
1b9e5b12 21
b053cd5f 22#include "sd-bus.h"
1f0cd86b 23#include "sd-daemon.h"
1f0cd86b 24#include "sd-id128.h"
8fe0087e 25
b5efdb8a 26#include "alloc-util.h"
8fe0087e
LP
27#include "barrier.h"
28#include "base-filesystem.h"
29#include "blkid-util.h"
30#include "btrfs-util.h"
b8ea7a6e 31#include "bus-error.h"
b053cd5f 32#include "bus-util.h"
8fe0087e 33#include "cap-list.h"
430f0182 34#include "capability-util.h"
04d391da 35#include "cgroup-util.h"
8fe0087e 36#include "copy.h"
d107bb7d 37#include "cpu-set-util.h"
786d19fd 38#include "creds-util.h"
4fc9982c 39#include "dev-setup.h"
57f1b61b 40#include "discover-image.h"
2d845785 41#include "dissect-image.h"
8fe0087e 42#include "env-util.h"
3652872a 43#include "escape.h"
3ffd4af2 44#include "fd-util.h"
842f3b0f 45#include "fdset.h"
a5c32cff 46#include "fileio.h"
f97b34a6 47#include "format-util.h"
f4f15635 48#include "fs-util.h"
1b9e5b12 49#include "gpt.h"
4623e8e6 50#include "hexdecoct.h"
e2054217 51#include "hostname-setup.h"
8fe0087e 52#include "hostname-util.h"
910fd145 53#include "id128-util.h"
3652872a 54#include "io-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
8fe0087e 58#include "macro.h"
44dbef90 59#include "main-func.h"
f5947a5e 60#include "missing_sched.h"
8fe0087e 61#include "mkdir.h"
4349cd7c 62#include "mount-util.h"
049af8ad 63#include "mountpoint-util.h"
0cb8e3d1 64#include "namespace-util.h"
8fe0087e 65#include "netlink-util.h"
2f893044 66#include "nspawn-bind-user.h"
07630cea 67#include "nspawn-cgroup.h"
3652872a 68#include "nspawn-creds.h"
3603efde 69#include "nspawn-def.h"
07630cea
LP
70#include "nspawn-expose-ports.h"
71#include "nspawn-mount.h"
72#include "nspawn-network.h"
de40a303 73#include "nspawn-oci.h"
7336138e 74#include "nspawn-patch-uid.h"
07630cea 75#include "nspawn-register.h"
910fd145 76#include "nspawn-seccomp.h"
07630cea
LP
77#include "nspawn-settings.h"
78#include "nspawn-setuid.h"
7732f92b 79#include "nspawn-stub-pid1.h"
91181e07 80#include "nspawn.h"
d8b4d14d 81#include "nulstr-util.h"
d58ad743 82#include "os-util.h"
50ebcf6c 83#include "pager.h"
614b022c 84#include "parse-argument.h"
6bedfcbb 85#include "parse-util.h"
8fe0087e 86#include "path-util.h"
294bf0c3 87#include "pretty-print.h"
0b452006 88#include "process-util.h"
8fe0087e
LP
89#include "ptyfwd.h"
90#include "random-util.h"
8869a0b4 91#include "raw-clone.h"
86775e35 92#include "resolve-util.h"
bf428efb 93#include "rlimit-util.h"
8fe0087e 94#include "rm-rf.h"
de40a303
LP
95#if HAVE_SECCOMP
96#include "seccomp-util.h"
97#endif
68b02049 98#include "selinux-util.h"
8fe0087e 99#include "signal-util.h"
2583fbea 100#include "socket-util.h"
8fcde012 101#include "stat-util.h"
15a5e950 102#include "stdio-util.h"
5c828e66 103#include "string-table.h"
07630cea 104#include "string-util.h"
8fe0087e 105#include "strv.h"
de40a303 106#include "sysctl-util.h"
8fe0087e 107#include "terminal-util.h"
e4de7287 108#include "tmpfile-util.h"
affb60b1 109#include "umask-util.h"
43c3fb46 110#include "unit-name.h"
b1d4f8e1 111#include "user-util.h"
8fe0087e 112#include "util.h"
e9642be2 113
e96ceaba
LP
114/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
115#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
0e7ac751 116
2a49b612
ZJS
117#define EXIT_FORCE_RESTART 133
118
113cea80
DH
119typedef enum ContainerStatus {
120 CONTAINER_TERMINATED,
6145bb4f 121 CONTAINER_REBOOTED,
113cea80
DH
122} ContainerStatus;
123
88213476 124static char *arg_directory = NULL;
ec16945e 125static char *arg_template = NULL;
5f932eb9 126static char *arg_chdir = NULL;
b53ede69
PW
127static char *arg_pivot_root_new = NULL;
128static char *arg_pivot_root_old = NULL;
687d0825 129static char *arg_user = NULL;
de40a303
LP
130static uid_t arg_uid = UID_INVALID;
131static gid_t arg_gid = GID_INVALID;
132static gid_t* arg_supplementary_gids = NULL;
133static size_t arg_n_supplementary_gids = 0;
9444b1f2 134static sd_id128_t arg_uuid = {};
3a9530e5
LP
135static char *arg_machine = NULL; /* The name used by the host to refer to this */
136static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
137static const char *arg_selinux_context = NULL;
138static const char *arg_selinux_apifs_context = NULL;
de40a303 139static char *arg_slice = NULL;
ff01d048 140static bool arg_private_network = false;
bc2f673e 141static bool arg_read_only = false;
7732f92b 142static StartMode arg_start_mode = START_PID1;
ec16945e 143static bool arg_ephemeral = false;
57fb9fb5 144static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 145static bool arg_link_journal_try = false;
520e0d54 146static uint64_t arg_caps_retain =
50b52222
LP
147 (1ULL << CAP_AUDIT_CONTROL) |
148 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
149 (1ULL << CAP_CHOWN) |
150 (1ULL << CAP_DAC_OVERRIDE) |
151 (1ULL << CAP_DAC_READ_SEARCH) |
152 (1ULL << CAP_FOWNER) |
153 (1ULL << CAP_FSETID) |
154 (1ULL << CAP_IPC_OWNER) |
155 (1ULL << CAP_KILL) |
156 (1ULL << CAP_LEASE) |
157 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 158 (1ULL << CAP_MKNOD) |
5076f0cc
LP
159 (1ULL << CAP_NET_BIND_SERVICE) |
160 (1ULL << CAP_NET_BROADCAST) |
161 (1ULL << CAP_NET_RAW) |
5076f0cc 162 (1ULL << CAP_SETFCAP) |
50b52222 163 (1ULL << CAP_SETGID) |
5076f0cc
LP
164 (1ULL << CAP_SETPCAP) |
165 (1ULL << CAP_SETUID) |
166 (1ULL << CAP_SYS_ADMIN) |
50b52222 167 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
168 (1ULL << CAP_SYS_CHROOT) |
169 (1ULL << CAP_SYS_NICE) |
170 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 171 (1ULL << CAP_SYS_RESOURCE) |
50b52222 172 (1ULL << CAP_SYS_TTY_CONFIG);
88fc9c9b 173static uint64_t arg_caps_ambient = 0;
de40a303 174static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 175static CustomMount *arg_custom_mounts = NULL;
88614c8a 176static size_t arg_n_custom_mounts = 0;
f4889f65 177static char **arg_setenv = NULL;
284c0b91 178static bool arg_quiet = false;
eb91eb18 179static bool arg_register = true;
89f7c846 180static bool arg_keep_unit = false;
aa28aefe 181static char **arg_network_interfaces = NULL;
c74e630d 182static char **arg_network_macvlan = NULL;
4bbfe7ad 183static char **arg_network_ipvlan = NULL;
69c79d3c 184static bool arg_network_veth = false;
f6d6bad1 185static char **arg_network_veth_extra = NULL;
f757855e 186static char *arg_network_bridge = NULL;
22b28dfd 187static char *arg_network_zone = NULL;
d7bea6b6 188static char *arg_network_namespace_path = NULL;
bb068de0 189static PagerFlags arg_pager_flags = 0;
050f7277 190static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 191static char *arg_image = NULL;
de40a303 192static char *arg_oci_bundle = NULL;
f757855e 193static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 194static ExposePort *arg_expose_ports = NULL;
f36933fe 195static char **arg_property = NULL;
de40a303 196static sd_bus_message *arg_property_message = NULL;
0de7acce 197static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 198static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
6c045a99 199static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
c6c8f6e2 200static int arg_kill_signal = 0;
5da38d07 201static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
202static SettingsMask arg_settings_mask = 0;
203static int arg_settings_trusted = -1;
204static char **arg_parameters = NULL;
6aadfa4c 205static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 206static bool arg_notify_ready = false;
5a8ff0e6 207static bool arg_use_cgns = true;
0c582db0 208static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 209static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
aee36b4e 210static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
6b000af4
LP
211static char **arg_syscall_allow_list = NULL;
212static char **arg_syscall_deny_list = NULL;
de40a303
LP
213#if HAVE_SECCOMP
214static scmp_filter_ctx arg_seccomp = NULL;
215#endif
bf428efb 216static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 217static bool arg_no_new_privileges = false;
81f345df
LP
218static int arg_oom_score_adjust = 0;
219static bool arg_oom_score_adjust_set = false;
0985c7c4 220static CPUSet arg_cpu_set = {};
09d423e9 221static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 222static TimezoneMode arg_timezone = TIMEZONE_AUTO;
f5fbe71d 223static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
de40a303
LP
224static DeviceNode* arg_extra_nodes = NULL;
225static size_t arg_n_extra_nodes = 0;
226static char **arg_sysctl = NULL;
227static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
3652872a
LP
228static Credential *arg_credentials = NULL;
229static size_t arg_n_credentials = 0;
2f893044 230static char **arg_bind_user = NULL;
88213476 231
6145bb4f
LP
232STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
233STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
237STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
238STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
243STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
244STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
245STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
246STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
247STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
248STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
251STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
252STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
253STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
254STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
89e62e0b 255STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
6b000af4
LP
256STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
257STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
6145bb4f
LP
258#if HAVE_SECCOMP
259STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
260#endif
0985c7c4 261STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
6145bb4f 262STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
2f893044 263STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
6145bb4f 264
dce66ffe
ZJS
265static int handle_arg_console(const char *arg) {
266 if (streq(arg, "help")) {
10e8a60b
LP
267 puts("autopipe\n"
268 "interactive\n"
dce66ffe 269 "passive\n"
10e8a60b
LP
270 "pipe\n"
271 "read-only");
dce66ffe
ZJS
272 return 0;
273 }
274
275 if (streq(arg, "interactive"))
276 arg_console_mode = CONSOLE_INTERACTIVE;
277 else if (streq(arg, "read-only"))
278 arg_console_mode = CONSOLE_READ_ONLY;
279 else if (streq(arg, "passive"))
280 arg_console_mode = CONSOLE_PASSIVE;
554c4beb
LP
281 else if (streq(arg, "pipe")) {
282 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
283 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
284 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
285 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
286 "Proceeding anyway.");
287
dce66ffe 288 arg_console_mode = CONSOLE_PIPE;
10e8a60b
LP
289 } else if (streq(arg, "autopipe")) {
290 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
291 arg_console_mode = CONSOLE_INTERACTIVE;
292 else
293 arg_console_mode = CONSOLE_PIPE;
554c4beb 294 } else
dce66ffe
ZJS
295 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
296
297 arg_settings_mask |= SETTING_CONSOLE_MODE;
298 return 1;
299}
300
37ec0fdd
LP
301static int help(void) {
302 _cleanup_free_ char *link = NULL;
303 int r;
304
bb068de0 305 (void) pager_open(arg_pager_flags);
50ebcf6c 306
37ec0fdd
LP
307 r = terminal_urlify_man("systemd-nspawn", "1", &link);
308 if (r < 0)
309 return log_oom();
310
25148653 311 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
37a92352 312 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
a8828ed9
DW
313 " -h --help Show this help\n"
314 " --version Print version string\n"
69c79d3c 315 " -q --quiet Do not show status information\n"
bb068de0 316 " --no-pager Do not pipe output into a pager\n"
25148653
LP
317 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
318 "%3$sImage:%4$s\n"
1b9e5b12 319 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
320 " --template=PATH Initialize root directory from template directory,\n"
321 " if missing\n"
322 " -x --ephemeral Run container with snapshot of root directory, and\n"
323 " remove it after exit\n"
25e68fd3
LP
324 " -i --image=PATH Root file system disk image (or device node) for\n"
325 " the container\n"
de40a303 326 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
327 " --read-only Mount the root directory read-only\n"
328 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 329 " --root-hash=HASH Specify verity root hash for root disk image\n"
c2923fdc
LB
330 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
331 " as a DER encoded PKCS7, either as a path to a file\n"
332 " or as an ASCII base64 encoded string prefixed by\n"
333 " 'base64:'\n"
e7cbe5cb 334 " --verity-data=PATH Specify hash device for verity\n"
25148653
LP
335 " --pivot-root=PATH[:PATH]\n"
336 " Pivot root to given directory in the container\n\n"
337 "%3$sExecution:%4$s\n"
7732f92b 338 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 339 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 340 " --chdir=PATH Set working directory in the container\n"
25148653
LP
341 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
342 " -u --user=USER Run the command under specified user or UID\n"
343 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
344 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
345 "%3$sSystem Identity:%4$s\n"
a8828ed9 346 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 347 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
348 " --uuid=UUID Set a specific machine UUID for the container\n\n"
349 "%3$sProperties:%4$s\n"
a8828ed9 350 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 351 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
352 " --register=BOOLEAN Register container as machine\n"
353 " --keep-unit Do not register a scope for the machine, reuse\n"
354 " the service unit nspawn is running in\n\n"
355 "%3$sUser Namespacing:%4$s\n"
90b4a64d 356 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 357 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 358 " Similar, but with user configured UID/GID range\n"
6c045a99
LP
359 " --private-users-ownership=MODE\n"
360 " Adjust ('chown') or map ('map') OS tree ownership\n"
361 " to private UID/GID range\n\n"
25148653 362 "%3$sNetworking:%4$s\n"
69c79d3c
LP
363 " --private-network Disable network in container\n"
364 " --network-interface=INTERFACE\n"
365 " Assign an existing network interface to the\n"
366 " container\n"
c74e630d
LP
367 " --network-macvlan=INTERFACE\n"
368 " Create a macvlan network interface based on an\n"
369 " existing network interface to the container\n"
4bbfe7ad
TG
370 " --network-ipvlan=INTERFACE\n"
371 " Create a ipvlan network interface based on an\n"
372 " existing network interface to the container\n"
a8eaaee7 373 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 374 " and container\n"
f6d6bad1
LP
375 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
376 " Add an additional virtual Ethernet link between\n"
377 " host and container\n"
ab046dde 378 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
379 " Add a virtual Ethernet connection to the container\n"
380 " and attach it to an existing bridge on the host\n"
381 " --network-zone=NAME Similar, but attach the new interface to an\n"
382 " an automatically managed bridge interface\n"
d7bea6b6
DP
383 " --network-namespace-path=PATH\n"
384 " Set network namespace to the one represented by\n"
385 " the specified kernel namespace file node\n"
6d0b55c2 386 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
387 " Expose a container IP port on the host\n\n"
388 "%3$sSecurity:%4$s\n"
a8828ed9
DW
389 " --capability=CAP In addition to the default, retain specified\n"
390 " capability\n"
391 " --drop-capability=CAP Drop the specified capability from the default set\n"
88fc9c9b
TH
392 " --ambient-capability=CAP\n"
393 " Sets the specified capability for the started\n"
394 " process. Not useful if booting a machine.\n"
f4e803c8 395 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
396 " --system-call-filter=LIST|~LIST\n"
397 " Permit/prohibit specific system calls\n"
25148653
LP
398 " -Z --selinux-context=SECLABEL\n"
399 " Set the SELinux security context to be used by\n"
400 " processes in the container\n"
401 " -L --selinux-apifs-context=SECLABEL\n"
402 " Set the SELinux security context to be used by\n"
403 " API/tmpfs file systems in the container\n\n"
404 "%3$sResources:%4$s\n"
bf428efb 405 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
406 " --oom-score-adjust=VALUE\n"
407 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
408 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
409 " --personality=ARCH Pick personality for this container\n\n"
25148653 410 "%3$sIntegration:%4$s\n"
09d423e9 411 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 412 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
413 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
414 " host, try-guest, try-host\n"
415 " -j Equivalent to --link-journal=try-guest\n\n"
416 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
417 " --bind=PATH[:PATH[:OPTIONS]]\n"
418 " Bind mount a file or directory from the host into\n"
a8828ed9 419 " the container\n"
5e5bfa6e
EY
420 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
421 " Similar, but creates a read-only bind mount\n"
de40a303
LP
422 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
423 " it\n"
06c17c39 424 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
425 " --overlay=PATH[:PATH...]:PATH\n"
426 " Create an overlay mount from the host to \n"
427 " the container\n"
428 " --overlay-ro=PATH[:PATH...]:PATH\n"
2f893044
LP
429 " Similar, but creates a read-only overlay mount\n"
430 " --bind-user=NAME Bind user from host to container\n\n"
25148653 431 "%3$sInput/Output:%4$s\n"
de40a303
LP
432 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
433 " set up for the container.\n"
3652872a
LP
434 " -P --pipe Equivalent to --console=pipe\n\n"
435 "%3$sCredentials:%4$s\n"
436 " --set-credential=ID:VALUE\n"
437 " Pass a credential with literal value to container.\n"
438 " --load-credential=ID:PATH\n"
439 " Load credential to pass to container from file or\n"
440 " AF_UNIX stream socket.\n"
bc556335
DDM
441 "\nSee the %2$s for details.\n",
442 program_invocation_short_name,
443 link,
444 ansi_underline(),
445 ansi_normal(),
446 ansi_highlight(),
447 ansi_normal());
37ec0fdd
LP
448
449 return 0;
88213476
LP
450}
451
86c0dd4a 452static int custom_mount_check_all(void) {
88614c8a 453 size_t i;
5a8af538 454
5a8af538
LP
455 for (i = 0; i < arg_n_custom_mounts; i++) {
456 CustomMount *m = &arg_custom_mounts[i];
457
0de7acce 458 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
6c045a99 459 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
baaa35ad 460 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
c920b863 461 "--private-users-ownership=own may not be combined with custom root mounts.");
6c045a99 462 if (arg_uid_shift == UID_INVALID)
baaa35ad
ZJS
463 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
464 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 465 }
5a8af538
LP
466 }
467
468 return 0;
469}
470
8199d554 471static int detect_unified_cgroup_hierarchy_from_environment(void) {
c78c095b 472 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
415fc41c 473 int r;
5da38d07 474
efdb0237 475 /* Allow the user to control whether the unified hierarchy is used */
c78c095b
ZJS
476
477 e = getenv(var);
478 if (!e) {
d5fc5b2f 479 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
c78c095b
ZJS
480 var = "UNIFIED_CGROUP_HIERARCHY";
481 e = getenv(var);
c78c095b
ZJS
482 }
483
484 if (!isempty(e)) {
efdb0237
LP
485 r = parse_boolean(e);
486 if (r < 0)
c78c095b 487 return log_error_errno(r, "Failed to parse $%s: %m", var);
5da38d07
TH
488 if (r > 0)
489 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
490 else
491 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
492 }
493
8199d554
LP
494 return 0;
495}
496
497static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
498 int r;
499
75b0d8b8
ZJS
500 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
501 * in the image actually supports. */
b4cccbc1
LP
502 r = cg_all_unified();
503 if (r < 0)
504 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
505 if (r > 0) {
a8725a06
ZJS
506 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
507 * routine only detects 231, so we'll have a false negative here for 230. */
508 r = systemd_installation_has_version(directory, 230);
509 if (r < 0)
510 return log_error_errno(r, "Failed to determine systemd version in container: %m");
511 if (r > 0)
512 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
513 else
514 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 515 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
516 /* Mixed cgroup hierarchy support was added in 233 */
517 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
518 if (r < 0)
519 return log_error_errno(r, "Failed to determine systemd version in container: %m");
520 if (r > 0)
521 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
522 else
523 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
524 } else
5da38d07 525 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 526
8199d554
LP
527 log_debug("Using %s hierarchy for container.",
528 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
529 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
530
efdb0237
LP
531 return 0;
532}
533
8a99bd0c
ZJS
534static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
535 uint64_t mask = 0;
536 int r;
537
538 for (;;) {
539 _cleanup_free_ char *t = NULL;
540
541 r = extract_first_word(&spec, &t, ",", 0);
542 if (r < 0)
543 return log_error_errno(r, "Failed to parse capability %s.", t);
544 if (r == 0)
545 break;
546
547 if (streq(t, "help")) {
548 for (int i = 0; i < capability_list_length(); i++) {
549 const char *name;
550
551 name = capability_to_name(i);
552 if (name)
553 puts(name);
554 }
555
556 return 0; /* quit */
557 }
558
559 if (streq(t, "all"))
f5fbe71d 560 mask = UINT64_MAX;
8a99bd0c
ZJS
561 else {
562 r = capability_from_name(t);
563 if (r < 0)
564 return log_error_errno(r, "Failed to parse capability %s.", t);
565
566 mask |= 1ULL << r;
567 }
568 }
569
570 *ret_mask = mask;
571 return 1; /* continue */
572}
573
49048684 574static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
0c582db0
LB
575 int r;
576
577 r = getenv_bool(name);
578 if (r == -ENXIO)
49048684 579 return 0;
0c582db0 580 if (r < 0)
49048684 581 return log_error_errno(r, "Failed to parse $%s: %m", name);
de40a303 582
0c582db0 583 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 584 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
49048684 585 return 0;
0c582db0
LB
586}
587
49048684 588static int parse_mount_settings_env(void) {
4f086aab 589 const char *e;
1099ceeb
LP
590 int r;
591
592 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
49048684
ZJS
593 if (r < 0 && r != -ENXIO)
594 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
1099ceeb
LP
595 if (r >= 0)
596 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
4f086aab
SU
597
598 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
49048684 599 if (streq_ptr(e, "network"))
4f086aab 600 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
4f086aab 601
49048684
ZJS
602 else if (e) {
603 r = parse_boolean(e);
604 if (r < 0)
605 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
606
607 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
608 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
ab8ee0f2 609 }
4f086aab 610
49048684 611 return 0;
4f086aab
SU
612}
613
49048684 614static int parse_environment(void) {
d5455d2f
LP
615 const char *e;
616 int r;
617
49048684
ZJS
618 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
619 if (r < 0)
620 return r;
621 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
622 if (r < 0)
623 return r;
624 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
625 if (r < 0)
626 return r;
627 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
628 if (r < 0)
629 return r;
d5455d2f 630
49048684
ZJS
631 r = parse_mount_settings_env();
632 if (r < 0)
633 return r;
d5455d2f 634
489fae52
ZJS
635 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
636 * even if it is supported. If not supported, it has no effect. */
de40a303 637 if (!cg_ns_supported())
489fae52 638 arg_use_cgns = false;
de40a303
LP
639 else {
640 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
641 if (r < 0) {
642 if (r != -ENXIO)
49048684 643 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
de40a303
LP
644
645 arg_use_cgns = true;
646 } else {
647 arg_use_cgns = r > 0;
648 arg_settings_mask |= SETTING_USE_CGNS;
649 }
650 }
d5455d2f
LP
651
652 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
653 if (e)
654 arg_container_service_name = e;
655
49048684 656 return detect_unified_cgroup_hierarchy_from_environment();
d5455d2f
LP
657}
658
88213476 659static int parse_argv(int argc, char *argv[]) {
a41fe3a2 660 enum {
acbeb427
ZJS
661 ARG_VERSION = 0x100,
662 ARG_PRIVATE_NETWORK,
bc2f673e 663 ARG_UUID,
5076f0cc 664 ARG_READ_ONLY,
57fb9fb5 665 ARG_CAPABILITY,
88fc9c9b 666 ARG_AMBIENT_CAPABILITY,
420c7379 667 ARG_DROP_CAPABILITY,
17fe0523
LP
668 ARG_LINK_JOURNAL,
669 ARG_BIND,
f4889f65 670 ARG_BIND_RO,
06c17c39 671 ARG_TMPFS,
5a8af538
LP
672 ARG_OVERLAY,
673 ARG_OVERLAY_RO,
de40a303 674 ARG_INACCESSIBLE,
eb91eb18 675 ARG_SHARE_SYSTEM,
89f7c846 676 ARG_REGISTER,
aa28aefe 677 ARG_KEEP_UNIT,
69c79d3c 678 ARG_NETWORK_INTERFACE,
c74e630d 679 ARG_NETWORK_MACVLAN,
4bbfe7ad 680 ARG_NETWORK_IPVLAN,
ab046dde 681 ARG_NETWORK_BRIDGE,
22b28dfd 682 ARG_NETWORK_ZONE,
f6d6bad1 683 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 684 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 685 ARG_PERSONALITY,
4d9f07b4 686 ARG_VOLATILE,
ec16945e 687 ARG_TEMPLATE,
f36933fe 688 ARG_PROPERTY,
6dac160c 689 ARG_PRIVATE_USERS,
c6c8f6e2 690 ARG_KILL_SIGNAL,
f757855e 691 ARG_SETTINGS,
5f932eb9 692 ARG_CHDIR,
b53ede69 693 ARG_PIVOT_ROOT,
7336138e 694 ARG_PRIVATE_USERS_CHOWN,
6c045a99 695 ARG_PRIVATE_USERS_OWNERSHIP,
9c1e04d0 696 ARG_NOTIFY_READY,
4623e8e6 697 ARG_ROOT_HASH,
89e62e0b
LP
698 ARG_ROOT_HASH_SIG,
699 ARG_VERITY_DATA,
960e4569 700 ARG_SYSTEM_CALL_FILTER,
bf428efb 701 ARG_RLIMIT,
3a9530e5 702 ARG_HOSTNAME,
66edd963 703 ARG_NO_NEW_PRIVILEGES,
81f345df 704 ARG_OOM_SCORE_ADJUST,
d107bb7d 705 ARG_CPU_AFFINITY,
09d423e9 706 ARG_RESOLV_CONF,
1688841f 707 ARG_TIMEZONE,
de40a303
LP
708 ARG_CONSOLE,
709 ARG_PIPE,
710 ARG_OCI_BUNDLE,
bb068de0 711 ARG_NO_PAGER,
3652872a
LP
712 ARG_SET_CREDENTIAL,
713 ARG_LOAD_CREDENTIAL,
2f893044 714 ARG_BIND_USER,
a41fe3a2
LP
715 };
716
88213476 717 static const struct option options[] = {
d7bea6b6
DP
718 { "help", no_argument, NULL, 'h' },
719 { "version", no_argument, NULL, ARG_VERSION },
720 { "directory", required_argument, NULL, 'D' },
721 { "template", required_argument, NULL, ARG_TEMPLATE },
722 { "ephemeral", no_argument, NULL, 'x' },
723 { "user", required_argument, NULL, 'u' },
724 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
725 { "as-pid2", no_argument, NULL, 'a' },
726 { "boot", no_argument, NULL, 'b' },
727 { "uuid", required_argument, NULL, ARG_UUID },
728 { "read-only", no_argument, NULL, ARG_READ_ONLY },
729 { "capability", required_argument, NULL, ARG_CAPABILITY },
88fc9c9b 730 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
d7bea6b6 731 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 732 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
733 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
734 { "bind", required_argument, NULL, ARG_BIND },
735 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
736 { "tmpfs", required_argument, NULL, ARG_TMPFS },
737 { "overlay", required_argument, NULL, ARG_OVERLAY },
738 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 739 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 740 { "machine", required_argument, NULL, 'M' },
3a9530e5 741 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
742 { "slice", required_argument, NULL, 'S' },
743 { "setenv", required_argument, NULL, 'E' },
744 { "selinux-context", required_argument, NULL, 'Z' },
745 { "selinux-apifs-context", required_argument, NULL, 'L' },
746 { "quiet", no_argument, NULL, 'q' },
747 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
748 { "register", required_argument, NULL, ARG_REGISTER },
749 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
750 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
751 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
752 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
753 { "network-veth", no_argument, NULL, 'n' },
754 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
755 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
756 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
757 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
758 { "personality", required_argument, NULL, ARG_PERSONALITY },
759 { "image", required_argument, NULL, 'i' },
760 { "volatile", optional_argument, NULL, ARG_VOLATILE },
761 { "port", required_argument, NULL, 'p' },
762 { "property", required_argument, NULL, ARG_PROPERTY },
763 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
6c045a99
LP
764 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
765 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
d7bea6b6
DP
766 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
767 { "settings", required_argument, NULL, ARG_SETTINGS },
768 { "chdir", required_argument, NULL, ARG_CHDIR },
769 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
770 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
771 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
89e62e0b
LP
772 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
773 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
d7bea6b6 774 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 775 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 776 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 777 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 778 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 779 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
780 { "console", required_argument, NULL, ARG_CONSOLE },
781 { "pipe", no_argument, NULL, ARG_PIPE },
782 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 783 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
3652872a
LP
784 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
785 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
2f893044 786 { "bind-user", required_argument, NULL, ARG_BIND_USER },
eb9da376 787 {}
88213476
LP
788 };
789
9444b1f2 790 int c, r;
a42c8b54 791 uint64_t plus = 0, minus = 0;
f757855e 792 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
793
794 assert(argc >= 0);
795 assert(argv);
796
de40a303 797 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
798 switch (c) {
799
800 case 'h':
37ec0fdd 801 return help();
88213476 802
acbeb427 803 case ARG_VERSION:
3f6fd1ba 804 return version();
acbeb427 805
88213476 806 case 'D':
614b022c 807 r = parse_path_argument(optarg, false, &arg_directory);
ec16945e 808 if (r < 0)
0f03c2a4 809 return r;
de40a303
LP
810
811 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
812 break;
813
814 case ARG_TEMPLATE:
614b022c 815 r = parse_path_argument(optarg, false, &arg_template);
ec16945e 816 if (r < 0)
0f03c2a4 817 return r;
de40a303
LP
818
819 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
820 break;
821
1b9e5b12 822 case 'i':
614b022c 823 r = parse_path_argument(optarg, false, &arg_image);
ec16945e 824 if (r < 0)
0f03c2a4 825 return r;
de40a303
LP
826
827 arg_settings_mask |= SETTING_DIRECTORY;
828 break;
829
830 case ARG_OCI_BUNDLE:
614b022c 831 r = parse_path_argument(optarg, false, &arg_oci_bundle);
de40a303
LP
832 if (r < 0)
833 return r;
834
ec16945e
LP
835 break;
836
837 case 'x':
838 arg_ephemeral = true;
a2f577fc 839 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
840 break;
841
687d0825 842 case 'u':
2fc09a9c
DM
843 r = free_and_strdup(&arg_user, optarg);
844 if (r < 0)
7027ff61 845 return log_oom();
687d0825 846
f757855e 847 arg_settings_mask |= SETTING_USER;
687d0825
MV
848 break;
849
22b28dfd
LP
850 case ARG_NETWORK_ZONE: {
851 char *j;
852
b910cc72 853 j = strjoin("vz-", optarg);
22b28dfd
LP
854 if (!j)
855 return log_oom();
856
857 if (!ifname_valid(j)) {
858 log_error("Network zone name not valid: %s", j);
859 free(j);
860 return -EINVAL;
861 }
862
df1fac6d 863 free_and_replace(arg_network_zone, j);
22b28dfd
LP
864
865 arg_network_veth = true;
866 arg_private_network = true;
867 arg_settings_mask |= SETTING_NETWORK;
868 break;
869 }
870
ab046dde 871 case ARG_NETWORK_BRIDGE:
ef76dff2 872
baaa35ad
ZJS
873 if (!ifname_valid(optarg))
874 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
875 "Bridge interface name not valid: %s", optarg);
ef76dff2 876
f757855e
LP
877 r = free_and_strdup(&arg_network_bridge, optarg);
878 if (r < 0)
879 return log_oom();
ab046dde 880
4831981d 881 _fallthrough_;
0dfaa006 882 case 'n':
69c79d3c
LP
883 arg_network_veth = true;
884 arg_private_network = true;
f757855e 885 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
886 break;
887
f6d6bad1
LP
888 case ARG_NETWORK_VETH_EXTRA:
889 r = veth_extra_parse(&arg_network_veth_extra, optarg);
890 if (r < 0)
891 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
892
893 arg_private_network = true;
894 arg_settings_mask |= SETTING_NETWORK;
895 break;
896
aa28aefe 897 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
898 if (!ifname_valid(optarg))
899 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
900 "Network interface name not valid: %s", optarg);
ef76dff2 901
b390f178
DDM
902 r = test_network_interface_initialized(optarg);
903 if (r < 0)
904 return r;
905
c74e630d
LP
906 if (strv_extend(&arg_network_interfaces, optarg) < 0)
907 return log_oom();
908
909 arg_private_network = true;
f757855e 910 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
911 break;
912
913 case ARG_NETWORK_MACVLAN:
ef76dff2 914
baaa35ad
ZJS
915 if (!ifname_valid(optarg))
916 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
917 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 918
b390f178
DDM
919 r = test_network_interface_initialized(optarg);
920 if (r < 0)
921 return r;
922
c74e630d 923 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
924 return log_oom();
925
4bbfe7ad 926 arg_private_network = true;
f757855e 927 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
928 break;
929
930 case ARG_NETWORK_IPVLAN:
ef76dff2 931
baaa35ad
ZJS
932 if (!ifname_valid(optarg))
933 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
934 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 935
b390f178
DDM
936 r = test_network_interface_initialized(optarg);
937 if (r < 0)
938 return r;
939
4bbfe7ad
TG
940 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
941 return log_oom();
942
4831981d 943 _fallthrough_;
ff01d048
LP
944 case ARG_PRIVATE_NETWORK:
945 arg_private_network = true;
f757855e 946 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
947 break;
948
d7bea6b6 949 case ARG_NETWORK_NAMESPACE_PATH:
614b022c 950 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
d7bea6b6
DP
951 if (r < 0)
952 return r;
953
de40a303 954 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
955 break;
956
0f0dbc46 957 case 'b':
baaa35ad
ZJS
958 if (arg_start_mode == START_PID2)
959 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
960 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
961
962 arg_start_mode = START_BOOT;
963 arg_settings_mask |= SETTING_START_MODE;
964 break;
965
966 case 'a':
baaa35ad
ZJS
967 if (arg_start_mode == START_BOOT)
968 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
969 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
970
971 arg_start_mode = START_PID2;
972 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
973 break;
974
144f0fc0 975 case ARG_UUID:
9444b1f2 976 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
977 if (r < 0)
978 return log_error_errno(r, "Invalid UUID: %s", optarg);
979
baaa35ad
ZJS
980 if (sd_id128_is_null(arg_uuid))
981 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
982 "Machine UUID may not be all zeroes.");
f757855e
LP
983
984 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 985 break;
aa96c6cb 986
43c3fb46
LP
987 case 'S': {
988 _cleanup_free_ char *mangled = NULL;
989
990 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
de40a303
LP
991 if (r < 0)
992 return log_oom();
993
43c3fb46 994 free_and_replace(arg_slice, mangled);
de40a303 995 arg_settings_mask |= SETTING_SLICE;
144f0fc0 996 break;
43c3fb46 997 }
144f0fc0 998
7027ff61 999 case 'M':
c1521918 1000 if (isempty(optarg))
97b11eed 1001 arg_machine = mfree(arg_machine);
c1521918 1002 else {
52ef5dd7 1003 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1004 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1005 "Invalid machine name: %s", optarg);
7027ff61 1006
0c3c4284
LP
1007 r = free_and_strdup(&arg_machine, optarg);
1008 if (r < 0)
eb91eb18 1009 return log_oom();
eb91eb18 1010 }
9ce6d1b3 1011 break;
7027ff61 1012
3a9530e5
LP
1013 case ARG_HOSTNAME:
1014 if (isempty(optarg))
1015 arg_hostname = mfree(arg_hostname);
1016 else {
52ef5dd7 1017 if (!hostname_is_valid(optarg, 0))
baaa35ad
ZJS
1018 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019 "Invalid hostname: %s", optarg);
3a9530e5
LP
1020
1021 r = free_and_strdup(&arg_hostname, optarg);
1022 if (r < 0)
1023 return log_oom();
1024 }
1025
1026 arg_settings_mask |= SETTING_HOSTNAME;
1027 break;
1028
82adf6af
LP
1029 case 'Z':
1030 arg_selinux_context = optarg;
a8828ed9
DW
1031 break;
1032
82adf6af
LP
1033 case 'L':
1034 arg_selinux_apifs_context = optarg;
a8828ed9
DW
1035 break;
1036
bc2f673e
LP
1037 case ARG_READ_ONLY:
1038 arg_read_only = true;
f757855e 1039 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
1040 break;
1041
88fc9c9b
TH
1042 case ARG_AMBIENT_CAPABILITY: {
1043 uint64_t m;
1044 r = parse_capability_spec(optarg, &m);
1045 if (r <= 0)
1046 return r;
1047 arg_caps_ambient |= m;
1048 arg_settings_mask |= SETTING_CAPABILITY;
1049 break;
1050 }
420c7379
LP
1051 case ARG_CAPABILITY:
1052 case ARG_DROP_CAPABILITY: {
8a99bd0c
ZJS
1053 uint64_t m;
1054 r = parse_capability_spec(optarg, &m);
1055 if (r <= 0)
1056 return r;
5076f0cc 1057
8a99bd0c
ZJS
1058 if (c == ARG_CAPABILITY)
1059 plus |= m;
1060 else
1061 minus |= m;
f757855e 1062 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
1063 break;
1064 }
66edd963
LP
1065 case ARG_NO_NEW_PRIVILEGES:
1066 r = parse_boolean(optarg);
1067 if (r < 0)
1068 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1069
1070 arg_no_new_privileges = r;
1071 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1072 break;
1073
57fb9fb5
LP
1074 case 'j':
1075 arg_link_journal = LINK_GUEST;
574edc90 1076 arg_link_journal_try = true;
4e1d6aa9 1077 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1078 break;
1079
1080 case ARG_LINK_JOURNAL:
4e1d6aa9 1081 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
c6147113
LP
1082 if (r < 0)
1083 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5 1084
4e1d6aa9 1085 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
1086 break;
1087
17fe0523 1088 case ARG_BIND:
f757855e
LP
1089 case ARG_BIND_RO:
1090 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1091 if (r < 0)
1092 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 1093
f757855e 1094 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 1095 break;
06c17c39 1096
f757855e
LP
1097 case ARG_TMPFS:
1098 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1099 if (r < 0)
1100 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 1101
f757855e 1102 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 1103 break;
5a8af538
LP
1104
1105 case ARG_OVERLAY:
ad85779a
LP
1106 case ARG_OVERLAY_RO:
1107 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1108 if (r == -EADDRNOTAVAIL)
1109 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 1112
f757855e 1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 1114 break;
06c17c39 1115
de40a303
LP
1116 case ARG_INACCESSIBLE:
1117 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1118 if (r < 0)
1119 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1120
1121 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1122 break;
1123
a5f1cb3b 1124 case 'E': {
baaa35ad
ZJS
1125 if (!env_assignment_is_valid(optarg))
1126 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1127 "Environment variable assignment '%s' is not valid.", optarg);
aaf057c4
ZJS
1128 r = strv_env_replace_strdup(&arg_setenv, optarg);
1129 if (r < 0)
1130 return r;
f4889f65 1131
f757855e 1132 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1133 break;
1134 }
1135
284c0b91
LP
1136 case 'q':
1137 arg_quiet = true;
1138 break;
1139
8a96d94e 1140 case ARG_SHARE_SYSTEM:
a6b5216c 1141 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1142 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1143 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1144 arg_clone_ns_flags = 0;
8a96d94e
LP
1145 break;
1146
eb91eb18
LP
1147 case ARG_REGISTER:
1148 r = parse_boolean(optarg);
1149 if (r < 0) {
1150 log_error("Failed to parse --register= argument: %s", optarg);
1151 return r;
1152 }
1153
1154 arg_register = r;
1155 break;
1156
89f7c846
LP
1157 case ARG_KEEP_UNIT:
1158 arg_keep_unit = true;
1159 break;
1160
6afc95b7
LP
1161 case ARG_PERSONALITY:
1162
ac45f971 1163 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1164 if (arg_personality == PERSONALITY_INVALID)
1165 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1166 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1167
f757855e 1168 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1169 break;
1170
4d9f07b4
LP
1171 case ARG_VOLATILE:
1172
1173 if (!optarg)
f757855e 1174 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1175 else if (streq(optarg, "help")) {
1176 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1177 return 0;
1178 } else {
f757855e 1179 VolatileMode m;
4d9f07b4 1180
f757855e 1181 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1182 if (m < 0)
1183 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1184 "Failed to parse --volatile= argument: %s", optarg);
1185 else
f757855e 1186 arg_volatile_mode = m;
6d0b55c2
LP
1187 }
1188
f757855e
LP
1189 arg_settings_mask |= SETTING_VOLATILE_MODE;
1190 break;
6d0b55c2 1191
f757855e
LP
1192 case 'p':
1193 r = expose_port_parse(&arg_expose_ports, optarg);
1194 if (r == -EEXIST)
1195 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1196 if (r < 0)
1197 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1198
f757855e 1199 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1200 break;
6d0b55c2 1201
f36933fe
LP
1202 case ARG_PROPERTY:
1203 if (strv_extend(&arg_property, optarg) < 0)
1204 return log_oom();
1205
1206 break;
1207
ae209204 1208 case ARG_PRIVATE_USERS: {
33eac552 1209 int boolean;
0de7acce 1210
ae209204
ZJS
1211 if (!optarg)
1212 boolean = true;
1213 else if (!in_charset(optarg, DIGITS))
1214 /* do *not* parse numbers as booleans */
1215 boolean = parse_boolean(optarg);
33eac552
LP
1216 else
1217 boolean = -1;
ae209204 1218
33eac552 1219 if (boolean == 0) {
0de7acce
LP
1220 /* no: User namespacing off */
1221 arg_userns_mode = USER_NAMESPACE_NO;
1222 arg_uid_shift = UID_INVALID;
1223 arg_uid_range = UINT32_C(0x10000);
33eac552 1224 } else if (boolean > 0) {
0de7acce
LP
1225 /* yes: User namespacing on, UID range is read from root dir */
1226 arg_userns_mode = USER_NAMESPACE_FIXED;
1227 arg_uid_shift = UID_INVALID;
1228 arg_uid_range = UINT32_C(0x10000);
1229 } else if (streq(optarg, "pick")) {
1230 /* pick: User namespacing on, UID range is picked randomly */
6c045a99
LP
1231 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1232 * implied by USER_NAMESPACE_PICK
33eac552 1233 * further down. */
0de7acce
LP
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
33eac552
LP
1236
1237 } else if (streq(optarg, "identity")) {
1238 /* identitiy: User namespaces on, UID range is map the 0…0xFFFF range to
1239 * itself, i.e. we don't actually map anything, but do take benefit of
1240 * isolation of capability sets. */
1241 arg_userns_mode = USER_NAMESPACE_FIXED;
1242 arg_uid_shift = 0;
1243 arg_uid_range = UINT32_C(0x10000);
0de7acce 1244 } else {
6c2058b3 1245 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1246 const char *range, *shift;
1247
0de7acce
LP
1248 /* anything else: User namespacing on, UID range is explicitly configured */
1249
6dac160c
LP
1250 range = strchr(optarg, ':');
1251 if (range) {
6c2058b3
ZJS
1252 buffer = strndup(optarg, range - optarg);
1253 if (!buffer)
1254 return log_oom();
1255 shift = buffer;
6dac160c
LP
1256
1257 range++;
bfd292ec
ZJS
1258 r = safe_atou32(range, &arg_uid_range);
1259 if (r < 0)
be715731 1260 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1261 } else
1262 shift = optarg;
1263
be715731
ZJS
1264 r = parse_uid(shift, &arg_uid_shift);
1265 if (r < 0)
1266 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1267
1268 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c 1269
58e13de5
LP
1270 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1271 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1272 }
be715731 1273
0de7acce 1274 arg_settings_mask |= SETTING_USERNS;
6dac160c 1275 break;
ae209204 1276 }
6dac160c 1277
0de7acce 1278 case 'U':
ccabee0d 1279 if (userns_supported()) {
6c045a99
LP
1280 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1281 * implied by USER_NAMESPACE_PICK
33eac552 1282 * further down. */
ccabee0d
LP
1283 arg_uid_shift = UID_INVALID;
1284 arg_uid_range = UINT32_C(0x10000);
1285
1286 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1287 }
1288
7336138e
LP
1289 break;
1290
0de7acce 1291 case ARG_PRIVATE_USERS_CHOWN:
6c045a99
LP
1292 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1293
1294 arg_settings_mask |= SETTING_USERNS;
1295 break;
1296
1297 case ARG_PRIVATE_USERS_OWNERSHIP:
1298 if (streq(optarg, "help")) {
1299 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1300 return 0;
1301 }
1302
1303 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1304 if (arg_userns_ownership < 0)
1305 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
0de7acce
LP
1306
1307 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1308 break;
1309
c6c8f6e2 1310 case ARG_KILL_SIGNAL:
5c828e66
LP
1311 if (streq(optarg, "help")) {
1312 DUMP_STRING_TABLE(signal, int, _NSIG);
1313 return 0;
1314 }
1315
29a3db75 1316 arg_kill_signal = signal_from_string(optarg);
baaa35ad 1317 if (arg_kill_signal < 0)
7211c853 1318 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
c6c8f6e2 1319
f757855e
LP
1320 arg_settings_mask |= SETTING_KILL_SIGNAL;
1321 break;
1322
1323 case ARG_SETTINGS:
1324
1325 /* no → do not read files
1326 * yes → read files, do not override cmdline, trust only subset
1327 * override → read files, override cmdline, trust only subset
1328 * trusted → read files, do not override cmdline, trust all
1329 */
1330
1331 r = parse_boolean(optarg);
1332 if (r < 0) {
1333 if (streq(optarg, "trusted")) {
1334 mask_all_settings = false;
1335 mask_no_settings = false;
1336 arg_settings_trusted = true;
1337
1338 } else if (streq(optarg, "override")) {
1339 mask_all_settings = false;
1340 mask_no_settings = true;
1341 arg_settings_trusted = -1;
1342 } else
1343 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1344 } else if (r > 0) {
1345 /* yes */
1346 mask_all_settings = false;
1347 mask_no_settings = false;
1348 arg_settings_trusted = -1;
1349 } else {
1350 /* no */
1351 mask_all_settings = true;
1352 mask_no_settings = false;
1353 arg_settings_trusted = false;
1354 }
1355
c6c8f6e2
LP
1356 break;
1357
5f932eb9 1358 case ARG_CHDIR:
baaa35ad
ZJS
1359 if (!path_is_absolute(optarg))
1360 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1361 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1362
1363 r = free_and_strdup(&arg_chdir, optarg);
1364 if (r < 0)
1365 return log_oom();
1366
1367 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1368 break;
1369
b53ede69
PW
1370 case ARG_PIVOT_ROOT:
1371 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1372 if (r < 0)
1373 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1374
1375 arg_settings_mask |= SETTING_PIVOT_ROOT;
1376 break;
1377
9c1e04d0
AP
1378 case ARG_NOTIFY_READY:
1379 r = parse_boolean(optarg);
baaa35ad
ZJS
1380 if (r < 0)
1381 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1382 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1383 arg_notify_ready = r;
1384 arg_settings_mask |= SETTING_NOTIFY_READY;
1385 break;
1386
4623e8e6 1387 case ARG_ROOT_HASH: {
89e62e0b 1388 _cleanup_free_ void *k = NULL;
4623e8e6
LP
1389 size_t l;
1390
1391 r = unhexmem(optarg, strlen(optarg), &k, &l);
1392 if (r < 0)
1393 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
89e62e0b 1394 if (l < sizeof(sd_id128_t))
c6147113 1395 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
4623e8e6 1396
89e62e0b
LP
1397 free_and_replace(arg_verity_settings.root_hash, k);
1398 arg_verity_settings.root_hash_size = l;
4623e8e6
LP
1399 break;
1400 }
1401
c2923fdc
LB
1402 case ARG_ROOT_HASH_SIG: {
1403 char *value;
89e62e0b
LP
1404 size_t l;
1405 void *p;
c2923fdc
LB
1406
1407 if ((value = startswith(optarg, "base64:"))) {
c2923fdc
LB
1408 r = unbase64mem(value, strlen(value), &p, &l);
1409 if (r < 0)
1410 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1411
c2923fdc 1412 } else {
89e62e0b 1413 r = read_full_file(optarg, (char**) &p, &l);
c2923fdc 1414 if (r < 0)
89e62e0b 1415 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
c2923fdc
LB
1416 }
1417
89e62e0b
LP
1418 free_and_replace(arg_verity_settings.root_hash_sig, p);
1419 arg_verity_settings.root_hash_sig_size = l;
c2923fdc
LB
1420 break;
1421 }
1422
89e62e0b 1423 case ARG_VERITY_DATA:
614b022c 1424 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
89e62e0b
LP
1425 if (r < 0)
1426 return r;
1427 break;
1428
960e4569
LP
1429 case ARG_SYSTEM_CALL_FILTER: {
1430 bool negative;
1431 const char *items;
1432
1433 negative = optarg[0] == '~';
1434 items = negative ? optarg + 1 : optarg;
1435
1436 for (;;) {
1437 _cleanup_free_ char *word = NULL;
1438
1439 r = extract_first_word(&items, &word, NULL, 0);
1440 if (r == 0)
1441 break;
1442 if (r == -ENOMEM)
1443 return log_oom();
1444 if (r < 0)
1445 return log_error_errno(r, "Failed to parse system call filter: %m");
1446
1447 if (negative)
6b000af4 1448 r = strv_extend(&arg_syscall_deny_list, word);
960e4569 1449 else
6b000af4 1450 r = strv_extend(&arg_syscall_allow_list, word);
960e4569
LP
1451 if (r < 0)
1452 return log_oom();
1453 }
1454
1455 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1456 break;
1457 }
1458
bf428efb
LP
1459 case ARG_RLIMIT: {
1460 const char *eq;
622ecfa8 1461 _cleanup_free_ char *name = NULL;
bf428efb
LP
1462 int rl;
1463
5c828e66
LP
1464 if (streq(optarg, "help")) {
1465 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1466 return 0;
1467 }
1468
bf428efb 1469 eq = strchr(optarg, '=');
baaa35ad
ZJS
1470 if (!eq)
1471 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1472 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1473
1474 name = strndup(optarg, eq - optarg);
1475 if (!name)
1476 return log_oom();
1477
1478 rl = rlimit_from_string_harder(name);
baaa35ad 1479 if (rl < 0)
7211c853 1480 return log_error_errno(rl, "Unknown resource limit: %s", name);
bf428efb
LP
1481
1482 if (!arg_rlimit[rl]) {
1483 arg_rlimit[rl] = new0(struct rlimit, 1);
1484 if (!arg_rlimit[rl])
1485 return log_oom();
1486 }
1487
1488 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1489 if (r < 0)
1490 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1491
1492 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1493 break;
1494 }
1495
81f345df
LP
1496 case ARG_OOM_SCORE_ADJUST:
1497 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1500
1501 arg_oom_score_adjust_set = true;
1502 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1503 break;
1504
d107bb7d 1505 case ARG_CPU_AFFINITY: {
0985c7c4 1506 CPUSet cpuset;
d107bb7d
LP
1507
1508 r = parse_cpu_set(optarg, &cpuset);
1509 if (r < 0)
0985c7c4 1510 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
d107bb7d 1511
0985c7c4
ZJS
1512 cpu_set_reset(&arg_cpu_set);
1513 arg_cpu_set = cpuset;
d107bb7d
LP
1514 arg_settings_mask |= SETTING_CPU_AFFINITY;
1515 break;
1516 }
1517
09d423e9
LP
1518 case ARG_RESOLV_CONF:
1519 if (streq(optarg, "help")) {
1520 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1521 return 0;
1522 }
1523
1524 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad 1525 if (arg_resolv_conf < 0)
7211c853 1526 return log_error_errno(arg_resolv_conf,
baaa35ad 1527 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1528
1529 arg_settings_mask |= SETTING_RESOLV_CONF;
1530 break;
1531
1688841f
LP
1532 case ARG_TIMEZONE:
1533 if (streq(optarg, "help")) {
1534 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1535 return 0;
1536 }
1537
1538 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad 1539 if (arg_timezone < 0)
7211c853 1540 return log_error_errno(arg_timezone,
baaa35ad 1541 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1542
1543 arg_settings_mask |= SETTING_TIMEZONE;
1544 break;
1545
de40a303 1546 case ARG_CONSOLE:
dce66ffe
ZJS
1547 r = handle_arg_console(optarg);
1548 if (r <= 0)
1549 return r;
de40a303
LP
1550 break;
1551
1552 case 'P':
1553 case ARG_PIPE:
dce66ffe
ZJS
1554 r = handle_arg_console("pipe");
1555 if (r <= 0)
1556 return r;
de40a303
LP
1557 break;
1558
bb068de0
ZJS
1559 case ARG_NO_PAGER:
1560 arg_pager_flags |= PAGER_DISABLE;
1561 break;
1562
3652872a
LP
1563 case ARG_SET_CREDENTIAL: {
1564 _cleanup_free_ char *word = NULL, *data = NULL;
1565 const char *p = optarg;
1566 Credential *a;
1567 size_t i;
1568 int l;
1569
1570 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1571 if (r == -ENOMEM)
1572 return log_oom();
1573 if (r < 0)
1574 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1575 if (r == 0 || !p)
1576 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1577
1578 if (!credential_name_valid(word))
1579 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1580
1581 for (i = 0; i < arg_n_credentials; i++)
1582 if (streq(arg_credentials[i].id, word))
1583 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1584
1585 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1586 if (l < 0)
1587 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1588
1589 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1590 if (!a)
1591 return log_oom();
1592
1593 a[arg_n_credentials++] = (Credential) {
1594 .id = TAKE_PTR(word),
1595 .data = TAKE_PTR(data),
1596 .size = l,
1597 };
1598
1599 arg_credentials = a;
1600
1601 arg_settings_mask |= SETTING_CREDENTIALS;
1602 break;
1603 }
1604
1605 case ARG_LOAD_CREDENTIAL: {
1606 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1607 _cleanup_(erase_and_freep) char *data = NULL;
1608 _cleanup_free_ char *word = NULL, *j = NULL;
1609 const char *p = optarg;
1610 Credential *a;
1611 size_t size, i;
1612
1613 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1614 if (r == -ENOMEM)
1615 return log_oom();
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1618 if (r == 0 || !p)
1619 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1620
1621 if (!credential_name_valid(word))
1622 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1623
1624 for (i = 0; i < arg_n_credentials; i++)
1625 if (streq(arg_credentials[i].id, word))
1626 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1627
1628 if (path_is_absolute(p))
1629 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1630 else {
1631 const char *e;
1632
786d19fd
LP
1633 r = get_credentials_dir(&e);
1634 if (r < 0)
1635 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
3652872a
LP
1636
1637 j = path_join(e, p);
1638 if (!j)
1639 return log_oom();
1640 }
1641
986311c2
LP
1642 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1643 flags,
1644 NULL,
1645 &data, &size);
3652872a
LP
1646 if (r < 0)
1647 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1648
1649 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1650 if (!a)
1651 return log_oom();
1652
1653 a[arg_n_credentials++] = (Credential) {
1654 .id = TAKE_PTR(word),
1655 .data = TAKE_PTR(data),
1656 .size = size,
1657 };
1658
1659 arg_credentials = a;
1660
1661 arg_settings_mask |= SETTING_CREDENTIALS;
1662 break;
1663 }
1664
2f893044
LP
1665 case ARG_BIND_USER:
1666 if (!valid_user_group_name(optarg, 0))
1667 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1668
1669 if (strv_extend(&arg_bind_user, optarg) < 0)
1670 return log_oom();
1671
1672 arg_settings_mask |= SETTING_BIND_USER;
1673 break;
1674
88213476
LP
1675 case '?':
1676 return -EINVAL;
1677
1678 default:
eb9da376 1679 assert_not_reached("Unhandled option");
88213476 1680 }
88213476 1681
60f1ec13
LP
1682 if (argc > optind) {
1683 strv_free(arg_parameters);
1684 arg_parameters = strv_copy(argv + optind);
1685 if (!arg_parameters)
1686 return log_oom();
d7bea6b6 1687
60f1ec13
LP
1688 arg_settings_mask |= SETTING_START_MODE;
1689 }
1690
1691 if (arg_ephemeral && arg_template && !arg_directory)
1692 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1693 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1694 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1695 * --directory=". */
1696 arg_directory = TAKE_PTR(arg_template);
1697
bd4b15f2 1698 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1699
de40a303 1700 /* Make sure to parse environment before we reset the settings mask below */
49048684
ZJS
1701 r = parse_environment();
1702 if (r < 0)
1703 return r;
de40a303 1704
60f1ec13
LP
1705 /* Load all settings from .nspawn files */
1706 if (mask_no_settings)
1707 arg_settings_mask = 0;
1708
1709 /* Don't load any settings from .nspawn files */
1710 if (mask_all_settings)
1711 arg_settings_mask = _SETTINGS_MASK_ALL;
1712
1713 return 1;
1714}
1715
1716static int verify_arguments(void) {
1717 int r;
a6b5216c 1718
75b0d8b8
ZJS
1719 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1720 /* If we are running the stub init in the container, we don't need to look at what the init
1721 * in the container supports, because we are not using it. Let's immediately pick the right
1722 * setting based on the host system configuration.
1723 *
1724 * We only do this, if the user didn't use an environment variable to override the detection.
1725 */
1726
1727 r = cg_all_unified();
1728 if (r < 0)
1729 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1730 if (r > 0)
1731 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1732 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1733 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1734 else
1735 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1736 }
1737
4f086aab
SU
1738 if (arg_userns_mode != USER_NAMESPACE_NO)
1739 arg_mount_settings |= MOUNT_USE_USERNS;
1740
1741 if (arg_private_network)
1742 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1743
48a8d337
LB
1744 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1745 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1746 arg_register = false;
baaa35ad 1747 if (arg_start_mode != START_PID1)
60f1ec13 1748 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1749 }
eb91eb18 1750
6c045a99
LP
1751 if (arg_userns_ownership < 0)
1752 arg_userns_ownership =
f61c7f88 1753 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
6c045a99 1754 USER_NAMESPACE_OWNERSHIP_OFF;
0e7ac751 1755
60f1ec13
LP
1756 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1757 arg_kill_signal = SIGRTMIN+3;
1758
e5a4bb0d
LP
1759 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1760 arg_read_only = true;
1761
2436ea76
DDM
1762 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1763 arg_read_only = true;
1764
baaa35ad 1765 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1766 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1767 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1768 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1769
baaa35ad 1770 if (arg_directory && arg_image)
60f1ec13 1771 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1772
baaa35ad 1773 if (arg_template && arg_image)
60f1ec13 1774 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1775
baaa35ad 1776 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1778
baaa35ad 1779 if (arg_ephemeral && arg_template)
60f1ec13 1780 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1781
baaa35ad 1782 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1784
baaa35ad 1785 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1786 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1787
6c045a99 1788 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
de40a303 1789 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6c045a99 1790 "--read-only and --private-users-ownership=chown may not be combined.");
f757855e 1791
6c045a99
LP
1792 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1793 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1794 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1795 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1796 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
e5a4bb0d 1797
679ecd36
SZ
1798 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1799 * we need to error out, to avoid conflicts between different network options. */
60f1ec13
LP
1800 if (arg_network_namespace_path &&
1801 (arg_network_interfaces || arg_network_macvlan ||
1802 arg_network_ipvlan || arg_network_veth_extra ||
1803 arg_network_bridge || arg_network_zone ||
679ecd36 1804 arg_network_veth))
de40a303 1805 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1806
60f1ec13 1807 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1808 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1809 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1810
baaa35ad 1811 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1813
baaa35ad 1814 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1815 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1816
baaa35ad 1817 if (arg_expose_ports && !arg_private_network)
60f1ec13 1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1819
88fc9c9b 1820 if (arg_caps_ambient) {
f5fbe71d 1821 if (arg_caps_ambient == UINT64_MAX)
88fc9c9b
TH
1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1823
1824 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1826
1827 if (arg_start_mode == START_BOOT)
1828 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1829 }
1830
2f893044
LP
1831 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1833
1834 /* Drop duplicate --bind-user= entries */
1835 strv_uniq(arg_bind_user);
1836
60f1ec13
LP
1837 r = custom_mount_check_all();
1838 if (r < 0)
1839 return r;
c6c8f6e2 1840
f757855e 1841 return 0;
88213476
LP
1842}
1843
91181e07 1844int userns_lchown(const char *p, uid_t uid, gid_t gid) {
03cfe0d5
LP
1845 assert(p);
1846
0de7acce 1847 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1848 return 0;
1849
1850 if (uid == UID_INVALID && gid == GID_INVALID)
1851 return 0;
1852
1853 if (uid != UID_INVALID) {
1854 uid += arg_uid_shift;
1855
1856 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1857 return -EOVERFLOW;
1858 }
1859
1860 if (gid != GID_INVALID) {
1861 gid += (gid_t) arg_uid_shift;
1862
1863 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1864 return -EOVERFLOW;
1865 }
1866
1867 if (lchown(p, uid, gid) < 0)
1868 return -errno;
b12afc8c
LP
1869
1870 return 0;
1871}
1872
91181e07 1873int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
03cfe0d5 1874 const char *q;
dae8b82e 1875 int r;
03cfe0d5
LP
1876
1877 q = prefix_roota(root, path);
dae8b82e
ZJS
1878 r = mkdir_errno_wrapper(q, mode);
1879 if (r == -EEXIST)
1880 return 0;
1881 if (r < 0)
1882 return r;
03cfe0d5
LP
1883
1884 return userns_lchown(q, uid, gid);
1885}
1886
1688841f 1887static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1888 return PATH_STARTSWITH_SET(
1889 path,
1890 "../usr/share/zoneinfo/",
1891 "/usr/share/zoneinfo/");
1688841f
LP
1892}
1893
83205269
LP
1894static bool etc_writable(void) {
1895 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1896}
1897
e58a1277 1898static int setup_timezone(const char *dest) {
1688841f
LP
1899 _cleanup_free_ char *p = NULL, *etc = NULL;
1900 const char *where, *check;
1901 TimezoneMode m;
d4036145 1902 int r;
f8440af5 1903
e58a1277
LP
1904 assert(dest);
1905
1688841f 1906 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1907 r = readlink_malloc("/etc/localtime", &p);
1908 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1909 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1910 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1911 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1912 else if (r < 0) {
1913 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1914 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1915 * file.
1916 *
1917 * Example:
1918 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1919 */
1920 return 0;
1921 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1922 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1923 else
1924 m = arg_timezone;
1925 } else
1926 m = arg_timezone;
1927
1928 if (m == TIMEZONE_OFF)
1929 return 0;
1930
a5648b80 1931 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
d4036145 1932 if (r < 0) {
1688841f 1933 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1934 return 0;
1935 }
1936
1688841f
LP
1937 where = strjoina(etc, "/localtime");
1938
1939 switch (m) {
1940
1941 case TIMEZONE_DELETE:
1942 if (unlink(where) < 0)
1943 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1944
d4036145 1945 return 0;
d4036145 1946
1688841f
LP
1947 case TIMEZONE_SYMLINK: {
1948 _cleanup_free_ char *q = NULL;
1949 const char *z, *what;
4d1c38b8 1950
1688841f
LP
1951 z = timezone_from_path(p);
1952 if (!z) {
1953 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1954 return 0;
1688841f 1955 }
d4036145 1956
1688841f
LP
1957 r = readlink_malloc(where, &q);
1958 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1959 return 0; /* Already pointing to the right place? Then do nothing .. */
1960
1961 check = strjoina(dest, "/usr/share/zoneinfo/", z);
a5648b80 1962 r = chase_symlinks(check, dest, 0, NULL, NULL);
1688841f
LP
1963 if (r < 0)
1964 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1965 else {
1966 if (unlink(where) < 0 && errno != ENOENT) {
1967 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1968 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1969 return 0;
1970 }
1971
1972 what = strjoina("../usr/share/zoneinfo/", z);
1973 if (symlink(what, where) < 0) {
1974 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1975 errno, "Failed to correct timezone of container, ignoring: %m");
1976 return 0;
1977 }
1978
1979 break;
1980 }
1981
1982 _fallthrough_;
d4036145 1983 }
68fb0892 1984
1688841f
LP
1985 case TIMEZONE_BIND: {
1986 _cleanup_free_ char *resolved = NULL;
1987 int found;
1988
a5648b80 1989 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1688841f
LP
1990 if (found < 0) {
1991 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1992 return 0;
1993 }
1994
1995 if (found == 0) /* missing? */
1996 (void) touch(resolved);
1997
511a8cfe 1998 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1688841f 1999 if (r >= 0)
511a8cfe 2000 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1688841f
LP
2001
2002 _fallthrough_;
79d80fc1 2003 }
4d9f07b4 2004
1688841f
LP
2005 case TIMEZONE_COPY:
2006 /* If mounting failed, try to copy */
8a016c74 2007 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1688841f
LP
2008 if (r < 0) {
2009 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2010 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2011 return 0;
2012 }
2013
2014 break;
2015
2016 default:
2017 assert_not_reached("unexpected mode");
d4036145 2018 }
e58a1277 2019
1688841f 2020 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
2021 r = userns_lchown(where, 0, 0);
2022 if (r < 0)
1688841f 2023 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 2024
e58a1277 2025 return 0;
88213476
LP
2026}
2027
09d423e9
LP
2028static int have_resolv_conf(const char *path) {
2029 assert(path);
2030
2031 if (access(path, F_OK) < 0) {
2032 if (errno == ENOENT)
2033 return 0;
2034
2035 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2036 }
2037
2038 return 1;
2039}
2040
7357272e 2041static int resolved_listening(void) {
b8ea7a6e 2042 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 2043 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 2044 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
2045 int r;
2046
7357272e 2047 /* Check if resolved is listening */
b053cd5f
LP
2048
2049 r = sd_bus_open_system(&bus);
2050 if (r < 0)
b8ea7a6e 2051 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 2052
7357272e 2053 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
2054 if (r < 0)
2055 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2056 if (r == 0)
2057 return 0;
7357272e
DM
2058
2059 r = sd_bus_get_property_string(bus,
2060 "org.freedesktop.resolve1",
2061 "/org/freedesktop/resolve1",
2062 "org.freedesktop.resolve1.Manager",
2063 "DNSStubListener",
b8ea7a6e 2064 &error,
7357272e
DM
2065 &dns_stub_listener_mode);
2066 if (r < 0)
b8ea7a6e 2067 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
2068
2069 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
2070}
2071
2547bb41 2072static int setup_resolv_conf(const char *dest) {
09d423e9
LP
2073 _cleanup_free_ char *etc = NULL;
2074 const char *where, *what;
2075 ResolvConfMode m;
2076 int r;
2547bb41
LP
2077
2078 assert(dest);
2079
09d423e9
LP
2080 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2081 if (arg_private_network)
2082 m = RESOLV_CONF_OFF;
86775e35
LP
2083 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2084 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
09d423e9 2085 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 2086 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 2087 else
83205269 2088 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
86775e35 2089
09d423e9
LP
2090 } else
2091 m = arg_resolv_conf;
2092
2093 if (m == RESOLV_CONF_OFF)
2547bb41
LP
2094 return 0;
2095
a5648b80 2096 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
87447ae4
LP
2097 if (r < 0) {
2098 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2099 return 0;
2100 }
2101
2102 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
2103
2104 if (m == RESOLV_CONF_DELETE) {
2105 if (unlink(where) < 0)
2106 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2107
87447ae4
LP
2108 return 0;
2109 }
79d80fc1 2110
86775e35
LP
2111 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2112 what = PRIVATE_STATIC_RESOLV_CONF;
2113 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2114 what = PRIVATE_UPLINK_RESOLV_CONF;
2115 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2116 what = PRIVATE_STUB_RESOLV_CONF;
09d423e9
LP
2117 else
2118 what = "/etc/resolv.conf";
87447ae4 2119
86775e35 2120 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
09d423e9
LP
2121 _cleanup_free_ char *resolved = NULL;
2122 int found;
2123
a5648b80 2124 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
09d423e9
LP
2125 if (found < 0) {
2126 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2127 return 0;
2128 }
3539724c 2129
87447ae4
LP
2130 if (found == 0) /* missing? */
2131 (void) touch(resolved);
5367354d 2132
511a8cfe 2133 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 2134 if (r >= 0)
511a8cfe 2135 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
86775e35
LP
2136
2137 /* If that didn't work, let's copy the file */
3539724c
LP
2138 }
2139
86775e35
LP
2140 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2141 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2142 else
2143 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
79d80fc1 2144 if (r < 0) {
3539724c
LP
2145 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2146 * resolved or something similar runs inside and the symlink points there.
68a313c5 2147 *
3539724c 2148 * If the disk image is read-only, there's also no point in complaining.
68a313c5 2149 */
86775e35
LP
2150 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2151 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 2152 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
2153 return 0;
2154 }
2547bb41 2155
03cfe0d5
LP
2156 r = userns_lchown(where, 0, 0);
2157 if (r < 0)
3539724c 2158 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 2159
2547bb41
LP
2160 return 0;
2161}
2162
1e4f1671 2163static int setup_boot_id(void) {
cdde6ba6
LP
2164 _cleanup_(unlink_and_freep) char *from = NULL;
2165 _cleanup_free_ char *path = NULL;
3bbaff3e 2166 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 2167 const char *to;
04bc4a3f
LP
2168 int r;
2169
1eacc470 2170 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
04bc4a3f 2171
1eacc470 2172 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
cdde6ba6
LP
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
2175
2176 r = sd_id128_randomize(&rnd);
f647962d
MS
2177 if (r < 0)
2178 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 2179
cdde6ba6 2180 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
2181 if (r < 0)
2182 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 2183
cdde6ba6
LP
2184 from = TAKE_PTR(path);
2185 to = "/proc/sys/kernel/random/boot_id";
2186
511a8cfe 2187 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
2188 if (r < 0)
2189 return r;
04bc4a3f 2190
511a8cfe 2191 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
2192}
2193
e58a1277 2194static int copy_devnodes(const char *dest) {
88213476
LP
2195 static const char devnodes[] =
2196 "null\0"
2197 "zero\0"
2198 "full\0"
2199 "random\0"
2200 "urandom\0"
85614d66
TG
2201 "tty\0"
2202 "net/tun\0";
88213476 2203
de40a303 2204 _cleanup_umask_ mode_t u;
88213476 2205 const char *d;
e58a1277 2206 int r = 0;
a258bf26
LP
2207
2208 assert(dest);
124640f1
LP
2209
2210 u = umask(0000);
88213476 2211
03cfe0d5
LP
2212 /* Create /dev/net, so that we can create /dev/net/tun in it */
2213 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2214 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2215
88213476 2216 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2217 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2218 struct stat st;
88213476 2219
c6134d3e 2220 from = path_join("/dev/", d);
8967f291
LP
2221 if (!from)
2222 return log_oom();
2223
c6134d3e 2224 to = path_join(dest, from);
8967f291
LP
2225 if (!to)
2226 return log_oom();
88213476
LP
2227
2228 if (stat(from, &st) < 0) {
2229
4a62c710
MS
2230 if (errno != ENOENT)
2231 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2232
baaa35ad
ZJS
2233 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2234 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2235 "%s is not a char or block device, cannot copy.", from);
2236 else {
8dfce114
LP
2237 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2238
81f5049b 2239 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 2240 /* Explicitly warn the user when /dev is already populated. */
41eb4362 2241 if (errno == EEXIST)
8dbf71ec 2242 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
2243 if (errno != EPERM)
2244 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2245
8dfce114 2246 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
2247 r = touch(to);
2248 if (r < 0)
2249 return log_error_errno(r, "touch (%s) failed: %m", to);
511a8cfe 2250 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
60e76d48
ZJS
2251 if (r < 0)
2252 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 2253 }
6278cf60 2254
03cfe0d5
LP
2255 r = userns_lchown(to, 0, 0);
2256 if (r < 0)
2257 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114 2258
657ee2d8 2259 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
8dfce114
LP
2260 if (!dn)
2261 return log_oom();
2262
2263 r = userns_mkdir(dest, dn, 0755, 0, 0);
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to create '%s': %m", dn);
2266
2267 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2268 return log_oom();
2269
c6134d3e 2270 prefixed = path_join(dest, sl);
8dfce114
LP
2271 if (!prefixed)
2272 return log_oom();
2273
2d9b74ba 2274 t = path_join("..", d);
8dfce114
LP
2275 if (!t)
2276 return log_oom();
2277
2278 if (symlink(t, prefixed) < 0)
2279 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 2280 }
88213476
LP
2281 }
2282
e58a1277
LP
2283 return r;
2284}
88213476 2285
de40a303
LP
2286static int make_extra_nodes(const char *dest) {
2287 _cleanup_umask_ mode_t u;
2288 size_t i;
2289 int r;
2290
2291 u = umask(0000);
2292
2293 for (i = 0; i < arg_n_extra_nodes; i++) {
2294 _cleanup_free_ char *path = NULL;
2295 DeviceNode *n = arg_extra_nodes + i;
2296
c6134d3e 2297 path = path_join(dest, n->path);
de40a303
LP
2298 if (!path)
2299 return log_oom();
2300
2301 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2302 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2303
2304 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2307 }
2308
2309 return 0;
2310}
2311
03cfe0d5
LP
2312static int setup_pts(const char *dest) {
2313 _cleanup_free_ char *options = NULL;
2314 const char *p;
709f6e46 2315 int r;
03cfe0d5 2316
349cc4a5 2317#if HAVE_SELINUX
03cfe0d5
LP
2318 if (arg_selinux_apifs_context)
2319 (void) asprintf(&options,
3dce8915 2320 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2321 arg_uid_shift + TTY_GID,
2322 arg_selinux_apifs_context);
2323 else
2324#endif
2325 (void) asprintf(&options,
3dce8915 2326 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2327 arg_uid_shift + TTY_GID);
f2d88580 2328
03cfe0d5 2329 if (!options)
f2d88580
LP
2330 return log_oom();
2331
03cfe0d5 2332 /* Mount /dev/pts itself */
cc9fce65 2333 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2334 r = mkdir_errno_wrapper(p, 0755);
2335 if (r < 0)
2336 return log_error_errno(r, "Failed to create /dev/pts: %m");
2337
511a8cfe 2338 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
60e76d48
ZJS
2339 if (r < 0)
2340 return r;
709f6e46
MS
2341 r = userns_lchown(p, 0, 0);
2342 if (r < 0)
2343 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2344
2345 /* Create /dev/ptmx symlink */
2346 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2347 if (symlink("pts/ptmx", p) < 0)
2348 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2349 r = userns_lchown(p, 0, 0);
2350 if (r < 0)
2351 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2352
03cfe0d5
LP
2353 /* And fix /dev/pts/ptmx ownership */
2354 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2355 r = userns_lchown(p, 0, 0);
2356 if (r < 0)
2357 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2358
f2d88580
LP
2359 return 0;
2360}
2361
3acc84eb 2362static int setup_stdio_as_dev_console(void) {
2fef50cd 2363 _cleanup_close_ int terminal = -1;
e58a1277 2364 int r;
e58a1277 2365
335d2ead
LP
2366 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2367 * explicitly, if we are configured to. */
2368 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
3acc84eb
FB
2369 if (terminal < 0)
2370 return log_error_errno(terminal, "Failed to open console: %m");
e58a1277 2371
3acc84eb
FB
2372 /* Make sure we can continue logging to the original stderr, even if
2373 * stderr points elsewhere now */
2374 r = log_dup_console();
2375 if (r < 0)
2376 return log_error_errno(r, "Failed to duplicate stderr: %m");
de40a303 2377
3acc84eb
FB
2378 /* invalidates 'terminal' on success and failure */
2379 r = rearrange_stdio(terminal, terminal, terminal);
2fef50cd 2380 TAKE_FD(terminal);
f647962d 2381 if (r < 0)
3acc84eb
FB
2382 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2383
2384 return 0;
2385}
88213476 2386
3acc84eb
FB
2387static int setup_dev_console(const char *console) {
2388 _cleanup_free_ char *p = NULL;
2389 int r;
a258bf26 2390
3acc84eb
FB
2391 /* Create /dev/console symlink */
2392 r = path_make_relative("/dev", console, &p);
81f5049b 2393 if (r < 0)
3acc84eb
FB
2394 return log_error_errno(r, "Failed to create relative path: %m");
2395
2396 if (symlink(p, "/dev/console") < 0)
2397 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
a258bf26 2398
3acc84eb 2399 return 0;
e58a1277
LP
2400}
2401
8e5430c4
LP
2402static int setup_keyring(void) {
2403 key_serial_t keyring;
2404
6b000af4
LP
2405 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2406 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2407 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2408 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2409 * into the container. */
8e5430c4
LP
2410
2411 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2412 if (keyring == -1) {
2413 if (errno == ENOSYS)
2414 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
065b4774 2415 else if (ERRNO_IS_PRIVILEGE(errno))
8e5430c4
LP
2416 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2417 else
2418 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2419 }
2420
2421 return 0;
2422}
2423
3652872a
LP
2424static int setup_credentials(const char *root) {
2425 const char *q;
2426 int r;
2427
2428 if (arg_n_credentials <= 0)
2429 return 0;
2430
2431 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2432 if (r < 0)
2433 return log_error_errno(r, "Failed to create /run/host: %m");
2434
2435 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2436 if (r < 0)
2437 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2438
2439 q = prefix_roota(root, "/run/host/credentials");
511a8cfe 2440 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
3652872a
LP
2441 if (r < 0)
2442 return r;
2443
2444 for (size_t i = 0; i < arg_n_credentials; i++) {
2445 _cleanup_free_ char *j = NULL;
2446 _cleanup_close_ int fd = -1;
2447
2448 j = path_join(q, arg_credentials[i].id);
2449 if (!j)
2450 return log_oom();
2451
2452 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2453 if (fd < 0)
2454 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2455
2456 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2459
2460 if (fchmod(fd, 0400) < 0)
2461 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2462
2463 if (arg_userns_mode != USER_NAMESPACE_NO) {
2464 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2465 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2466 }
2467 }
2468
2469 if (chmod(q, 0500) < 0)
2470 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2471
2472 r = userns_lchown(q, 0, 0);
2473 if (r < 0)
2474 return r;
2475
2476 /* Make both mount and superblock read-only now */
511a8cfe 2477 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
3652872a
LP
2478 if (r < 0)
2479 return r;
2480
511a8cfe 2481 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
3652872a
LP
2482}
2483
1e4f1671 2484static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2485 _cleanup_(unlink_and_freep) char *from = NULL;
2486 _cleanup_free_ char *fifo = NULL;
2487 _cleanup_close_ int fd = -1;
7fd1b19b 2488 _cleanup_umask_ mode_t u;
9ec5a93c 2489 int r;
e58a1277 2490
e58a1277 2491 assert(kmsg_socket >= 0);
a258bf26 2492
e58a1277 2493 u = umask(0000);
a258bf26 2494
1eacc470 2495 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
9ec5a93c
LP
2496 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2497 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2498 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2499
1eacc470 2500 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
9ec5a93c
LP
2501 if (r < 0)
2502 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2503
9ec5a93c 2504 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2505 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2506
2507 from = TAKE_PTR(fifo);
9ec5a93c 2508
511a8cfe 2509 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
60e76d48
ZJS
2510 if (r < 0)
2511 return r;
e58a1277 2512
669fc4e5 2513 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2514 if (fd < 0)
2515 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2516
9ec5a93c 2517 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2518 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2519 if (r < 0)
2520 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2521
25ea79fe 2522 return 0;
88213476
LP
2523}
2524
761cf19d 2525struct ExposeArgs {
deff68e7
FW
2526 union in_addr_union address4;
2527 union in_addr_union address6;
761cf19d
FW
2528 struct FirewallContext *fw_ctx;
2529};
2530
1c4baffc 2531static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
761cf19d 2532 struct ExposeArgs *args = userdata;
6d0b55c2
LP
2533
2534 assert(rtnl);
2535 assert(m);
761cf19d 2536 assert(args);
6d0b55c2 2537
deff68e7
FW
2538 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2539 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
6d0b55c2
LP
2540 return 0;
2541}
2542
3a74cea5 2543static int setup_hostname(void) {
c818eef1 2544 int r;
3a74cea5 2545
0c582db0 2546 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2547 return 0;
2548
c818eef1
LP
2549 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2552
7027ff61 2553 return 0;
3a74cea5
LP
2554}
2555
57fb9fb5 2556static int setup_journal(const char *directory) {
0f5e1382 2557 _cleanup_free_ char *d = NULL;
5905d7cf 2558 char id[SD_ID128_STRING_MAX];
b2238e38
LP
2559 const char *dirname, *p, *q;
2560 sd_id128_t this_id;
8054d749 2561 bool try;
57fb9fb5
LP
2562 int r;
2563
df9a75e4
LP
2564 /* Don't link journals in ephemeral mode */
2565 if (arg_ephemeral)
2566 return 0;
2567
8054d749
LP
2568 if (arg_link_journal == LINK_NO)
2569 return 0;
2570
2571 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2572
4d680aee 2573 r = sd_id128_get_machine(&this_id);
f647962d
MS
2574 if (r < 0)
2575 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2576
e01ff70a 2577 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2578 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2579 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2580 if (try)
4d680aee 2581 return 0;
df9a75e4 2582 return -EEXIST;
4d680aee
ZJS
2583 }
2584
369ca6da
ZJS
2585 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2586 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2587 if (r < 0) {
2588 bool ignore = r == -EROFS && try;
2589 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2590 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2591 return ignore ? 0 : r;
2592 }
2593 }
03cfe0d5 2594
e01ff70a
MS
2595 (void) sd_id128_to_string(arg_uuid, id);
2596
03cfe0d5
LP
2597 p = strjoina("/var/log/journal/", id);
2598 q = prefix_roota(directory, p);
27407a01 2599
e1873695 2600 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2601 if (try)
2602 return 0;
27407a01 2603
baaa35ad
ZJS
2604 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2605 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2606 }
2607
e1873695 2608 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2609 if (try)
2610 return 0;
57fb9fb5 2611
baaa35ad
ZJS
2612 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2613 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2614 }
2615
2616 r = readlink_and_make_absolute(p, &d);
2617 if (r >= 0) {
3742095b 2618 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2619 path_equal(d, q)) {
2620
03cfe0d5 2621 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2622 if (r < 0)
709f6e46 2623 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2624 return 0;
57fb9fb5
LP
2625 }
2626
4a62c710
MS
2627 if (unlink(p) < 0)
2628 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2629 } else if (r == -EINVAL) {
2630
2631 if (arg_link_journal == LINK_GUEST &&
2632 rmdir(p) < 0) {
2633
27407a01
ZJS
2634 if (errno == ENOTDIR) {
2635 log_error("%s already exists and is neither a symlink nor a directory", p);
2636 return r;
4314d33f
MS
2637 } else
2638 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2639 }
4314d33f
MS
2640 } else if (r != -ENOENT)
2641 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2642
2643 if (arg_link_journal == LINK_GUEST) {
2644
2645 if (symlink(q, p) < 0) {
8054d749 2646 if (try) {
56f64d95 2647 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2648 return 0;
4314d33f
MS
2649 } else
2650 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2651 }
2652
03cfe0d5 2653 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2654 if (r < 0)
709f6e46 2655 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2656 return 0;
57fb9fb5
LP
2657 }
2658
2659 if (arg_link_journal == LINK_HOST) {
ccddd104 2660 /* don't create parents here — if the host doesn't have
574edc90 2661 * permanent journal set up, don't force it here */
ba8e6c4d 2662
dae8b82e
ZJS
2663 r = mkdir_errno_wrapper(p, 0755);
2664 if (r < 0 && r != -EEXIST) {
8054d749 2665 if (try) {
dae8b82e 2666 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2667 return 0;
4314d33f 2668 } else
dae8b82e 2669 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2670 }
2671
27407a01
ZJS
2672 } else if (access(p, F_OK) < 0)
2673 return 0;
57fb9fb5 2674
cdb2b9d0
LP
2675 if (dir_is_empty(q) == 0)
2676 log_warning("%s is not empty, proceeding anyway.", q);
2677
03cfe0d5 2678 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2681
511a8cfe 2682 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
60e76d48 2683 if (r < 0)
4a62c710 2684 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2685
27407a01 2686 return 0;
57fb9fb5
LP
2687}
2688
de40a303
LP
2689static int drop_capabilities(uid_t uid) {
2690 CapabilityQuintet q;
2691
2692 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2693 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2694 * arg_caps_retain. */
2695
2696 if (capability_quintet_is_set(&arg_full_capabilities)) {
2697 q = arg_full_capabilities;
2698
f5fbe71d 2699 if (q.bounding == UINT64_MAX)
de40a303
LP
2700 q.bounding = uid == 0 ? arg_caps_retain : 0;
2701
f5fbe71d 2702 if (q.effective == UINT64_MAX)
de40a303
LP
2703 q.effective = uid == 0 ? q.bounding : 0;
2704
f5fbe71d 2705 if (q.inheritable == UINT64_MAX)
88fc9c9b 2706 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2707
f5fbe71d 2708 if (q.permitted == UINT64_MAX)
88fc9c9b 2709 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
de40a303 2710
f5fbe71d 2711 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
88fc9c9b 2712 q.ambient = arg_caps_ambient;
f66ad460
AZ
2713
2714 if (capability_quintet_mangle(&q))
2715 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2716
2717 } else {
de40a303
LP
2718 q = (CapabilityQuintet) {
2719 .bounding = arg_caps_retain,
2720 .effective = uid == 0 ? arg_caps_retain : 0,
88fc9c9b
TH
2721 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2722 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
f5fbe71d 2723 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
de40a303
LP
2724 };
2725
f66ad460
AZ
2726 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2727 * in order to maintain the same behavior as systemd < 242. */
2728 if (capability_quintet_mangle(&q))
0ccdaa79
JT
2729 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2730 "Some capabilities will not be set because they are not in the current bounding set.");
f66ad460
AZ
2731
2732 }
2733
de40a303 2734 return capability_quintet_enforce(&q);
88213476
LP
2735}
2736
db999e0f
LP
2737static int reset_audit_loginuid(void) {
2738 _cleanup_free_ char *p = NULL;
2739 int r;
2740
0c582db0 2741 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2742 return 0;
2743
2744 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2745 if (r == -ENOENT)
db999e0f 2746 return 0;
f647962d
MS
2747 if (r < 0)
2748 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2749
2750 /* Already reset? */
2751 if (streq(p, "4294967295"))
2752 return 0;
2753
57512c89 2754 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2755 if (r < 0) {
10a87006
LP
2756 log_error_errno(r,
2757 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2758 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2759 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2760 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2761 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2762
db999e0f 2763 sleep(5);
77b6e194 2764 }
db999e0f
LP
2765
2766 return 0;
77b6e194
LP
2767}
2768
785890ac
LP
2769static int setup_propagate(const char *root) {
2770 const char *p, *q;
709f6e46 2771 int r;
785890ac
LP
2772
2773 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2774 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2775 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2776 (void) mkdir_p(p, 0600);
2777
5a27b395 2778 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
709f6e46 2779 if (r < 0)
5a27b395 2780 return log_error_errno(r, "Failed to create /run/host: %m");
03cfe0d5 2781
5a27b395 2782 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
709f6e46 2783 if (r < 0)
5a27b395 2784 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
03cfe0d5 2785
5a27b395 2786 q = prefix_roota(root, "/run/host/incoming");
511a8cfe 2787 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
60e76d48
ZJS
2788 if (r < 0)
2789 return r;
785890ac 2790
511a8cfe 2791 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
60e76d48
ZJS
2792 if (r < 0)
2793 return r;
785890ac 2794
5a27b395 2795 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
511a8cfe 2796 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2797}
2798
317feb4d 2799static int setup_machine_id(const char *directory) {
691675ba
LP
2800 const char *etc_machine_id;
2801 sd_id128_t id;
3bbaff3e 2802 int r;
e01ff70a 2803
317feb4d
LP
2804 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2805 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2806 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2807 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2808 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2809 * container behaves nicely). */
2810
e01ff70a
MS
2811 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2812
c5fbeedb 2813 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
317feb4d
LP
2814 if (r < 0) {
2815 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2816 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2817
317feb4d
LP
2818 if (sd_id128_is_null(arg_uuid)) {
2819 r = sd_id128_randomize(&arg_uuid);
2820 if (r < 0)
2821 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2822 }
2823 } else {
baaa35ad
ZJS
2824 if (sd_id128_is_null(id))
2825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2826 "Machine ID in container image is zero, refusing.");
e01ff70a 2827
317feb4d
LP
2828 arg_uuid = id;
2829 }
691675ba 2830
e01ff70a
MS
2831 return 0;
2832}
2833
7336138e
LP
2834static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2835 int r;
2836
2837 assert(directory);
2838
6c045a99 2839 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
7336138e
LP
2840 return 0;
2841
2842 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2843 if (r == -EOPNOTSUPP)
2844 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2845 if (r == -EBADE)
2846 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2849 if (r == 0)
2850 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2851 else
2852 log_debug("Patched directory tree to match UID/GID range.");
2853
2854 return r;
2855}
2856
113cea80 2857/*
6d416b9c
LS
2858 * Return values:
2859 * < 0 : wait_for_terminate() failed to get the state of the
2860 * container, the container was terminated by a signal, or
2861 * failed for an unknown reason. No change is made to the
2862 * container argument.
2863 * > 0 : The program executed in the container terminated with an
2864 * error. The exit code of the program executed in the
919699ec
LP
2865 * container is returned. The container argument has been set
2866 * to CONTAINER_TERMINATED.
6d416b9c
LS
2867 * 0 : The container is being rebooted, has been shut down or exited
2868 * successfully. The container argument has been set to either
2869 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2870 *
6d416b9c
LS
2871 * That is, success is indicated by a return value of zero, and an
2872 * error is indicated by a non-zero value.
113cea80
DH
2873 */
2874static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2875 siginfo_t status;
919699ec 2876 int r;
113cea80
DH
2877
2878 r = wait_for_terminate(pid, &status);
f647962d
MS
2879 if (r < 0)
2880 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2881
2882 switch (status.si_code) {
fddbb89c 2883
113cea80 2884 case CLD_EXITED:
b5a2179b 2885 if (status.si_status == 0)
919699ec 2886 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2887 else
919699ec 2888 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2889
919699ec
LP
2890 *container = CONTAINER_TERMINATED;
2891 return status.si_status;
113cea80
DH
2892
2893 case CLD_KILLED:
2894 if (status.si_status == SIGINT) {
919699ec 2895 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2896 *container = CONTAINER_TERMINATED;
919699ec
LP
2897 return 0;
2898
113cea80 2899 } else if (status.si_status == SIGHUP) {
919699ec 2900 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2901 *container = CONTAINER_REBOOTED;
919699ec 2902 return 0;
113cea80 2903 }
919699ec 2904
4831981d 2905 _fallthrough_;
113cea80 2906 case CLD_DUMPED:
baaa35ad
ZJS
2907 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2908 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2909
2910 default:
baaa35ad
ZJS
2911 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2912 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2913 }
113cea80
DH
2914}
2915
023fb90b
LP
2916static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2917 pid_t pid;
2918
4a0b58c4 2919 pid = PTR_TO_PID(userdata);
023fb90b 2920 if (pid > 0) {
c6c8f6e2 2921 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2922 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2923 sd_event_source_set_userdata(s, NULL);
2924 return 0;
2925 }
2926 }
2927
2928 sd_event_exit(sd_event_source_get_event(s), 0);
2929 return 0;
2930}
2931
6916b164 2932static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2933 pid_t pid;
2934
2935 assert(s);
2936 assert(ssi);
2937
2938 pid = PTR_TO_PID(userdata);
2939
6916b164
AU
2940 for (;;) {
2941 siginfo_t si = {};
abdb9b08 2942
6916b164
AU
2943 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2944 return log_error_errno(errno, "Failed to waitid(): %m");
2945 if (si.si_pid == 0) /* No pending children. */
2946 break;
abdb9b08 2947 if (si.si_pid == pid) {
6916b164
AU
2948 /* The main process we care for has exited. Return from
2949 * signal handler but leave the zombie. */
2950 sd_event_exit(sd_event_source_get_event(s), 0);
2951 break;
2952 }
abdb9b08 2953
6916b164
AU
2954 /* Reap all other children. */
2955 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2956 }
2957
2958 return 0;
2959}
2960
abdb9b08
LP
2961static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2962 pid_t pid;
2963
2964 assert(m);
2965
2966 pid = PTR_TO_PID(userdata);
2967
2968 if (arg_kill_signal > 0) {
2969 log_info("Container termination requested. Attempting to halt container.");
2970 (void) kill(pid, arg_kill_signal);
2971 } else {
2972 log_info("Container termination requested. Exiting.");
2973 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2974 }
2975
2976 return 0;
2977}
2978
ec16945e 2979static int determine_names(void) {
1b9cebf6 2980 int r;
ec16945e 2981
c1521918
LP
2982 if (arg_template && !arg_directory && arg_machine) {
2983
2984 /* If --template= was specified then we should not
2985 * search for a machine, but instead create a new one
2986 * in /var/lib/machine. */
2987
657ee2d8 2988 arg_directory = path_join("/var/lib/machines", arg_machine);
c1521918
LP
2989 if (!arg_directory)
2990 return log_oom();
2991 }
2992
ec16945e 2993 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2994 if (arg_machine) {
2995 _cleanup_(image_unrefp) Image *i = NULL;
2996
d577d4a4 2997 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3a6ce860
LP
2998 if (r == -ENOENT)
2999 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
3000 if (r < 0)
3001 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 3002
eb38edce 3003 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 3004 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 3005 else
0f03c2a4 3006 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 3007 if (r < 0)
0f3be6ca 3008 return log_oom();
1b9cebf6 3009
aee327b8
LP
3010 if (!arg_ephemeral)
3011 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
3012 } else {
3013 r = safe_getcwd(&arg_directory);
3014 if (r < 0)
3015 return log_error_errno(r, "Failed to determine current directory: %m");
3016 }
ec16945e 3017
c6147113
LP
3018 if (!arg_directory && !arg_image)
3019 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
ec16945e
LP
3020 }
3021
3022 if (!arg_machine) {
b9ba4dab
LP
3023 if (arg_directory && path_equal(arg_directory, "/"))
3024 arg_machine = gethostname_malloc();
e9b88a6d
LP
3025 else if (arg_image) {
3026 char *e;
4827ab48 3027
e9b88a6d 3028 arg_machine = strdup(basename(arg_image));
4827ab48 3029
e9b88a6d
LP
3030 /* Truncate suffix if there is one */
3031 e = endswith(arg_machine, ".raw");
3032 if (e)
3033 *e = 0;
3034 } else
3035 arg_machine = strdup(basename(arg_directory));
ec16945e
LP
3036 if (!arg_machine)
3037 return log_oom();
3038
ae691c1d 3039 hostname_cleanup(arg_machine);
52ef5dd7 3040 if (!hostname_is_valid(arg_machine, 0))
c6147113 3041 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
b9ba4dab 3042
e9b88a6d
LP
3043 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3044 * instances at once without manually having to specify -M each time. */
3045 if (arg_ephemeral)
3046 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
b9ba4dab 3047 return log_oom();
ec16945e
LP
3048 }
3049
3050 return 0;
3051}
3052
8d4aa2bb 3053static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
3054 char *chased;
3055 int r;
3056
3057 assert(p);
3058
3059 if (!*p)
3060 return 0;
3061
a5648b80 3062 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3f342ec4
LP
3063 if (r < 0)
3064 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3065
a5648b80 3066 return free_and_replace(*p, chased);
3f342ec4
LP
3067}
3068
03cfe0d5 3069static int determine_uid_shift(const char *directory) {
6dac160c 3070
0de7acce 3071 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 3072 arg_uid_shift = 0;
6dac160c 3073 return 0;
03cfe0d5 3074 }
6dac160c
LP
3075
3076 if (arg_uid_shift == UID_INVALID) {
3077 struct stat st;
3078
993da6d4
LP
3079 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3080
3081 if (stat(directory, &st) < 0)
03cfe0d5 3082 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3083
3084 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3085
baaa35ad
ZJS
3086 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3087 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3088 "UID and GID base of %s don't match.", directory);
6dac160c
LP
3089
3090 arg_uid_range = UINT32_C(0x10000);
f61c7f88
LP
3091
3092 if (arg_uid_shift != 0) {
3093 /* If the image is shifted already, then we'll fall back to classic chowning, for
3094 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3095
3096 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3097 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3098 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3099 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3100 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3101 "UID base of %s is not zero, UID mapping not supported.", directory);
3102 }
6dac160c
LP
3103 }
3104
58e13de5
LP
3105 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3106 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
6dac160c 3107
6dac160c
LP
3108 return 0;
3109}
3110
de40a303
LP
3111static unsigned long effective_clone_ns_flags(void) {
3112 unsigned long flags = arg_clone_ns_flags;
3113
3114 if (arg_private_network)
3115 flags |= CLONE_NEWNET;
3116 if (arg_use_cgns)
3117 flags |= CLONE_NEWCGROUP;
3118 if (arg_userns_mode != USER_NAMESPACE_NO)
3119 flags |= CLONE_NEWUSER;
3120
3121 return flags;
3122}
3123
3124static int patch_sysctl(void) {
3125
3126 /* This table is inspired by runc's sysctl() function */
3127 static const struct {
3128 const char *key;
3129 bool prefix;
3130 unsigned long clone_flags;
3131 } safe_sysctl[] = {
3132 { "kernel.hostname", false, CLONE_NEWUTS },
3133 { "kernel.domainname", false, CLONE_NEWUTS },
3134 { "kernel.msgmax", false, CLONE_NEWIPC },
3135 { "kernel.msgmnb", false, CLONE_NEWIPC },
3136 { "kernel.msgmni", false, CLONE_NEWIPC },
3137 { "kernel.sem", false, CLONE_NEWIPC },
3138 { "kernel.shmall", false, CLONE_NEWIPC },
3139 { "kernel.shmmax", false, CLONE_NEWIPC },
3140 { "kernel.shmmni", false, CLONE_NEWIPC },
3141 { "fs.mqueue.", true, CLONE_NEWIPC },
3142 { "net.", true, CLONE_NEWNET },
3143 };
3144
3145 unsigned long flags;
3146 char **k, **v;
3147 int r;
3148
3149 flags = effective_clone_ns_flags();
3150
3151 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3152 bool good = false;
3153 size_t i;
3154
3155 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3156
3157 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3158 continue;
3159
3160 if (safe_sysctl[i].prefix)
3161 good = startswith(*k, safe_sysctl[i].key);
3162 else
3163 good = streq(*k, safe_sysctl[i].key);
3164
3165 if (good)
3166 break;
3167 }
3168
c6147113
LP
3169 if (!good)
3170 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
de40a303
LP
3171
3172 r = sysctl_write(*k, *v);
3173 if (r < 0)
3174 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3175 }
3176
3177 return 0;
3178}
3179
03cfe0d5
LP
3180static int inner_child(
3181 Barrier *barrier,
3182 const char *directory,
3183 bool secondary,
3184 int kmsg_socket,
3185 int rtnl_socket,
3acc84eb 3186 int master_pty_socket,
e1bb4b0d
LB
3187 FDSet *fds,
3188 char **os_release_pairs) {
69c79d3c 3189
03cfe0d5 3190 _cleanup_free_ char *home = NULL;
b5ea030d 3191 char as_uuid[ID128_UUID_STRING_MAX];
88614c8a 3192 size_t n_env = 1;
03cfe0d5 3193 const char *envp[] = {
0c300adf 3194 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 3195 NULL, /* container */
03cfe0d5
LP
3196 NULL, /* TERM */
3197 NULL, /* HOME */
3198 NULL, /* USER */
3199 NULL, /* LOGNAME */
3200 NULL, /* container_uuid */
3201 NULL, /* LISTEN_FDS */
3202 NULL, /* LISTEN_PID */
9c1e04d0 3203 NULL, /* NOTIFY_SOCKET */
3652872a 3204 NULL, /* CREDENTIALS_DIRECTORY */
03cfe0d5
LP
3205 NULL
3206 };
1a68e1e5 3207 const char *exec_target;
2371271c 3208 _cleanup_strv_free_ char **env_use = NULL;
de40a303 3209 int r, which_failed;
88213476 3210
b37469d7
LP
3211 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3212 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3213 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3214 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3215 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3216 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3217 * namespace.
3218 *
3219 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3220 * unshare(). See below. */
3221
03cfe0d5
LP
3222 assert(barrier);
3223 assert(directory);
3224 assert(kmsg_socket >= 0);
88213476 3225
de40a303
LP
3226 log_debug("Inner child is initializing.");
3227
0de7acce 3228 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3229 /* Tell the parent, that it now can write the UID map. */
3230 (void) barrier_place(barrier); /* #1 */
7027ff61 3231
03cfe0d5 3232 /* Wait until the parent wrote the UID map */
baaa35ad 3233 if (!barrier_place_and_sync(barrier)) /* #2 */
2a2e78e9 3234 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
88213476 3235
2a2e78e9
LP
3236 /* Become the new root user inside our namespace */
3237 r = reset_uid_gid();
3238 if (r < 0)
3239 return log_error_errno(r, "Couldn't become new root: %m");
3240
3241 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3242 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3243 * propagation, but simply create new peer groups for all our mounts). */
511a8cfe 3244 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
2a2e78e9
LP
3245 if (r < 0)
3246 return r;
3247 }
6d66bd3b 3248
0de7acce 3249 r = mount_all(NULL,
4f086aab 3250 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 3251 arg_uid_shift,
0de7acce 3252 arg_selinux_apifs_context);
03cfe0d5
LP
3253 if (r < 0)
3254 return r;
3255
04413780
ZJS
3256 if (!arg_network_namespace_path && arg_private_network) {
3257 r = unshare(CLONE_NEWNET);
3258 if (r < 0)
3259 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
3260
3261 /* Tell the parent that it can setup network interfaces. */
3262 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
3263 }
3264
4f086aab 3265 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
3266 if (r < 0)
3267 return r;
3268
03cfe0d5
LP
3269 /* Wait until we are cgroup-ified, so that we
3270 * can mount the right cgroup path writable */
baaa35ad
ZJS
3271 if (!barrier_place_and_sync(barrier)) /* #4 */
3272 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3273 "Parent died too early");
88213476 3274
489fae52 3275 if (arg_use_cgns) {
0996ef00
CB
3276 r = unshare(CLONE_NEWCGROUP);
3277 if (r < 0)
04413780 3278 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
3279 r = mount_cgroups(
3280 "",
3281 arg_unified_cgroup_hierarchy,
3282 arg_userns_mode != USER_NAMESPACE_NO,
3283 arg_uid_shift,
3284 arg_uid_range,
5a8ff0e6 3285 arg_selinux_apifs_context,
ada54120 3286 true);
1433e0f2 3287 } else
0996ef00 3288 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
1433e0f2
LP
3289 if (r < 0)
3290 return r;
ec16945e 3291
1e4f1671 3292 r = setup_boot_id();
03cfe0d5
LP
3293 if (r < 0)
3294 return r;
ec16945e 3295
1e4f1671 3296 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
3297 if (r < 0)
3298 return r;
3299 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3300
de40a303
LP
3301 r = mount_custom(
3302 "/",
3303 arg_custom_mounts,
3304 arg_n_custom_mounts,
de40a303
LP
3305 0,
3306 arg_selinux_apifs_context,
5f0a6347 3307 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
de40a303
LP
3308 if (r < 0)
3309 return r;
3310
03cfe0d5
LP
3311 if (setsid() < 0)
3312 return log_error_errno(errno, "setsid() failed: %m");
3313
3314 if (arg_private_network)
df883de9 3315 (void) loopback_setup();
03cfe0d5 3316
7a8f6325
LP
3317 if (arg_expose_ports) {
3318 r = expose_port_send_rtnl(rtnl_socket);
3319 if (r < 0)
3320 return r;
3321 rtnl_socket = safe_close(rtnl_socket);
3322 }
03cfe0d5 3323
3acc84eb 3324 if (arg_console_mode != CONSOLE_PIPE) {
cd132992 3325 _cleanup_close_ int master = -1;
3acc84eb
FB
3326 _cleanup_free_ char *console = NULL;
3327
3328 /* Allocate a pty and make it available as /dev/console. */
dc98caea 3329 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3acc84eb 3330 if (master < 0)
dc98caea 3331 return log_error_errno(master, "Failed to allocate a pty: %m");
3acc84eb
FB
3332
3333 r = setup_dev_console(console);
3334 if (r < 0)
105a1a36 3335 return log_error_errno(r, "Failed to set up /dev/console: %m");
3acc84eb
FB
3336
3337 r = send_one_fd(master_pty_socket, master, 0);
3338 if (r < 0)
3339 return log_error_errno(r, "Failed to send master fd: %m");
3340 master_pty_socket = safe_close(master_pty_socket);
3341
3342 r = setup_stdio_as_dev_console();
3343 if (r < 0)
3344 return r;
3345 }
3346
de40a303
LP
3347 r = patch_sysctl();
3348 if (r < 0)
3349 return r;
3350
81f345df
LP
3351 if (arg_oom_score_adjust_set) {
3352 r = set_oom_score_adjust(arg_oom_score_adjust);
3353 if (r < 0)
3354 return log_error_errno(r, "Failed to adjust OOM score: %m");
3355 }
3356
0985c7c4
ZJS
3357 if (arg_cpu_set.set)
3358 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
d107bb7d
LP
3359 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3360
c818eef1 3361 (void) setup_hostname();
03cfe0d5 3362
050f7277 3363 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
3364 r = safe_personality(arg_personality);
3365 if (r < 0)
3366 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 3367 } else if (secondary) {
21022b9d
LP
3368 r = safe_personality(PER_LINUX32);
3369 if (r < 0)
3370 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
3371 }
3372
de40a303
LP
3373 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3374 if (r < 0)
3375 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3376
3377#if HAVE_SECCOMP
3378 if (arg_seccomp) {
3379
3380 if (is_seccomp_available()) {
3381
3382 r = seccomp_load(arg_seccomp);
7bc5e0b1 3383 if (ERRNO_IS_SECCOMP_FATAL(r))
de40a303
LP
3384 return log_error_errno(r, "Failed to install seccomp filter: %m");
3385 if (r < 0)
3386 log_debug_errno(r, "Failed to install seccomp filter: %m");
3387 }
3388 } else
3389#endif
3390 {
6b000af4 3391 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
de40a303
LP
3392 if (r < 0)
3393 return r;
3394 }
3395
349cc4a5 3396#if HAVE_SELINUX
03cfe0d5 3397 if (arg_selinux_context)
2ed96880 3398 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
3399 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3400#endif
3401
de40a303
LP
3402 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3403 * if we need to later on. */
3404 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3405 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3406
3407 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3462d773 3408 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
de40a303 3409 else
3462d773 3410 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
03cfe0d5
LP
3411 if (r < 0)
3412 return r;
3413
de40a303
LP
3414 r = drop_capabilities(getuid());
3415 if (r < 0)
3416 return log_error_errno(r, "Dropping capabilities failed: %m");
3417
66edd963
LP
3418 if (arg_no_new_privileges)
3419 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3420 return log_error_errno(errno, "Failed to disable new privileges: %m");
3421
6aadfa4c
ILG
3422 /* LXC sets container=lxc, so follow the scheme here */
3423 envp[n_env++] = strjoina("container=", arg_container_service_name);
3424
03cfe0d5
LP
3425 envp[n_env] = strv_find_prefix(environ, "TERM=");
3426 if (envp[n_env])
313cefa1 3427 n_env++;
03cfe0d5 3428
de40a303
LP
3429 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3430 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3431 return log_oom();
3432
3433 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3434 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3435 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3436 return log_oom();
03cfe0d5 3437
3bbaff3e 3438 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3439
691675ba 3440 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3441 return log_oom();
03cfe0d5
LP
3442
3443 if (fdset_size(fds) > 0) {
3444 r = fdset_cloexec(fds, false);
3445 if (r < 0)
3446 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3447
3448 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3449 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3450 return log_oom();
3451 }
9c1e04d0
AP
3452 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3453 return log_oom();
03cfe0d5 3454
3652872a
LP
3455 if (arg_n_credentials > 0) {
3456 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3457 if (!envp[n_env])
3458 return log_oom();
3459 n_env++;
3460 }
3461
ed4512d0 3462 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
2371271c
TG
3463 if (!env_use)
3464 return log_oom();
03cfe0d5
LP
3465
3466 /* Let the parent know that we are ready and
3467 * wait until the parent is ready with the
3468 * setup, too... */
baaa35ad 3469 if (!barrier_place_and_sync(barrier)) /* #5 */
335d2ead 3470 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
03cfe0d5 3471
5f932eb9
LP
3472 if (arg_chdir)
3473 if (chdir(arg_chdir) < 0)
3474 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3475
7732f92b 3476 if (arg_start_mode == START_PID2) {
75bf701f 3477 r = stub_pid1(arg_uuid);
7732f92b
LP
3478 if (r < 0)
3479 return r;
3480 }
3481
335d2ead
LP
3482 if (arg_console_mode != CONSOLE_PIPE) {
3483 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3484 * are configured for that. Acquire it as controlling tty. */
3485 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3486 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3487 }
3488
de40a303
LP
3489 log_debug("Inner child completed, invoking payload.");
3490
8ca082b4
LP
3491 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3492 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3493 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3494 log_close();
8ca082b4
LP
3495 log_set_open_when_needed(true);
3496
03cfe0d5
LP
3497 (void) fdset_close_others(fds);
3498
7732f92b 3499 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3500 char **a;
3501 size_t m;
3502
3503 /* Automatically search for the init system */
3504
75f32f04
ZJS
3505 m = strv_length(arg_parameters);
3506 a = newa(char*, m + 2);
3507 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3508 a[1 + m] = NULL;
03cfe0d5 3509
ced58da7 3510 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3511 execve(a[0], a, env_use);
3512
ced58da7 3513 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3514 execve(a[0], a, env_use);
3515
ced58da7 3516 a[0] = (char*) "/sbin/init";
03cfe0d5 3517 execve(a[0], a, env_use);
ced58da7
LP
3518
3519 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3520 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3521 const char *dollar_path;
3522
1a68e1e5 3523 exec_target = arg_parameters[0];
b6b180b7
LP
3524
3525 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3526 * binary. */
3527 dollar_path = strv_env_get(env_use, "PATH");
3528 if (dollar_path) {
6f646e01 3529 if (setenv("PATH", dollar_path, 1) < 0)
b6b180b7
LP
3530 return log_error_errno(errno, "Failed to update $PATH: %m");
3531 }
3532
f757855e 3533 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3534 } else {
5f932eb9 3535 if (!arg_chdir)
d929b0f9
ZJS
3536 /* If we cannot change the directory, we'll end up in /, that is expected. */
3537 (void) chdir(home ?: "/root");
5f932eb9 3538
03cfe0d5
LP
3539 execle("/bin/bash", "-bash", NULL, env_use);
3540 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3541
3542 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3543 }
3544
8ca082b4 3545 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3546}
3547
e96ceaba 3548static int setup_notify_child(void) {
271f518f 3549 _cleanup_close_ int fd = -1;
9c1e04d0 3550 union sockaddr_union sa = {
44ed5214
LP
3551 .un.sun_family = AF_UNIX,
3552 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3553 };
3554 int r;
3555
3556 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3557 if (fd < 0)
3558 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3559
3560 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3561 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3562
9c1e04d0 3563 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3564 if (r < 0)
44ed5214 3565 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3566
adc7d9f0 3567 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3568 if (r < 0)
adc7d9f0 3569 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3570
2ff48e98 3571 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3572 if (r < 0)
2ff48e98 3573 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3574
271f518f 3575 return TAKE_FD(fd);
9c1e04d0
AP
3576}
3577
03cfe0d5
LP
3578static int outer_child(
3579 Barrier *barrier,
3580 const char *directory,
2d845785 3581 DissectedImage *dissected_image,
03cfe0d5
LP
3582 bool secondary,
3583 int pid_socket,
e01ff70a 3584 int uuid_socket,
9c1e04d0 3585 int notify_socket,
03cfe0d5
LP
3586 int kmsg_socket,
3587 int rtnl_socket,
825d5287 3588 int uid_shift_socket,
3acc84eb 3589 int master_pty_socket,
8199d554 3590 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3591 FDSet *fds,
3592 int netns_fd) {
03cfe0d5 3593
2f893044 3594 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
e1bb4b0d 3595 _cleanup_strv_free_ char **os_release_pairs = NULL;
bf428efb 3596 _cleanup_close_ int fd = -1;
f61c7f88 3597 bool idmap = false;
e5f10caf 3598 const char *p;
03cfe0d5
LP
3599 pid_t pid;
3600 ssize_t l;
de40a303 3601 int r;
03cfe0d5 3602
b37469d7
LP
3603 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3604 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3605 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3606 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3607
03cfe0d5
LP
3608 assert(barrier);
3609 assert(directory);
03cfe0d5 3610 assert(pid_socket >= 0);
e01ff70a 3611 assert(uuid_socket >= 0);
9c1e04d0 3612 assert(notify_socket >= 0);
3acc84eb 3613 assert(master_pty_socket >= 0);
03cfe0d5
LP
3614 assert(kmsg_socket >= 0);
3615
de40a303
LP
3616 log_debug("Outer child is initializing.");
3617
e1bb4b0d
LB
3618 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3619 if (r < 0)
3620 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3621
03cfe0d5
LP
3622 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3623 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3624
03cfe0d5
LP
3625 r = reset_audit_loginuid();
3626 if (r < 0)
3627 return r;
3628
2a2e78e9
LP
3629 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3630 * mounts to the real root. */
511a8cfe 3631 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
60e76d48
ZJS
3632 if (r < 0)
3633 return r;
03cfe0d5 3634
2d845785 3635 if (dissected_image) {
2d3a5a73
LP
3636 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3637 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3638 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3639 * makes sure ESP partitions and userns are compatible. */
3640
af187ab2 3641 r = dissected_image_mount_and_warn(
d04faa4e
LP
3642 dissected_image,
3643 directory,
3644 arg_uid_shift,
21b61b1d 3645 arg_uid_range,
d04faa4e
LP
3646 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3647 DISSECT_IMAGE_DISCARD_ON_LOOP|
3648 DISSECT_IMAGE_USR_NO_ROOT|
c65f854a 3649 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
af187ab2 3650 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785 3651 if (r < 0)
af187ab2 3652 return r;
2d845785 3653 }
03cfe0d5 3654
391567f4
LP
3655 r = determine_uid_shift(directory);
3656 if (r < 0)
3657 return r;
3658
0de7acce 3659 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3660 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3661 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3662 if (l < 0)
3663 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3664 if (l != sizeof(arg_uid_shift))
3665 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3666 "Short write while sending UID shift.");
0e7ac751 3667
0de7acce 3668 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3669 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3670 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3671 * not it will pick a different one, and send it back to us. */
3672
3673 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3674 if (l < 0)
3675 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3676 if (l != sizeof(arg_uid_shift))
3677 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3678 "Short read while receiving UID shift.");
0e7ac751
LP
3679 }
3680
ff6c6cc1
LP
3681 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3682 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3683 }
3684
6f83d3d1
LP
3685 if (path_equal(directory, "/")) {
3686 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3687 * place, so that we can make changes to its mount structure (for example, to implement
3688 * --volatile=) without this interfering with our ability to access files such as
3689 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3690 * (instead of a temporary directory, since we are living in our own mount namspace here
3691 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3692 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3693
511a8cfe 3694 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
6f83d3d1
LP
3695 if (r < 0)
3696 return r;
3697
3698 directory = "/run/systemd/nspawn-root";
e50cd82f 3699 }
7d0ecdd6 3700
f61c7f88
LP
3701 if (arg_userns_mode != USER_NAMESPACE_NO &&
3702 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3703 arg_uid_shift != 0) {
3704 r = make_mount_point(directory);
3705 if (r < 0)
3706 return r;
3707
3708 r = remount_idmap(directory, arg_uid_shift, arg_uid_range);
3709 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3710 /* This might fail because the kernel or file system doesn't support idmapping. We
3711 * can't really distinguish this nicely, nor do we have any guarantees about the
3712 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3713 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3714 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3715 "ID mapped mounts are apparently not available, sorry.");
3716
3717 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3718 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3719 } else if (r < 0)
3720 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3721 else {
3722 log_debug("ID mapped mounts available, making use of them.");
3723 idmap = true;
3724 }
3725 }
3726
7d0ecdd6
LP
3727 r = setup_pivot_root(
3728 directory,
3729 arg_pivot_root_new,
3730 arg_pivot_root_old);
3731 if (r < 0)
3732 return r;
3733
3734 r = setup_volatile_mode(
3735 directory,
3736 arg_volatile_mode,
7d0ecdd6 3737 arg_uid_shift,
8f1ed04a 3738 arg_selinux_apifs_context);
7d0ecdd6
LP
3739 if (r < 0)
3740 return r;
3741
2f893044
LP
3742 r = bind_user_prepare(
3743 directory,
3744 arg_bind_user,
3745 arg_uid_shift,
3746 arg_uid_range,
3747 &arg_custom_mounts, &arg_n_custom_mounts,
3748 &bind_user_context);
3749 if (r < 0)
3750 return r;
3751
3752 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3753 /* Send the user maps we determined to the parent, so that it installs it in our user namespace UID map table */
3754
3755 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3756 uid_t map[] = {
3757 bind_user_context->data[i].payload_user->uid,
3758 bind_user_context->data[i].host_user->uid,
3759 (uid_t) bind_user_context->data[i].payload_group->gid,
3760 (uid_t) bind_user_context->data[i].host_group->gid,
3761 };
3762
3763 l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
3764 if (l < 0)
3765 return log_error_errno(errno, "Failed to send user UID map: %m");
3766 if (l != sizeof(map))
3767 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3768 "Short write while sending user UID map.");
3769 }
3770 }
3771
5f0a6347
DDM
3772 r = mount_custom(
3773 directory,
3774 arg_custom_mounts,
3775 arg_n_custom_mounts,
5f0a6347 3776 arg_uid_shift,
5f0a6347
DDM
3777 arg_selinux_apifs_context,
3778 MOUNT_ROOT_ONLY);
3779 if (r < 0)
3780 return r;
3781
5530dc87 3782 /* Make sure we always have a mount that we can move to root later on. */
14a25e1f
LP
3783 r = make_mount_point(directory);
3784 if (r < 0)
3785 return r;
5530dc87 3786
2d3a5a73
LP
3787 if (dissected_image) {
3788 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
d04faa4e
LP
3789 r = dissected_image_mount(
3790 dissected_image,
3791 directory,
3792 arg_uid_shift,
21b61b1d 3793 arg_uid_range,
d04faa4e
LP
3794 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3795 DISSECT_IMAGE_DISCARD_ON_LOOP|
3796 DISSECT_IMAGE_USR_NO_ROOT|
f61c7f88
LP
3797 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3798 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4fcb96ce
LP
3799 if (r == -EUCLEAN)
3800 return log_error_errno(r, "File system check for image failed: %m");
2d3a5a73 3801 if (r < 0)
4fcb96ce 3802 return log_error_errno(r, "Failed to mount image file system: %m");
2d3a5a73
LP
3803 }
3804
8199d554
LP
3805 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3806 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3807
3808 r = detect_unified_cgroup_hierarchy_from_image(directory);
3809 if (r < 0)
3810 return r;
3811
3812 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3813 if (l < 0)
3814 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3815 if (l != sizeof(arg_unified_cgroup_hierarchy))
3816 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3817 "Short write while sending cgroup mode.");
8199d554
LP
3818
3819 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3820 }
3821
4ad14eff
LP
3822 /* Mark everything as shared so our mounts get propagated down. This is
3823 * required to make new bind mounts available in systemd services
5238e957 3824 * inside the container that create a new mount namespace.
4ad14eff
LP
3825 * See https://github.com/systemd/systemd/issues/3860
3826 * Further submounts (such as /dev) done after this will inherit the
5f0a6347
DDM
3827 * shared propagation mode.
3828 *
3829 * IMPORTANT: Do not overmount the root directory anymore from now on to
3830 * enable moving the root directory mount to root later on.
3831 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3832 */
511a8cfe 3833 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
4ad14eff
LP
3834 if (r < 0)
3835 return r;
3836
3837 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3838 if (r < 0)
3839 return r;
3840
03cfe0d5
LP
3841 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3842 if (r < 0)
3843 return r;
3844
bbd407ea
DDM
3845 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3846 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
64e82c19 3847 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3848 if (r < 0)
3849 return log_error_errno(r, "Failed to make tree read-only: %m");
3850 }
3851
0de7acce 3852 r = mount_all(directory,
4f086aab 3853 arg_mount_settings,
0de7acce 3854 arg_uid_shift,
0de7acce 3855 arg_selinux_apifs_context);
03cfe0d5
LP
3856 if (r < 0)
3857 return r;
3858
07fa00f9
LP
3859 r = copy_devnodes(directory);
3860 if (r < 0)
03cfe0d5
LP
3861 return r;
3862
de40a303
LP
3863 r = make_extra_nodes(directory);
3864 if (r < 0)
3865 return r;
3866
3867 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
e5f10caf 3868
9fac5029 3869 p = prefix_roota(directory, "/run/host");
e5f10caf 3870 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
03cfe0d5 3871
07fa00f9
LP
3872 r = setup_pts(directory);
3873 if (r < 0)
03cfe0d5
LP
3874 return r;
3875
3876 r = setup_propagate(directory);
3877 if (r < 0)
3878 return r;
3879
8e5430c4
LP
3880 r = setup_keyring();
3881 if (r < 0)
3882 return r;
3883
3652872a
LP
3884 r = setup_credentials(directory);
3885 if (r < 0)
3886 return r;
3887
2f893044
LP
3888 r = bind_user_setup(bind_user_context, directory);
3889 if (r < 0)
3890 return r;
3891
5c4deb9a
MJ
3892 r = mount_custom(
3893 directory,
3894 arg_custom_mounts,
3895 arg_n_custom_mounts,
3896 arg_uid_shift,
3897 arg_selinux_apifs_context,
3898 MOUNT_NON_ROOT_ONLY);
3899 if (r < 0)
3900 return r;
3901
03cfe0d5
LP
3902 r = setup_timezone(directory);
3903 if (r < 0)
3904 return r;
3905
3906 r = setup_resolv_conf(directory);
3907 if (r < 0)
3908 return r;
3909
e01ff70a
MS
3910 r = setup_machine_id(directory);
3911 if (r < 0)
3912 return r;
3913
03cfe0d5
LP
3914 r = setup_journal(directory);
3915 if (r < 0)
3916 return r;
3917
0f48ba7b
LP
3918 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3919 p = prefix_roota(directory, "/run/host/container-manager");
3920 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3921
3922 /* The same stuff as the $container_uuid env var */
3923 p = prefix_roota(directory, "/run/host/container-uuid");
3924 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3925
489fae52 3926 if (!arg_use_cgns) {
0996ef00
CB
3927 r = mount_cgroups(
3928 directory,
3929 arg_unified_cgroup_hierarchy,
3930 arg_userns_mode != USER_NAMESPACE_NO,
3931 arg_uid_shift,
3932 arg_uid_range,
5a8ff0e6 3933 arg_selinux_apifs_context,
ada54120 3934 false);
0996ef00
CB
3935 if (r < 0)
3936 return r;
3937 }
03cfe0d5
LP
3938
3939 r = mount_move_root(directory);
3940 if (r < 0)
3941 return log_error_errno(r, "Failed to move root directory: %m");
3942
e96ceaba 3943 fd = setup_notify_child();
9c1e04d0
AP
3944 if (fd < 0)
3945 return fd;
3946
03cfe0d5 3947 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3948 arg_clone_ns_flags |
8869a0b4 3949 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3950 if (pid < 0)
3951 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3952 if (pid == 0) {
3953 pid_socket = safe_close(pid_socket);
e01ff70a 3954 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3955 notify_socket = safe_close(notify_socket);
825d5287 3956 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5 3957
2a2e78e9
LP
3958 /* The inner child has all namespaces that are requested, so that we all are owned by the
3959 * user if user namespaces are turned on. */
03cfe0d5 3960
d7bea6b6
DP
3961 if (arg_network_namespace_path) {
3962 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3963 if (r < 0)
e2d39e54 3964 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3965 }
3966
e1bb4b0d 3967 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
03cfe0d5
LP
3968 if (r < 0)
3969 _exit(EXIT_FAILURE);
3970
3971 _exit(EXIT_SUCCESS);
3972 }
3973
3974 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3975 if (l < 0)
3976 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3977 if (l != sizeof(pid))
3978 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3979 "Short write while sending PID.");
03cfe0d5 3980
e01ff70a
MS
3981 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3982 if (l < 0)
3983 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3984 if (l != sizeof(arg_uuid))
3985 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3986 "Short write while sending machine ID.");
e01ff70a 3987
9c1e04d0
AP
3988 l = send_one_fd(notify_socket, fd, 0);
3989 if (l < 0)
ba72801d 3990 return log_error_errno(l, "Failed to send notify fd: %m");
9c1e04d0 3991
03cfe0d5 3992 pid_socket = safe_close(pid_socket);
e01ff70a 3993 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3994 notify_socket = safe_close(notify_socket);
3acc84eb 3995 master_pty_socket = safe_close(master_pty_socket);
327e26d6
KN
3996 kmsg_socket = safe_close(kmsg_socket);
3997 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3998 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3999
4000 return 0;
4001}
4002
0e7ac751 4003static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 4004 bool tried_hashed = false;
0e7ac751
LP
4005 unsigned n_tries = 100;
4006 uid_t candidate;
4007 int r;
4008
4009 assert(shift);
4010 assert(ret_lock_file);
0de7acce 4011 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
4012 assert(arg_uid_range == 0x10000U);
4013
4014 candidate = *shift;
4015
4016 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4017
4018 for (;;) {
fbd0b64f 4019 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 4020 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
4021
4022 if (--n_tries <= 0)
4023 return -EBUSY;
4024
87d5e4f2 4025 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
4026 goto next;
4027 if ((candidate & UINT32_C(0xFFFF)) != 0)
4028 goto next;
4029
4030 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4031 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4032 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4033 goto next;
4034 if (r < 0)
4035 return r;
4036
4037 /* Make some superficial checks whether the range is currently known in the user database */
4038 if (getpwuid(candidate))
4039 goto next;
4040 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4041 goto next;
4042 if (getgrgid(candidate))
4043 goto next;
4044 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4045 goto next;
4046
4047 *ret_lock_file = lf;
4048 lf = (struct LockFile) LOCK_FILE_INIT;
4049 *shift = candidate;
4050 return 0;
4051
4052 next:
d381c8a6
LP
4053 if (arg_machine && !tried_hashed) {
4054 /* Try to hash the base from the container name */
4055
4056 static const uint8_t hash_key[] = {
4057 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4058 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4059 };
4060
4061 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4062
4063 tried_hashed = true;
4064 } else
4065 random_bytes(&candidate, sizeof(candidate));
4066
87d5e4f2 4067 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
4068 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4069 }
4070}
4071
2f893044
LP
4072static int add_one_uid_map(
4073 char **p,
4074 uid_t container_uid,
4075 uid_t host_uid,
4076 uid_t range) {
4077
4078 return strextendf(p,
4079 UID_FMT " " UID_FMT " " UID_FMT "\n",
4080 container_uid, host_uid, range);
4081}
4082
4083static int make_uid_map_string(
4084 const uid_t bind_user_uid[],
4085 size_t n_bind_user_uid,
4086 size_t offset,
4087 char **ret) {
4088
4089 _cleanup_free_ char *s = NULL;
4090 uid_t previous_uid = 0;
4091 int r;
4092
4093 assert(n_bind_user_uid == 0 || bind_user_uid);
4094 assert(offset == 0 || offset == 2); /* used to switch between UID and GID map */
4095 assert(ret);
4096
4097 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4098 * quadruplet, consisting of host and container UID + GID. */
4099
4100 for (size_t i = 0; i < n_bind_user_uid; i++) {
4101 uid_t payload_uid = bind_user_uid[i*2+offset],
4102 host_uid = bind_user_uid[i*2+offset+1];
4103
4104 assert(previous_uid <= payload_uid);
4105 assert(payload_uid < arg_uid_range);
4106
4107 /* Add a range to close the gap to previous entry */
4108 if (payload_uid > previous_uid) {
4109 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4110 if (r < 0)
4111 return r;
4112 }
4113
4114 /* Map this specific user */
4115 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4116 if (r < 0)
4117 return r;
4118
4119 previous_uid = payload_uid + 1;
4120 }
4121
4122 /* And add a range to close the gap to finish the range */
4123 if (arg_uid_range > previous_uid) {
4124 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4125 if (r < 0)
4126 return r;
4127 }
4128
4129 assert(s);
4130
4131 *ret = TAKE_PTR(s);
4132 return 0;
4133}
4134
4135static int setup_uid_map(
4136 pid_t pid,
4137 const uid_t bind_user_uid[],
4138 size_t n_bind_user_uid) {
4139
4140 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4141 _cleanup_free_ char *s = NULL;
03cfe0d5
LP
4142 int r;
4143
4144 assert(pid > 1);
4145
2f893044
LP
4146 /* Build the UID map string */
4147 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4148 return log_oom();
4149
03cfe0d5 4150 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2f893044 4151 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4152 if (r < 0)
4153 return log_error_errno(r, "Failed to write UID map: %m");
4154
2f893044
LP
4155 /* And now build the GID map string */
4156 s = mfree(s);
4157 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4158 return log_oom();
4159
03cfe0d5 4160 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2f893044 4161 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
4162 if (r < 0)
4163 return log_error_errno(r, "Failed to write GID map: %m");
4164
4165 return 0;
4166}
4167
9c1e04d0 4168static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
4169 char buf[NOTIFY_BUFFER_MAX+1];
4170 char *p = NULL;
4171 struct iovec iovec = {
4172 .iov_base = buf,
4173 .iov_len = sizeof(buf)-1,
4174 };
fb29cdbe
LP
4175 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4176 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
9c1e04d0
AP
4177 struct msghdr msghdr = {
4178 .msg_iov = &iovec,
4179 .msg_iovlen = 1,
4180 .msg_control = &control,
4181 .msg_controllen = sizeof(control),
4182 };
371d72e0 4183 struct ucred *ucred;
9c1e04d0
AP
4184 ssize_t n;
4185 pid_t inner_child_pid;
4186 _cleanup_strv_free_ char **tags = NULL;
4187
4188 assert(userdata);
4189
4190 inner_child_pid = PTR_TO_PID(userdata);
4191
4192 if (revents != EPOLLIN) {
4193 log_warning("Got unexpected poll event for notify fd.");
4194 return 0;
4195 }
4196
3691bcf3
LP
4197 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4198 if (IN_SET(n, -EAGAIN, -EINTR))
4199 return 0;
741bfd7f
LP
4200 if (n == -EXFULL) {
4201 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4202 return 0;
4203 }
3691bcf3
LP
4204 if (n < 0)
4205 return log_warning_errno(n, "Couldn't read notification socket: %m");
9c1e04d0 4206
9c1e04d0
AP
4207 cmsg_close_all(&msghdr);
4208
371d72e0 4209 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
9c1e04d0 4210 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 4211 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
4212 return 0;
4213 }
4214
4215 if ((size_t) n >= sizeof(buf)) {
4216 log_warning("Received notify message exceeded maximum size. Ignoring.");
4217 return 0;
4218 }
4219
4220 buf[n] = 0;
4221 tags = strv_split(buf, "\n\r");
4222 if (!tags)
4223 return log_oom();
4224
4225 if (strv_find(tags, "READY=1"))
04f590a4 4226 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
4227
4228 p = strv_find_startswith(tags, "STATUS=");
4229 if (p)
04f590a4 4230 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
4231
4232 return 0;
4233}
4234
e96ceaba 4235static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 4236 int r;
9c1e04d0 4237
5773024d 4238 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
4239 if (r < 0)
4240 return log_error_errno(r, "Failed to allocate notify event source: %m");
4241
5773024d 4242 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
4243
4244 return 0;
4245}
4246
5d961407
LP
4247static int merge_settings(Settings *settings, const char *path) {
4248 int rl;
f757855e 4249
5d961407
LP
4250 assert(settings);
4251 assert(path);
f757855e 4252
5d961407
LP
4253 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4254 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 4255
7732f92b
LP
4256 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4257 settings->start_mode >= 0) {
4258 arg_start_mode = settings->start_mode;
130d3d22 4259 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
4260 }
4261
a2f577fc
JL
4262 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4263 arg_ephemeral = settings->ephemeral;
4264
de40a303
LP
4265 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4266 settings->root) {
4267
4268 if (!arg_settings_trusted)
4269 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4270 else
4271 free_and_replace(arg_directory, settings->root);
4272 }
4273
b53ede69
PW
4274 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4275 settings->pivot_root_new) {
4276 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4277 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4278 }
4279
5f932eb9 4280 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
4281 settings->working_directory)
4282 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 4283
f757855e 4284 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
4285 settings->environment)
4286 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 4287
de40a303
LP
4288 if ((arg_settings_mask & SETTING_USER) == 0) {
4289
4290 if (settings->user)
4291 free_and_replace(arg_user, settings->user);
4292
4293 if (uid_is_valid(settings->uid))
4294 arg_uid = settings->uid;
4295 if (gid_is_valid(settings->gid))
4296 arg_gid = settings->gid;
4297 if (settings->n_supplementary_gids > 0) {
4298 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4299 arg_n_supplementary_gids = settings->n_supplementary_gids;
4300 }
4301 }
f757855e
LP
4302
4303 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 4304 uint64_t plus, minus;
7be830c6 4305 uint64_t network_minus = 0;
88fc9c9b 4306 uint64_t ambient;
f757855e 4307
de40a303
LP
4308 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4309 * Settings structure */
4310
0e265674 4311 plus = settings->capability;
a3fc6b55
LP
4312 minus = settings->drop_capability;
4313
4314 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4315 if (settings_private_network(settings))
4316 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4317 else
7be830c6 4318 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
a3fc6b55 4319 }
0e265674
LP
4320
4321 if (!arg_settings_trusted && plus != 0) {
4322 if (settings->capability != 0)
5d961407 4323 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
7be830c6
TH
4324 } else {
4325 arg_caps_retain &= ~network_minus;
520e0d54 4326 arg_caps_retain |= plus;
7be830c6 4327 }
f757855e 4328
a3fc6b55 4329 arg_caps_retain &= ~minus;
de40a303
LP
4330
4331 /* Copy the full capabilities over too */
4332 if (capability_quintet_is_set(&settings->full_capabilities)) {
4333 if (!arg_settings_trusted)
5238e957 4334 log_warning("Ignoring capability settings, file %s is not trusted.", path);
de40a303
LP
4335 else
4336 arg_full_capabilities = settings->full_capabilities;
4337 }
88fc9c9b
TH
4338
4339 ambient = settings->ambient_capability;
4340 if (!arg_settings_trusted && ambient != 0)
4341 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4342 else
4343 arg_caps_ambient |= ambient;
f757855e
LP
4344 }
4345
4346 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4347 settings->kill_signal > 0)
4348 arg_kill_signal = settings->kill_signal;
4349
4350 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4351 settings->personality != PERSONALITY_INVALID)
4352 arg_personality = settings->personality;
4353
4354 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4355 !sd_id128_is_null(settings->machine_id)) {
4356
4357 if (!arg_settings_trusted)
5d961407 4358 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
4359 else
4360 arg_uuid = settings->machine_id;
4361 }
4362
4363 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4364 settings->read_only >= 0)
4365 arg_read_only = settings->read_only;
4366
4367 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4368 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4369 arg_volatile_mode = settings->volatile_mode;
4370
4371 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4372 settings->n_custom_mounts > 0) {
4373
4374 if (!arg_settings_trusted)
5d961407 4375 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
4376 else {
4377 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 4378 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 4379 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
4380 settings->n_custom_mounts = 0;
4381 }
4382 }
4383
4384 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4385 (settings->private_network >= 0 ||
4386 settings->network_veth >= 0 ||
4387 settings->network_bridge ||
22b28dfd 4388 settings->network_zone ||
f757855e
LP
4389 settings->network_interfaces ||
4390 settings->network_macvlan ||
f6d6bad1 4391 settings->network_ipvlan ||
de40a303
LP
4392 settings->network_veth_extra ||
4393 settings->network_namespace_path)) {
f757855e
LP
4394
4395 if (!arg_settings_trusted)
5d961407 4396 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 4397 else {
f6d6bad1 4398 arg_network_veth = settings_network_veth(settings);
0e265674
LP
4399 arg_private_network = settings_private_network(settings);
4400
130d3d22
YW
4401 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4402 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4403 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4404 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 4405
1cc6c93a
YW
4406 free_and_replace(arg_network_bridge, settings->network_bridge);
4407 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
4408
4409 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
4410 }
4411 }
4412
4413 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4414 settings->expose_ports) {
4415
4416 if (!arg_settings_trusted)
5d961407 4417 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
4418 else {
4419 expose_port_free_all(arg_expose_ports);
1cc6c93a 4420 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
4421 }
4422 }
4423
0de7acce
LP
4424 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4425 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4426
4427 if (!arg_settings_trusted)
5d961407 4428 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
4429 else {
4430 arg_userns_mode = settings->userns_mode;
4431 arg_uid_shift = settings->uid_shift;
4432 arg_uid_range = settings->uid_range;
6c045a99 4433 arg_userns_ownership = settings->userns_ownership;
0de7acce
LP
4434 }
4435 }
4436
2f893044
LP
4437 if ((arg_settings_mask & SETTING_BIND_USER) == 0)
4438 strv_free_and_replace(arg_bind_user, settings->bind_user);
4439
9c1e04d0
AP
4440 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4441 arg_notify_ready = settings->notify_ready;
4442
960e4569
LP
4443 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4444
6b000af4 4445 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
5d961407 4446 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 4447 else {
6b000af4
LP
4448 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4449 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
960e4569 4450 }
de40a303
LP
4451
4452#if HAVE_SECCOMP
4453 if (!arg_settings_trusted && settings->seccomp)
4454 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4455 else {
4456 seccomp_release(arg_seccomp);
4457 arg_seccomp = TAKE_PTR(settings->seccomp);
4458 }
4459#endif
960e4569
LP
4460 }
4461
bf428efb
LP
4462 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4463 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4464 continue;
4465
4466 if (!settings->rlimit[rl])
4467 continue;
4468
4469 if (!arg_settings_trusted) {
5d961407 4470 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
4471 continue;
4472 }
4473
4474 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4475 }
4476
3a9530e5
LP
4477 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4478 settings->hostname)
4479 free_and_replace(arg_hostname, settings->hostname);
4480
66edd963
LP
4481 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4482 settings->no_new_privileges >= 0)
4483 arg_no_new_privileges = settings->no_new_privileges;
4484
81f345df
LP
4485 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4486 settings->oom_score_adjust_set) {
4487
4488 if (!arg_settings_trusted)
5d961407 4489 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
4490 else {
4491 arg_oom_score_adjust = settings->oom_score_adjust;
4492 arg_oom_score_adjust_set = true;
4493 }
4494 }
4495
d107bb7d 4496 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
0985c7c4 4497 settings->cpu_set.set) {
d107bb7d
LP
4498
4499 if (!arg_settings_trusted)
5d961407 4500 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d 4501 else {
0985c7c4
ZJS
4502 cpu_set_reset(&arg_cpu_set);
4503 arg_cpu_set = settings->cpu_set;
4504 settings->cpu_set = (CPUSet) {};
d107bb7d
LP
4505 }
4506 }
4507
09d423e9
LP
4508 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4509 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4510 arg_resolv_conf = settings->resolv_conf;
4511
4e1d6aa9
LP
4512 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4513 settings->link_journal != _LINK_JOURNAL_INVALID) {
4514
4515 if (!arg_settings_trusted)
4516 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4517 else {
4518 arg_link_journal = settings->link_journal;
4519 arg_link_journal_try = settings->link_journal_try;
4520 }
4521 }
4522
1688841f
LP
4523 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4524 settings->timezone != _TIMEZONE_MODE_INVALID)
4525 arg_timezone = settings->timezone;
4526
de40a303
LP
4527 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4528 settings->slice) {
4529
4530 if (!arg_settings_trusted)
4531 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4532 else
4533 free_and_replace(arg_slice, settings->slice);
4534 }
4535
4536 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4537 settings->use_cgns >= 0) {
4538
4539 if (!arg_settings_trusted)
4540 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4541 else
4542 arg_use_cgns = settings->use_cgns;
4543 }
4544
4545 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
f5fbe71d 4546 settings->clone_ns_flags != ULONG_MAX) {
de40a303
LP
4547
4548 if (!arg_settings_trusted)
4549 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4550 else
4551 arg_clone_ns_flags = settings->clone_ns_flags;
4552 }
4553
4554 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4555 settings->console_mode >= 0) {
4556
4557 if (!arg_settings_trusted)
4558 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4559 else
4560 arg_console_mode = settings->console_mode;
4561 }
4562
4563 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4564 * don't consult arg_settings_mask for them. */
4565
4566 sd_bus_message_unref(arg_property_message);
4567 arg_property_message = TAKE_PTR(settings->properties);
4568
4569 arg_console_width = settings->console_width;
4570 arg_console_height = settings->console_height;
4571
b2645747 4572 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
4573 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4574 arg_n_extra_nodes = settings->n_extra_nodes;
4575
f757855e
LP
4576 return 0;
4577}
4578
5d961407
LP
4579static int load_settings(void) {
4580 _cleanup_(settings_freep) Settings *settings = NULL;
4581 _cleanup_fclose_ FILE *f = NULL;
4582 _cleanup_free_ char *p = NULL;
4583 const char *fn, *i;
4584 int r;
4585
de40a303
LP
4586 if (arg_oci_bundle)
4587 return 0;
4588
5d961407
LP
4589 /* If all settings are masked, there's no point in looking for
4590 * the settings file */
d7a0f1f4 4591 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
5d961407
LP
4592 return 0;
4593
4594 fn = strjoina(arg_machine, ".nspawn");
4595
4596 /* We first look in the admin's directories in /etc and /run */
4597 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4598 _cleanup_free_ char *j = NULL;
4599
657ee2d8 4600 j = path_join(i, fn);
5d961407
LP
4601 if (!j)
4602 return log_oom();
4603
4604 f = fopen(j, "re");
4605 if (f) {
4606 p = TAKE_PTR(j);
4607
4608 /* By default, we trust configuration from /etc and /run */
4609 if (arg_settings_trusted < 0)
4610 arg_settings_trusted = true;
4611
4612 break;
4613 }
4614
4615 if (errno != ENOENT)
4616 return log_error_errno(errno, "Failed to open %s: %m", j);
4617 }
4618
4619 if (!f) {
4620 /* After that, let's look for a file next to the
4621 * actual image we shall boot. */
4622
4623 if (arg_image) {
4624 p = file_in_same_dir(arg_image, fn);
4625 if (!p)
4626 return log_oom();
cd6e3914 4627 } else if (arg_directory && !path_equal(arg_directory, "/")) {
5d961407
LP
4628 p = file_in_same_dir(arg_directory, fn);
4629 if (!p)
4630 return log_oom();
4631 }
4632
4633 if (p) {
4634 f = fopen(p, "re");
4635 if (!f && errno != ENOENT)
4636 return log_error_errno(errno, "Failed to open %s: %m", p);
4637
4638 /* By default, we do not trust configuration from /var/lib/machines */
4639 if (arg_settings_trusted < 0)
4640 arg_settings_trusted = false;
4641 }
4642 }
4643
4644 if (!f)
4645 return 0;
4646
4647 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4648
4649 r = settings_load(f, p, &settings);
4650 if (r < 0)
4651 return r;
4652
4653 return merge_settings(settings, p);
4654}
4655
de40a303
LP
4656static int load_oci_bundle(void) {
4657 _cleanup_(settings_freep) Settings *settings = NULL;
4658 int r;
4659
4660 if (!arg_oci_bundle)
4661 return 0;
4662
4663 /* By default let's trust OCI bundles */
4664 if (arg_settings_trusted < 0)
4665 arg_settings_trusted = true;
4666
4667 r = oci_load(NULL, arg_oci_bundle, &settings);
4668 if (r < 0)
4669 return r;
4670
4671 return merge_settings(settings, arg_oci_bundle);
4672}
4673
3acc84eb 4674static int run_container(
2d845785 4675 DissectedImage *dissected_image,
b0067625
ZJS
4676 bool secondary,
4677 FDSet *fds,
4678 char veth_name[IFNAMSIZ], bool *veth_created,
761cf19d 4679 struct ExposeArgs *expose_args,
3acc84eb 4680 int *master, pid_t *pid, int *ret) {
b0067625
ZJS
4681
4682 static const struct sigaction sa = {
4683 .sa_handler = nop_signal_handler,
e28c7cd0 4684 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4685 };
4686
8e766630 4687 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4688 _cleanup_close_ int etc_passwd_lock = -1;
4689 _cleanup_close_pair_ int
4690 kmsg_socket_pair[2] = { -1, -1 },
4691 rtnl_socket_pair[2] = { -1, -1 },
4692 pid_socket_pair[2] = { -1, -1 },
4693 uuid_socket_pair[2] = { -1, -1 },
4694 notify_socket_pair[2] = { -1, -1 },
8199d554 4695 uid_shift_socket_pair[2] = { -1, -1 },
3acc84eb 4696 master_pty_socket_pair[2] = { -1, -1 },
8199d554
LP
4697 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4698
3acc84eb 4699 _cleanup_close_ int notify_socket = -1;
b0067625 4700 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4701 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4702 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4703 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4704 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4705 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2f893044
LP
4706 _cleanup_free_ uid_t *bind_user_uid = NULL;
4707 size_t n_bind_user_uid = 0;
b0067625 4708 ContainerStatus container_status = 0;
b0067625
ZJS
4709 int ifi = 0, r;
4710 ssize_t l;
4711 sigset_t mask_chld;
5b4855ab 4712 _cleanup_close_ int child_netns_fd = -1;
b0067625
ZJS
4713
4714 assert_se(sigemptyset(&mask_chld) == 0);
4715 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4716
4717 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4718 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4719 * check with getpwuid() if the specific user already exists. Note that /etc might be
4720 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4721 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4722 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4723 * really ours. */
4724
4725 etc_passwd_lock = take_etc_passwd_lock(NULL);
4726 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4727 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4728 }
4729
4730 r = barrier_create(&barrier);
4731 if (r < 0)
4732 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4733
4734 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4735 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4736
4737 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4738 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4739
4740 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4741 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4742
4743 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4744 return log_error_errno(errno, "Failed to create id socket pair: %m");
4745
4746 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4747 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4748
3acc84eb
FB
4749 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4750 return log_error_errno(errno, "Failed to create console socket pair: %m");
4751
b0067625
ZJS
4752 if (arg_userns_mode != USER_NAMESPACE_NO)
4753 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4754 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4755
8199d554
LP
4756 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4757 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4758 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4759
b0067625
ZJS
4760 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4761 * parent's blocking calls and give it a chance to call wait() and terminate. */
4762 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4763 if (r < 0)
4764 return log_error_errno(errno, "Failed to change the signal mask: %m");
4765
4766 r = sigaction(SIGCHLD, &sa, NULL);
4767 if (r < 0)
4768 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4769
d7bea6b6 4770 if (arg_network_namespace_path) {
5b4855ab
DDM
4771 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4772 if (child_netns_fd < 0)
d7bea6b6
DP
4773 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4774
54c2459d 4775 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
6619ad88
LP
4776 if (r == -EUCLEAN)
4777 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4778 else if (r < 0)
d7bea6b6 4779 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
c6147113
LP
4780 else if (r == 0)
4781 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4782 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4783 }
4784
b0067625
ZJS
4785 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4786 if (*pid < 0)
4787 return log_error_errno(errno, "clone() failed%s: %m",
4788 errno == EINVAL ?
4789 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4790
4791 if (*pid == 0) {
4792 /* The outer child only has a file system namespace. */
4793 barrier_set_role(&barrier, BARRIER_CHILD);
4794
b0067625
ZJS
4795 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4796 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4797 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4798 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4799 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3acc84eb 4800 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
b0067625 4801 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4802 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4803
4804 (void) reset_all_signal_handlers();
4805 (void) reset_signal_mask();
4806
4807 r = outer_child(&barrier,
4808 arg_directory,
2d845785 4809 dissected_image,
b0067625
ZJS
4810 secondary,
4811 pid_socket_pair[1],
4812 uuid_socket_pair[1],
4813 notify_socket_pair[1],
4814 kmsg_socket_pair[1],
4815 rtnl_socket_pair[1],
4816 uid_shift_socket_pair[1],
3acc84eb 4817 master_pty_socket_pair[1],
8199d554 4818 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6 4819 fds,
5b4855ab 4820 child_netns_fd);
b0067625
ZJS
4821 if (r < 0)
4822 _exit(EXIT_FAILURE);
4823
4824 _exit(EXIT_SUCCESS);
4825 }
4826
4827 barrier_set_role(&barrier, BARRIER_PARENT);
4828
e4077ff6 4829 fdset_close(fds);
b0067625
ZJS
4830
4831 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4832 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4833 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4834 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4835 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3acc84eb 4836 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
b0067625 4837 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4838 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4839
4840 if (arg_userns_mode != USER_NAMESPACE_NO) {
4841 /* The child just let us know the UID shift it might have read from the image. */
4842 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4843 if (l < 0)
4844 return log_error_errno(errno, "Failed to read UID shift: %m");
c6147113
LP
4845 if (l != sizeof arg_uid_shift)
4846 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
b0067625
ZJS
4847
4848 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4849 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4850 * image, but if that's already in use, pick a new one, and report back to the child,
4851 * which one we now picked. */
4852
4853 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4854 if (r < 0)
4855 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4856
4857 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4858 if (l < 0)
4859 return log_error_errno(errno, "Failed to send UID shift: %m");
c6147113
LP
4860 if (l != sizeof arg_uid_shift)
4861 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
b0067625 4862 }
2f893044
LP
4863
4864 n_bind_user_uid = strv_length(arg_bind_user);
4865 if (n_bind_user_uid > 0) {
4866 /* Right after the UID shift, we'll receive the list of UID mappings for the
4867 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4868
4869 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4870 if (!bind_user_uid)
4871 return log_oom();
4872
4873 for (size_t i = 0; i < n_bind_user_uid; i++) {
4874 l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4875 if (l < 0)
4876 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4877 if (l != sizeof(uid_t)*4)
4878 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4879 SYNTHETIC_ERRNO(EIO),
4880 "Short read while reading bind user UID pairs.");
4881 }
4882 }
b0067625
ZJS
4883 }
4884
8199d554
LP
4885 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4886 /* The child let us know the support cgroup mode it might have read from the image. */
4887 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4888 if (l < 0)
4889 return log_error_errno(errno, "Failed to read cgroup mode: %m");
c6147113
LP
4890 if (l != sizeof(arg_unified_cgroup_hierarchy))
4891 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4892 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4893 }
4894
b0067625 4895 /* Wait for the outer child. */
d2e0ac3d
LP
4896 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4897 if (r < 0)
4898 return r;
4899 if (r != EXIT_SUCCESS)
4900 return -EIO;
b0067625
ZJS
4901
4902 /* And now retrieve the PID of the inner child. */
4903 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4904 if (l < 0)
4905 return log_error_errno(errno, "Failed to read inner child PID: %m");
c6147113
LP
4906 if (l != sizeof *pid)
4907 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
b0067625
ZJS
4908
4909 /* We also retrieve container UUID in case it was generated by outer child */
4910 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4911 if (l < 0)
4912 return log_error_errno(errno, "Failed to read container machine ID: %m");
c6147113
LP
4913 if (l != sizeof(arg_uuid))
4914 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
b0067625
ZJS
4915
4916 /* We also retrieve the socket used for notifications generated by outer child */
4917 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4918 if (notify_socket < 0)
4919 return log_error_errno(notify_socket,
4920 "Failed to receive notification socket from the outer child: %m");
4921
4922 log_debug("Init process invoked as PID "PID_FMT, *pid);
4923
4924 if (arg_userns_mode != USER_NAMESPACE_NO) {
c6147113
LP
4925 if (!barrier_place_and_sync(&barrier)) /* #1 */
4926 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 4927
2f893044 4928 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
b0067625
ZJS
4929 if (r < 0)
4930 return r;
4931
4932 (void) barrier_place(&barrier); /* #2 */
4933 }
4934
4935 if (arg_private_network) {
75116558
PS
4936 if (!arg_network_namespace_path) {
4937 /* Wait until the child has unshared its network namespace. */
c6147113
LP
4938 if (!barrier_place_and_sync(&barrier)) /* #3 */
4939 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
75116558
PS
4940 }
4941
5b4855ab
DDM
4942 if (child_netns_fd < 0) {
4943 /* Make sure we have an open file descriptor to the child's network
4944 * namespace so it stays alive even if the child exits. */
4945 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4946 if (r < 0)
4947 return log_error_errno(r, "Failed to open child network namespace: %m");
4948 }
4949
4950 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
b0067625
ZJS
4951 if (r < 0)
4952 return r;
4953
4954 if (arg_network_veth) {
4955 r = setup_veth(arg_machine, *pid, veth_name,
4956 arg_network_bridge || arg_network_zone);
4957 if (r < 0)
4958 return r;
4959 else if (r > 0)
4960 ifi = r;
4961
4962 if (arg_network_bridge) {
4963 /* Add the interface to a bridge */
4964 r = setup_bridge(veth_name, arg_network_bridge, false);
4965 if (r < 0)
4966 return r;
4967 if (r > 0)
4968 ifi = r;
4969 } else if (arg_network_zone) {
4970 /* Add the interface to a bridge, possibly creating it */
4971 r = setup_bridge(veth_name, arg_network_zone, true);
4972 if (r < 0)
4973 return r;
4974 if (r > 0)
4975 ifi = r;
4976 }
4977 }
4978
4979 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4980 if (r < 0)
4981 return r;
4982
4983 /* We created the primary and extra veth links now; let's remember this, so that we know to
4984 remove them later on. Note that we don't bother with removing veth links that were created
4985 here when their setup failed half-way, because in that case the kernel should be able to
4986 remove them on its own, since they cannot be referenced by anything yet. */
4987 *veth_created = true;
4988
4989 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4990 if (r < 0)
4991 return r;
4992
4993 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4994 if (r < 0)
4995 return r;
4996 }
4997
abdb9b08
LP
4998 if (arg_register || !arg_keep_unit) {
4999 r = sd_bus_default_system(&bus);
5000 if (r < 0)
5001 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
5002
5003 r = sd_bus_set_close_on_exit(bus, false);
5004 if (r < 0)
5005 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
5006 }
5007
5008 if (!arg_keep_unit) {
5009 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5010 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5011 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5012
75152a4d
LP
5013 r = sd_bus_match_signal_async(
5014 bus,
5015 NULL,
5016 "org.freedesktop.systemd1",
5017 NULL,
5018 "org.freedesktop.systemd1.Scope",
5019 "RequestStop",
5020 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 5021 if (r < 0)
75152a4d 5022 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
5023 }
5024
b0067625
ZJS
5025 if (arg_register) {
5026 r = register_machine(
abdb9b08 5027 bus,
b0067625
ZJS
5028 arg_machine,
5029 *pid,
5030 arg_directory,
5031 arg_uuid,
5032 ifi,
5033 arg_slice,
5034 arg_custom_mounts, arg_n_custom_mounts,
5035 arg_kill_signal,
5036 arg_property,
de40a303 5037 arg_property_message,
b0067625
ZJS
5038 arg_keep_unit,
5039 arg_container_service_name);
5040 if (r < 0)
5041 return r;
abdb9b08 5042
cd2dfc6f
LP
5043 } else if (!arg_keep_unit) {
5044 r = allocate_scope(
abdb9b08 5045 bus,
cd2dfc6f
LP
5046 arg_machine,
5047 *pid,
5048 arg_slice,
5049 arg_custom_mounts, arg_n_custom_mounts,
5050 arg_kill_signal,
de40a303
LP
5051 arg_property,
5052 arg_property_message);
cd2dfc6f
LP
5053 if (r < 0)
5054 return r;
5055
5056 } else if (arg_slice || arg_property)
5057 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 5058
27da7ef0 5059 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
5060 if (r < 0)
5061 return r;
5062
27da7ef0 5063 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
5064 if (r < 0)
5065 return r;
b0067625 5066
de54e02d 5067 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
5068 if (r < 0)
5069 return r;
5070
5071 /* Notify the child that the parent is ready with all
5072 * its setup (including cgroup-ification), and that
5073 * the child can now hand over control to the code to
5074 * run inside the container. */
75116558 5075 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
5076
5077 /* Block SIGCHLD here, before notifying child.
5078 * process_pty() will handle it with the other signals. */
5079 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5080
5081 /* Reset signal to default */
9c274488 5082 r = default_signals(SIGCHLD);
b0067625
ZJS
5083 if (r < 0)
5084 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5085
5086 r = sd_event_new(&event);
5087 if (r < 0)
5088 return log_error_errno(r, "Failed to get default event source: %m");
5089
8fd010bb
LP
5090 (void) sd_event_set_watchdog(event, true);
5091
abdb9b08
LP
5092 if (bus) {
5093 r = sd_bus_attach_event(bus, event, 0);
5094 if (r < 0)
5095 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5096 }
5097
e96ceaba 5098 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
5099 if (r < 0)
5100 return r;
5101
5102 /* Let the child know that we are ready and wait that the child is completely ready now. */
c6147113
LP
5103 if (!barrier_place_and_sync(&barrier)) /* #5 */
5104 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
b0067625 5105
38ccb557 5106 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
b0067625
ZJS
5107 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5108 etc_passwd_lock = safe_close(etc_passwd_lock);
5109
04f590a4
LP
5110 (void) sd_notifyf(false,
5111 "STATUS=Container running.\n"
5112 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 5113 if (!arg_notify_ready)
919f5ae0 5114 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
5115
5116 if (arg_kill_signal > 0) {
5117 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
5118 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5119 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
5120 } else {
5121 /* Immediately exit */
919f5ae0
LP
5122 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5123 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
5124 }
5125
6916b164 5126 /* Exit when the child exits */
919f5ae0 5127 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
5128
5129 if (arg_expose_ports) {
761cf19d 5130 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
b0067625
ZJS
5131 if (r < 0)
5132 return r;
5133
deff68e7
FW
5134 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5135 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5136 }
5137
5138 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5139
3acc84eb
FB
5140 if (arg_console_mode != CONSOLE_PIPE) {
5141 _cleanup_close_ int fd = -1;
5142 PTYForwardFlags flags = 0;
de40a303 5143
3acc84eb
FB
5144 /* Retrieve the master pty allocated by inner child */
5145 fd = receive_one_fd(master_pty_socket_pair[0], 0);
5146 if (fd < 0)
5147 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5148
5149 switch (arg_console_mode) {
de40a303 5150
3acc84eb
FB
5151 case CONSOLE_READ_ONLY:
5152 flags |= PTY_FORWARD_READ_ONLY;
5153
5154 _fallthrough_;
5155
5156 case CONSOLE_INTERACTIVE:
5157 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5158
5159 r = pty_forward_new(event, fd, flags, &forward);
5160 if (r < 0)
5161 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5162
f5fbe71d 5163 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
3acc84eb
FB
5164 (void) pty_forward_set_width_height(forward,
5165 arg_console_width,
5166 arg_console_height);
5167 break;
5168
5169 default:
5170 assert(arg_console_mode == CONSOLE_PASSIVE);
5171 }
5172
5173 *master = TAKE_FD(fd);
de40a303 5174 }
b0067625
ZJS
5175
5176 r = sd_event_loop(event);
5177 if (r < 0)
5178 return log_error_errno(r, "Failed to run event loop: %m");
5179
de40a303
LP
5180 if (forward) {
5181 char last_char = 0;
b0067625 5182
de40a303
LP
5183 (void) pty_forward_get_last_char(forward, &last_char);
5184 forward = pty_forward_free(forward);
b0067625 5185
de40a303
LP
5186 if (!arg_quiet && last_char != '\n')
5187 putc('\n', stdout);
5188 }
b0067625
ZJS
5189
5190 /* Kill if it is not dead yet anyway */
0bb0a9fa
ZJS
5191 if (!arg_register && !arg_keep_unit && bus)
5192 terminate_scope(bus, arg_machine);
b0067625
ZJS
5193
5194 /* Normally redundant, but better safe than sorry */
c67b0082 5195 (void) kill(*pid, SIGKILL);
b0067625 5196
5b4855ab
DDM
5197 if (arg_private_network) {
5198 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5199 * to avoid having to move the parent to the child network namespace. */
5200 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5201 if (r < 0)
5202 return r;
5203
5204 if (r == 0) {
5205 _cleanup_close_ int parent_netns_fd = -1;
5206
5207 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5208 if (r < 0) {
5209 log_error_errno(r, "Failed to open parent network namespace: %m");
5210 _exit(EXIT_FAILURE);
5211 }
5212
5213 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5214 if (r < 0) {
5215 log_error_errno(r, "Failed to enter child network namespace: %m");
5216 _exit(EXIT_FAILURE);
5217 }
5218
5219 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5220 if (r < 0)
5221 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5222
5223 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5224 }
5225 }
5226
b0067625
ZJS
5227 r = wait_for_container(*pid, &container_status);
5228 *pid = 0;
5229
0bb0a9fa
ZJS
5230 /* Tell machined that we are gone. */
5231 if (bus)
5232 (void) unregister_machine(bus, arg_machine);
5233
b0067625
ZJS
5234 if (r < 0)
5235 /* We failed to wait for the container, or the container exited abnormally. */
5236 return r;
5237 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
5238 /* r > 0 → The container exited with a non-zero status.
5239 * As a special case, we need to replace 133 with a different value,
5240 * because 133 is special-cased in the service file to reboot the container.
5241 * otherwise → The container exited with zero status and a reboot was not requested.
5242 */
2a49b612 5243 if (r == EXIT_FORCE_RESTART)
27e29a1e 5244 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 5245 *ret = r;
b0067625
ZJS
5246 return 0; /* finito */
5247 }
5248
5249 /* CONTAINER_REBOOTED, loop again */
5250
5251 if (arg_keep_unit) {
5252 /* Special handling if we are running as a service: instead of simply
5253 * restarting the machine we want to restart the entire service, so let's
5254 * inform systemd about this with the special exit code 133. The service
5255 * file uses RestartForceExitStatus=133 so that this results in a full
5256 * nspawn restart. This is necessary since we might have cgroup parameters
5257 * set we want to have flushed out. */
2a49b612
ZJS
5258 *ret = EXIT_FORCE_RESTART;
5259 return 0; /* finito */
b0067625
ZJS
5260 }
5261
deff68e7
FW
5262 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5263 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
b0067625
ZJS
5264
5265 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5266 *veth_created = false;
5267 return 1; /* loop again */
5268}
5269
bf428efb 5270static int initialize_rlimits(void) {
bf428efb
LP
5271 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
5272 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5273 * container execution environments. */
5274
5275 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5276 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5277 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5278 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5279 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5280 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5281 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5282 [RLIMIT_MEMLOCK] = { 65536, 65536 },
5283 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5284 [RLIMIT_NICE] = { 0, 0 },
5285 [RLIMIT_NOFILE] = { 1024, 4096 },
5286 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5287 [RLIMIT_RTPRIO] = { 0, 0 },
5288 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5289 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5290
5291 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5292 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5293 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5294 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5295 * that PID 1 changes a number of other resource limits during early initialization which is why we
5296 * don't read the other limits from PID 1 but prefer the static table above. */
5297 };
5298
5299 int rl;
5300
5301 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
5302 /* Let's only fill in what the user hasn't explicitly configured anyway */
5303 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5304 const struct rlimit *v;
5305 struct rlimit buffer;
5306
5307 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5308 /* For these two let's read the limits off PID 1. See above for an explanation. */
5309
5310 if (prlimit(1, rl, NULL, &buffer) < 0)
5311 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5312
5313 v = &buffer;
5314 } else
5315 v = kernel_defaults + rl;
5316
5317 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5318 if (!arg_rlimit[rl])
5319 return log_oom();
5320 }
5321
5322 if (DEBUG_LOGGING) {
5323 _cleanup_free_ char *k = NULL;
5324
5325 (void) rlimit_format(arg_rlimit[rl], &k);
5326 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5327 }
5328 }
5329
5330 return 0;
5331}
5332
287b7376
LP
5333static int cant_be_in_netns(void) {
5334 union sockaddr_union sa = {
5335 .un = {
5336 .sun_family = AF_UNIX,
5337 .sun_path = "/run/udev/control",
5338 },
5339 };
5340 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5341 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5342 _cleanup_close_ int fd = -1;
5343 struct ucred ucred;
5344 int r;
5345
5346 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5347 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5348 * nice message. */
5349
5350 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5351 return 0;
5352
5353 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5354 if (fd < 0)
5355 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5356
5357 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5358
5359 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5360 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5361 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5362
5363 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5364 }
5365
5366 r = getpeercred(fd, &ucred);
5367 if (r < 0)
5368 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5369
5370 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5371 r = readlink_malloc(udev_path, &udev_ns);
5372 if (r < 0)
5373 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5374
5375 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5376 if (r < 0)
5377 return log_error_errno(r, "Failed to read our own network namespace: %m");
5378
5379 if (!streq(our_ns, udev_ns))
5380 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5381 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5382 return 0;
5383}
5384
44dbef90 5385static int run(int argc, char *argv[]) {
7bf011e3
LP
5386 bool secondary = false, remove_directory = false, remove_image = false,
5387 veth_created = false, remove_tmprootdir = false;
2d845785 5388 _cleanup_close_ int master = -1;
03cfe0d5 5389 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 5390 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 5391 char veth_name[IFNAMSIZ] = "";
761cf19d 5392 struct ExposeArgs expose_args = {};
8e766630 5393 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082 5394 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 5395 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
5396 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5397 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
761cf19d 5398 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
7bf011e3 5399 pid_t pid = 0;
03cfe0d5
LP
5400
5401 log_parse_environment();
5402 log_open();
415fc41c 5403
03cfe0d5
LP
5404 r = parse_argv(argc, argv);
5405 if (r <= 0)
5406 goto finish;
5407
38ee19c0
ZJS
5408 if (geteuid() != 0) {
5409 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5410 argc >= 2 ? "Need to be root." :
5411 "Need to be root (and some arguments are usually required).\nHint: try --help");
03cfe0d5 5412 goto finish;
38ee19c0 5413 }
fba868fa 5414
287b7376
LP
5415 r = cant_be_in_netns();
5416 if (r < 0)
5417 goto finish;
5418
bf428efb
LP
5419 r = initialize_rlimits();
5420 if (r < 0)
5421 goto finish;
5422
de40a303
LP
5423 r = load_oci_bundle();
5424 if (r < 0)
5425 goto finish;
5426
f757855e
LP
5427 r = determine_names();
5428 if (r < 0)
5429 goto finish;
5430
5431 r = load_settings();
5432 if (r < 0)
5433 goto finish;
5434
d4d99bc6 5435 r = cg_unified();
5eee8290
LP
5436 if (r < 0) {
5437 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5438 goto finish;
5439 }
5440
f757855e
LP
5441 r = verify_arguments();
5442 if (r < 0)
5443 goto finish;
03cfe0d5 5444
49048684
ZJS
5445 /* Reapply environment settings. */
5446 (void) detect_unified_cgroup_hierarchy_from_environment();
8199d554 5447
2949ff26
LP
5448 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5449 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5450 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
9c274488 5451 (void) ignore_signals(SIGPIPE);
2949ff26 5452
03cfe0d5
LP
5453 n_fd_passed = sd_listen_fds(false);
5454 if (n_fd_passed > 0) {
5455 r = fdset_new_listen_fds(&fds, false);
5456 if (r < 0) {
5457 log_error_errno(r, "Failed to collect file descriptors: %m");
5458 goto finish;
5459 }
5460 }
5461
83e803a9
ZJS
5462 /* The "default" umask. This is appropriate for most file and directory
5463 * operations performed by nspawn, and is the umask that will be used for
5464 * the child. Functions like copy_devnodes() change the umask temporarily. */
5465 umask(0022);
5466
03cfe0d5
LP
5467 if (arg_directory) {
5468 assert(!arg_image);
5469
b35ca61a
LP
5470 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5471 * /var from the host will propagate into container dynamically (because bad things happen if
5472 * two systems write to the same /var). Let's allow it for the special cases where /var is
5473 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5474 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5475 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
03cfe0d5
LP
5476 r = -EINVAL;
5477 goto finish;
5478 }
5479
5480 if (arg_ephemeral) {
5481 _cleanup_free_ char *np = NULL;
5482
8d4aa2bb 5483 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
5484 if (r < 0)
5485 goto finish;
5486
7bf011e3
LP
5487 /* If the specified path is a mount point we generate the new snapshot immediately
5488 * inside it under a random name. However if the specified is not a mount point we
5489 * create the new snapshot in the parent directory, just next to it. */
e1873695 5490 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
5491 if (r < 0) {
5492 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5493 goto finish;
5494 }
5495 if (r > 0)
770b5ce4 5496 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5497 else
770b5ce4 5498 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 5499 if (r < 0) {
0f3be6ca 5500 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
5501 goto finish;
5502 }
5503
6992459c 5504 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
162392b7 5505 * only owned by us and no one else. */
6992459c 5506 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
03cfe0d5
LP
5507 if (r < 0) {
5508 log_error_errno(r, "Failed to lock %s: %m", np);
5509 goto finish;
5510 }
5511
7bf011e3
LP
5512 {
5513 BLOCK_SIGNALS(SIGINT);
5514 r = btrfs_subvol_snapshot(arg_directory, np,
5515 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5516 BTRFS_SNAPSHOT_FALLBACK_COPY |
5517 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5518 BTRFS_SNAPSHOT_RECURSIVE |
5519 BTRFS_SNAPSHOT_QUOTA |
5520 BTRFS_SNAPSHOT_SIGINT);
5521 }
5522 if (r == -EINTR) {
5523 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5524 goto finish;
5525 }
03cfe0d5
LP
5526 if (r < 0) {
5527 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5528 goto finish;
ec16945e
LP
5529 }
5530
1cc6c93a 5531 free_and_replace(arg_directory, np);
17cbb288 5532 remove_directory = true;
30535c16 5533 } else {
cb638b5e 5534 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
5535 if (r < 0)
5536 goto finish;
5537
30535c16
LP
5538 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5539 if (r == -EBUSY) {
5540 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5541 goto finish;
5542 }
5543 if (r < 0) {
5544 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 5545 goto finish;
30535c16
LP
5546 }
5547
5548 if (arg_template) {
8d4aa2bb 5549 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
5550 if (r < 0)
5551 goto finish;
5552
7bf011e3
LP
5553 {
5554 BLOCK_SIGNALS(SIGINT);
5555 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5556 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5557 BTRFS_SNAPSHOT_FALLBACK_COPY |
5558 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5559 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5560 BTRFS_SNAPSHOT_RECURSIVE |
5561 BTRFS_SNAPSHOT_QUOTA |
5562 BTRFS_SNAPSHOT_SIGINT);
5563 }
ff6c6cc1
LP
5564 if (r == -EEXIST)
5565 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5566 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
7bf011e3
LP
5567 else if (r == -EINTR) {
5568 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5569 goto finish;
5570 } else if (r < 0) {
83521414 5571 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 5572 goto finish;
ff6c6cc1
LP
5573 } else
5574 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5575 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 5576 }
ec16945e
LP
5577 }
5578
7732f92b 5579 if (arg_start_mode == START_BOOT) {
a5201ed6 5580 const char *p;
c9fe05e0 5581
a5201ed6
LP
5582 if (arg_pivot_root_new)
5583 p = prefix_roota(arg_directory, arg_pivot_root_new);
5584 else
5585 p = arg_directory;
c9fe05e0
AR
5586
5587 if (path_is_os_tree(p) <= 0) {
5588 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 5589 r = -EINVAL;
1b9e5b12
LP
5590 goto finish;
5591 }
5592 } else {
c9fe05e0
AR
5593 const char *p, *q;
5594
a5201ed6
LP
5595 if (arg_pivot_root_new)
5596 p = prefix_roota(arg_directory, arg_pivot_root_new);
5597 else
5598 p = arg_directory;
c9fe05e0
AR
5599
5600 q = strjoina(p, "/usr/");
1b9e5b12 5601
c9fe05e0
AR
5602 if (laccess(q, F_OK) < 0) {
5603 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 5604 r = -EINVAL;
1b9e5b12 5605 goto finish;
1b9e5b12
LP
5606 }
5607 }
ec16945e 5608
6b9132a9 5609 } else {
d04faa4e 5610 DissectImageFlags dissect_image_flags =
4b5de5dd 5611 DISSECT_IMAGE_GENERIC_ROOT |
d04faa4e
LP
5612 DISSECT_IMAGE_REQUIRE_ROOT |
5613 DISSECT_IMAGE_RELAX_VAR_CHECK |
5614 DISSECT_IMAGE_USR_NO_ROOT;
ec16945e
LP
5615 assert(arg_image);
5616 assert(!arg_template);
5617
8d4aa2bb 5618 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
5619 if (r < 0)
5620 goto finish;
5621
0f3be6ca
LP
5622 if (arg_ephemeral) {
5623 _cleanup_free_ char *np = NULL;
5624
5625 r = tempfn_random(arg_image, "machine.", &np);
5626 if (r < 0) {
5627 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5628 goto finish;
5629 }
5630
6992459c
LP
5631 /* Always take an exclusive lock on our own ephemeral copy. */
5632 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
0f3be6ca
LP
5633 if (r < 0) {
5634 r = log_error_errno(r, "Failed to create image lock: %m");
5635 goto finish;
5636 }
5637
7bf011e3
LP
5638 {
5639 BLOCK_SIGNALS(SIGINT);
5640 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5641 }
5642 if (r == -EINTR) {
5643 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5644 goto finish;
5645 }
0f3be6ca
LP
5646 if (r < 0) {
5647 r = log_error_errno(r, "Failed to copy image file: %m");
5648 goto finish;
5649 }
5650
1cc6c93a 5651 free_and_replace(arg_image, np);
0f3be6ca
LP
5652 remove_image = true;
5653 } else {
5654 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5655 if (r == -EBUSY) {
5656 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5657 goto finish;
5658 }
5659 if (r < 0) {
5660 r = log_error_errno(r, "Failed to create image lock: %m");
5661 goto finish;
5662 }
4623e8e6 5663
89e62e0b
LP
5664 r = verity_settings_load(
5665 &arg_verity_settings,
5666 arg_image, NULL, NULL);
e7cbe5cb
LB
5667 if (r < 0) {
5668 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5669 goto finish;
78ebe980 5670 }
89e62e0b
LP
5671
5672 if (arg_verity_settings.data_path)
5673 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
30535c16
LP
5674 }
5675
c67b0082 5676 if (!mkdtemp(tmprootdir)) {
0f3be6ca 5677 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 5678 goto finish;
1b9e5b12 5679 }
6b9132a9 5680
c67b0082
LP
5681 remove_tmprootdir = true;
5682
5683 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
5684 if (!arg_directory) {
5685 r = log_oom();
5686 goto finish;
6b9132a9 5687 }
88213476 5688
89e62e0b
LP
5689 r = loop_device_make_by_path(
5690 arg_image,
5691 arg_read_only ? O_RDONLY : O_RDWR,
5692 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5693 &loop);
2d845785
LP
5694 if (r < 0) {
5695 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
5696 goto finish;
5697 }
1b9e5b12 5698
4526113f 5699 r = dissect_image_and_warn(
e0f9e7bd 5700 loop->fd,
4526113f 5701 arg_image,
89e62e0b 5702 &arg_verity_settings,
18d73705 5703 NULL,
75dc190d 5704 loop->uevent_seqnum_not_before,
4a62257d 5705 loop->timestamp_not_before,
e7cbe5cb 5706 dissect_image_flags,
e0f9e7bd 5707 &dissected_image);
2d845785 5708 if (r == -ENOPKG) {
4526113f 5709 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
5710 log_notice("Note that the disk image needs to\n"
5711 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5712 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
19ac32cd 5713 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
2d845785
LP
5714 " d) or contain a file system without a partition table\n"
5715 "in order to be bootable with systemd-nspawn.");
1b9e5b12 5716 goto finish;
2d845785 5717 }
4526113f 5718 if (r < 0)
842f3b0f 5719 goto finish;
1b9e5b12 5720
89e62e0b 5721 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
4623e8e6
LP
5722 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5723
89e62e0b
LP
5724 r = dissected_image_decrypt_interactively(
5725 dissected_image,
5726 NULL,
5727 &arg_verity_settings,
5728 0,
5729 &decrypted_image);
1b9e5b12
LP
5730 if (r < 0)
5731 goto finish;
0f3be6ca
LP
5732
5733 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5734 if (remove_image && unlink(arg_image) >= 0)
5735 remove_image = false;
842f3b0f 5736 }
842f3b0f 5737
86c0dd4a 5738 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
5739 if (r < 0)
5740 goto finish;
5741
de40a303
LP
5742 if (arg_console_mode < 0)
5743 arg_console_mode =
5744 isatty(STDIN_FILENO) > 0 &&
5745 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 5746
de40a303
LP
5747 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5748 arg_quiet = true;
a258bf26 5749
9c857b9d
LP
5750 if (!arg_quiet)
5751 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5752 arg_machine, arg_image ?: arg_directory);
5753
72c0a2c2 5754 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5755
66edd963 5756 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5757 r = log_error_errno(errno, "Failed to become subreaper: %m");
5758 goto finish;
5759 }
5760
761cf19d
FW
5761 if (arg_expose_ports) {
5762 r = fw_ctx_new(&fw_ctx);
5763 if (r < 0) {
5764 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5765 goto finish;
5766 }
5767 expose_args.fw_ctx = fw_ctx;
5768 }
d87be9b0 5769 for (;;) {
3acc84eb 5770 r = run_container(dissected_image,
44dbef90
LP
5771 secondary,
5772 fds,
5773 veth_name, &veth_created,
761cf19d 5774 &expose_args, &master,
44dbef90 5775 &pid, &ret);
b0067625 5776 if (r <= 0)
d87be9b0 5777 break;
d87be9b0 5778 }
88213476
LP
5779
5780finish:
04f590a4
LP
5781 (void) sd_notify(false,
5782 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5783 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5784
9444b1f2 5785 if (pid > 0)
c67b0082 5786 (void) kill(pid, SIGKILL);
88213476 5787
503546da 5788 /* Try to flush whatever is still queued in the pty */
6a0f896b 5789 if (master >= 0) {
f5fbe71d 5790 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6a0f896b
LP
5791 master = safe_close(master);
5792 }
5793
5794 if (pid > 0)
5795 (void) wait_for_terminate(pid, NULL);
503546da 5796
50ebcf6c
LP
5797 pager_close();
5798
17cbb288 5799 if (remove_directory && arg_directory) {
ec16945e
LP
5800 int k;
5801
17cbb288 5802 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5803 if (k < 0)
17cbb288 5804 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5805 }
5806
0f3be6ca
LP
5807 if (remove_image && arg_image) {
5808 if (unlink(arg_image) < 0)
5809 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5810 }
5811
c67b0082
LP
5812 if (remove_tmprootdir) {
5813 if (rmdir(tmprootdir) < 0)
5814 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5815 }
5816
785890ac
LP
5817 if (arg_machine) {
5818 const char *p;
5819
63c372cb 5820 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5821 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5822 }
5823
deff68e7
FW
5824 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5825 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
7513c5b8
LP
5826
5827 if (veth_created)
5828 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5829 (void) remove_bridge(arg_network_zone);
f757855e 5830
f757855e
LP
5831 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5832 expose_port_free_all(arg_expose_ports);
bf428efb 5833 rlimit_free_all(arg_rlimit);
b2645747 5834 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3652872a 5835 credential_free_all(arg_credentials, arg_n_credentials);
6d0b55c2 5836
44dbef90
LP
5837 if (r < 0)
5838 return r;
5839
5840 return ret;
88213476 5841}
44dbef90
LP
5842
5843DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);