]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
missing: add FS_PROJINHERIT_FL
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
6b5cf3ea 4#include <blkid.h>
8fe0087e 5#endif
88213476 6#include <errno.h>
88213476 7#include <getopt.h>
0e7ac751 8#include <grp.h>
503f480f 9#include <linux/fs.h>
1b9e5b12 10#include <linux/loop.h>
0e7ac751 11#include <pwd.h>
8fe0087e 12#include <sched.h>
349cc4a5 13#if HAVE_SELINUX
8fe0087e 14#include <selinux/selinux.h>
1b9e5b12 15#endif
8fe0087e
LP
16#include <signal.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20#include <sys/file.h>
8fe0087e
LP
21#include <sys/personality.h>
22#include <sys/prctl.h>
23#include <sys/types.h>
6916b164 24#include <sys/wait.h>
8fe0087e 25#include <unistd.h>
1b9e5b12 26
b053cd5f 27#include "sd-bus.h"
1f0cd86b 28#include "sd-daemon.h"
1f0cd86b 29#include "sd-id128.h"
8fe0087e 30
b5efdb8a 31#include "alloc-util.h"
8fe0087e
LP
32#include "barrier.h"
33#include "base-filesystem.h"
34#include "blkid-util.h"
35#include "btrfs-util.h"
b8ea7a6e 36#include "bus-error.h"
b053cd5f 37#include "bus-util.h"
8fe0087e 38#include "cap-list.h"
430f0182 39#include "capability-util.h"
04d391da 40#include "cgroup-util.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
4fc9982c 43#include "dev-setup.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3ffd4af2 46#include "fd-util.h"
842f3b0f 47#include "fdset.h"
a5c32cff 48#include "fileio.h"
f97b34a6 49#include "format-util.h"
f4f15635 50#include "fs-util.h"
1b9e5b12 51#include "gpt.h"
4623e8e6 52#include "hexdecoct.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
1b9cebf6 58#include "machine-image.h"
8fe0087e 59#include "macro.h"
44dbef90 60#include "main-func.h"
8fe0087e
LP
61#include "missing.h"
62#include "mkdir.h"
4349cd7c 63#include "mount-util.h"
049af8ad 64#include "mountpoint-util.h"
0cb8e3d1 65#include "namespace-util.h"
8fe0087e 66#include "netlink-util.h"
07630cea 67#include "nspawn-cgroup.h"
3603efde 68#include "nspawn-def.h"
07630cea
LP
69#include "nspawn-expose-ports.h"
70#include "nspawn-mount.h"
71#include "nspawn-network.h"
de40a303 72#include "nspawn-oci.h"
7336138e 73#include "nspawn-patch-uid.h"
07630cea 74#include "nspawn-register.h"
910fd145 75#include "nspawn-seccomp.h"
07630cea
LP
76#include "nspawn-settings.h"
77#include "nspawn-setuid.h"
7732f92b 78#include "nspawn-stub-pid1.h"
d8b4d14d 79#include "nulstr-util.h"
d58ad743 80#include "os-util.h"
50ebcf6c 81#include "pager.h"
6bedfcbb 82#include "parse-util.h"
8fe0087e 83#include "path-util.h"
294bf0c3 84#include "pretty-print.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
bf428efb 89#include "rlimit-util.h"
8fe0087e 90#include "rm-rf.h"
de40a303
LP
91#if HAVE_SECCOMP
92#include "seccomp-util.h"
93#endif
68b02049 94#include "selinux-util.h"
8fe0087e 95#include "signal-util.h"
2583fbea 96#include "socket-util.h"
8fcde012 97#include "stat-util.h"
15a5e950 98#include "stdio-util.h"
5c828e66 99#include "string-table.h"
07630cea 100#include "string-util.h"
8fe0087e 101#include "strv.h"
de40a303 102#include "sysctl-util.h"
8fe0087e 103#include "terminal-util.h"
e4de7287 104#include "tmpfile-util.h"
affb60b1 105#include "umask-util.h"
b1d4f8e1 106#include "user-util.h"
8fe0087e 107#include "util.h"
e9642be2 108
62b1e758
YW
109#if HAVE_SPLIT_USR
110#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
111#else
112#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
113#endif
114
9c1e04d0
AP
115/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
6145bb4f 124 CONTAINER_REBOOTED,
113cea80
DH
125} ContainerStatus;
126
88213476 127static char *arg_directory = NULL;
ec16945e 128static char *arg_template = NULL;
5f932eb9 129static char *arg_chdir = NULL;
b53ede69
PW
130static char *arg_pivot_root_new = NULL;
131static char *arg_pivot_root_old = NULL;
687d0825 132static char *arg_user = NULL;
de40a303
LP
133static uid_t arg_uid = UID_INVALID;
134static gid_t arg_gid = GID_INVALID;
135static gid_t* arg_supplementary_gids = NULL;
136static size_t arg_n_supplementary_gids = 0;
9444b1f2 137static sd_id128_t arg_uuid = {};
3a9530e5
LP
138static char *arg_machine = NULL; /* The name used by the host to refer to this */
139static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
140static const char *arg_selinux_context = NULL;
141static const char *arg_selinux_apifs_context = NULL;
de40a303 142static char *arg_slice = NULL;
ff01d048 143static bool arg_private_network = false;
bc2f673e 144static bool arg_read_only = false;
7732f92b 145static StartMode arg_start_mode = START_PID1;
ec16945e 146static bool arg_ephemeral = false;
57fb9fb5 147static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 148static bool arg_link_journal_try = false;
520e0d54 149static uint64_t arg_caps_retain =
50b52222
LP
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 161 (1ULL << CAP_MKNOD) |
5076f0cc
LP
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
5076f0cc 165 (1ULL << CAP_SETFCAP) |
50b52222 166 (1ULL << CAP_SETGID) |
5076f0cc
LP
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
50b52222 170 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 174 (1ULL << CAP_SYS_RESOURCE) |
50b52222 175 (1ULL << CAP_SYS_TTY_CONFIG);
de40a303 176static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
5a8af538 177static CustomMount *arg_custom_mounts = NULL;
88614c8a 178static size_t arg_n_custom_mounts = 0;
f4889f65 179static char **arg_setenv = NULL;
284c0b91 180static bool arg_quiet = false;
eb91eb18 181static bool arg_register = true;
89f7c846 182static bool arg_keep_unit = false;
aa28aefe 183static char **arg_network_interfaces = NULL;
c74e630d 184static char **arg_network_macvlan = NULL;
4bbfe7ad 185static char **arg_network_ipvlan = NULL;
69c79d3c 186static bool arg_network_veth = false;
f6d6bad1 187static char **arg_network_veth_extra = NULL;
f757855e 188static char *arg_network_bridge = NULL;
22b28dfd 189static char *arg_network_zone = NULL;
d7bea6b6 190static char *arg_network_namespace_path = NULL;
bb068de0 191static PagerFlags arg_pager_flags = 0;
050f7277 192static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 193static char *arg_image = NULL;
de40a303 194static char *arg_oci_bundle = NULL;
f757855e 195static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 196static ExposePort *arg_expose_ports = NULL;
f36933fe 197static char **arg_property = NULL;
de40a303 198static sd_bus_message *arg_property_message = NULL;
0de7acce 199static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 200static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 201static bool arg_userns_chown = false;
c6c8f6e2 202static int arg_kill_signal = 0;
5da38d07 203static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
204static SettingsMask arg_settings_mask = 0;
205static int arg_settings_trusted = -1;
206static char **arg_parameters = NULL;
6aadfa4c 207static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 208static bool arg_notify_ready = false;
5a8ff0e6 209static bool arg_use_cgns = true;
0c582db0 210static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 211static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
212static void *arg_root_hash = NULL;
213static size_t arg_root_hash_size = 0;
960e4569
LP
214static char **arg_syscall_whitelist = NULL;
215static char **arg_syscall_blacklist = NULL;
de40a303
LP
216#if HAVE_SECCOMP
217static scmp_filter_ctx arg_seccomp = NULL;
218#endif
bf428efb 219static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 220static bool arg_no_new_privileges = false;
81f345df
LP
221static int arg_oom_score_adjust = 0;
222static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
223static cpu_set_t *arg_cpuset = NULL;
224static unsigned arg_cpuset_ncpus = 0;
09d423e9 225static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 226static TimezoneMode arg_timezone = TIMEZONE_AUTO;
de40a303
LP
227static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
228static DeviceNode* arg_extra_nodes = NULL;
229static size_t arg_n_extra_nodes = 0;
230static char **arg_sysctl = NULL;
231static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
88213476 232
6145bb4f
LP
233STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
234STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
235STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
236STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
237STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
238STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
239STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
240STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
241STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
242STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
243STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
244STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
245STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
246STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
247STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
248STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
249STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
250STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
251STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
252STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
253STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
254STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
255STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
256STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
257STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
258STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
259#if HAVE_SECCOMP
260STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
261#endif
262STATIC_DESTRUCTOR_REGISTER(arg_cpuset, CPU_FREEp);
263STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
264
37ec0fdd
LP
265static int help(void) {
266 _cleanup_free_ char *link = NULL;
267 int r;
268
bb068de0 269 (void) pager_open(arg_pager_flags);
50ebcf6c 270
37ec0fdd
LP
271 r = terminal_urlify_man("systemd-nspawn", "1", &link);
272 if (r < 0)
273 return log_oom();
274
25148653 275 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
a7e2e50d 276 "Spawn a command or OS in a light-weight container.\n\n"
a8828ed9
DW
277 " -h --help Show this help\n"
278 " --version Print version string\n"
69c79d3c 279 " -q --quiet Do not show status information\n"
bb068de0 280 " --no-pager Do not pipe output into a pager\n"
25148653
LP
281 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
282 "%3$sImage:%4$s\n"
1b9e5b12 283 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
284 " --template=PATH Initialize root directory from template directory,\n"
285 " if missing\n"
286 " -x --ephemeral Run container with snapshot of root directory, and\n"
287 " remove it after exit\n"
25e68fd3
LP
288 " -i --image=PATH Root file system disk image (or device node) for\n"
289 " the container\n"
de40a303 290 " --oci-bundle=PATH OCI bundle directory\n"
25148653
LP
291 " --read-only Mount the root directory read-only\n"
292 " --volatile[=MODE] Run the system in volatile mode\n"
25e68fd3 293 " --root-hash=HASH Specify verity root hash for root disk image\n"
25148653
LP
294 " --pivot-root=PATH[:PATH]\n"
295 " Pivot root to given directory in the container\n\n"
296 "%3$sExecution:%4$s\n"
7732f92b 297 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 298 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 299 " --chdir=PATH Set working directory in the container\n"
25148653
LP
300 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
301 " -u --user=USER Run the command under specified user or UID\n"
302 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
303 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
304 "%3$sSystem Identity:%4$s\n"
a8828ed9 305 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 306 " --hostname=NAME Override the hostname for the container\n"
25148653
LP
307 " --uuid=UUID Set a specific machine UUID for the container\n\n"
308 "%3$sProperties:%4$s\n"
a8828ed9 309 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 310 " --property=NAME=VALUE Set scope unit property\n"
25148653
LP
311 " --register=BOOLEAN Register container as machine\n"
312 " --keep-unit Do not register a scope for the machine, reuse\n"
313 " the service unit nspawn is running in\n\n"
314 "%3$sUser Namespacing:%4$s\n"
90b4a64d 315 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 316 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 317 " Similar, but with user configured UID/GID range\n"
25148653
LP
318 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
319 "%3$sNetworking:%4$s\n"
69c79d3c
LP
320 " --private-network Disable network in container\n"
321 " --network-interface=INTERFACE\n"
322 " Assign an existing network interface to the\n"
323 " container\n"
c74e630d
LP
324 " --network-macvlan=INTERFACE\n"
325 " Create a macvlan network interface based on an\n"
326 " existing network interface to the container\n"
4bbfe7ad
TG
327 " --network-ipvlan=INTERFACE\n"
328 " Create a ipvlan network interface based on an\n"
329 " existing network interface to the container\n"
a8eaaee7 330 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 331 " and container\n"
f6d6bad1
LP
332 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
333 " Add an additional virtual Ethernet link between\n"
334 " host and container\n"
ab046dde 335 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
336 " Add a virtual Ethernet connection to the container\n"
337 " and attach it to an existing bridge on the host\n"
338 " --network-zone=NAME Similar, but attach the new interface to an\n"
339 " an automatically managed bridge interface\n"
d7bea6b6
DP
340 " --network-namespace-path=PATH\n"
341 " Set network namespace to the one represented by\n"
342 " the specified kernel namespace file node\n"
6d0b55c2 343 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
25148653
LP
344 " Expose a container IP port on the host\n\n"
345 "%3$sSecurity:%4$s\n"
a8828ed9
DW
346 " --capability=CAP In addition to the default, retain specified\n"
347 " capability\n"
348 " --drop-capability=CAP Drop the specified capability from the default set\n"
f4e803c8 349 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
960e4569
LP
350 " --system-call-filter=LIST|~LIST\n"
351 " Permit/prohibit specific system calls\n"
25148653
LP
352 " -Z --selinux-context=SECLABEL\n"
353 " Set the SELinux security context to be used by\n"
354 " processes in the container\n"
355 " -L --selinux-apifs-context=SECLABEL\n"
356 " Set the SELinux security context to be used by\n"
357 " API/tmpfs file systems in the container\n\n"
358 "%3$sResources:%4$s\n"
bf428efb 359 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
360 " --oom-score-adjust=VALUE\n"
361 " Adjust the OOM score value for the payload\n"
f4e803c8
LP
362 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
363 " --personality=ARCH Pick personality for this container\n\n"
25148653 364 "%3$sIntegration:%4$s\n"
09d423e9 365 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 366 " --timezone=MODE Select mode of /etc/localtime initialization\n"
25148653
LP
367 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
368 " host, try-guest, try-host\n"
369 " -j Equivalent to --link-journal=try-guest\n\n"
370 "%3$sMounts:%4$s\n"
5e5bfa6e
EY
371 " --bind=PATH[:PATH[:OPTIONS]]\n"
372 " Bind mount a file or directory from the host into\n"
a8828ed9 373 " the container\n"
5e5bfa6e
EY
374 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
375 " Similar, but creates a read-only bind mount\n"
de40a303
LP
376 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
377 " it\n"
06c17c39 378 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
379 " --overlay=PATH[:PATH...]:PATH\n"
380 " Create an overlay mount from the host to \n"
381 " the container\n"
382 " --overlay-ro=PATH[:PATH...]:PATH\n"
25148653
LP
383 " Similar, but creates a read-only overlay mount\n\n"
384 "%3$sInput/Output:%4$s\n"
de40a303
LP
385 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
386 " set up for the container.\n"
387 " -P --pipe Equivalent to --console=pipe\n"
25148653 388 "\nSee the %2$s for details.\n"
37ec0fdd
LP
389 , program_invocation_short_name
390 , link
25148653 391 , ansi_underline(), ansi_normal());
37ec0fdd
LP
392
393 return 0;
88213476
LP
394}
395
86c0dd4a 396static int custom_mount_check_all(void) {
88614c8a 397 size_t i;
5a8af538 398
5a8af538
LP
399 for (i = 0; i < arg_n_custom_mounts; i++) {
400 CustomMount *m = &arg_custom_mounts[i];
401
0de7acce 402 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
403 if (arg_userns_chown)
404 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
405 "--private-users-chown may not be combined with custom root mounts.");
406 else if (arg_uid_shift == UID_INVALID)
407 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
408 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 409 }
5a8af538
LP
410 }
411
412 return 0;
413}
414
8199d554 415static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 416 const char *e;
415fc41c 417 int r;
5da38d07 418
efdb0237
LP
419 /* Allow the user to control whether the unified hierarchy is used */
420 e = getenv("UNIFIED_CGROUP_HIERARCHY");
421 if (e) {
422 r = parse_boolean(e);
423 if (r < 0)
424 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
425 if (r > 0)
426 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
427 else
428 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
429 }
430
8199d554
LP
431 return 0;
432}
433
434static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
435 int r;
436
437 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
438 * image actually supports. */
b4cccbc1
LP
439 r = cg_all_unified();
440 if (r < 0)
441 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
442 if (r > 0) {
a8725a06
ZJS
443 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
444 * routine only detects 231, so we'll have a false negative here for 230. */
445 r = systemd_installation_has_version(directory, 230);
446 if (r < 0)
447 return log_error_errno(r, "Failed to determine systemd version in container: %m");
448 if (r > 0)
449 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
450 else
451 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 452 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
453 /* Mixed cgroup hierarchy support was added in 233 */
454 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
455 if (r < 0)
456 return log_error_errno(r, "Failed to determine systemd version in container: %m");
457 if (r > 0)
458 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
459 else
460 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
461 } else
5da38d07 462 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 463
8199d554
LP
464 log_debug("Using %s hierarchy for container.",
465 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
466 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
467
efdb0237
LP
468 return 0;
469}
470
0c582db0
LB
471static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
472 int r;
473
474 r = getenv_bool(name);
475 if (r == -ENXIO)
476 return;
477 if (r < 0)
478 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
de40a303 479
0c582db0 480 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
de40a303 481 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
0c582db0
LB
482}
483
4f086aab 484static void parse_mount_settings_env(void) {
4f086aab 485 const char *e;
1099ceeb
LP
486 int r;
487
488 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
489 if (r >= 0)
490 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
491 else if (r != -ENXIO)
492 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
4f086aab
SU
493
494 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
495 if (!e)
496 return;
497
498 if (streq(e, "network")) {
499 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
500 return;
501 }
502
503 r = parse_boolean(e);
504 if (r < 0) {
505 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
506 return;
ab8ee0f2 507 }
4f086aab 508
ab8ee0f2
ZJS
509 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
510 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
511}
512
d5455d2f
LP
513static void parse_environment(void) {
514 const char *e;
515 int r;
516
517 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
518 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
519 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
520 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
521
522 parse_mount_settings_env();
523
489fae52
ZJS
524 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
525 * even if it is supported. If not supported, it has no effect. */
de40a303 526 if (!cg_ns_supported())
489fae52 527 arg_use_cgns = false;
de40a303
LP
528 else {
529 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
530 if (r < 0) {
531 if (r != -ENXIO)
532 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
533
534 arg_use_cgns = true;
535 } else {
536 arg_use_cgns = r > 0;
537 arg_settings_mask |= SETTING_USE_CGNS;
538 }
539 }
d5455d2f
LP
540
541 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
542 if (e)
543 arg_container_service_name = e;
544
545 detect_unified_cgroup_hierarchy_from_environment();
546}
547
88213476 548static int parse_argv(int argc, char *argv[]) {
a41fe3a2 549 enum {
acbeb427
ZJS
550 ARG_VERSION = 0x100,
551 ARG_PRIVATE_NETWORK,
bc2f673e 552 ARG_UUID,
5076f0cc 553 ARG_READ_ONLY,
57fb9fb5 554 ARG_CAPABILITY,
420c7379 555 ARG_DROP_CAPABILITY,
17fe0523
LP
556 ARG_LINK_JOURNAL,
557 ARG_BIND,
f4889f65 558 ARG_BIND_RO,
06c17c39 559 ARG_TMPFS,
5a8af538
LP
560 ARG_OVERLAY,
561 ARG_OVERLAY_RO,
de40a303 562 ARG_INACCESSIBLE,
eb91eb18 563 ARG_SHARE_SYSTEM,
89f7c846 564 ARG_REGISTER,
aa28aefe 565 ARG_KEEP_UNIT,
69c79d3c 566 ARG_NETWORK_INTERFACE,
c74e630d 567 ARG_NETWORK_MACVLAN,
4bbfe7ad 568 ARG_NETWORK_IPVLAN,
ab046dde 569 ARG_NETWORK_BRIDGE,
22b28dfd 570 ARG_NETWORK_ZONE,
f6d6bad1 571 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 572 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 573 ARG_PERSONALITY,
4d9f07b4 574 ARG_VOLATILE,
ec16945e 575 ARG_TEMPLATE,
f36933fe 576 ARG_PROPERTY,
6dac160c 577 ARG_PRIVATE_USERS,
c6c8f6e2 578 ARG_KILL_SIGNAL,
f757855e 579 ARG_SETTINGS,
5f932eb9 580 ARG_CHDIR,
b53ede69 581 ARG_PIVOT_ROOT,
7336138e 582 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 583 ARG_NOTIFY_READY,
4623e8e6 584 ARG_ROOT_HASH,
960e4569 585 ARG_SYSTEM_CALL_FILTER,
bf428efb 586 ARG_RLIMIT,
3a9530e5 587 ARG_HOSTNAME,
66edd963 588 ARG_NO_NEW_PRIVILEGES,
81f345df 589 ARG_OOM_SCORE_ADJUST,
d107bb7d 590 ARG_CPU_AFFINITY,
09d423e9 591 ARG_RESOLV_CONF,
1688841f 592 ARG_TIMEZONE,
de40a303
LP
593 ARG_CONSOLE,
594 ARG_PIPE,
595 ARG_OCI_BUNDLE,
bb068de0 596 ARG_NO_PAGER,
a41fe3a2
LP
597 };
598
88213476 599 static const struct option options[] = {
d7bea6b6
DP
600 { "help", no_argument, NULL, 'h' },
601 { "version", no_argument, NULL, ARG_VERSION },
602 { "directory", required_argument, NULL, 'D' },
603 { "template", required_argument, NULL, ARG_TEMPLATE },
604 { "ephemeral", no_argument, NULL, 'x' },
605 { "user", required_argument, NULL, 'u' },
606 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
607 { "as-pid2", no_argument, NULL, 'a' },
608 { "boot", no_argument, NULL, 'b' },
609 { "uuid", required_argument, NULL, ARG_UUID },
610 { "read-only", no_argument, NULL, ARG_READ_ONLY },
611 { "capability", required_argument, NULL, ARG_CAPABILITY },
612 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 613 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
614 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
615 { "bind", required_argument, NULL, ARG_BIND },
616 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
617 { "tmpfs", required_argument, NULL, ARG_TMPFS },
618 { "overlay", required_argument, NULL, ARG_OVERLAY },
619 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
de40a303 620 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
d7bea6b6 621 { "machine", required_argument, NULL, 'M' },
3a9530e5 622 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
623 { "slice", required_argument, NULL, 'S' },
624 { "setenv", required_argument, NULL, 'E' },
625 { "selinux-context", required_argument, NULL, 'Z' },
626 { "selinux-apifs-context", required_argument, NULL, 'L' },
627 { "quiet", no_argument, NULL, 'q' },
628 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
629 { "register", required_argument, NULL, ARG_REGISTER },
630 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
631 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
632 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
633 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
634 { "network-veth", no_argument, NULL, 'n' },
635 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
636 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
637 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
638 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
639 { "personality", required_argument, NULL, ARG_PERSONALITY },
640 { "image", required_argument, NULL, 'i' },
641 { "volatile", optional_argument, NULL, ARG_VOLATILE },
642 { "port", required_argument, NULL, 'p' },
643 { "property", required_argument, NULL, ARG_PROPERTY },
644 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
645 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
646 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
647 { "settings", required_argument, NULL, ARG_SETTINGS },
648 { "chdir", required_argument, NULL, ARG_CHDIR },
649 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
650 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
651 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
652 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 653 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 654 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 655 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 656 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 657 { "timezone", required_argument, NULL, ARG_TIMEZONE },
de40a303
LP
658 { "console", required_argument, NULL, ARG_CONSOLE },
659 { "pipe", no_argument, NULL, ARG_PIPE },
660 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
bb068de0 661 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
eb9da376 662 {}
88213476
LP
663 };
664
9444b1f2 665 int c, r;
d5455d2f 666 const char *p;
a42c8b54 667 uint64_t plus = 0, minus = 0;
f757855e 668 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
669
670 assert(argc >= 0);
671 assert(argv);
672
de40a303 673 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
88213476
LP
674 switch (c) {
675
676 case 'h':
37ec0fdd 677 return help();
88213476 678
acbeb427 679 case ARG_VERSION:
3f6fd1ba 680 return version();
acbeb427 681
88213476 682 case 'D':
0f03c2a4 683 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 684 if (r < 0)
0f03c2a4 685 return r;
de40a303
LP
686
687 arg_settings_mask |= SETTING_DIRECTORY;
ec16945e
LP
688 break;
689
690 case ARG_TEMPLATE:
0f03c2a4 691 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 692 if (r < 0)
0f03c2a4 693 return r;
de40a303
LP
694
695 arg_settings_mask |= SETTING_DIRECTORY;
88213476
LP
696 break;
697
1b9e5b12 698 case 'i':
0f03c2a4 699 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 700 if (r < 0)
0f03c2a4 701 return r;
de40a303
LP
702
703 arg_settings_mask |= SETTING_DIRECTORY;
704 break;
705
706 case ARG_OCI_BUNDLE:
707 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
708 if (r < 0)
709 return r;
710
ec16945e
LP
711 break;
712
713 case 'x':
714 arg_ephemeral = true;
a2f577fc 715 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
716 break;
717
687d0825 718 case 'u':
2fc09a9c
DM
719 r = free_and_strdup(&arg_user, optarg);
720 if (r < 0)
7027ff61 721 return log_oom();
687d0825 722
f757855e 723 arg_settings_mask |= SETTING_USER;
687d0825
MV
724 break;
725
22b28dfd
LP
726 case ARG_NETWORK_ZONE: {
727 char *j;
728
729 j = strappend("vz-", optarg);
730 if (!j)
731 return log_oom();
732
733 if (!ifname_valid(j)) {
734 log_error("Network zone name not valid: %s", j);
735 free(j);
736 return -EINVAL;
737 }
738
df1fac6d 739 free_and_replace(arg_network_zone, j);
22b28dfd
LP
740
741 arg_network_veth = true;
742 arg_private_network = true;
743 arg_settings_mask |= SETTING_NETWORK;
744 break;
745 }
746
ab046dde 747 case ARG_NETWORK_BRIDGE:
ef76dff2 748
baaa35ad
ZJS
749 if (!ifname_valid(optarg))
750 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
751 "Bridge interface name not valid: %s", optarg);
ef76dff2 752
f757855e
LP
753 r = free_and_strdup(&arg_network_bridge, optarg);
754 if (r < 0)
755 return log_oom();
ab046dde 756
4831981d 757 _fallthrough_;
0dfaa006 758 case 'n':
69c79d3c
LP
759 arg_network_veth = true;
760 arg_private_network = true;
f757855e 761 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
762 break;
763
f6d6bad1
LP
764 case ARG_NETWORK_VETH_EXTRA:
765 r = veth_extra_parse(&arg_network_veth_extra, optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
768
769 arg_private_network = true;
770 arg_settings_mask |= SETTING_NETWORK;
771 break;
772
aa28aefe 773 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
774 if (!ifname_valid(optarg))
775 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
776 "Network interface name not valid: %s", optarg);
ef76dff2 777
c74e630d
LP
778 if (strv_extend(&arg_network_interfaces, optarg) < 0)
779 return log_oom();
780
781 arg_private_network = true;
f757855e 782 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
783 break;
784
785 case ARG_NETWORK_MACVLAN:
ef76dff2 786
baaa35ad
ZJS
787 if (!ifname_valid(optarg))
788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
789 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 790
c74e630d 791 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
792 return log_oom();
793
4bbfe7ad 794 arg_private_network = true;
f757855e 795 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
796 break;
797
798 case ARG_NETWORK_IPVLAN:
ef76dff2 799
baaa35ad
ZJS
800 if (!ifname_valid(optarg))
801 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
802 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 803
4bbfe7ad
TG
804 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
805 return log_oom();
806
4831981d 807 _fallthrough_;
ff01d048
LP
808 case ARG_PRIVATE_NETWORK:
809 arg_private_network = true;
f757855e 810 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
811 break;
812
d7bea6b6
DP
813 case ARG_NETWORK_NAMESPACE_PATH:
814 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
815 if (r < 0)
816 return r;
817
de40a303 818 arg_settings_mask |= SETTING_NETWORK;
d7bea6b6
DP
819 break;
820
0f0dbc46 821 case 'b':
baaa35ad
ZJS
822 if (arg_start_mode == START_PID2)
823 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
824 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
825
826 arg_start_mode = START_BOOT;
827 arg_settings_mask |= SETTING_START_MODE;
828 break;
829
830 case 'a':
baaa35ad
ZJS
831 if (arg_start_mode == START_BOOT)
832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
833 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
834
835 arg_start_mode = START_PID2;
836 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
837 break;
838
144f0fc0 839 case ARG_UUID:
9444b1f2 840 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
841 if (r < 0)
842 return log_error_errno(r, "Invalid UUID: %s", optarg);
843
baaa35ad
ZJS
844 if (sd_id128_is_null(arg_uuid))
845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
846 "Machine UUID may not be all zeroes.");
f757855e
LP
847
848 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 849 break;
aa96c6cb 850
9444b1f2 851 case 'S':
de40a303
LP
852 r = free_and_strdup(&arg_slice, optarg);
853 if (r < 0)
854 return log_oom();
855
856 arg_settings_mask |= SETTING_SLICE;
144f0fc0
LP
857 break;
858
7027ff61 859 case 'M':
c1521918 860 if (isempty(optarg))
97b11eed 861 arg_machine = mfree(arg_machine);
c1521918 862 else {
baaa35ad
ZJS
863 if (!machine_name_is_valid(optarg))
864 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
865 "Invalid machine name: %s", optarg);
7027ff61 866
0c3c4284
LP
867 r = free_and_strdup(&arg_machine, optarg);
868 if (r < 0)
eb91eb18 869 return log_oom();
eb91eb18 870 }
9ce6d1b3 871 break;
7027ff61 872
3a9530e5
LP
873 case ARG_HOSTNAME:
874 if (isempty(optarg))
875 arg_hostname = mfree(arg_hostname);
876 else {
baaa35ad
ZJS
877 if (!hostname_is_valid(optarg, false))
878 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
879 "Invalid hostname: %s", optarg);
3a9530e5
LP
880
881 r = free_and_strdup(&arg_hostname, optarg);
882 if (r < 0)
883 return log_oom();
884 }
885
886 arg_settings_mask |= SETTING_HOSTNAME;
887 break;
888
82adf6af
LP
889 case 'Z':
890 arg_selinux_context = optarg;
a8828ed9
DW
891 break;
892
82adf6af
LP
893 case 'L':
894 arg_selinux_apifs_context = optarg;
a8828ed9
DW
895 break;
896
bc2f673e
LP
897 case ARG_READ_ONLY:
898 arg_read_only = true;
f757855e 899 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
900 break;
901
420c7379
LP
902 case ARG_CAPABILITY:
903 case ARG_DROP_CAPABILITY: {
6cbe4ed1 904 p = optarg;
9ed794a3 905 for (;;) {
6cbe4ed1 906 _cleanup_free_ char *t = NULL;
5076f0cc 907
6cbe4ed1
SS
908 r = extract_first_word(&p, &t, ",", 0);
909 if (r < 0)
910 return log_error_errno(r, "Failed to parse capability %s.", t);
6cbe4ed1
SS
911 if (r == 0)
912 break;
5076f0cc 913
39ed67d1
LP
914 if (streq(t, "all")) {
915 if (c == ARG_CAPABILITY)
a42c8b54 916 plus = (uint64_t) -1;
39ed67d1 917 else
a42c8b54 918 minus = (uint64_t) -1;
39ed67d1 919 } else {
acf4d158
YW
920 r = capability_from_name(t);
921 if (r < 0)
922 return log_error_errno(r, "Failed to parse capability %s.", t);
39ed67d1
LP
923
924 if (c == ARG_CAPABILITY)
acf4d158 925 plus |= 1ULL << r;
39ed67d1 926 else
acf4d158 927 minus |= 1ULL << r;
5076f0cc 928 }
5076f0cc
LP
929 }
930
f757855e 931 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
932 break;
933 }
934
66edd963
LP
935 case ARG_NO_NEW_PRIVILEGES:
936 r = parse_boolean(optarg);
937 if (r < 0)
938 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
939
940 arg_no_new_privileges = r;
941 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
942 break;
943
57fb9fb5
LP
944 case 'j':
945 arg_link_journal = LINK_GUEST;
574edc90 946 arg_link_journal_try = true;
4e1d6aa9 947 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
948 break;
949
950 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
951 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
952 if (r < 0) {
953 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
954 return -EINVAL;
955 }
956
4e1d6aa9 957 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
958 break;
959
17fe0523 960 case ARG_BIND:
f757855e
LP
961 case ARG_BIND_RO:
962 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
963 if (r < 0)
964 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 965
f757855e 966 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 967 break;
06c17c39 968
f757855e
LP
969 case ARG_TMPFS:
970 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
971 if (r < 0)
972 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 973
f757855e 974 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 975 break;
5a8af538
LP
976
977 case ARG_OVERLAY:
ad85779a
LP
978 case ARG_OVERLAY_RO:
979 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
980 if (r == -EADDRNOTAVAIL)
981 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
982 if (r < 0)
983 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 984
f757855e 985 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 986 break;
06c17c39 987
de40a303
LP
988 case ARG_INACCESSIBLE:
989 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
990 if (r < 0)
991 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
992
993 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
994 break;
995
a5f1cb3b 996 case 'E': {
f4889f65
LP
997 char **n;
998
baaa35ad
ZJS
999 if (!env_assignment_is_valid(optarg))
1000 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1001 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
1002
1003 n = strv_env_set(arg_setenv, optarg);
1004 if (!n)
1005 return log_oom();
1006
130d3d22 1007 strv_free_and_replace(arg_setenv, n);
f757855e 1008 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
1009 break;
1010 }
1011
284c0b91
LP
1012 case 'q':
1013 arg_quiet = true;
1014 break;
1015
8a96d94e 1016 case ARG_SHARE_SYSTEM:
a6b5216c 1017 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 1018 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 1019 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 1020 arg_clone_ns_flags = 0;
8a96d94e
LP
1021 break;
1022
eb91eb18
LP
1023 case ARG_REGISTER:
1024 r = parse_boolean(optarg);
1025 if (r < 0) {
1026 log_error("Failed to parse --register= argument: %s", optarg);
1027 return r;
1028 }
1029
1030 arg_register = r;
1031 break;
1032
89f7c846
LP
1033 case ARG_KEEP_UNIT:
1034 arg_keep_unit = true;
1035 break;
1036
6afc95b7
LP
1037 case ARG_PERSONALITY:
1038
ac45f971 1039 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
1040 if (arg_personality == PERSONALITY_INVALID)
1041 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1042 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 1043
f757855e 1044 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
1045 break;
1046
4d9f07b4
LP
1047 case ARG_VOLATILE:
1048
1049 if (!optarg)
f757855e 1050 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
1051 else if (streq(optarg, "help")) {
1052 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1053 return 0;
1054 } else {
f757855e 1055 VolatileMode m;
4d9f07b4 1056
f757855e 1057 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
1058 if (m < 0)
1059 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1060 "Failed to parse --volatile= argument: %s", optarg);
1061 else
f757855e 1062 arg_volatile_mode = m;
6d0b55c2
LP
1063 }
1064
f757855e
LP
1065 arg_settings_mask |= SETTING_VOLATILE_MODE;
1066 break;
6d0b55c2 1067
f757855e
LP
1068 case 'p':
1069 r = expose_port_parse(&arg_expose_ports, optarg);
1070 if (r == -EEXIST)
1071 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1072 if (r < 0)
1073 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 1074
f757855e 1075 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 1076 break;
6d0b55c2 1077
f36933fe
LP
1078 case ARG_PROPERTY:
1079 if (strv_extend(&arg_property, optarg) < 0)
1080 return log_oom();
1081
1082 break;
1083
ae209204
ZJS
1084 case ARG_PRIVATE_USERS: {
1085 int boolean = -1;
0de7acce 1086
ae209204
ZJS
1087 if (!optarg)
1088 boolean = true;
1089 else if (!in_charset(optarg, DIGITS))
1090 /* do *not* parse numbers as booleans */
1091 boolean = parse_boolean(optarg);
1092
1093 if (boolean == false) {
0de7acce
LP
1094 /* no: User namespacing off */
1095 arg_userns_mode = USER_NAMESPACE_NO;
1096 arg_uid_shift = UID_INVALID;
1097 arg_uid_range = UINT32_C(0x10000);
ae209204 1098 } else if (boolean == true) {
0de7acce
LP
1099 /* yes: User namespacing on, UID range is read from root dir */
1100 arg_userns_mode = USER_NAMESPACE_FIXED;
1101 arg_uid_shift = UID_INVALID;
1102 arg_uid_range = UINT32_C(0x10000);
1103 } else if (streq(optarg, "pick")) {
1104 /* pick: User namespacing on, UID range is picked randomly */
1105 arg_userns_mode = USER_NAMESPACE_PICK;
1106 arg_uid_shift = UID_INVALID;
1107 arg_uid_range = UINT32_C(0x10000);
1108 } else {
6c2058b3 1109 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
1110 const char *range, *shift;
1111
0de7acce
LP
1112 /* anything else: User namespacing on, UID range is explicitly configured */
1113
6dac160c
LP
1114 range = strchr(optarg, ':');
1115 if (range) {
6c2058b3
ZJS
1116 buffer = strndup(optarg, range - optarg);
1117 if (!buffer)
1118 return log_oom();
1119 shift = buffer;
6dac160c
LP
1120
1121 range++;
bfd292ec
ZJS
1122 r = safe_atou32(range, &arg_uid_range);
1123 if (r < 0)
be715731 1124 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
1125 } else
1126 shift = optarg;
1127
be715731
ZJS
1128 r = parse_uid(shift, &arg_uid_shift);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1131
1132 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1133 }
1134
baaa35ad
ZJS
1135 if (arg_uid_range <= 0)
1136 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1137 "UID range cannot be 0.");
be715731 1138
0de7acce 1139 arg_settings_mask |= SETTING_USERNS;
6dac160c 1140 break;
ae209204 1141 }
6dac160c 1142
0de7acce 1143 case 'U':
ccabee0d
LP
1144 if (userns_supported()) {
1145 arg_userns_mode = USER_NAMESPACE_PICK;
1146 arg_uid_shift = UID_INVALID;
1147 arg_uid_range = UINT32_C(0x10000);
1148
1149 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1150 }
1151
7336138e
LP
1152 break;
1153
0de7acce 1154 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1155 arg_userns_chown = true;
0de7acce
LP
1156
1157 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1158 break;
1159
c6c8f6e2 1160 case ARG_KILL_SIGNAL:
5c828e66
LP
1161 if (streq(optarg, "help")) {
1162 DUMP_STRING_TABLE(signal, int, _NSIG);
1163 return 0;
1164 }
1165
29a3db75 1166 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1167 if (arg_kill_signal < 0)
1168 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1169 "Cannot parse signal: %s", optarg);
c6c8f6e2 1170
f757855e
LP
1171 arg_settings_mask |= SETTING_KILL_SIGNAL;
1172 break;
1173
1174 case ARG_SETTINGS:
1175
1176 /* no → do not read files
1177 * yes → read files, do not override cmdline, trust only subset
1178 * override → read files, override cmdline, trust only subset
1179 * trusted → read files, do not override cmdline, trust all
1180 */
1181
1182 r = parse_boolean(optarg);
1183 if (r < 0) {
1184 if (streq(optarg, "trusted")) {
1185 mask_all_settings = false;
1186 mask_no_settings = false;
1187 arg_settings_trusted = true;
1188
1189 } else if (streq(optarg, "override")) {
1190 mask_all_settings = false;
1191 mask_no_settings = true;
1192 arg_settings_trusted = -1;
1193 } else
1194 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1195 } else if (r > 0) {
1196 /* yes */
1197 mask_all_settings = false;
1198 mask_no_settings = false;
1199 arg_settings_trusted = -1;
1200 } else {
1201 /* no */
1202 mask_all_settings = true;
1203 mask_no_settings = false;
1204 arg_settings_trusted = false;
1205 }
1206
c6c8f6e2
LP
1207 break;
1208
5f932eb9 1209 case ARG_CHDIR:
baaa35ad
ZJS
1210 if (!path_is_absolute(optarg))
1211 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1212 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1213
1214 r = free_and_strdup(&arg_chdir, optarg);
1215 if (r < 0)
1216 return log_oom();
1217
1218 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1219 break;
1220
b53ede69
PW
1221 case ARG_PIVOT_ROOT:
1222 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1225
1226 arg_settings_mask |= SETTING_PIVOT_ROOT;
1227 break;
1228
9c1e04d0
AP
1229 case ARG_NOTIFY_READY:
1230 r = parse_boolean(optarg);
baaa35ad
ZJS
1231 if (r < 0)
1232 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1233 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1234 arg_notify_ready = r;
1235 arg_settings_mask |= SETTING_NOTIFY_READY;
1236 break;
1237
4623e8e6
LP
1238 case ARG_ROOT_HASH: {
1239 void *k;
1240 size_t l;
1241
1242 r = unhexmem(optarg, strlen(optarg), &k, &l);
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1245 if (l < sizeof(sd_id128_t)) {
1246 log_error("Root hash must be at least 128bit long: %s", optarg);
1247 free(k);
1248 return -EINVAL;
1249 }
1250
1251 free(arg_root_hash);
1252 arg_root_hash = k;
1253 arg_root_hash_size = l;
1254 break;
1255 }
1256
960e4569
LP
1257 case ARG_SYSTEM_CALL_FILTER: {
1258 bool negative;
1259 const char *items;
1260
1261 negative = optarg[0] == '~';
1262 items = negative ? optarg + 1 : optarg;
1263
1264 for (;;) {
1265 _cleanup_free_ char *word = NULL;
1266
1267 r = extract_first_word(&items, &word, NULL, 0);
1268 if (r == 0)
1269 break;
1270 if (r == -ENOMEM)
1271 return log_oom();
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse system call filter: %m");
1274
1275 if (negative)
1276 r = strv_extend(&arg_syscall_blacklist, word);
1277 else
1278 r = strv_extend(&arg_syscall_whitelist, word);
1279 if (r < 0)
1280 return log_oom();
1281 }
1282
1283 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1284 break;
1285 }
1286
bf428efb
LP
1287 case ARG_RLIMIT: {
1288 const char *eq;
1289 char *name;
1290 int rl;
1291
5c828e66
LP
1292 if (streq(optarg, "help")) {
1293 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1294 return 0;
1295 }
1296
bf428efb 1297 eq = strchr(optarg, '=');
baaa35ad
ZJS
1298 if (!eq)
1299 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1300 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1301
1302 name = strndup(optarg, eq - optarg);
1303 if (!name)
1304 return log_oom();
1305
1306 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1307 if (rl < 0)
1308 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1309 "Unknown resource limit: %s", name);
bf428efb
LP
1310
1311 if (!arg_rlimit[rl]) {
1312 arg_rlimit[rl] = new0(struct rlimit, 1);
1313 if (!arg_rlimit[rl])
1314 return log_oom();
1315 }
1316
1317 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1318 if (r < 0)
1319 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1320
1321 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1322 break;
1323 }
1324
81f345df
LP
1325 case ARG_OOM_SCORE_ADJUST:
1326 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1329
1330 arg_oom_score_adjust_set = true;
1331 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1332 break;
1333
d107bb7d
LP
1334 case ARG_CPU_AFFINITY: {
1335 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1336
1337 r = parse_cpu_set(optarg, &cpuset);
1338 if (r < 0)
1339 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1340
1341 if (arg_cpuset)
1342 CPU_FREE(arg_cpuset);
1343
1344 arg_cpuset = TAKE_PTR(cpuset);
1345 arg_cpuset_ncpus = r;
1346 arg_settings_mask |= SETTING_CPU_AFFINITY;
1347 break;
1348 }
1349
09d423e9
LP
1350 case ARG_RESOLV_CONF:
1351 if (streq(optarg, "help")) {
1352 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1353 return 0;
1354 }
1355
1356 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1357 if (arg_resolv_conf < 0)
1358 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1359 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1360
1361 arg_settings_mask |= SETTING_RESOLV_CONF;
1362 break;
1363
1688841f
LP
1364 case ARG_TIMEZONE:
1365 if (streq(optarg, "help")) {
1366 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1367 return 0;
1368 }
1369
1370 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1371 if (arg_timezone < 0)
1372 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1373 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1374
1375 arg_settings_mask |= SETTING_TIMEZONE;
1376 break;
1377
de40a303
LP
1378 case ARG_CONSOLE:
1379 if (streq(optarg, "interactive"))
1380 arg_console_mode = CONSOLE_INTERACTIVE;
1381 else if (streq(optarg, "read-only"))
1382 arg_console_mode = CONSOLE_READ_ONLY;
1383 else if (streq(optarg, "passive"))
1384 arg_console_mode = CONSOLE_PASSIVE;
1385 else if (streq(optarg, "pipe"))
1386 arg_console_mode = CONSOLE_PIPE;
1387 else if (streq(optarg, "help"))
1388 puts("interactive\n"
1389 "read-only\n"
1390 "passive\n"
1391 "pipe");
1392 else {
1393 log_error("Unknown console mode: %s", optarg);
1394 return -EINVAL;
1395 }
1396
1397 arg_settings_mask |= SETTING_CONSOLE_MODE;
1398 break;
1399
1400 case 'P':
1401 case ARG_PIPE:
1402 arg_console_mode = CONSOLE_PIPE;
1403 arg_settings_mask |= SETTING_CONSOLE_MODE;
1404 break;
1405
bb068de0
ZJS
1406 case ARG_NO_PAGER:
1407 arg_pager_flags |= PAGER_DISABLE;
1408 break;
1409
88213476
LP
1410 case '?':
1411 return -EINVAL;
1412
1413 default:
eb9da376 1414 assert_not_reached("Unhandled option");
88213476 1415 }
88213476 1416
60f1ec13
LP
1417 if (argc > optind) {
1418 strv_free(arg_parameters);
1419 arg_parameters = strv_copy(argv + optind);
1420 if (!arg_parameters)
1421 return log_oom();
d7bea6b6 1422
60f1ec13
LP
1423 arg_settings_mask |= SETTING_START_MODE;
1424 }
1425
1426 if (arg_ephemeral && arg_template && !arg_directory)
1427 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1428 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1429 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1430 * --directory=". */
1431 arg_directory = TAKE_PTR(arg_template);
1432
bd4b15f2 1433 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
60f1ec13 1434
de40a303
LP
1435 /* Make sure to parse environment before we reset the settings mask below */
1436 parse_environment();
1437
60f1ec13
LP
1438 /* Load all settings from .nspawn files */
1439 if (mask_no_settings)
1440 arg_settings_mask = 0;
1441
1442 /* Don't load any settings from .nspawn files */
1443 if (mask_all_settings)
1444 arg_settings_mask = _SETTINGS_MASK_ALL;
1445
1446 return 1;
1447}
1448
1449static int verify_arguments(void) {
1450 int r;
a6b5216c 1451
4f086aab
SU
1452 if (arg_userns_mode != USER_NAMESPACE_NO)
1453 arg_mount_settings |= MOUNT_USE_USERNS;
1454
1455 if (arg_private_network)
1456 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1457
48a8d337
LB
1458 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1459 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1460 arg_register = false;
baaa35ad 1461 if (arg_start_mode != START_PID1)
60f1ec13 1462 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
0c582db0 1463 }
eb91eb18 1464
0de7acce 1465 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1466 arg_userns_chown = true;
1467
60f1ec13
LP
1468 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1469 arg_kill_signal = SIGRTMIN+3;
1470
e5a4bb0d
LP
1471 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1472 arg_read_only = true;
1473
baaa35ad 1474 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1475 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1476 * The latter is not technically a user session, but we don't need to labour the point. */
60f1ec13 1477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1478
baaa35ad 1479 if (arg_directory && arg_image)
60f1ec13 1480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1b9e5b12 1481
baaa35ad 1482 if (arg_template && arg_image)
60f1ec13 1483 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
8cd328d8 1484
baaa35ad 1485 if (arg_template && !(arg_directory || arg_machine))
60f1ec13 1486 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
ec16945e 1487
baaa35ad 1488 if (arg_ephemeral && arg_template)
60f1ec13 1489 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
ec16945e 1490
baaa35ad 1491 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
60f1ec13 1492 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1493
baaa35ad 1494 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
60f1ec13 1495 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1496
baaa35ad 1497 if (arg_userns_chown && arg_read_only)
de40a303
LP
1498 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1499 "--read-only and --private-users-chown may not be combined.");
f757855e 1500
e5a4bb0d
LP
1501 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1502 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1503 * copy-up (in case of overlay) making the entire excercise pointless. */
1504 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1505 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1506
de40a303
LP
1507 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1508 * to avoid conflicts between different network options. */
60f1ec13
LP
1509 if (arg_network_namespace_path &&
1510 (arg_network_interfaces || arg_network_macvlan ||
1511 arg_network_ipvlan || arg_network_veth_extra ||
1512 arg_network_bridge || arg_network_zone ||
1513 arg_network_veth || arg_private_network))
de40a303 1514 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
86c0dd4a 1515
60f1ec13 1516 if (arg_network_bridge && arg_network_zone)
de40a303
LP
1517 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1518 "--network-bridge= and --network-zone= may not be combined.");
f757855e 1519
baaa35ad 1520 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
60f1ec13 1521 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1522
baaa35ad 1523 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
60f1ec13 1524 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
f757855e 1525
baaa35ad 1526 if (arg_expose_ports && !arg_private_network)
60f1ec13 1527 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
6d0b55c2 1528
349cc4a5 1529#if ! HAVE_LIBIPTC
baaa35ad 1530 if (arg_expose_ports)
60f1ec13 1531 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1532#endif
1533
60f1ec13
LP
1534 r = custom_mount_check_all();
1535 if (r < 0)
1536 return r;
c6c8f6e2 1537
f757855e 1538 return 0;
88213476
LP
1539}
1540
03cfe0d5
LP
1541static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1542 assert(p);
1543
0de7acce 1544 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1545 return 0;
1546
1547 if (uid == UID_INVALID && gid == GID_INVALID)
1548 return 0;
1549
1550 if (uid != UID_INVALID) {
1551 uid += arg_uid_shift;
1552
1553 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1554 return -EOVERFLOW;
1555 }
1556
1557 if (gid != GID_INVALID) {
1558 gid += (gid_t) arg_uid_shift;
1559
1560 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1561 return -EOVERFLOW;
1562 }
1563
1564 if (lchown(p, uid, gid) < 0)
1565 return -errno;
b12afc8c
LP
1566
1567 return 0;
1568}
1569
03cfe0d5
LP
1570static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1571 const char *q;
dae8b82e 1572 int r;
03cfe0d5
LP
1573
1574 q = prefix_roota(root, path);
dae8b82e
ZJS
1575 r = mkdir_errno_wrapper(q, mode);
1576 if (r == -EEXIST)
1577 return 0;
1578 if (r < 0)
1579 return r;
03cfe0d5
LP
1580
1581 return userns_lchown(q, uid, gid);
1582}
1583
1688841f 1584static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1585 return PATH_STARTSWITH_SET(
1586 path,
1587 "../usr/share/zoneinfo/",
1588 "/usr/share/zoneinfo/");
1688841f
LP
1589}
1590
83205269
LP
1591static bool etc_writable(void) {
1592 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1593}
1594
e58a1277 1595static int setup_timezone(const char *dest) {
1688841f
LP
1596 _cleanup_free_ char *p = NULL, *etc = NULL;
1597 const char *where, *check;
1598 TimezoneMode m;
d4036145 1599 int r;
f8440af5 1600
e58a1277
LP
1601 assert(dest);
1602
1688841f 1603 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1604 r = readlink_malloc("/etc/localtime", &p);
1605 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
83205269 1606 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1688841f 1607 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
83205269 1608 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1688841f
LP
1609 else if (r < 0) {
1610 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1611 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1612 * file.
1613 *
1614 * Example:
1615 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1616 */
1617 return 0;
1618 } else if (arg_timezone == TIMEZONE_AUTO)
83205269 1619 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1688841f
LP
1620 else
1621 m = arg_timezone;
1622 } else
1623 m = arg_timezone;
1624
1625 if (m == TIMEZONE_OFF)
1626 return 0;
1627
1628 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
d4036145 1629 if (r < 0) {
1688841f 1630 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1631 return 0;
1632 }
1633
1688841f
LP
1634 where = strjoina(etc, "/localtime");
1635
1636 switch (m) {
1637
1638 case TIMEZONE_DELETE:
1639 if (unlink(where) < 0)
1640 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1641
d4036145 1642 return 0;
d4036145 1643
1688841f
LP
1644 case TIMEZONE_SYMLINK: {
1645 _cleanup_free_ char *q = NULL;
1646 const char *z, *what;
4d1c38b8 1647
1688841f
LP
1648 z = timezone_from_path(p);
1649 if (!z) {
1650 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1651 return 0;
1688841f 1652 }
d4036145 1653
1688841f
LP
1654 r = readlink_malloc(where, &q);
1655 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1656 return 0; /* Already pointing to the right place? Then do nothing .. */
1657
1658 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1659 r = chase_symlinks(check, dest, 0, NULL);
1660 if (r < 0)
1661 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1662 else {
1663 if (unlink(where) < 0 && errno != ENOENT) {
1664 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1665 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1666 return 0;
1667 }
1668
1669 what = strjoina("../usr/share/zoneinfo/", z);
1670 if (symlink(what, where) < 0) {
1671 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1672 errno, "Failed to correct timezone of container, ignoring: %m");
1673 return 0;
1674 }
1675
1676 break;
1677 }
1678
1679 _fallthrough_;
d4036145 1680 }
68fb0892 1681
1688841f
LP
1682 case TIMEZONE_BIND: {
1683 _cleanup_free_ char *resolved = NULL;
1684 int found;
1685
1686 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1687 if (found < 0) {
1688 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1689 return 0;
1690 }
1691
1692 if (found == 0) /* missing? */
1693 (void) touch(resolved);
1694
1695 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1696 if (r >= 0)
1697 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1698
1699 _fallthrough_;
79d80fc1 1700 }
4d9f07b4 1701
1688841f
LP
1702 case TIMEZONE_COPY:
1703 /* If mounting failed, try to copy */
1704 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1705 if (r < 0) {
1706 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1707 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1708 return 0;
1709 }
1710
1711 break;
1712
1713 default:
1714 assert_not_reached("unexpected mode");
d4036145 1715 }
e58a1277 1716
1688841f 1717 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1718 r = userns_lchown(where, 0, 0);
1719 if (r < 0)
1688841f 1720 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1721
e58a1277 1722 return 0;
88213476
LP
1723}
1724
09d423e9
LP
1725static int have_resolv_conf(const char *path) {
1726 assert(path);
1727
1728 if (access(path, F_OK) < 0) {
1729 if (errno == ENOENT)
1730 return 0;
1731
1732 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1733 }
1734
1735 return 1;
1736}
1737
7357272e 1738static int resolved_listening(void) {
b8ea7a6e 1739 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1740 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1741 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1742 int r;
1743
7357272e 1744 /* Check if resolved is listening */
b053cd5f
LP
1745
1746 r = sd_bus_open_system(&bus);
1747 if (r < 0)
b8ea7a6e 1748 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1749
7357272e 1750 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1751 if (r < 0)
1752 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1753 if (r == 0)
1754 return 0;
7357272e
DM
1755
1756 r = sd_bus_get_property_string(bus,
1757 "org.freedesktop.resolve1",
1758 "/org/freedesktop/resolve1",
1759 "org.freedesktop.resolve1.Manager",
1760 "DNSStubListener",
b8ea7a6e 1761 &error,
7357272e
DM
1762 &dns_stub_listener_mode);
1763 if (r < 0)
b8ea7a6e 1764 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1765
1766 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1767}
1768
2547bb41 1769static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1770 _cleanup_free_ char *etc = NULL;
1771 const char *where, *what;
1772 ResolvConfMode m;
1773 int r;
2547bb41
LP
1774
1775 assert(dest);
1776
09d423e9
LP
1777 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1778 if (arg_private_network)
1779 m = RESOLV_CONF_OFF;
1780 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
83205269 1781 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
09d423e9 1782 else if (have_resolv_conf("/etc/resolv.conf") > 0)
83205269 1783 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
09d423e9 1784 else
83205269 1785 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
09d423e9
LP
1786 } else
1787 m = arg_resolv_conf;
1788
1789 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1790 return 0;
1791
87447ae4
LP
1792 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1793 if (r < 0) {
1794 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1795 return 0;
1796 }
1797
1798 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1799
1800 if (m == RESOLV_CONF_DELETE) {
1801 if (unlink(where) < 0)
1802 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1803
87447ae4
LP
1804 return 0;
1805 }
79d80fc1 1806
09d423e9
LP
1807 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1808 what = STATIC_RESOLV_CONF;
1809 else
1810 what = "/etc/resolv.conf";
87447ae4 1811
09d423e9
LP
1812 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1813 _cleanup_free_ char *resolved = NULL;
1814 int found;
1815
1816 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1817 if (found < 0) {
1818 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1819 return 0;
1820 }
3539724c 1821
87447ae4
LP
1822 if (found == 0) /* missing? */
1823 (void) touch(resolved);
5367354d 1824
09d423e9 1825 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1826 if (r >= 0)
87447ae4 1827 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1828 }
1829
1830 /* If that didn't work, let's copy the file */
09d423e9 1831 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1832 if (r < 0) {
3539724c
LP
1833 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1834 * resolved or something similar runs inside and the symlink points there.
68a313c5 1835 *
3539724c 1836 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1837 */
09d423e9 1838 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1839 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1840 return 0;
1841 }
2547bb41 1842
03cfe0d5
LP
1843 r = userns_lchown(where, 0, 0);
1844 if (r < 0)
3539724c 1845 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1846
2547bb41
LP
1847 return 0;
1848}
1849
1e4f1671 1850static int setup_boot_id(void) {
cdde6ba6
LP
1851 _cleanup_(unlink_and_freep) char *from = NULL;
1852 _cleanup_free_ char *path = NULL;
3bbaff3e 1853 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1854 const char *to;
04bc4a3f
LP
1855 int r;
1856
04bc4a3f
LP
1857 /* Generate a new randomized boot ID, so that each boot-up of
1858 * the container gets a new one */
1859
cdde6ba6
LP
1860 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1861 if (r < 0)
1862 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1863
1864 r = sd_id128_randomize(&rnd);
f647962d
MS
1865 if (r < 0)
1866 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1867
cdde6ba6 1868 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1869 if (r < 0)
1870 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1871
cdde6ba6
LP
1872 from = TAKE_PTR(path);
1873 to = "/proc/sys/kernel/random/boot_id";
1874
60e76d48 1875 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1876 if (r < 0)
1877 return r;
04bc4a3f 1878
cdde6ba6 1879 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1880}
1881
e58a1277 1882static int copy_devnodes(const char *dest) {
88213476
LP
1883 static const char devnodes[] =
1884 "null\0"
1885 "zero\0"
1886 "full\0"
1887 "random\0"
1888 "urandom\0"
85614d66
TG
1889 "tty\0"
1890 "net/tun\0";
88213476 1891
de40a303 1892 _cleanup_umask_ mode_t u;
88213476 1893 const char *d;
e58a1277 1894 int r = 0;
a258bf26
LP
1895
1896 assert(dest);
124640f1
LP
1897
1898 u = umask(0000);
88213476 1899
03cfe0d5
LP
1900 /* Create /dev/net, so that we can create /dev/net/tun in it */
1901 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1902 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1903
88213476 1904 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1905 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1906 struct stat st;
88213476 1907
7f112f50 1908 from = strappend("/dev/", d);
8967f291
LP
1909 if (!from)
1910 return log_oom();
1911
03cfe0d5 1912 to = prefix_root(dest, from);
8967f291
LP
1913 if (!to)
1914 return log_oom();
88213476
LP
1915
1916 if (stat(from, &st) < 0) {
1917
4a62c710
MS
1918 if (errno != ENOENT)
1919 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1920
baaa35ad
ZJS
1921 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1922 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1923 "%s is not a char or block device, cannot copy.", from);
1924 else {
8dfce114
LP
1925 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1926
81f5049b 1927 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1928 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1929 if (errno == EEXIST)
8dbf71ec 1930 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1931 if (errno != EPERM)
1932 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1933
8dfce114 1934 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1935 r = touch(to);
1936 if (r < 0)
1937 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1938 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1939 if (r < 0)
1940 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1941 }
6278cf60 1942
03cfe0d5
LP
1943 r = userns_lchown(to, 0, 0);
1944 if (r < 0)
1945 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114
LP
1946
1947 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1948 if (!dn)
1949 return log_oom();
1950
1951 r = userns_mkdir(dest, dn, 0755, 0, 0);
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to create '%s': %m", dn);
1954
1955 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1956 return log_oom();
1957
1958 prefixed = prefix_root(dest, sl);
1959 if (!prefixed)
1960 return log_oom();
1961
1962 t = strjoin("../", d);
1963 if (!t)
1964 return log_oom();
1965
1966 if (symlink(t, prefixed) < 0)
1967 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 1968 }
88213476
LP
1969 }
1970
e58a1277
LP
1971 return r;
1972}
88213476 1973
de40a303
LP
1974static int make_extra_nodes(const char *dest) {
1975 _cleanup_umask_ mode_t u;
1976 size_t i;
1977 int r;
1978
1979 u = umask(0000);
1980
1981 for (i = 0; i < arg_n_extra_nodes; i++) {
1982 _cleanup_free_ char *path = NULL;
1983 DeviceNode *n = arg_extra_nodes + i;
1984
1985 path = prefix_root(dest, n->path);
1986 if (!path)
1987 return log_oom();
1988
1989 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
1990 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
1991
1992 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
1993 if (r < 0)
1994 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
1995 }
1996
1997 return 0;
1998}
1999
03cfe0d5
LP
2000static int setup_pts(const char *dest) {
2001 _cleanup_free_ char *options = NULL;
2002 const char *p;
709f6e46 2003 int r;
03cfe0d5 2004
349cc4a5 2005#if HAVE_SELINUX
03cfe0d5
LP
2006 if (arg_selinux_apifs_context)
2007 (void) asprintf(&options,
3dce8915 2008 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2009 arg_uid_shift + TTY_GID,
2010 arg_selinux_apifs_context);
2011 else
2012#endif
2013 (void) asprintf(&options,
3dce8915 2014 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2015 arg_uid_shift + TTY_GID);
f2d88580 2016
03cfe0d5 2017 if (!options)
f2d88580
LP
2018 return log_oom();
2019
03cfe0d5 2020 /* Mount /dev/pts itself */
cc9fce65 2021 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
2022 r = mkdir_errno_wrapper(p, 0755);
2023 if (r < 0)
2024 return log_error_errno(r, "Failed to create /dev/pts: %m");
2025
60e76d48
ZJS
2026 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2027 if (r < 0)
2028 return r;
709f6e46
MS
2029 r = userns_lchown(p, 0, 0);
2030 if (r < 0)
2031 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
2032
2033 /* Create /dev/ptmx symlink */
2034 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2035 if (symlink("pts/ptmx", p) < 0)
2036 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
2037 r = userns_lchown(p, 0, 0);
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 2040
03cfe0d5
LP
2041 /* And fix /dev/pts/ptmx ownership */
2042 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
2043 r = userns_lchown(p, 0, 0);
2044 if (r < 0)
2045 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2046
f2d88580
LP
2047 return 0;
2048}
2049
e58a1277 2050static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
2051 _cleanup_umask_ mode_t u;
2052 const char *to;
e58a1277 2053 int r;
e58a1277
LP
2054
2055 assert(dest);
e58a1277
LP
2056
2057 u = umask(0000);
2058
de40a303
LP
2059 if (!console)
2060 return 0;
2061
03cfe0d5 2062 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
2063 if (r < 0)
2064 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 2065
a258bf26
LP
2066 /* We need to bind mount the right tty to /dev/console since
2067 * ptys can only exist on pts file systems. To have something
81f5049b 2068 * to bind mount things on we create a empty regular file. */
a258bf26 2069
03cfe0d5 2070 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
2071 r = touch(to);
2072 if (r < 0)
2073 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 2074
60e76d48 2075 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
2076}
2077
8e5430c4
LP
2078static int setup_keyring(void) {
2079 key_serial_t keyring;
2080
2081 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2082 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2083 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2084 * these system calls let's make sure we don't leak anything into the container. */
2085
2086 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2087 if (keyring == -1) {
2088 if (errno == ENOSYS)
2089 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2090 else if (IN_SET(errno, EACCES, EPERM))
2091 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2092 else
2093 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2094 }
2095
2096 return 0;
2097}
2098
1e4f1671 2099static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
2100 _cleanup_(unlink_and_freep) char *from = NULL;
2101 _cleanup_free_ char *fifo = NULL;
2102 _cleanup_close_ int fd = -1;
7fd1b19b 2103 _cleanup_umask_ mode_t u;
9ec5a93c
LP
2104 const char *to;
2105 int r;
e58a1277 2106
e58a1277 2107 assert(kmsg_socket >= 0);
a258bf26 2108
e58a1277 2109 u = umask(0000);
a258bf26 2110
9ec5a93c
LP
2111 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
2112 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2113 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2114 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2115
2116 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
2117 if (r < 0)
2118 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 2119
9ec5a93c 2120 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 2121 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
2122
2123 from = TAKE_PTR(fifo);
2124 to = "/proc/kmsg";
2125
60e76d48
ZJS
2126 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2127 if (r < 0)
2128 return r;
e58a1277 2129
669fc4e5 2130 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
2131 if (fd < 0)
2132 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2133
9ec5a93c 2134 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 2135 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
2136 if (r < 0)
2137 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 2138
25ea79fe 2139 return 0;
88213476
LP
2140}
2141
1c4baffc 2142static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2143 union in_addr_union *exposed = userdata;
2144
2145 assert(rtnl);
2146 assert(m);
2147 assert(exposed);
2148
7a8f6325 2149 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
2150 return 0;
2151}
2152
3a74cea5 2153static int setup_hostname(void) {
c818eef1 2154 int r;
3a74cea5 2155
0c582db0 2156 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
2157 return 0;
2158
c818eef1
LP
2159 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 2162
7027ff61 2163 return 0;
3a74cea5
LP
2164}
2165
57fb9fb5 2166static int setup_journal(const char *directory) {
0f5e1382 2167 _cleanup_free_ char *d = NULL;
b2238e38
LP
2168 const char *dirname, *p, *q;
2169 sd_id128_t this_id;
2170 char id[33];
8054d749 2171 bool try;
57fb9fb5
LP
2172 int r;
2173
df9a75e4
LP
2174 /* Don't link journals in ephemeral mode */
2175 if (arg_ephemeral)
2176 return 0;
2177
8054d749
LP
2178 if (arg_link_journal == LINK_NO)
2179 return 0;
2180
2181 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2182
4d680aee 2183 r = sd_id128_get_machine(&this_id);
f647962d
MS
2184 if (r < 0)
2185 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2186
e01ff70a 2187 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2188 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2189 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2190 if (try)
4d680aee 2191 return 0;
df9a75e4 2192 return -EEXIST;
4d680aee
ZJS
2193 }
2194
369ca6da
ZJS
2195 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2196 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2197 if (r < 0) {
2198 bool ignore = r == -EROFS && try;
2199 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2200 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2201 return ignore ? 0 : r;
2202 }
2203 }
03cfe0d5 2204
e01ff70a
MS
2205 (void) sd_id128_to_string(arg_uuid, id);
2206
03cfe0d5
LP
2207 p = strjoina("/var/log/journal/", id);
2208 q = prefix_roota(directory, p);
27407a01 2209
e1873695 2210 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2211 if (try)
2212 return 0;
27407a01 2213
baaa35ad
ZJS
2214 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2215 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2216 }
2217
e1873695 2218 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2219 if (try)
2220 return 0;
57fb9fb5 2221
baaa35ad
ZJS
2222 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2223 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2224 }
2225
2226 r = readlink_and_make_absolute(p, &d);
2227 if (r >= 0) {
3742095b 2228 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2229 path_equal(d, q)) {
2230
03cfe0d5 2231 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2232 if (r < 0)
709f6e46 2233 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2234 return 0;
57fb9fb5
LP
2235 }
2236
4a62c710
MS
2237 if (unlink(p) < 0)
2238 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2239 } else if (r == -EINVAL) {
2240
2241 if (arg_link_journal == LINK_GUEST &&
2242 rmdir(p) < 0) {
2243
27407a01
ZJS
2244 if (errno == ENOTDIR) {
2245 log_error("%s already exists and is neither a symlink nor a directory", p);
2246 return r;
4314d33f
MS
2247 } else
2248 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2249 }
4314d33f
MS
2250 } else if (r != -ENOENT)
2251 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2252
2253 if (arg_link_journal == LINK_GUEST) {
2254
2255 if (symlink(q, p) < 0) {
8054d749 2256 if (try) {
56f64d95 2257 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2258 return 0;
4314d33f
MS
2259 } else
2260 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2261 }
2262
03cfe0d5 2263 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2264 if (r < 0)
709f6e46 2265 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2266 return 0;
57fb9fb5
LP
2267 }
2268
2269 if (arg_link_journal == LINK_HOST) {
ccddd104 2270 /* don't create parents here — if the host doesn't have
574edc90 2271 * permanent journal set up, don't force it here */
ba8e6c4d 2272
dae8b82e
ZJS
2273 r = mkdir_errno_wrapper(p, 0755);
2274 if (r < 0 && r != -EEXIST) {
8054d749 2275 if (try) {
dae8b82e 2276 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2277 return 0;
4314d33f 2278 } else
dae8b82e 2279 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2280 }
2281
27407a01
ZJS
2282 } else if (access(p, F_OK) < 0)
2283 return 0;
57fb9fb5 2284
cdb2b9d0
LP
2285 if (dir_is_empty(q) == 0)
2286 log_warning("%s is not empty, proceeding anyway.", q);
2287
03cfe0d5 2288 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2289 if (r < 0)
2290 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2291
60e76d48
ZJS
2292 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2293 if (r < 0)
4a62c710 2294 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2295
27407a01 2296 return 0;
57fb9fb5
LP
2297}
2298
de40a303
LP
2299static int drop_capabilities(uid_t uid) {
2300 CapabilityQuintet q;
2301
2302 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2303 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2304 * arg_caps_retain. */
2305
2306 if (capability_quintet_is_set(&arg_full_capabilities)) {
2307 q = arg_full_capabilities;
2308
2309 if (q.bounding == (uint64_t) -1)
2310 q.bounding = uid == 0 ? arg_caps_retain : 0;
2311
2312 if (q.effective == (uint64_t) -1)
2313 q.effective = uid == 0 ? q.bounding : 0;
2314
2315 if (q.inheritable == (uint64_t) -1)
2316 q.inheritable = uid == 0 ? q.bounding : 0;
2317
2318 if (q.permitted == (uint64_t) -1)
2319 q.permitted = uid == 0 ? q.bounding : 0;
2320
2321 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2322 q.ambient = 0;
2323 } else
2324 q = (CapabilityQuintet) {
2325 .bounding = arg_caps_retain,
2326 .effective = uid == 0 ? arg_caps_retain : 0,
2327 .inheritable = uid == 0 ? arg_caps_retain : 0,
2328 .permitted = uid == 0 ? arg_caps_retain : 0,
2329 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2330 };
2331
2332 return capability_quintet_enforce(&q);
88213476
LP
2333}
2334
db999e0f
LP
2335static int reset_audit_loginuid(void) {
2336 _cleanup_free_ char *p = NULL;
2337 int r;
2338
0c582db0 2339 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2340 return 0;
2341
2342 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2343 if (r == -ENOENT)
db999e0f 2344 return 0;
f647962d
MS
2345 if (r < 0)
2346 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2347
2348 /* Already reset? */
2349 if (streq(p, "4294967295"))
2350 return 0;
2351
57512c89 2352 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2353 if (r < 0) {
10a87006
LP
2354 log_error_errno(r,
2355 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2356 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2357 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2358 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2359 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2360
db999e0f 2361 sleep(5);
77b6e194 2362 }
db999e0f
LP
2363
2364 return 0;
77b6e194
LP
2365}
2366
785890ac
LP
2367static int setup_propagate(const char *root) {
2368 const char *p, *q;
709f6e46 2369 int r;
785890ac
LP
2370
2371 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2372 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2373 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2374 (void) mkdir_p(p, 0600);
2375
709f6e46
MS
2376 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2377 if (r < 0)
2378 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2379
709f6e46
MS
2380 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2381 if (r < 0)
2382 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2383
709f6e46
MS
2384 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2387
03cfe0d5 2388 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2389 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2390 if (r < 0)
2391 return r;
785890ac 2392
60e76d48
ZJS
2393 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2394 if (r < 0)
2395 return r;
785890ac 2396
19caffac
AC
2397 /* machined will MS_MOVE into that directory, and that's only
2398 * supported for non-shared mounts. */
60e76d48 2399 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2400}
2401
317feb4d 2402static int setup_machine_id(const char *directory) {
691675ba
LP
2403 const char *etc_machine_id;
2404 sd_id128_t id;
3bbaff3e 2405 int r;
e01ff70a 2406
317feb4d
LP
2407 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2408 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2409 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2410 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2411 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2412 * container behaves nicely). */
2413
e01ff70a
MS
2414 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2415
691675ba 2416 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2417 if (r < 0) {
2418 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2419 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2420
317feb4d
LP
2421 if (sd_id128_is_null(arg_uuid)) {
2422 r = sd_id128_randomize(&arg_uuid);
2423 if (r < 0)
2424 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2425 }
2426 } else {
baaa35ad
ZJS
2427 if (sd_id128_is_null(id))
2428 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2429 "Machine ID in container image is zero, refusing.");
e01ff70a 2430
317feb4d
LP
2431 arg_uuid = id;
2432 }
691675ba 2433
e01ff70a
MS
2434 return 0;
2435}
2436
7336138e
LP
2437static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2438 int r;
2439
2440 assert(directory);
2441
0de7acce 2442 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2443 return 0;
2444
2445 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2446 if (r == -EOPNOTSUPP)
2447 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2448 if (r == -EBADE)
2449 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2450 if (r < 0)
2451 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2452 if (r == 0)
2453 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2454 else
2455 log_debug("Patched directory tree to match UID/GID range.");
2456
2457 return r;
2458}
2459
113cea80 2460/*
6d416b9c
LS
2461 * Return values:
2462 * < 0 : wait_for_terminate() failed to get the state of the
2463 * container, the container was terminated by a signal, or
2464 * failed for an unknown reason. No change is made to the
2465 * container argument.
2466 * > 0 : The program executed in the container terminated with an
2467 * error. The exit code of the program executed in the
919699ec
LP
2468 * container is returned. The container argument has been set
2469 * to CONTAINER_TERMINATED.
6d416b9c
LS
2470 * 0 : The container is being rebooted, has been shut down or exited
2471 * successfully. The container argument has been set to either
2472 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2473 *
6d416b9c
LS
2474 * That is, success is indicated by a return value of zero, and an
2475 * error is indicated by a non-zero value.
113cea80
DH
2476 */
2477static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2478 siginfo_t status;
919699ec 2479 int r;
113cea80
DH
2480
2481 r = wait_for_terminate(pid, &status);
f647962d
MS
2482 if (r < 0)
2483 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2484
2485 switch (status.si_code) {
fddbb89c 2486
113cea80 2487 case CLD_EXITED:
b5a2179b 2488 if (status.si_status == 0)
919699ec 2489 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2490 else
919699ec 2491 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2492
919699ec
LP
2493 *container = CONTAINER_TERMINATED;
2494 return status.si_status;
113cea80
DH
2495
2496 case CLD_KILLED:
2497 if (status.si_status == SIGINT) {
919699ec 2498 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2499 *container = CONTAINER_TERMINATED;
919699ec
LP
2500 return 0;
2501
113cea80 2502 } else if (status.si_status == SIGHUP) {
919699ec 2503 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2504 *container = CONTAINER_REBOOTED;
919699ec 2505 return 0;
113cea80 2506 }
919699ec 2507
4831981d 2508 _fallthrough_;
113cea80 2509 case CLD_DUMPED:
baaa35ad
ZJS
2510 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2511 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2512
2513 default:
baaa35ad
ZJS
2514 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2515 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2516 }
113cea80
DH
2517}
2518
023fb90b
LP
2519static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2520 pid_t pid;
2521
4a0b58c4 2522 pid = PTR_TO_PID(userdata);
023fb90b 2523 if (pid > 0) {
c6c8f6e2 2524 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2525 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2526 sd_event_source_set_userdata(s, NULL);
2527 return 0;
2528 }
2529 }
2530
2531 sd_event_exit(sd_event_source_get_event(s), 0);
2532 return 0;
2533}
2534
6916b164 2535static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2536 pid_t pid;
2537
2538 assert(s);
2539 assert(ssi);
2540
2541 pid = PTR_TO_PID(userdata);
2542
6916b164
AU
2543 for (;;) {
2544 siginfo_t si = {};
abdb9b08 2545
6916b164
AU
2546 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2547 return log_error_errno(errno, "Failed to waitid(): %m");
2548 if (si.si_pid == 0) /* No pending children. */
2549 break;
abdb9b08 2550 if (si.si_pid == pid) {
6916b164
AU
2551 /* The main process we care for has exited. Return from
2552 * signal handler but leave the zombie. */
2553 sd_event_exit(sd_event_source_get_event(s), 0);
2554 break;
2555 }
abdb9b08 2556
6916b164
AU
2557 /* Reap all other children. */
2558 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2559 }
2560
2561 return 0;
2562}
2563
abdb9b08
LP
2564static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2565 pid_t pid;
2566
2567 assert(m);
2568
2569 pid = PTR_TO_PID(userdata);
2570
2571 if (arg_kill_signal > 0) {
2572 log_info("Container termination requested. Attempting to halt container.");
2573 (void) kill(pid, arg_kill_signal);
2574 } else {
2575 log_info("Container termination requested. Exiting.");
2576 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2577 }
2578
2579 return 0;
2580}
2581
ec16945e 2582static int determine_names(void) {
1b9cebf6 2583 int r;
ec16945e 2584
c1521918
LP
2585 if (arg_template && !arg_directory && arg_machine) {
2586
2587 /* If --template= was specified then we should not
2588 * search for a machine, but instead create a new one
2589 * in /var/lib/machine. */
2590
605405c6 2591 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2592 if (!arg_directory)
2593 return log_oom();
2594 }
2595
ec16945e 2596 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2597 if (arg_machine) {
2598 _cleanup_(image_unrefp) Image *i = NULL;
2599
5ef46e5f 2600 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2601 if (r == -ENOENT)
2602 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2603 if (r < 0)
2604 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2605
eb38edce 2606 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2607 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2608 else
0f03c2a4 2609 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2610 if (r < 0)
0f3be6ca 2611 return log_oom();
1b9cebf6 2612
aee327b8
LP
2613 if (!arg_ephemeral)
2614 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2615 } else {
2616 r = safe_getcwd(&arg_directory);
2617 if (r < 0)
2618 return log_error_errno(r, "Failed to determine current directory: %m");
2619 }
ec16945e 2620
0f3be6ca 2621 if (!arg_directory && !arg_image) {
1b9cebf6 2622 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2623 return -EINVAL;
2624 }
2625 }
2626
2627 if (!arg_machine) {
b9ba4dab
LP
2628 if (arg_directory && path_equal(arg_directory, "/"))
2629 arg_machine = gethostname_malloc();
4827ab48
LP
2630 else {
2631 if (arg_image) {
2632 char *e;
2633
2634 arg_machine = strdup(basename(arg_image));
2635
2636 /* Truncate suffix if there is one */
2637 e = endswith(arg_machine, ".raw");
2638 if (e)
2639 *e = 0;
2640 } else
2641 arg_machine = strdup(basename(arg_directory));
2642 }
ec16945e
LP
2643 if (!arg_machine)
2644 return log_oom();
2645
ae691c1d 2646 hostname_cleanup(arg_machine);
ec16945e
LP
2647 if (!machine_name_is_valid(arg_machine)) {
2648 log_error("Failed to determine machine name automatically, please use -M.");
2649 return -EINVAL;
2650 }
b9ba4dab
LP
2651
2652 if (arg_ephemeral) {
2653 char *b;
2654
2655 /* Add a random suffix when this is an
2656 * ephemeral machine, so that we can run many
2657 * instances at once without manually having
2658 * to specify -M each time. */
2659
2660 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2661 return log_oom();
2662
2663 free(arg_machine);
2664 arg_machine = b;
2665 }
ec16945e
LP
2666 }
2667
2668 return 0;
2669}
2670
8d4aa2bb 2671static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2672 char *chased;
2673 int r;
2674
2675 assert(p);
2676
2677 if (!*p)
2678 return 0;
2679
8d4aa2bb 2680 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2683
8405dcf7
ZJS
2684 free_and_replace(*p, chased);
2685 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2686}
2687
03cfe0d5 2688static int determine_uid_shift(const char *directory) {
6dac160c
LP
2689 int r;
2690
0de7acce 2691 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2692 arg_uid_shift = 0;
6dac160c 2693 return 0;
03cfe0d5 2694 }
6dac160c
LP
2695
2696 if (arg_uid_shift == UID_INVALID) {
2697 struct stat st;
2698
03cfe0d5 2699 r = stat(directory, &st);
6dac160c 2700 if (r < 0)
03cfe0d5 2701 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2702
2703 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2704
baaa35ad
ZJS
2705 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2706 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2707 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2708
2709 arg_uid_range = UINT32_C(0x10000);
2710 }
2711
baaa35ad
ZJS
2712 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2713 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2714 "UID base too high for UID range.");
6dac160c 2715
6dac160c
LP
2716 return 0;
2717}
2718
de40a303
LP
2719static unsigned long effective_clone_ns_flags(void) {
2720 unsigned long flags = arg_clone_ns_flags;
2721
2722 if (arg_private_network)
2723 flags |= CLONE_NEWNET;
2724 if (arg_use_cgns)
2725 flags |= CLONE_NEWCGROUP;
2726 if (arg_userns_mode != USER_NAMESPACE_NO)
2727 flags |= CLONE_NEWUSER;
2728
2729 return flags;
2730}
2731
2732static int patch_sysctl(void) {
2733
2734 /* This table is inspired by runc's sysctl() function */
2735 static const struct {
2736 const char *key;
2737 bool prefix;
2738 unsigned long clone_flags;
2739 } safe_sysctl[] = {
2740 { "kernel.hostname", false, CLONE_NEWUTS },
2741 { "kernel.domainname", false, CLONE_NEWUTS },
2742 { "kernel.msgmax", false, CLONE_NEWIPC },
2743 { "kernel.msgmnb", false, CLONE_NEWIPC },
2744 { "kernel.msgmni", false, CLONE_NEWIPC },
2745 { "kernel.sem", false, CLONE_NEWIPC },
2746 { "kernel.shmall", false, CLONE_NEWIPC },
2747 { "kernel.shmmax", false, CLONE_NEWIPC },
2748 { "kernel.shmmni", false, CLONE_NEWIPC },
2749 { "fs.mqueue.", true, CLONE_NEWIPC },
2750 { "net.", true, CLONE_NEWNET },
2751 };
2752
2753 unsigned long flags;
2754 char **k, **v;
2755 int r;
2756
2757 flags = effective_clone_ns_flags();
2758
2759 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2760 bool good = false;
2761 size_t i;
2762
2763 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2764
2765 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2766 continue;
2767
2768 if (safe_sysctl[i].prefix)
2769 good = startswith(*k, safe_sysctl[i].key);
2770 else
2771 good = streq(*k, safe_sysctl[i].key);
2772
2773 if (good)
2774 break;
2775 }
2776
2777 if (!good) {
2778 log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2779 return -EPERM;
2780 }
2781
2782 r = sysctl_write(*k, *v);
2783 if (r < 0)
2784 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2785 }
2786
2787 return 0;
2788}
2789
03cfe0d5
LP
2790static int inner_child(
2791 Barrier *barrier,
2792 const char *directory,
2793 bool secondary,
2794 int kmsg_socket,
2795 int rtnl_socket,
f757855e 2796 FDSet *fds) {
69c79d3c 2797
03cfe0d5 2798 _cleanup_free_ char *home = NULL;
e01ff70a 2799 char as_uuid[37];
88614c8a 2800 size_t n_env = 1;
03cfe0d5 2801 const char *envp[] = {
0c300adf 2802 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2803 NULL, /* container */
03cfe0d5
LP
2804 NULL, /* TERM */
2805 NULL, /* HOME */
2806 NULL, /* USER */
2807 NULL, /* LOGNAME */
2808 NULL, /* container_uuid */
2809 NULL, /* LISTEN_FDS */
2810 NULL, /* LISTEN_PID */
9c1e04d0 2811 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2812 NULL
2813 };
1a68e1e5 2814 const char *exec_target;
2371271c 2815 _cleanup_strv_free_ char **env_use = NULL;
de40a303 2816 int r, which_failed;
88213476 2817
b37469d7
LP
2818 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2819 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2820 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2821 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2822 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2823 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2824 * namespace.
2825 *
2826 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2827 * unshare(). See below. */
2828
03cfe0d5
LP
2829 assert(barrier);
2830 assert(directory);
2831 assert(kmsg_socket >= 0);
88213476 2832
de40a303
LP
2833 log_debug("Inner child is initializing.");
2834
0de7acce 2835 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2836 /* Tell the parent, that it now can write the UID map. */
2837 (void) barrier_place(barrier); /* #1 */
7027ff61 2838
03cfe0d5 2839 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2840 if (!barrier_place_and_sync(barrier)) /* #2 */
2841 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2842 "Parent died too early");
88213476
LP
2843 }
2844
6d66bd3b
EV
2845 r = reset_uid_gid();
2846 if (r < 0)
2847 return log_error_errno(r, "Couldn't become new root: %m");
2848
0de7acce 2849 r = mount_all(NULL,
4f086aab 2850 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2851 arg_uid_shift,
0de7acce 2852 arg_selinux_apifs_context);
03cfe0d5
LP
2853 if (r < 0)
2854 return r;
2855
04413780
ZJS
2856 if (!arg_network_namespace_path && arg_private_network) {
2857 r = unshare(CLONE_NEWNET);
2858 if (r < 0)
2859 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2860
2861 /* Tell the parent that it can setup network interfaces. */
2862 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2863 }
2864
4f086aab 2865 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2866 if (r < 0)
2867 return r;
2868
03cfe0d5
LP
2869 /* Wait until we are cgroup-ified, so that we
2870 * can mount the right cgroup path writable */
baaa35ad
ZJS
2871 if (!barrier_place_and_sync(barrier)) /* #4 */
2872 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2873 "Parent died too early");
88213476 2874
489fae52 2875 if (arg_use_cgns) {
0996ef00
CB
2876 r = unshare(CLONE_NEWCGROUP);
2877 if (r < 0)
04413780 2878 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2879 r = mount_cgroups(
2880 "",
2881 arg_unified_cgroup_hierarchy,
2882 arg_userns_mode != USER_NAMESPACE_NO,
2883 arg_uid_shift,
2884 arg_uid_range,
5a8ff0e6 2885 arg_selinux_apifs_context,
ada54120 2886 true);
0996ef00
CB
2887 if (r < 0)
2888 return r;
2889 } else {
2890 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2891 if (r < 0)
2892 return r;
2893 }
ec16945e 2894
1e4f1671 2895 r = setup_boot_id();
03cfe0d5
LP
2896 if (r < 0)
2897 return r;
ec16945e 2898
1e4f1671 2899 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2900 if (r < 0)
2901 return r;
2902 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2903
de40a303
LP
2904 r = mount_custom(
2905 "/",
2906 arg_custom_mounts,
2907 arg_n_custom_mounts,
2908 false,
2909 0,
2910 0,
2911 arg_selinux_apifs_context,
2912 true);
2913 if (r < 0)
2914 return r;
2915
03cfe0d5
LP
2916 if (setsid() < 0)
2917 return log_error_errno(errno, "setsid() failed: %m");
2918
2919 if (arg_private_network)
2920 loopback_setup();
2921
7a8f6325
LP
2922 if (arg_expose_ports) {
2923 r = expose_port_send_rtnl(rtnl_socket);
2924 if (r < 0)
2925 return r;
2926 rtnl_socket = safe_close(rtnl_socket);
2927 }
03cfe0d5 2928
de40a303
LP
2929 r = patch_sysctl();
2930 if (r < 0)
2931 return r;
2932
81f345df
LP
2933 if (arg_oom_score_adjust_set) {
2934 r = set_oom_score_adjust(arg_oom_score_adjust);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to adjust OOM score: %m");
2937 }
2938
d107bb7d
LP
2939 if (arg_cpuset)
2940 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2941 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2942
c818eef1 2943 (void) setup_hostname();
03cfe0d5 2944
050f7277 2945 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2946 r = safe_personality(arg_personality);
2947 if (r < 0)
2948 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2949 } else if (secondary) {
21022b9d
LP
2950 r = safe_personality(PER_LINUX32);
2951 if (r < 0)
2952 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2953 }
2954
de40a303
LP
2955 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2956 if (r < 0)
2957 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2958
2959#if HAVE_SECCOMP
2960 if (arg_seccomp) {
2961
2962 if (is_seccomp_available()) {
2963
2964 r = seccomp_load(arg_seccomp);
2965 if (IN_SET(r, -EPERM, -EACCES))
2966 return log_error_errno(r, "Failed to install seccomp filter: %m");
2967 if (r < 0)
2968 log_debug_errno(r, "Failed to install seccomp filter: %m");
2969 }
2970 } else
2971#endif
2972 {
2973 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
2974 if (r < 0)
2975 return r;
2976 }
2977
349cc4a5 2978#if HAVE_SELINUX
03cfe0d5 2979 if (arg_selinux_context)
2ed96880 2980 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2981 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2982#endif
2983
de40a303
LP
2984 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
2985 * if we need to later on. */
2986 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
2987 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
2988
2989 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
2990 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
2991 else
2992 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2993 if (r < 0)
2994 return r;
2995
de40a303
LP
2996 r = drop_capabilities(getuid());
2997 if (r < 0)
2998 return log_error_errno(r, "Dropping capabilities failed: %m");
2999
66edd963
LP
3000 if (arg_no_new_privileges)
3001 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3002 return log_error_errno(errno, "Failed to disable new privileges: %m");
3003
6aadfa4c
ILG
3004 /* LXC sets container=lxc, so follow the scheme here */
3005 envp[n_env++] = strjoina("container=", arg_container_service_name);
3006
03cfe0d5
LP
3007 envp[n_env] = strv_find_prefix(environ, "TERM=");
3008 if (envp[n_env])
313cefa1 3009 n_env++;
03cfe0d5 3010
de40a303
LP
3011 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3012 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3013 return log_oom();
3014
3015 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3016 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3017 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3018 return log_oom();
03cfe0d5 3019
3bbaff3e 3020 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 3021
691675ba 3022 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 3023 return log_oom();
03cfe0d5
LP
3024
3025 if (fdset_size(fds) > 0) {
3026 r = fdset_cloexec(fds, false);
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3029
3030 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3031 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3032 return log_oom();
3033 }
9c1e04d0
AP
3034 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3035 return log_oom();
03cfe0d5 3036
2371271c
TG
3037 env_use = strv_env_merge(2, envp, arg_setenv);
3038 if (!env_use)
3039 return log_oom();
03cfe0d5
LP
3040
3041 /* Let the parent know that we are ready and
3042 * wait until the parent is ready with the
3043 * setup, too... */
baaa35ad
ZJS
3044 if (!barrier_place_and_sync(barrier)) /* #5 */
3045 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3046 "Parent died too early");
03cfe0d5 3047
5f932eb9
LP
3048 if (arg_chdir)
3049 if (chdir(arg_chdir) < 0)
3050 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3051
7732f92b 3052 if (arg_start_mode == START_PID2) {
75bf701f 3053 r = stub_pid1(arg_uuid);
7732f92b
LP
3054 if (r < 0)
3055 return r;
3056 }
3057
de40a303
LP
3058 log_debug("Inner child completed, invoking payload.");
3059
8ca082b4
LP
3060 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3061 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3062 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 3063 log_close();
8ca082b4
LP
3064 log_set_open_when_needed(true);
3065
03cfe0d5
LP
3066 (void) fdset_close_others(fds);
3067
7732f92b 3068 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
3069 char **a;
3070 size_t m;
3071
3072 /* Automatically search for the init system */
3073
75f32f04
ZJS
3074 m = strv_length(arg_parameters);
3075 a = newa(char*, m + 2);
3076 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3077 a[1 + m] = NULL;
03cfe0d5 3078
ced58da7 3079 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
3080 execve(a[0], a, env_use);
3081
ced58da7 3082 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
3083 execve(a[0], a, env_use);
3084
ced58da7 3085 a[0] = (char*) "/sbin/init";
03cfe0d5 3086 execve(a[0], a, env_use);
ced58da7
LP
3087
3088 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 3089 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
3090 const char *dollar_path;
3091
1a68e1e5 3092 exec_target = arg_parameters[0];
b6b180b7
LP
3093
3094 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3095 * binary. */
3096 dollar_path = strv_env_get(env_use, "PATH");
3097 if (dollar_path) {
3098 if (putenv((char*) dollar_path) != 0)
3099 return log_error_errno(errno, "Failed to update $PATH: %m");
3100 }
3101
f757855e 3102 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 3103 } else {
5f932eb9 3104 if (!arg_chdir)
d929b0f9
ZJS
3105 /* If we cannot change the directory, we'll end up in /, that is expected. */
3106 (void) chdir(home ?: "/root");
5f932eb9 3107
03cfe0d5
LP
3108 execle("/bin/bash", "-bash", NULL, env_use);
3109 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
3110
3111 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
3112 }
3113
8ca082b4 3114 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
3115}
3116
9c1e04d0 3117static int setup_sd_notify_child(void) {
271f518f 3118 _cleanup_close_ int fd = -1;
9c1e04d0 3119 union sockaddr_union sa = {
44ed5214
LP
3120 .un.sun_family = AF_UNIX,
3121 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
3122 };
3123 int r;
3124
3125 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3126 if (fd < 0)
3127 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3128
3129 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 3130 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 3131
9c1e04d0 3132 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 3133 if (r < 0)
44ed5214 3134 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 3135
adc7d9f0 3136 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 3137 if (r < 0)
adc7d9f0 3138 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 3139
2ff48e98 3140 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 3141 if (r < 0)
2ff48e98 3142 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 3143
271f518f 3144 return TAKE_FD(fd);
9c1e04d0
AP
3145}
3146
03cfe0d5
LP
3147static int outer_child(
3148 Barrier *barrier,
3149 const char *directory,
3150 const char *console,
2d845785 3151 DissectedImage *dissected_image,
03cfe0d5
LP
3152 bool secondary,
3153 int pid_socket,
e01ff70a 3154 int uuid_socket,
9c1e04d0 3155 int notify_socket,
03cfe0d5
LP
3156 int kmsg_socket,
3157 int rtnl_socket,
825d5287 3158 int uid_shift_socket,
8199d554 3159 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
3160 FDSet *fds,
3161 int netns_fd) {
03cfe0d5 3162
bf428efb 3163 _cleanup_close_ int fd = -1;
03cfe0d5
LP
3164 pid_t pid;
3165 ssize_t l;
de40a303 3166 int r;
03cfe0d5 3167
b37469d7
LP
3168 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3169 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3170 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3171 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3172
03cfe0d5
LP
3173 assert(barrier);
3174 assert(directory);
03cfe0d5 3175 assert(pid_socket >= 0);
e01ff70a 3176 assert(uuid_socket >= 0);
9c1e04d0 3177 assert(notify_socket >= 0);
03cfe0d5
LP
3178 assert(kmsg_socket >= 0);
3179
de40a303
LP
3180 log_debug("Outer child is initializing.");
3181
03cfe0d5
LP
3182 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3183 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3184
de40a303 3185 if (arg_console_mode != CONSOLE_PIPE) {
2b33ab09 3186 int terminal;
03cfe0d5 3187
de40a303
LP
3188 assert(console);
3189
2b33ab09
LP
3190 terminal = open_terminal(console, O_RDWR);
3191 if (terminal < 0)
3192 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 3193
17cac366
LP
3194 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
3195 r = log_dup_console();
3196 if (r < 0)
3197 return log_error_errno(r, "Failed to duplicate stderr: %m");
3198
2b33ab09
LP
3199 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
3200 if (r < 0)
3201 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
3202 }
3203
3204 r = reset_audit_loginuid();
3205 if (r < 0)
3206 return r;
3207
3208 /* Mark everything as slave, so that we still
3209 * receive mounts from the real root, but don't
3210 * propagate mounts to the real root. */
60e76d48
ZJS
3211 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3212 if (r < 0)
3213 return r;
03cfe0d5 3214
2d845785 3215 if (dissected_image) {
2d3a5a73
LP
3216 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3217 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3218 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3219 * makes sure ESP partitions and userns are compatible. */
3220
3221 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
3222 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3223 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3224 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
3225 if (r < 0)
3226 return r;
3227 }
03cfe0d5 3228
391567f4
LP
3229 r = determine_uid_shift(directory);
3230 if (r < 0)
3231 return r;
3232
0de7acce 3233 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3234 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3235 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3236 if (l < 0)
3237 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
3238 if (l != sizeof(arg_uid_shift))
3239 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3240 "Short write while sending UID shift.");
0e7ac751 3241
0de7acce 3242 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3243 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3244 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3245 * not it will pick a different one, and send it back to us. */
3246
3247 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3248 if (l < 0)
3249 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
3250 if (l != sizeof(arg_uid_shift))
3251 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3252 "Short read while receiving UID shift.");
0e7ac751
LP
3253 }
3254
ff6c6cc1
LP
3255 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3256 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3257 }
3258
e50cd82f
LP
3259 if (!dissected_image) {
3260 /* Turn directory into bind mount */
3261 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3262 if (r < 0)
3263 return r;
3264 }
7d0ecdd6
LP
3265
3266 r = setup_pivot_root(
3267 directory,
3268 arg_pivot_root_new,
3269 arg_pivot_root_old);
3270 if (r < 0)
3271 return r;
3272
3273 r = setup_volatile_mode(
3274 directory,
3275 arg_volatile_mode,
3276 arg_userns_mode != USER_NAMESPACE_NO,
3277 arg_uid_shift,
3278 arg_uid_range,
3279 arg_selinux_context);
3280 if (r < 0)
3281 return r;
3282
2d3a5a73
LP
3283 if (dissected_image) {
3284 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3285 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3286 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3287 if (r < 0)
3288 return r;
3289 }
3290
8199d554
LP
3291 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3292 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3293
3294 r = detect_unified_cgroup_hierarchy_from_image(directory);
3295 if (r < 0)
3296 return r;
3297
3298 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3299 if (l < 0)
3300 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
3301 if (l != sizeof(arg_unified_cgroup_hierarchy))
3302 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3303 "Short write while sending cgroup mode.");
8199d554
LP
3304
3305 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3306 }
3307
4ad14eff
LP
3308 /* Mark everything as shared so our mounts get propagated down. This is
3309 * required to make new bind mounts available in systemd services
3310 * inside the containter that create a new mount namespace.
3311 * See https://github.com/systemd/systemd/issues/3860
3312 * Further submounts (such as /dev) done after this will inherit the
13e785f7 3313 * shared propagation mode. */
4ad14eff
LP
3314 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3315 if (r < 0)
3316 return r;
3317
3318 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3319 if (r < 0)
3320 return r;
3321
03cfe0d5
LP
3322 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3323 if (r < 0)
3324 return r;
3325
e5a4bb0d 3326 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
64e82c19 3327 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
03cfe0d5
LP
3328 if (r < 0)
3329 return log_error_errno(r, "Failed to make tree read-only: %m");
3330 }
3331
0de7acce 3332 r = mount_all(directory,
4f086aab 3333 arg_mount_settings,
0de7acce 3334 arg_uid_shift,
0de7acce 3335 arg_selinux_apifs_context);
03cfe0d5
LP
3336 if (r < 0)
3337 return r;
3338
07fa00f9
LP
3339 r = copy_devnodes(directory);
3340 if (r < 0)
03cfe0d5
LP
3341 return r;
3342
de40a303
LP
3343 r = make_extra_nodes(directory);
3344 if (r < 0)
3345 return r;
3346
3347 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3348 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
03cfe0d5 3349
07fa00f9
LP
3350 r = setup_pts(directory);
3351 if (r < 0)
03cfe0d5
LP
3352 return r;
3353
3354 r = setup_propagate(directory);
3355 if (r < 0)
3356 return r;
3357
3358 r = setup_dev_console(directory, console);
3359 if (r < 0)
3360 return r;
3361
8e5430c4
LP
3362 r = setup_keyring();
3363 if (r < 0)
3364 return r;
3365
03cfe0d5
LP
3366 r = setup_timezone(directory);
3367 if (r < 0)
3368 return r;
3369
3370 r = setup_resolv_conf(directory);
3371 if (r < 0)
3372 return r;
3373
e01ff70a
MS
3374 r = setup_machine_id(directory);
3375 if (r < 0)
3376 return r;
3377
03cfe0d5
LP
3378 r = setup_journal(directory);
3379 if (r < 0)
3380 return r;
3381
0de7acce
LP
3382 r = mount_custom(
3383 directory,
3384 arg_custom_mounts,
3385 arg_n_custom_mounts,
3386 arg_userns_mode != USER_NAMESPACE_NO,
3387 arg_uid_shift,
3388 arg_uid_range,
de40a303
LP
3389 arg_selinux_apifs_context,
3390 false);
03cfe0d5
LP
3391 if (r < 0)
3392 return r;
3393
489fae52 3394 if (!arg_use_cgns) {
0996ef00
CB
3395 r = mount_cgroups(
3396 directory,
3397 arg_unified_cgroup_hierarchy,
3398 arg_userns_mode != USER_NAMESPACE_NO,
3399 arg_uid_shift,
3400 arg_uid_range,
5a8ff0e6 3401 arg_selinux_apifs_context,
ada54120 3402 false);
0996ef00
CB
3403 if (r < 0)
3404 return r;
3405 }
03cfe0d5
LP
3406
3407 r = mount_move_root(directory);
3408 if (r < 0)
3409 return log_error_errno(r, "Failed to move root directory: %m");
3410
9c1e04d0
AP
3411 fd = setup_sd_notify_child();
3412 if (fd < 0)
3413 return fd;
3414
03cfe0d5 3415 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3416 arg_clone_ns_flags |
8869a0b4 3417 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3418 if (pid < 0)
3419 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3420 if (pid == 0) {
3421 pid_socket = safe_close(pid_socket);
e01ff70a 3422 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3423 notify_socket = safe_close(notify_socket);
825d5287 3424 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3425
3426 /* The inner child has all namespaces that are
3427 * requested, so that we all are owned by the user if
3428 * user namespaces are turned on. */
3429
d7bea6b6
DP
3430 if (arg_network_namespace_path) {
3431 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3432 if (r < 0)
e2d39e54 3433 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3434 }
3435
f757855e 3436 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3437 if (r < 0)
3438 _exit(EXIT_FAILURE);
3439
3440 _exit(EXIT_SUCCESS);
3441 }
3442
3443 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3444 if (l < 0)
3445 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3446 if (l != sizeof(pid))
3447 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3448 "Short write while sending PID.");
03cfe0d5 3449
e01ff70a
MS
3450 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3451 if (l < 0)
3452 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3453 if (l != sizeof(arg_uuid))
3454 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3455 "Short write while sending machine ID.");
e01ff70a 3456
9c1e04d0
AP
3457 l = send_one_fd(notify_socket, fd, 0);
3458 if (l < 0)
3459 return log_error_errno(errno, "Failed to send notify fd: %m");
3460
03cfe0d5 3461 pid_socket = safe_close(pid_socket);
e01ff70a 3462 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3463 notify_socket = safe_close(notify_socket);
327e26d6
KN
3464 kmsg_socket = safe_close(kmsg_socket);
3465 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3466 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3467
3468 return 0;
3469}
3470
0e7ac751 3471static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3472 bool tried_hashed = false;
0e7ac751
LP
3473 unsigned n_tries = 100;
3474 uid_t candidate;
3475 int r;
3476
3477 assert(shift);
3478 assert(ret_lock_file);
0de7acce 3479 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3480 assert(arg_uid_range == 0x10000U);
3481
3482 candidate = *shift;
3483
3484 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3485
3486 for (;;) {
fbd0b64f 3487 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3488 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3489
3490 if (--n_tries <= 0)
3491 return -EBUSY;
3492
87d5e4f2 3493 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3494 goto next;
3495 if ((candidate & UINT32_C(0xFFFF)) != 0)
3496 goto next;
3497
3498 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3499 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3500 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3501 goto next;
3502 if (r < 0)
3503 return r;
3504
3505 /* Make some superficial checks whether the range is currently known in the user database */
3506 if (getpwuid(candidate))
3507 goto next;
3508 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3509 goto next;
3510 if (getgrgid(candidate))
3511 goto next;
3512 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3513 goto next;
3514
3515 *ret_lock_file = lf;
3516 lf = (struct LockFile) LOCK_FILE_INIT;
3517 *shift = candidate;
3518 return 0;
3519
3520 next:
d381c8a6
LP
3521 if (arg_machine && !tried_hashed) {
3522 /* Try to hash the base from the container name */
3523
3524 static const uint8_t hash_key[] = {
3525 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3526 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3527 };
3528
3529 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3530
3531 tried_hashed = true;
3532 } else
3533 random_bytes(&candidate, sizeof(candidate));
3534
87d5e4f2 3535 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3536 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3537 }
3538}
3539
03cfe0d5 3540static int setup_uid_map(pid_t pid) {
fbd0b64f 3541 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3542 int r;
3543
3544 assert(pid > 1);
3545
3546 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3547 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3548 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3549 if (r < 0)
3550 return log_error_errno(r, "Failed to write UID map: %m");
3551
3552 /* We always assign the same UID and GID ranges */
3553 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3554 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3555 if (r < 0)
3556 return log_error_errno(r, "Failed to write GID map: %m");
3557
3558 return 0;
3559}
3560
9c1e04d0 3561static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3562 char buf[NOTIFY_BUFFER_MAX+1];
3563 char *p = NULL;
3564 struct iovec iovec = {
3565 .iov_base = buf,
3566 .iov_len = sizeof(buf)-1,
3567 };
3568 union {
3569 struct cmsghdr cmsghdr;
3570 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3571 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3572 } control = {};
3573 struct msghdr msghdr = {
3574 .msg_iov = &iovec,
3575 .msg_iovlen = 1,
3576 .msg_control = &control,
3577 .msg_controllen = sizeof(control),
3578 };
3579 struct cmsghdr *cmsg;
3580 struct ucred *ucred = NULL;
3581 ssize_t n;
3582 pid_t inner_child_pid;
3583 _cleanup_strv_free_ char **tags = NULL;
3584
3585 assert(userdata);
3586
3587 inner_child_pid = PTR_TO_PID(userdata);
3588
3589 if (revents != EPOLLIN) {
3590 log_warning("Got unexpected poll event for notify fd.");
3591 return 0;
3592 }
3593
3594 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3595 if (n < 0) {
3742095b 3596 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3597 return 0;
3598
3599 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3600 }
3601 cmsg_close_all(&msghdr);
3602
3603 CMSG_FOREACH(cmsg, &msghdr) {
3604 if (cmsg->cmsg_level == SOL_SOCKET &&
3605 cmsg->cmsg_type == SCM_CREDENTIALS &&
3606 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3607
3608 ucred = (struct ucred*) CMSG_DATA(cmsg);
3609 }
3610 }
3611
3612 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3613 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3614 return 0;
3615 }
3616
3617 if ((size_t) n >= sizeof(buf)) {
3618 log_warning("Received notify message exceeded maximum size. Ignoring.");
3619 return 0;
3620 }
3621
3622 buf[n] = 0;
3623 tags = strv_split(buf, "\n\r");
3624 if (!tags)
3625 return log_oom();
3626
3627 if (strv_find(tags, "READY=1"))
04f590a4 3628 (void) sd_notifyf(false, "READY=1\n");
9c1e04d0
AP
3629
3630 p = strv_find_startswith(tags, "STATUS=");
3631 if (p)
04f590a4 3632 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
9c1e04d0
AP
3633
3634 return 0;
3635}
3636
5773024d 3637static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3638 int r;
9c1e04d0 3639
5773024d 3640 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3641 if (r < 0)
3642 return log_error_errno(r, "Failed to allocate notify event source: %m");
3643
5773024d 3644 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3645
3646 return 0;
3647}
3648
5d961407
LP
3649static int merge_settings(Settings *settings, const char *path) {
3650 int rl;
f757855e 3651
5d961407
LP
3652 assert(settings);
3653 assert(path);
f757855e 3654
5d961407
LP
3655 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3656 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3657
7732f92b
LP
3658 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3659 settings->start_mode >= 0) {
3660 arg_start_mode = settings->start_mode;
130d3d22 3661 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3662 }
3663
a2f577fc
JL
3664 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3665 arg_ephemeral = settings->ephemeral;
3666
de40a303
LP
3667 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3668 settings->root) {
3669
3670 if (!arg_settings_trusted)
3671 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3672 else
3673 free_and_replace(arg_directory, settings->root);
3674 }
3675
b53ede69
PW
3676 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3677 settings->pivot_root_new) {
3678 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3679 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3680 }
3681
5f932eb9 3682 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3683 settings->working_directory)
3684 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3685
f757855e 3686 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3687 settings->environment)
3688 strv_free_and_replace(arg_setenv, settings->environment);
f757855e 3689
de40a303
LP
3690 if ((arg_settings_mask & SETTING_USER) == 0) {
3691
3692 if (settings->user)
3693 free_and_replace(arg_user, settings->user);
3694
3695 if (uid_is_valid(settings->uid))
3696 arg_uid = settings->uid;
3697 if (gid_is_valid(settings->gid))
3698 arg_gid = settings->gid;
3699 if (settings->n_supplementary_gids > 0) {
3700 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3701 arg_n_supplementary_gids = settings->n_supplementary_gids;
3702 }
3703 }
f757855e
LP
3704
3705 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
a3fc6b55 3706 uint64_t plus, minus;
f757855e 3707
de40a303
LP
3708 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3709 * Settings structure */
3710
0e265674 3711 plus = settings->capability;
a3fc6b55
LP
3712 minus = settings->drop_capability;
3713
3714 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3715 if (settings_private_network(settings))
3716 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3717 else
3718 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3719 }
0e265674
LP
3720
3721 if (!arg_settings_trusted && plus != 0) {
3722 if (settings->capability != 0)
5d961407 3723 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3724 } else
520e0d54 3725 arg_caps_retain |= plus;
f757855e 3726
a3fc6b55 3727 arg_caps_retain &= ~minus;
de40a303
LP
3728
3729 /* Copy the full capabilities over too */
3730 if (capability_quintet_is_set(&settings->full_capabilities)) {
3731 if (!arg_settings_trusted)
3732 log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
3733 else
3734 arg_full_capabilities = settings->full_capabilities;
3735 }
f757855e
LP
3736 }
3737
3738 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3739 settings->kill_signal > 0)
3740 arg_kill_signal = settings->kill_signal;
3741
3742 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3743 settings->personality != PERSONALITY_INVALID)
3744 arg_personality = settings->personality;
3745
3746 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3747 !sd_id128_is_null(settings->machine_id)) {
3748
3749 if (!arg_settings_trusted)
5d961407 3750 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3751 else
3752 arg_uuid = settings->machine_id;
3753 }
3754
3755 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3756 settings->read_only >= 0)
3757 arg_read_only = settings->read_only;
3758
3759 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3760 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3761 arg_volatile_mode = settings->volatile_mode;
3762
3763 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3764 settings->n_custom_mounts > 0) {
3765
3766 if (!arg_settings_trusted)
5d961407 3767 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3768 else {
3769 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3770 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3771 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3772 settings->n_custom_mounts = 0;
3773 }
3774 }
3775
3776 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3777 (settings->private_network >= 0 ||
3778 settings->network_veth >= 0 ||
3779 settings->network_bridge ||
22b28dfd 3780 settings->network_zone ||
f757855e
LP
3781 settings->network_interfaces ||
3782 settings->network_macvlan ||
f6d6bad1 3783 settings->network_ipvlan ||
de40a303
LP
3784 settings->network_veth_extra ||
3785 settings->network_namespace_path)) {
f757855e
LP
3786
3787 if (!arg_settings_trusted)
5d961407 3788 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3789 else {
f6d6bad1 3790 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3791 arg_private_network = settings_private_network(settings);
3792
130d3d22
YW
3793 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3794 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3795 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3796 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3797
1cc6c93a
YW
3798 free_and_replace(arg_network_bridge, settings->network_bridge);
3799 free_and_replace(arg_network_zone, settings->network_zone);
de40a303
LP
3800
3801 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
f757855e
LP
3802 }
3803 }
3804
3805 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3806 settings->expose_ports) {
3807
3808 if (!arg_settings_trusted)
5d961407 3809 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3810 else {
3811 expose_port_free_all(arg_expose_ports);
1cc6c93a 3812 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3813 }
3814 }
3815
0de7acce
LP
3816 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3817 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3818
3819 if (!arg_settings_trusted)
5d961407 3820 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3821 else {
3822 arg_userns_mode = settings->userns_mode;
3823 arg_uid_shift = settings->uid_shift;
3824 arg_uid_range = settings->uid_range;
3825 arg_userns_chown = settings->userns_chown;
3826 }
3827 }
3828
9c1e04d0
AP
3829 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3830 arg_notify_ready = settings->notify_ready;
3831
960e4569
LP
3832 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3833
de40a303 3834 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
5d961407 3835 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3836 else {
130d3d22
YW
3837 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3838 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569 3839 }
de40a303
LP
3840
3841#if HAVE_SECCOMP
3842 if (!arg_settings_trusted && settings->seccomp)
3843 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3844 else {
3845 seccomp_release(arg_seccomp);
3846 arg_seccomp = TAKE_PTR(settings->seccomp);
3847 }
3848#endif
960e4569
LP
3849 }
3850
bf428efb
LP
3851 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3852 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3853 continue;
3854
3855 if (!settings->rlimit[rl])
3856 continue;
3857
3858 if (!arg_settings_trusted) {
5d961407 3859 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3860 continue;
3861 }
3862
3863 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3864 }
3865
3a9530e5
LP
3866 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3867 settings->hostname)
3868 free_and_replace(arg_hostname, settings->hostname);
3869
66edd963
LP
3870 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3871 settings->no_new_privileges >= 0)
3872 arg_no_new_privileges = settings->no_new_privileges;
3873
81f345df
LP
3874 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3875 settings->oom_score_adjust_set) {
3876
3877 if (!arg_settings_trusted)
5d961407 3878 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3879 else {
3880 arg_oom_score_adjust = settings->oom_score_adjust;
3881 arg_oom_score_adjust_set = true;
3882 }
3883 }
3884
d107bb7d
LP
3885 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3886 settings->cpuset) {
3887
3888 if (!arg_settings_trusted)
5d961407 3889 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3890 else {
3891 if (arg_cpuset)
3892 CPU_FREE(arg_cpuset);
3893 arg_cpuset = TAKE_PTR(settings->cpuset);
3894 arg_cpuset_ncpus = settings->cpuset_ncpus;
3895 }
3896 }
3897
09d423e9
LP
3898 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3899 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3900 arg_resolv_conf = settings->resolv_conf;
3901
4e1d6aa9
LP
3902 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3903 settings->link_journal != _LINK_JOURNAL_INVALID) {
3904
3905 if (!arg_settings_trusted)
3906 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3907 else {
3908 arg_link_journal = settings->link_journal;
3909 arg_link_journal_try = settings->link_journal_try;
3910 }
3911 }
3912
1688841f
LP
3913 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3914 settings->timezone != _TIMEZONE_MODE_INVALID)
3915 arg_timezone = settings->timezone;
3916
de40a303
LP
3917 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3918 settings->slice) {
3919
3920 if (!arg_settings_trusted)
3921 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3922 else
3923 free_and_replace(arg_slice, settings->slice);
3924 }
3925
3926 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
3927 settings->use_cgns >= 0) {
3928
3929 if (!arg_settings_trusted)
3930 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
3931 else
3932 arg_use_cgns = settings->use_cgns;
3933 }
3934
3935 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
3936 settings->clone_ns_flags != (unsigned long) -1) {
3937
3938 if (!arg_settings_trusted)
3939 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
3940 else
3941 arg_clone_ns_flags = settings->clone_ns_flags;
3942 }
3943
3944 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
3945 settings->console_mode >= 0) {
3946
3947 if (!arg_settings_trusted)
3948 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
3949 else
3950 arg_console_mode = settings->console_mode;
3951 }
3952
3953 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
3954 * don't consult arg_settings_mask for them. */
3955
3956 sd_bus_message_unref(arg_property_message);
3957 arg_property_message = TAKE_PTR(settings->properties);
3958
3959 arg_console_width = settings->console_width;
3960 arg_console_height = settings->console_height;
3961
b2645747 3962 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
de40a303
LP
3963 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
3964 arg_n_extra_nodes = settings->n_extra_nodes;
3965
f757855e
LP
3966 return 0;
3967}
3968
5d961407
LP
3969static int load_settings(void) {
3970 _cleanup_(settings_freep) Settings *settings = NULL;
3971 _cleanup_fclose_ FILE *f = NULL;
3972 _cleanup_free_ char *p = NULL;
3973 const char *fn, *i;
3974 int r;
3975
de40a303
LP
3976 if (arg_oci_bundle)
3977 return 0;
3978
5d961407
LP
3979 /* If all settings are masked, there's no point in looking for
3980 * the settings file */
3981 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3982 return 0;
3983
3984 fn = strjoina(arg_machine, ".nspawn");
3985
3986 /* We first look in the admin's directories in /etc and /run */
3987 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3988 _cleanup_free_ char *j = NULL;
3989
3990 j = strjoin(i, "/", fn);
3991 if (!j)
3992 return log_oom();
3993
3994 f = fopen(j, "re");
3995 if (f) {
3996 p = TAKE_PTR(j);
3997
3998 /* By default, we trust configuration from /etc and /run */
3999 if (arg_settings_trusted < 0)
4000 arg_settings_trusted = true;
4001
4002 break;
4003 }
4004
4005 if (errno != ENOENT)
4006 return log_error_errno(errno, "Failed to open %s: %m", j);
4007 }
4008
4009 if (!f) {
4010 /* After that, let's look for a file next to the
4011 * actual image we shall boot. */
4012
4013 if (arg_image) {
4014 p = file_in_same_dir(arg_image, fn);
4015 if (!p)
4016 return log_oom();
4017 } else if (arg_directory) {
4018 p = file_in_same_dir(arg_directory, fn);
4019 if (!p)
4020 return log_oom();
4021 }
4022
4023 if (p) {
4024 f = fopen(p, "re");
4025 if (!f && errno != ENOENT)
4026 return log_error_errno(errno, "Failed to open %s: %m", p);
4027
4028 /* By default, we do not trust configuration from /var/lib/machines */
4029 if (arg_settings_trusted < 0)
4030 arg_settings_trusted = false;
4031 }
4032 }
4033
4034 if (!f)
4035 return 0;
4036
4037 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4038
4039 r = settings_load(f, p, &settings);
4040 if (r < 0)
4041 return r;
4042
4043 return merge_settings(settings, p);
4044}
4045
de40a303
LP
4046static int load_oci_bundle(void) {
4047 _cleanup_(settings_freep) Settings *settings = NULL;
4048 int r;
4049
4050 if (!arg_oci_bundle)
4051 return 0;
4052
4053 /* By default let's trust OCI bundles */
4054 if (arg_settings_trusted < 0)
4055 arg_settings_trusted = true;
4056
4057 r = oci_load(NULL, arg_oci_bundle, &settings);
4058 if (r < 0)
4059 return r;
4060
4061 return merge_settings(settings, arg_oci_bundle);
4062}
4063
44dbef90 4064static int run_container(int master,
b0067625 4065 const char* console,
2d845785 4066 DissectedImage *dissected_image,
b0067625
ZJS
4067 bool secondary,
4068 FDSet *fds,
4069 char veth_name[IFNAMSIZ], bool *veth_created,
4070 union in_addr_union *exposed,
4071 pid_t *pid, int *ret) {
4072
4073 static const struct sigaction sa = {
4074 .sa_handler = nop_signal_handler,
e28c7cd0 4075 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
4076 };
4077
8e766630 4078 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
4079 _cleanup_close_ int etc_passwd_lock = -1;
4080 _cleanup_close_pair_ int
4081 kmsg_socket_pair[2] = { -1, -1 },
4082 rtnl_socket_pair[2] = { -1, -1 },
4083 pid_socket_pair[2] = { -1, -1 },
4084 uuid_socket_pair[2] = { -1, -1 },
4085 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
4086 uid_shift_socket_pair[2] = { -1, -1 },
4087 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4088
b0067625
ZJS
4089 _cleanup_close_ int notify_socket= -1;
4090 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 4091 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
4092 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4093 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4094 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 4095 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625 4096 ContainerStatus container_status = 0;
b0067625
ZJS
4097 int ifi = 0, r;
4098 ssize_t l;
4099 sigset_t mask_chld;
d7bea6b6 4100 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
4101
4102 assert_se(sigemptyset(&mask_chld) == 0);
4103 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4104
4105 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4106 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4107 * check with getpwuid() if the specific user already exists. Note that /etc might be
4108 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4109 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4110 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4111 * really ours. */
4112
4113 etc_passwd_lock = take_etc_passwd_lock(NULL);
4114 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4115 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4116 }
4117
4118 r = barrier_create(&barrier);
4119 if (r < 0)
4120 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4121
4122 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4123 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4124
4125 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4126 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4127
4128 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4129 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4130
4131 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4132 return log_error_errno(errno, "Failed to create id socket pair: %m");
4133
4134 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4135 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4136
4137 if (arg_userns_mode != USER_NAMESPACE_NO)
4138 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4139 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4140
8199d554
LP
4141 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4142 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4143 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4144
b0067625
ZJS
4145 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4146 * parent's blocking calls and give it a chance to call wait() and terminate. */
4147 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4148 if (r < 0)
4149 return log_error_errno(errno, "Failed to change the signal mask: %m");
4150
4151 r = sigaction(SIGCHLD, &sa, NULL);
4152 if (r < 0)
4153 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4154
d7bea6b6
DP
4155 if (arg_network_namespace_path) {
4156 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4157 if (netns_fd < 0)
4158 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4159
4160 r = fd_is_network_ns(netns_fd);
6619ad88
LP
4161 if (r == -EUCLEAN)
4162 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4163 else if (r < 0)
d7bea6b6 4164 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
6619ad88
LP
4165 else if (r == 0) {
4166 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
4167 return -EINVAL;
4168 }
4169 }
4170
b0067625
ZJS
4171 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4172 if (*pid < 0)
4173 return log_error_errno(errno, "clone() failed%s: %m",
4174 errno == EINVAL ?
4175 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4176
4177 if (*pid == 0) {
4178 /* The outer child only has a file system namespace. */
4179 barrier_set_role(&barrier, BARRIER_CHILD);
4180
4181 master = safe_close(master);
4182
4183 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4184 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4185 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4186 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4187 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4188 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 4189 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
4190
4191 (void) reset_all_signal_handlers();
4192 (void) reset_signal_mask();
4193
4194 r = outer_child(&barrier,
4195 arg_directory,
4196 console,
2d845785 4197 dissected_image,
b0067625
ZJS
4198 secondary,
4199 pid_socket_pair[1],
4200 uuid_socket_pair[1],
4201 notify_socket_pair[1],
4202 kmsg_socket_pair[1],
4203 rtnl_socket_pair[1],
4204 uid_shift_socket_pair[1],
8199d554 4205 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
4206 fds,
4207 netns_fd);
b0067625
ZJS
4208 if (r < 0)
4209 _exit(EXIT_FAILURE);
4210
4211 _exit(EXIT_SUCCESS);
4212 }
4213
4214 barrier_set_role(&barrier, BARRIER_PARENT);
4215
e4077ff6 4216 fdset_close(fds);
b0067625
ZJS
4217
4218 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4219 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4220 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4221 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4222 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4223 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 4224 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
4225
4226 if (arg_userns_mode != USER_NAMESPACE_NO) {
4227 /* The child just let us know the UID shift it might have read from the image. */
4228 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4229 if (l < 0)
4230 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
4231 if (l != sizeof arg_uid_shift) {
4232 log_error("Short read while reading UID shift.");
4233 return -EIO;
4234 }
4235
4236 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4237 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4238 * image, but if that's already in use, pick a new one, and report back to the child,
4239 * which one we now picked. */
4240
4241 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4242 if (r < 0)
4243 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4244
4245 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4246 if (l < 0)
4247 return log_error_errno(errno, "Failed to send UID shift: %m");
4248 if (l != sizeof arg_uid_shift) {
4249 log_error("Short write while writing UID shift.");
4250 return -EIO;
4251 }
4252 }
4253 }
4254
8199d554
LP
4255 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4256 /* The child let us know the support cgroup mode it might have read from the image. */
4257 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4258 if (l < 0)
4259 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4260 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
bd897e72
ZJS
4261 log_error("Short read while reading cgroup mode (%zu bytes).%s",
4262 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
4263 return -EIO;
4264 }
4265 }
4266
b0067625 4267 /* Wait for the outer child. */
d2e0ac3d
LP
4268 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4269 if (r < 0)
4270 return r;
4271 if (r != EXIT_SUCCESS)
4272 return -EIO;
b0067625
ZJS
4273
4274 /* And now retrieve the PID of the inner child. */
4275 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4276 if (l < 0)
4277 return log_error_errno(errno, "Failed to read inner child PID: %m");
4278 if (l != sizeof *pid) {
4279 log_error("Short read while reading inner child PID.");
4280 return -EIO;
4281 }
4282
4283 /* We also retrieve container UUID in case it was generated by outer child */
4284 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4285 if (l < 0)
4286 return log_error_errno(errno, "Failed to read container machine ID: %m");
4287 if (l != sizeof(arg_uuid)) {
4288 log_error("Short read while reading container machined ID.");
4289 return -EIO;
4290 }
4291
4292 /* We also retrieve the socket used for notifications generated by outer child */
4293 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4294 if (notify_socket < 0)
4295 return log_error_errno(notify_socket,
4296 "Failed to receive notification socket from the outer child: %m");
4297
4298 log_debug("Init process invoked as PID "PID_FMT, *pid);
4299
4300 if (arg_userns_mode != USER_NAMESPACE_NO) {
4301 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4302 log_error("Child died too early.");
4303 return -ESRCH;
4304 }
4305
4306 r = setup_uid_map(*pid);
4307 if (r < 0)
4308 return r;
4309
4310 (void) barrier_place(&barrier); /* #2 */
4311 }
4312
4313 if (arg_private_network) {
75116558
PS
4314 if (!arg_network_namespace_path) {
4315 /* Wait until the child has unshared its network namespace. */
4316 if (!barrier_place_and_sync(&barrier)) { /* #3 */
4317 log_error("Child died too early");
4318 return -ESRCH;
4319 }
4320 }
4321
b0067625
ZJS
4322 r = move_network_interfaces(*pid, arg_network_interfaces);
4323 if (r < 0)
4324 return r;
4325
4326 if (arg_network_veth) {
4327 r = setup_veth(arg_machine, *pid, veth_name,
4328 arg_network_bridge || arg_network_zone);
4329 if (r < 0)
4330 return r;
4331 else if (r > 0)
4332 ifi = r;
4333
4334 if (arg_network_bridge) {
4335 /* Add the interface to a bridge */
4336 r = setup_bridge(veth_name, arg_network_bridge, false);
4337 if (r < 0)
4338 return r;
4339 if (r > 0)
4340 ifi = r;
4341 } else if (arg_network_zone) {
4342 /* Add the interface to a bridge, possibly creating it */
4343 r = setup_bridge(veth_name, arg_network_zone, true);
4344 if (r < 0)
4345 return r;
4346 if (r > 0)
4347 ifi = r;
4348 }
4349 }
4350
4351 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4352 if (r < 0)
4353 return r;
4354
4355 /* We created the primary and extra veth links now; let's remember this, so that we know to
4356 remove them later on. Note that we don't bother with removing veth links that were created
4357 here when their setup failed half-way, because in that case the kernel should be able to
4358 remove them on its own, since they cannot be referenced by anything yet. */
4359 *veth_created = true;
4360
4361 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4362 if (r < 0)
4363 return r;
4364
4365 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4366 if (r < 0)
4367 return r;
4368 }
4369
abdb9b08
LP
4370 if (arg_register || !arg_keep_unit) {
4371 r = sd_bus_default_system(&bus);
4372 if (r < 0)
4373 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
4374
4375 r = sd_bus_set_close_on_exit(bus, false);
4376 if (r < 0)
4377 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
4378 }
4379
4380 if (!arg_keep_unit) {
4381 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4382 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4383 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4384
75152a4d
LP
4385 r = sd_bus_match_signal_async(
4386 bus,
4387 NULL,
4388 "org.freedesktop.systemd1",
4389 NULL,
4390 "org.freedesktop.systemd1.Scope",
4391 "RequestStop",
4392 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 4393 if (r < 0)
75152a4d 4394 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
4395 }
4396
b0067625
ZJS
4397 if (arg_register) {
4398 r = register_machine(
abdb9b08 4399 bus,
b0067625
ZJS
4400 arg_machine,
4401 *pid,
4402 arg_directory,
4403 arg_uuid,
4404 ifi,
4405 arg_slice,
4406 arg_custom_mounts, arg_n_custom_mounts,
4407 arg_kill_signal,
4408 arg_property,
de40a303 4409 arg_property_message,
b0067625
ZJS
4410 arg_keep_unit,
4411 arg_container_service_name);
4412 if (r < 0)
4413 return r;
abdb9b08 4414
cd2dfc6f
LP
4415 } else if (!arg_keep_unit) {
4416 r = allocate_scope(
abdb9b08 4417 bus,
cd2dfc6f
LP
4418 arg_machine,
4419 *pid,
4420 arg_slice,
4421 arg_custom_mounts, arg_n_custom_mounts,
4422 arg_kill_signal,
de40a303
LP
4423 arg_property,
4424 arg_property_message);
cd2dfc6f
LP
4425 if (r < 0)
4426 return r;
4427
4428 } else if (arg_slice || arg_property)
4429 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 4430
27da7ef0 4431 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
b0067625
ZJS
4432 if (r < 0)
4433 return r;
4434
27da7ef0 4435 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
720f0a2f
LP
4436 if (r < 0)
4437 return r;
b0067625 4438
de54e02d 4439 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
4440 if (r < 0)
4441 return r;
4442
4443 /* Notify the child that the parent is ready with all
4444 * its setup (including cgroup-ification), and that
4445 * the child can now hand over control to the code to
4446 * run inside the container. */
75116558 4447 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
4448
4449 /* Block SIGCHLD here, before notifying child.
4450 * process_pty() will handle it with the other signals. */
4451 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4452
4453 /* Reset signal to default */
4454 r = default_signals(SIGCHLD, -1);
4455 if (r < 0)
4456 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4457
4458 r = sd_event_new(&event);
4459 if (r < 0)
4460 return log_error_errno(r, "Failed to get default event source: %m");
4461
8fd010bb
LP
4462 (void) sd_event_set_watchdog(event, true);
4463
abdb9b08
LP
4464 if (bus) {
4465 r = sd_bus_attach_event(bus, event, 0);
4466 if (r < 0)
4467 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4468 }
4469
5773024d 4470 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4471 if (r < 0)
4472 return r;
4473
4474 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 4475 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
4476 log_error("Child died too early.");
4477 return -ESRCH;
4478 }
4479
4480 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4481 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4482 etc_passwd_lock = safe_close(etc_passwd_lock);
4483
04f590a4
LP
4484 (void) sd_notifyf(false,
4485 "STATUS=Container running.\n"
4486 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
b0067625 4487 if (!arg_notify_ready)
919f5ae0 4488 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4489
4490 if (arg_kill_signal > 0) {
4491 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4492 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4493 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4494 } else {
4495 /* Immediately exit */
919f5ae0
LP
4496 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4497 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4498 }
4499
6916b164 4500 /* Exit when the child exits */
919f5ae0 4501 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4502
4503 if (arg_expose_ports) {
4504 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4505 if (r < 0)
4506 return r;
4507
4508 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4509 }
4510
4511 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4512
de40a303
LP
4513 if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
4514 assert(master >= 0);
4515
4516 r = pty_forward_new(event, master,
4517 PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
4518 &forward);
4519 if (r < 0)
4520 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4521
4522 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4523 (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
4524 }
b0067625
ZJS
4525
4526 r = sd_event_loop(event);
4527 if (r < 0)
4528 return log_error_errno(r, "Failed to run event loop: %m");
4529
de40a303
LP
4530 if (forward) {
4531 char last_char = 0;
b0067625 4532
de40a303
LP
4533 (void) pty_forward_get_last_char(forward, &last_char);
4534 forward = pty_forward_free(forward);
b0067625 4535
de40a303
LP
4536 if (!arg_quiet && last_char != '\n')
4537 putc('\n', stdout);
4538 }
b0067625
ZJS
4539
4540 /* Kill if it is not dead yet anyway */
1d78fea2
LP
4541 if (bus) {
4542 if (arg_register)
4543 terminate_machine(bus, arg_machine);
4544 else if (!arg_keep_unit)
4545 terminate_scope(bus, arg_machine);
4546 }
b0067625
ZJS
4547
4548 /* Normally redundant, but better safe than sorry */
c67b0082 4549 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4550
4551 r = wait_for_container(*pid, &container_status);
4552 *pid = 0;
4553
4554 if (r < 0)
4555 /* We failed to wait for the container, or the container exited abnormally. */
4556 return r;
4557 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4558 /* r > 0 → The container exited with a non-zero status.
4559 * As a special case, we need to replace 133 with a different value,
4560 * because 133 is special-cased in the service file to reboot the container.
4561 * otherwise → The container exited with zero status and a reboot was not requested.
4562 */
2a49b612 4563 if (r == EXIT_FORCE_RESTART)
27e29a1e 4564 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4565 *ret = r;
b0067625
ZJS
4566 return 0; /* finito */
4567 }
4568
4569 /* CONTAINER_REBOOTED, loop again */
4570
4571 if (arg_keep_unit) {
4572 /* Special handling if we are running as a service: instead of simply
4573 * restarting the machine we want to restart the entire service, so let's
4574 * inform systemd about this with the special exit code 133. The service
4575 * file uses RestartForceExitStatus=133 so that this results in a full
4576 * nspawn restart. This is necessary since we might have cgroup parameters
4577 * set we want to have flushed out. */
2a49b612
ZJS
4578 *ret = EXIT_FORCE_RESTART;
4579 return 0; /* finito */
b0067625
ZJS
4580 }
4581
4582 expose_port_flush(arg_expose_ports, exposed);
4583
4584 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4585 *veth_created = false;
4586 return 1; /* loop again */
4587}
4588
bf428efb 4589static int initialize_rlimits(void) {
bf428efb
LP
4590 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4591 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4592 * container execution environments. */
4593
4594 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4595 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4596 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4597 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4598 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4599 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4600 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4601 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4602 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4603 [RLIMIT_NICE] = { 0, 0 },
4604 [RLIMIT_NOFILE] = { 1024, 4096 },
4605 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4606 [RLIMIT_RTPRIO] = { 0, 0 },
4607 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4608 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4609
4610 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4611 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4612 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4613 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4614 * that PID 1 changes a number of other resource limits during early initialization which is why we
4615 * don't read the other limits from PID 1 but prefer the static table above. */
4616 };
4617
4618 int rl;
4619
4620 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4621 /* Let's only fill in what the user hasn't explicitly configured anyway */
4622 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4623 const struct rlimit *v;
4624 struct rlimit buffer;
4625
4626 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4627 /* For these two let's read the limits off PID 1. See above for an explanation. */
4628
4629 if (prlimit(1, rl, NULL, &buffer) < 0)
4630 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4631
4632 v = &buffer;
4633 } else
4634 v = kernel_defaults + rl;
4635
4636 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4637 if (!arg_rlimit[rl])
4638 return log_oom();
4639 }
4640
4641 if (DEBUG_LOGGING) {
4642 _cleanup_free_ char *k = NULL;
4643
4644 (void) rlimit_format(arg_rlimit[rl], &k);
4645 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4646 }
4647 }
4648
4649 return 0;
4650}
4651
44dbef90 4652static int run(int argc, char *argv[]) {
2d845785
LP
4653 _cleanup_free_ char *console = NULL;
4654 _cleanup_close_ int master = -1;
03cfe0d5 4655 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4656 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4657 char veth_name[IFNAMSIZ] = "";
17cbb288 4658 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4659 pid_t pid = 0;
03cfe0d5 4660 union in_addr_union exposed = {};
8e766630 4661 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
de40a303 4662 bool veth_created = false, remove_tmprootdir = false;
c67b0082 4663 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4664 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4665 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4666 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4667
4668 log_parse_environment();
4669 log_open();
415fc41c 4670
03cfe0d5
LP
4671 r = parse_argv(argc, argv);
4672 if (r <= 0)
4673 goto finish;
4674
fba868fa
LP
4675 r = must_be_root();
4676 if (r < 0)
03cfe0d5 4677 goto finish;
fba868fa 4678
bf428efb
LP
4679 r = initialize_rlimits();
4680 if (r < 0)
4681 goto finish;
4682
de40a303
LP
4683 r = load_oci_bundle();
4684 if (r < 0)
4685 goto finish;
4686
f757855e
LP
4687 r = determine_names();
4688 if (r < 0)
4689 goto finish;
4690
4691 r = load_settings();
4692 if (r < 0)
4693 goto finish;
4694
5eee8290
LP
4695 r = cg_unified_flush();
4696 if (r < 0) {
4697 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4698 goto finish;
4699 }
4700
f757855e
LP
4701 r = verify_arguments();
4702 if (r < 0)
4703 goto finish;
03cfe0d5 4704
8199d554
LP
4705 r = detect_unified_cgroup_hierarchy_from_environment();
4706 if (r < 0)
4707 goto finish;
4708
2949ff26
LP
4709 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4710 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4711 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4712 (void) ignore_signals(SIGPIPE, -1);
4713
03cfe0d5
LP
4714 n_fd_passed = sd_listen_fds(false);
4715 if (n_fd_passed > 0) {
4716 r = fdset_new_listen_fds(&fds, false);
4717 if (r < 0) {
4718 log_error_errno(r, "Failed to collect file descriptors: %m");
4719 goto finish;
4720 }
4721 }
4722
83e803a9
ZJS
4723 /* The "default" umask. This is appropriate for most file and directory
4724 * operations performed by nspawn, and is the umask that will be used for
4725 * the child. Functions like copy_devnodes() change the umask temporarily. */
4726 umask(0022);
4727
03cfe0d5
LP
4728 if (arg_directory) {
4729 assert(!arg_image);
4730
4731 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4732 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4733 r = -EINVAL;
4734 goto finish;
4735 }
4736
4737 if (arg_ephemeral) {
4738 _cleanup_free_ char *np = NULL;
4739
8d4aa2bb 4740 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4741 if (r < 0)
4742 goto finish;
4743
03cfe0d5
LP
4744 /* If the specified path is a mount point we
4745 * generate the new snapshot immediately
4746 * inside it under a random name. However if
4747 * the specified is not a mount point we
4748 * create the new snapshot in the parent
4749 * directory, just next to it. */
e1873695 4750 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4751 if (r < 0) {
4752 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4753 goto finish;
4754 }
4755 if (r > 0)
770b5ce4 4756 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4757 else
770b5ce4 4758 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4759 if (r < 0) {
0f3be6ca 4760 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4761 goto finish;
4762 }
4763
4764 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4765 if (r < 0) {
4766 log_error_errno(r, "Failed to lock %s: %m", np);
4767 goto finish;
4768 }
4769
17cbb288
LP
4770 r = btrfs_subvol_snapshot(arg_directory, np,
4771 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4772 BTRFS_SNAPSHOT_FALLBACK_COPY |
4773 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4774 BTRFS_SNAPSHOT_RECURSIVE |
4775 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4776 if (r < 0) {
4777 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4778 goto finish;
ec16945e
LP
4779 }
4780
1cc6c93a 4781 free_and_replace(arg_directory, np);
ec16945e 4782
17cbb288 4783 remove_directory = true;
30535c16
LP
4784
4785 } else {
cb638b5e 4786 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4787 if (r < 0)
4788 goto finish;
4789
30535c16
LP
4790 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4791 if (r == -EBUSY) {
4792 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4793 goto finish;
4794 }
4795 if (r < 0) {
4796 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4797 goto finish;
30535c16
LP
4798 }
4799
4800 if (arg_template) {
8d4aa2bb 4801 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4802 if (r < 0)
4803 goto finish;
4804
17cbb288
LP
4805 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4806 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4807 BTRFS_SNAPSHOT_FALLBACK_COPY |
4808 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4809 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4810 BTRFS_SNAPSHOT_RECURSIVE |
4811 BTRFS_SNAPSHOT_QUOTA);
ff6c6cc1
LP
4812 if (r == -EEXIST)
4813 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4814 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4815 else if (r < 0) {
83521414 4816 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4817 goto finish;
ff6c6cc1
LP
4818 } else
4819 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4820 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4821 }
ec16945e
LP
4822 }
4823
7732f92b 4824 if (arg_start_mode == START_BOOT) {
a5201ed6 4825 const char *p;
c9fe05e0 4826
a5201ed6
LP
4827 if (arg_pivot_root_new)
4828 p = prefix_roota(arg_directory, arg_pivot_root_new);
4829 else
4830 p = arg_directory;
c9fe05e0
AR
4831
4832 if (path_is_os_tree(p) <= 0) {
4833 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4834 r = -EINVAL;
1b9e5b12
LP
4835 goto finish;
4836 }
4837 } else {
c9fe05e0
AR
4838 const char *p, *q;
4839
a5201ed6
LP
4840 if (arg_pivot_root_new)
4841 p = prefix_roota(arg_directory, arg_pivot_root_new);
4842 else
4843 p = arg_directory;
c9fe05e0
AR
4844
4845 q = strjoina(p, "/usr/");
1b9e5b12 4846
c9fe05e0
AR
4847 if (laccess(q, F_OK) < 0) {
4848 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4849 r = -EINVAL;
1b9e5b12 4850 goto finish;
1b9e5b12
LP
4851 }
4852 }
ec16945e 4853
6b9132a9 4854 } else {
ec16945e
LP
4855 assert(arg_image);
4856 assert(!arg_template);
4857
8d4aa2bb 4858 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4859 if (r < 0)
4860 goto finish;
4861
0f3be6ca
LP
4862 if (arg_ephemeral) {
4863 _cleanup_free_ char *np = NULL;
4864
4865 r = tempfn_random(arg_image, "machine.", &np);
4866 if (r < 0) {
4867 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4868 goto finish;
4869 }
4870
4871 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4872 if (r < 0) {
4873 r = log_error_errno(r, "Failed to create image lock: %m");
4874 goto finish;
4875 }
4876
adc6f43b 4877 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
0f3be6ca
LP
4878 if (r < 0) {
4879 r = log_error_errno(r, "Failed to copy image file: %m");
4880 goto finish;
4881 }
4882
1cc6c93a 4883 free_and_replace(arg_image, np);
0f3be6ca
LP
4884
4885 remove_image = true;
4886 } else {
4887 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4888 if (r == -EBUSY) {
4889 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4890 goto finish;
4891 }
4892 if (r < 0) {
4893 r = log_error_errno(r, "Failed to create image lock: %m");
4894 goto finish;
4895 }
4623e8e6 4896
78ebe980
LP
4897 if (!arg_root_hash) {
4898 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4899 if (r < 0) {
4900 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4901 goto finish;
4902 }
4903 }
30535c16
LP
4904 }
4905
c67b0082 4906 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4907 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4908 goto finish;
1b9e5b12 4909 }
6b9132a9 4910
c67b0082
LP
4911 remove_tmprootdir = true;
4912
4913 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4914 if (!arg_directory) {
4915 r = log_oom();
4916 goto finish;
6b9132a9 4917 }
88213476 4918
2d845785
LP
4919 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4920 if (r < 0) {
4921 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4922 goto finish;
4923 }
1b9e5b12 4924
4526113f 4925 r = dissect_image_and_warn(
e0f9e7bd 4926 loop->fd,
4526113f 4927 arg_image,
e0f9e7bd
LP
4928 arg_root_hash, arg_root_hash_size,
4929 DISSECT_IMAGE_REQUIRE_ROOT,
4930 &dissected_image);
2d845785 4931 if (r == -ENOPKG) {
4526113f 4932 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4933 log_notice("Note that the disk image needs to\n"
4934 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4935 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4936 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4937 " d) or contain a file system without a partition table\n"
4938 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4939 goto finish;
2d845785 4940 }
4526113f 4941 if (r < 0)
842f3b0f 4942 goto finish;
1b9e5b12 4943
4623e8e6
LP
4944 if (!arg_root_hash && dissected_image->can_verity)
4945 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4946
4947 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4948 if (r < 0)
4949 goto finish;
0f3be6ca
LP
4950
4951 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4952 if (remove_image && unlink(arg_image) >= 0)
4953 remove_image = false;
842f3b0f 4954 }
842f3b0f 4955
86c0dd4a 4956 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4957 if (r < 0)
4958 goto finish;
4959
de40a303
LP
4960 if (arg_console_mode < 0)
4961 arg_console_mode =
4962 isatty(STDIN_FILENO) > 0 &&
4963 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
9c857b9d 4964
de40a303
LP
4965 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
4966 arg_quiet = true;
a258bf26 4967
de40a303
LP
4968 if (arg_console_mode != CONSOLE_PIPE) {
4969 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
4970 if (master < 0) {
4971 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4972 goto finish;
4973 }
68b02049 4974
de40a303
LP
4975 r = ptsname_malloc(master, &console);
4976 if (r < 0) {
4977 r = log_error_errno(r, "Failed to determine tty name: %m");
68b02049 4978 goto finish;
de40a303 4979 }
a258bf26 4980
de40a303
LP
4981 if (arg_selinux_apifs_context) {
4982 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4983 if (r < 0)
4984 goto finish;
4985 }
4986
4987 if (unlockpt(master) < 0) {
4988 r = log_error_errno(errno, "Failed to unlock tty: %m");
4989 goto finish;
4990 }
a258bf26
LP
4991 }
4992
9c857b9d
LP
4993 if (!arg_quiet)
4994 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4995 arg_machine, arg_image ?: arg_directory);
4996
72c0a2c2 4997 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4998
66edd963 4999 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
5000 r = log_error_errno(errno, "Failed to become subreaper: %m");
5001 goto finish;
5002 }
5003
d87be9b0 5004 for (;;) {
44dbef90
LP
5005 r = run_container(master,
5006 console,
5007 dissected_image,
5008 secondary,
5009 fds,
5010 veth_name, &veth_created,
5011 &exposed,
5012 &pid, &ret);
b0067625 5013 if (r <= 0)
d87be9b0 5014 break;
d87be9b0 5015 }
88213476
LP
5016
5017finish:
04f590a4
LP
5018 (void) sd_notify(false,
5019 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5020 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 5021
9444b1f2 5022 if (pid > 0)
c67b0082 5023 (void) kill(pid, SIGKILL);
88213476 5024
503546da 5025 /* Try to flush whatever is still queued in the pty */
6a0f896b 5026 if (master >= 0) {
1c876927 5027 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
5028 master = safe_close(master);
5029 }
5030
5031 if (pid > 0)
5032 (void) wait_for_terminate(pid, NULL);
503546da 5033
50ebcf6c
LP
5034 pager_close();
5035
17cbb288 5036 if (remove_directory && arg_directory) {
ec16945e
LP
5037 int k;
5038
17cbb288 5039 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 5040 if (k < 0)
17cbb288 5041 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
5042 }
5043
0f3be6ca
LP
5044 if (remove_image && arg_image) {
5045 if (unlink(arg_image) < 0)
5046 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5047 }
5048
c67b0082
LP
5049 if (remove_tmprootdir) {
5050 if (rmdir(tmprootdir) < 0)
5051 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5052 }
5053
785890ac
LP
5054 if (arg_machine) {
5055 const char *p;
5056
63c372cb 5057 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5058 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5059 }
5060
7a8f6325 5061 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
5062
5063 if (veth_created)
5064 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 5065 (void) remove_bridge(arg_network_zone);
f757855e 5066
f757855e
LP
5067 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5068 expose_port_free_all(arg_expose_ports);
bf428efb 5069 rlimit_free_all(arg_rlimit);
b2645747 5070 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
6d0b55c2 5071
44dbef90
LP
5072 if (r < 0)
5073 return r;
5074
5075 return ret;
88213476 5076}
44dbef90
LP
5077
5078DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);