1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include <linux/loop.h>
7 #include <selinux/selinux.h>
11 #include <sys/ioctl.h>
12 #include <sys/mount.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/types.h>
20 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
23 #include "sd-daemon.h"
26 #include "alloc-util.h"
27 #include "ether-addr-util.h"
29 #include "base-filesystem.h"
30 #include "blkid-util.h"
31 #include "btrfs-util.h"
33 #include "bus-error.h"
34 #include "bus-locator.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
40 #include "common-signal.h"
42 #include "cpu-set-util.h"
43 #include "creds-util.h"
44 #include "dev-setup.h"
45 #include "discover-image.h"
46 #include "dissect-image.h"
52 #include "format-util.h"
55 #include "hexdecoct.h"
56 #include "hostname-setup.h"
57 #include "hostname-util.h"
58 #include "id128-util.h"
61 #include "loop-util.h"
62 #include "loopback-setup.h"
63 #include "machine-credential.h"
65 #include "main-func.h"
66 #include "missing_sched.h"
68 #include "mount-util.h"
69 #include "mountpoint-util.h"
70 #include "namespace-util.h"
71 #include "netlink-util.h"
72 #include "nspawn-bind-user.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-def.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-oci.h"
79 #include "nspawn-patch-uid.h"
80 #include "nspawn-register.h"
81 #include "nspawn-seccomp.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "nspawn-util.h"
87 #include "nsresource.h"
88 #include "nulstr-util.h"
91 #include "parse-argument.h"
92 #include "parse-util.h"
93 #include "pretty-print.h"
94 #include "process-util.h"
96 #include "random-util.h"
97 #include "raw-clone.h"
98 #include "resolve-util.h"
99 #include "rlimit-util.h"
101 #include "seccomp-util.h"
102 #include "selinux-util.h"
103 #include "signal-util.h"
104 #include "socket-util.h"
105 #include "stat-util.h"
106 #include "stdio-util.h"
107 #include "string-table.h"
108 #include "string-util.h"
110 #include "sysctl-util.h"
111 #include "terminal-util.h"
112 #include "tmpfile-util.h"
113 #include "umask-util.h"
114 #include "unit-name.h"
115 #include "user-util.h"
118 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
119 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
120 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
122 #define EXIT_FORCE_RESTART 133
124 typedef enum ContainerStatus
{
125 CONTAINER_TERMINATED
,
129 static char *arg_directory
= NULL
;
130 static char *arg_template
= NULL
;
131 static char *arg_chdir
= NULL
;
132 static char *arg_pivot_root_new
= NULL
;
133 static char *arg_pivot_root_old
= NULL
;
134 static char *arg_user
= NULL
;
135 static uid_t arg_uid
= UID_INVALID
;
136 static gid_t arg_gid
= GID_INVALID
;
137 static gid_t
* arg_supplementary_gids
= NULL
;
138 static size_t arg_n_supplementary_gids
= 0;
139 static sd_id128_t arg_uuid
= {};
140 static char *arg_machine
= NULL
; /* The name used by the host to refer to this */
141 static char *arg_hostname
= NULL
; /* The name the payload sees by default */
142 static const char *arg_selinux_context
= NULL
;
143 static const char *arg_selinux_apifs_context
= NULL
;
144 static char *arg_slice
= NULL
;
145 static bool arg_private_network
= false;
146 static bool arg_read_only
= false;
147 static StartMode arg_start_mode
= START_PID1
;
148 static bool arg_ephemeral
= false;
149 static LinkJournal arg_link_journal
= LINK_AUTO
;
150 static bool arg_link_journal_try
= false;
151 static uint64_t arg_caps_retain
=
152 (1ULL << CAP_AUDIT_CONTROL
) |
153 (1ULL << CAP_AUDIT_WRITE
) |
154 (1ULL << CAP_CHOWN
) |
155 (1ULL << CAP_DAC_OVERRIDE
) |
156 (1ULL << CAP_DAC_READ_SEARCH
) |
157 (1ULL << CAP_FOWNER
) |
158 (1ULL << CAP_FSETID
) |
159 (1ULL << CAP_IPC_OWNER
) |
161 (1ULL << CAP_LEASE
) |
162 (1ULL << CAP_LINUX_IMMUTABLE
) |
163 (1ULL << CAP_MKNOD
) |
164 (1ULL << CAP_NET_BIND_SERVICE
) |
165 (1ULL << CAP_NET_BROADCAST
) |
166 (1ULL << CAP_NET_RAW
) |
167 (1ULL << CAP_SETFCAP
) |
168 (1ULL << CAP_SETGID
) |
169 (1ULL << CAP_SETPCAP
) |
170 (1ULL << CAP_SETUID
) |
171 (1ULL << CAP_SYS_ADMIN
) |
172 (1ULL << CAP_SYS_BOOT
) |
173 (1ULL << CAP_SYS_CHROOT
) |
174 (1ULL << CAP_SYS_NICE
) |
175 (1ULL << CAP_SYS_PTRACE
) |
176 (1ULL << CAP_SYS_RESOURCE
) |
177 (1ULL << CAP_SYS_TTY_CONFIG
);
178 static uint64_t arg_caps_ambient
= 0;
179 static CapabilityQuintet arg_full_capabilities
= CAPABILITY_QUINTET_NULL
;
180 static CustomMount
*arg_custom_mounts
= NULL
;
181 static size_t arg_n_custom_mounts
= 0;
182 static char **arg_setenv
= NULL
;
183 static bool arg_quiet
= false;
184 static bool arg_register
= true;
185 static bool arg_keep_unit
= false;
186 static char **arg_network_interfaces
= NULL
;
187 static char **arg_network_macvlan
= NULL
;
188 static char **arg_network_ipvlan
= NULL
;
189 static bool arg_network_veth
= false;
190 static char **arg_network_veth_extra
= NULL
;
191 static char *arg_network_bridge
= NULL
;
192 static char *arg_network_zone
= NULL
;
193 static char *arg_network_namespace_path
= NULL
;
194 struct ether_addr arg_network_provided_mac
= {};
195 static PagerFlags arg_pager_flags
= 0;
196 static unsigned long arg_personality
= PERSONALITY_INVALID
;
197 static char *arg_image
= NULL
;
198 static char *arg_oci_bundle
= NULL
;
199 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
200 static ExposePort
*arg_expose_ports
= NULL
;
201 static char **arg_property
= NULL
;
202 static sd_bus_message
*arg_property_message
= NULL
;
203 static UserNamespaceMode arg_userns_mode
= USER_NAMESPACE_NO
;
204 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
205 static UserNamespaceOwnership arg_userns_ownership
= _USER_NAMESPACE_OWNERSHIP_INVALID
;
206 static int arg_kill_signal
= 0;
207 static CGroupUnified arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_UNKNOWN
;
208 static SettingsMask arg_settings_mask
= 0;
209 static int arg_settings_trusted
= -1;
210 static char **arg_parameters
= NULL
;
211 static const char *arg_container_service_name
= "systemd-nspawn";
212 static bool arg_notify_ready
= false;
213 static bool arg_use_cgns
= true;
214 static unsigned long arg_clone_ns_flags
= CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
;
215 static MountSettingsMask arg_mount_settings
= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_TMPFS_TMP
;
216 static VeritySettings arg_verity_settings
= VERITY_SETTINGS_DEFAULT
;
217 static char **arg_syscall_allow_list
= NULL
;
218 static char **arg_syscall_deny_list
= NULL
;
220 static scmp_filter_ctx arg_seccomp
= NULL
;
222 static struct rlimit
*arg_rlimit
[_RLIMIT_MAX
] = {};
223 static bool arg_no_new_privileges
= false;
224 static int arg_oom_score_adjust
= 0;
225 static bool arg_oom_score_adjust_set
= false;
226 static CPUSet arg_cpu_set
= {};
227 static ResolvConfMode arg_resolv_conf
= RESOLV_CONF_AUTO
;
228 static TimezoneMode arg_timezone
= TIMEZONE_AUTO
;
229 static unsigned arg_console_width
= UINT_MAX
, arg_console_height
= UINT_MAX
;
230 static DeviceNode
* arg_extra_nodes
= NULL
;
231 static size_t arg_n_extra_nodes
= 0;
232 static char **arg_sysctl
= NULL
;
233 static ConsoleMode arg_console_mode
= _CONSOLE_MODE_INVALID
;
234 static MachineCredentialContext arg_credentials
= {};
235 static char **arg_bind_user
= NULL
;
236 static bool arg_suppress_sync
= false;
237 static char *arg_settings_filename
= NULL
;
238 static Architecture arg_architecture
= _ARCHITECTURE_INVALID
;
239 static ImagePolicy
*arg_image_policy
= NULL
;
240 static char *arg_background
= NULL
;
241 static bool arg_privileged
= false;
243 STATIC_DESTRUCTOR_REGISTER(arg_directory
, freep
);
244 STATIC_DESTRUCTOR_REGISTER(arg_template
, freep
);
245 STATIC_DESTRUCTOR_REGISTER(arg_chdir
, freep
);
246 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new
, freep
);
247 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old
, freep
);
248 STATIC_DESTRUCTOR_REGISTER(arg_user
, freep
);
249 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids
, freep
);
250 STATIC_DESTRUCTOR_REGISTER(arg_machine
, freep
);
251 STATIC_DESTRUCTOR_REGISTER(arg_hostname
, freep
);
252 STATIC_DESTRUCTOR_REGISTER(arg_slice
, freep
);
253 STATIC_DESTRUCTOR_REGISTER(arg_setenv
, strv_freep
);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces
, strv_freep
);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan
, strv_freep
);
256 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan
, strv_freep
);
257 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra
, strv_freep
);
258 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge
, freep
);
259 STATIC_DESTRUCTOR_REGISTER(arg_network_zone
, freep
);
260 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path
, freep
);
261 STATIC_DESTRUCTOR_REGISTER(arg_image
, freep
);
262 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle
, freep
);
263 STATIC_DESTRUCTOR_REGISTER(arg_property
, strv_freep
);
264 STATIC_DESTRUCTOR_REGISTER(arg_property_message
, sd_bus_message_unrefp
);
265 STATIC_DESTRUCTOR_REGISTER(arg_parameters
, strv_freep
);
266 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings
, verity_settings_done
);
267 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list
, strv_freep
);
268 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list
, strv_freep
);
270 STATIC_DESTRUCTOR_REGISTER(arg_seccomp
, seccomp_releasep
);
272 STATIC_DESTRUCTOR_REGISTER(arg_credentials
, machine_credential_context_done
);
273 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set
, cpu_set_reset
);
274 STATIC_DESTRUCTOR_REGISTER(arg_sysctl
, strv_freep
);
275 STATIC_DESTRUCTOR_REGISTER(arg_bind_user
, strv_freep
);
276 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename
, freep
);
277 STATIC_DESTRUCTOR_REGISTER(arg_image_policy
, image_policy_freep
);
278 STATIC_DESTRUCTOR_REGISTER(arg_background
, freep
);
280 static int handle_arg_console(const char *arg
) {
281 if (streq(arg
, "help")) {
290 if (streq(arg
, "interactive"))
291 arg_console_mode
= CONSOLE_INTERACTIVE
;
292 else if (streq(arg
, "read-only"))
293 arg_console_mode
= CONSOLE_READ_ONLY
;
294 else if (streq(arg
, "passive"))
295 arg_console_mode
= CONSOLE_PASSIVE
;
296 else if (streq(arg
, "pipe")) {
297 if (isatty(STDIN_FILENO
) && isatty(STDOUT_FILENO
))
298 log_full(arg_quiet
? LOG_DEBUG
: LOG_NOTICE
,
299 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
300 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
301 "Proceeding anyway.");
303 arg_console_mode
= CONSOLE_PIPE
;
304 } else if (streq(arg
, "autopipe")) {
305 if (isatty(STDIN_FILENO
) && isatty(STDOUT_FILENO
))
306 arg_console_mode
= CONSOLE_INTERACTIVE
;
308 arg_console_mode
= CONSOLE_PIPE
;
310 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Unknown console mode: %s", optarg
);
312 arg_settings_mask
|= SETTING_CONSOLE_MODE
;
316 static int help(void) {
317 _cleanup_free_
char *link
= NULL
;
320 pager_open(arg_pager_flags
);
322 r
= terminal_urlify_man("systemd-nspawn", "1", &link
);
326 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
327 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
328 " -h --help Show this help\n"
329 " --version Print version string\n"
330 " -q --quiet Do not show status information\n"
331 " --no-pager Do not pipe output into a pager\n"
332 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
334 " -D --directory=PATH Root directory for the container\n"
335 " --template=PATH Initialize root directory from template directory,\n"
337 " -x --ephemeral Run container with snapshot of root directory, and\n"
338 " remove it after exit\n"
339 " -i --image=PATH Root file system disk image (or device node) for\n"
341 " --image-policy=POLICY Specify disk image dissection policy\n"
342 " --oci-bundle=PATH OCI bundle directory\n"
343 " --read-only Mount the root directory read-only\n"
344 " --volatile[=MODE] Run the system in volatile mode\n"
345 " --root-hash=HASH Specify verity root hash for root disk image\n"
346 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
347 " as a DER encoded PKCS7, either as a path to a file\n"
348 " or as an ASCII base64 encoded string prefixed by\n"
350 " --verity-data=PATH Specify hash device for verity\n"
351 " --pivot-root=PATH[:PATH]\n"
352 " Pivot root to given directory in the container\n"
353 "\n%3$sExecution:%4$s\n"
354 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
355 " -b --boot Boot up full system (i.e. invoke init)\n"
356 " --chdir=PATH Set working directory in the container\n"
357 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
358 " -u --user=USER Run the command under specified user or UID\n"
359 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
360 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
361 " --suppress-sync=BOOLEAN\n"
362 " Suppress any form of disk data synchronization\n"
363 "\n%3$sSystem Identity:%4$s\n"
364 " -M --machine=NAME Set the machine name for the container\n"
365 " --hostname=NAME Override the hostname for the container\n"
366 " --uuid=UUID Set a specific machine UUID for the container\n"
367 "\n%3$sProperties:%4$s\n"
368 " -S --slice=SLICE Place the container in the specified slice\n"
369 " --property=NAME=VALUE Set scope unit property\n"
370 " --register=BOOLEAN Register container as machine\n"
371 " --keep-unit Do not register a scope for the machine, reuse\n"
372 " the service unit nspawn is running in\n"
373 "\n%3$sUser Namespacing:%4$s\n"
374 " --private-users=no Run without user namespacing\n"
375 " --private-users=yes|pick|identity\n"
376 " Run within user namespace, autoselect UID/GID range\n"
377 " --private-users=UIDBASE[:NUIDS]\n"
378 " Similar, but with user configured UID/GID range\n"
379 " --private-users-ownership=MODE\n"
380 " Adjust ('chown') or map ('map') OS tree ownership\n"
381 " to private UID/GID range\n"
382 " -U Equivalent to --private-users=pick and\n"
383 " --private-users-ownership=auto\n"
384 "\n%3$sNetworking:%4$s\n"
385 " --private-network Disable network in container\n"
386 " --network-interface=HOSTIF[:CONTAINERIF]\n"
387 " Assign an existing network interface to the\n"
389 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
390 " Create a macvlan network interface based on an\n"
391 " existing network interface to the container\n"
392 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
393 " Create an ipvlan network interface based on an\n"
394 " existing network interface to the container\n"
395 " -n --network-veth Add a virtual Ethernet connection between host\n"
397 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
398 " Add an additional virtual Ethernet link between\n"
399 " host and container\n"
400 " --network-bridge=INTERFACE\n"
401 " Add a virtual Ethernet connection to the container\n"
402 " and attach it to an existing bridge on the host\n"
403 " --network-zone=NAME Similar, but attach the new interface to an\n"
404 " an automatically managed bridge interface\n"
405 " --network-namespace-path=PATH\n"
406 " Set network namespace to the one represented by\n"
407 " the specified kernel namespace file node\n"
408 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
409 " Expose a container IP port on the host\n"
410 "\n%3$sSecurity:%4$s\n"
411 " --capability=CAP In addition to the default, retain specified\n"
413 " --drop-capability=CAP Drop the specified capability from the default set\n"
414 " --ambient-capability=CAP\n"
415 " Sets the specified capability for the started\n"
416 " process. Not useful if booting a machine.\n"
417 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
418 " --system-call-filter=LIST|~LIST\n"
419 " Permit/prohibit specific system calls\n"
420 " -Z --selinux-context=SECLABEL\n"
421 " Set the SELinux security context to be used by\n"
422 " processes in the container\n"
423 " -L --selinux-apifs-context=SECLABEL\n"
424 " Set the SELinux security context to be used by\n"
425 " API/tmpfs file systems in the container\n"
426 "\n%3$sResources:%4$s\n"
427 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
428 " --oom-score-adjust=VALUE\n"
429 " Adjust the OOM score value for the payload\n"
430 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
431 " --personality=ARCH Pick personality for this container\n"
432 "\n%3$sIntegration:%4$s\n"
433 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
434 " --timezone=MODE Select mode of /etc/localtime initialization\n"
435 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
436 " host, try-guest, try-host\n"
437 " -j Equivalent to --link-journal=try-guest\n"
438 "\n%3$sMounts:%4$s\n"
439 " --bind=PATH[:PATH[:OPTIONS]]\n"
440 " Bind mount a file or directory from the host into\n"
442 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
443 " Similar, but creates a read-only bind mount\n"
444 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
446 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
447 " --overlay=PATH[:PATH...]:PATH\n"
448 " Create an overlay mount from the host to \n"
450 " --overlay-ro=PATH[:PATH...]:PATH\n"
451 " Similar, but creates a read-only overlay mount\n"
452 " --bind-user=NAME Bind user from host to container\n"
453 "\n%3$sInput/Output:%4$s\n"
454 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
455 " set up for the container.\n"
456 " -P --pipe Equivalent to --console=pipe\n"
457 " --background=COLOR Set ANSI color for background\n"
458 "\n%3$sCredentials:%4$s\n"
459 " --set-credential=ID:VALUE\n"
460 " Pass a credential with literal value to container.\n"
461 " --load-credential=ID:PATH\n"
462 " Load credential to pass to container from file or\n"
463 " AF_UNIX stream socket.\n"
464 "\nSee the %2$s for details.\n",
465 program_invocation_short_name
,
475 static int custom_mount_check_all(void) {
478 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
479 CustomMount
*m
= &arg_custom_mounts
[i
];
481 if (path_equal(m
->destination
, "/") && arg_userns_mode
!= USER_NAMESPACE_NO
) {
482 if (arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_OFF
)
483 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
484 "--private-users-ownership=own may not be combined with custom root mounts.");
485 if (arg_uid_shift
== UID_INVALID
)
486 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
487 "--private-users with automatic UID shift may not be combined with custom root mounts.");
494 static int detect_unified_cgroup_hierarchy_from_environment(void) {
495 const char *e
, *var
= "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
498 /* Allow the user to control whether the unified hierarchy is used */
502 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
503 var
= "UNIFIED_CGROUP_HIERARCHY";
508 r
= parse_boolean(e
);
510 return log_error_errno(r
, "Failed to parse $%s: %m", var
);
512 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
514 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
520 static int detect_unified_cgroup_hierarchy_from_image(const char *directory
) {
523 if (!arg_privileged
) {
524 /* We only support the unified mode when running unprivileged */
525 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
529 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
530 * in the image actually supports. */
531 r
= cg_all_unified();
533 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
535 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
536 * routine only detects 231, so we'll have a false negative here for 230. */
537 r
= systemd_installation_has_version(directory
, "230");
539 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
541 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
543 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
544 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0) {
545 /* Mixed cgroup hierarchy support was added in 233 */
546 r
= systemd_installation_has_version(directory
, "233");
548 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
550 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
552 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
554 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
556 log_debug("Using %s hierarchy for container.",
557 arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_NONE
? "legacy" :
558 arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_SYSTEMD
? "hybrid" : "unified");
563 static int parse_capability_spec(const char *spec
, uint64_t *ret_mask
) {
568 _cleanup_free_
char *t
= NULL
;
570 r
= extract_first_word(&spec
, &t
, ",", 0);
572 return log_error_errno(r
, "Failed to parse capability %s.", t
);
576 if (streq(t
, "help")) {
577 for (int i
= 0; i
< capability_list_length(); i
++) {
580 name
= capability_to_name(i
);
591 r
= capability_from_name(t
);
593 return log_error_errno(r
, "Failed to parse capability %s.", t
);
600 return 1; /* continue */
603 static int parse_share_ns_env(const char *name
, unsigned long ns_flag
) {
606 r
= getenv_bool(name
);
610 return log_error_errno(r
, "Failed to parse $%s: %m", name
);
612 arg_clone_ns_flags
= (arg_clone_ns_flags
& ~ns_flag
) | (r
> 0 ? 0 : ns_flag
);
613 arg_settings_mask
|= SETTING_CLONE_NS_FLAGS
;
617 static int parse_mount_settings_env(void) {
621 r
= getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
622 if (r
< 0 && r
!= -ENXIO
)
623 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
625 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_TMPFS_TMP
, r
> 0);
627 e
= getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
628 if (streq_ptr(e
, "network"))
629 arg_mount_settings
|= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
;
631 r
= parse_boolean(e
);
633 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
635 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_RO
, r
== 0);
636 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_NETNS
, false);
642 static int parse_environment(void) {
646 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC
);
649 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID
);
652 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS
);
655 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
);
659 r
= parse_mount_settings_env();
663 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
664 * even if it is supported. If not supported, it has no effect. */
665 if (!cg_ns_supported())
666 arg_use_cgns
= false;
668 r
= getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
671 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
675 arg_use_cgns
= r
> 0;
676 arg_settings_mask
|= SETTING_USE_CGNS
;
680 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
682 arg_container_service_name
= e
;
684 e
= getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
686 r
= parse_ether_addr(e
, &arg_network_provided_mac
);
688 return log_error_errno(r
, "Failed to parse provided MAC address via environment variable");
691 r
= getenv_bool("SYSTEMD_SUPPRESS_SYNC");
693 arg_suppress_sync
= r
;
694 else if (r
!= -ENXIO
)
695 log_debug_errno(r
, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
697 return detect_unified_cgroup_hierarchy_from_environment();
700 static int parse_argv(int argc
, char *argv
[]) {
707 ARG_AMBIENT_CAPABILITY
,
719 ARG_NETWORK_INTERFACE
,
724 ARG_NETWORK_VETH_EXTRA
,
725 ARG_NETWORK_NAMESPACE_PATH
,
735 ARG_PRIVATE_USERS_CHOWN
,
736 ARG_PRIVATE_USERS_OWNERSHIP
,
741 ARG_SYSTEM_CALL_FILTER
,
744 ARG_NO_NEW_PRIVILEGES
,
745 ARG_OOM_SCORE_ADJUST
,
761 static const struct option options
[] = {
762 { "help", no_argument
, NULL
, 'h' },
763 { "version", no_argument
, NULL
, ARG_VERSION
},
764 { "directory", required_argument
, NULL
, 'D' },
765 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
766 { "ephemeral", no_argument
, NULL
, 'x' },
767 { "user", required_argument
, NULL
, 'u' },
768 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
769 { "as-pid2", no_argument
, NULL
, 'a' },
770 { "boot", no_argument
, NULL
, 'b' },
771 { "uuid", required_argument
, NULL
, ARG_UUID
},
772 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
773 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
774 { "ambient-capability", required_argument
, NULL
, ARG_AMBIENT_CAPABILITY
},
775 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
776 { "no-new-privileges", required_argument
, NULL
, ARG_NO_NEW_PRIVILEGES
},
777 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
778 { "bind", required_argument
, NULL
, ARG_BIND
},
779 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
780 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
781 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
782 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
783 { "inaccessible", required_argument
, NULL
, ARG_INACCESSIBLE
},
784 { "machine", required_argument
, NULL
, 'M' },
785 { "hostname", required_argument
, NULL
, ARG_HOSTNAME
},
786 { "slice", required_argument
, NULL
, 'S' },
787 { "setenv", required_argument
, NULL
, 'E' },
788 { "selinux-context", required_argument
, NULL
, 'Z' },
789 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
790 { "quiet", no_argument
, NULL
, 'q' },
791 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
}, /* not documented */
792 { "register", required_argument
, NULL
, ARG_REGISTER
},
793 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
794 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
795 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
796 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
797 { "network-veth", no_argument
, NULL
, 'n' },
798 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
799 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
800 { "network-zone", required_argument
, NULL
, ARG_NETWORK_ZONE
},
801 { "network-namespace-path", required_argument
, NULL
, ARG_NETWORK_NAMESPACE_PATH
},
802 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
803 { "image", required_argument
, NULL
, 'i' },
804 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
805 { "port", required_argument
, NULL
, 'p' },
806 { "property", required_argument
, NULL
, ARG_PROPERTY
},
807 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
808 { "private-users-chown", optional_argument
, NULL
, ARG_PRIVATE_USERS_CHOWN
}, /* obsolete */
809 { "private-users-ownership",required_argument
, NULL
, ARG_PRIVATE_USERS_OWNERSHIP
},
810 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
811 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
812 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
813 { "pivot-root", required_argument
, NULL
, ARG_PIVOT_ROOT
},
814 { "notify-ready", required_argument
, NULL
, ARG_NOTIFY_READY
},
815 { "root-hash", required_argument
, NULL
, ARG_ROOT_HASH
},
816 { "root-hash-sig", required_argument
, NULL
, ARG_ROOT_HASH_SIG
},
817 { "verity-data", required_argument
, NULL
, ARG_VERITY_DATA
},
818 { "system-call-filter", required_argument
, NULL
, ARG_SYSTEM_CALL_FILTER
},
819 { "rlimit", required_argument
, NULL
, ARG_RLIMIT
},
820 { "oom-score-adjust", required_argument
, NULL
, ARG_OOM_SCORE_ADJUST
},
821 { "cpu-affinity", required_argument
, NULL
, ARG_CPU_AFFINITY
},
822 { "resolv-conf", required_argument
, NULL
, ARG_RESOLV_CONF
},
823 { "timezone", required_argument
, NULL
, ARG_TIMEZONE
},
824 { "console", required_argument
, NULL
, ARG_CONSOLE
},
825 { "pipe", no_argument
, NULL
, ARG_PIPE
},
826 { "oci-bundle", required_argument
, NULL
, ARG_OCI_BUNDLE
},
827 { "no-pager", no_argument
, NULL
, ARG_NO_PAGER
},
828 { "set-credential", required_argument
, NULL
, ARG_SET_CREDENTIAL
},
829 { "load-credential", required_argument
, NULL
, ARG_LOAD_CREDENTIAL
},
830 { "bind-user", required_argument
, NULL
, ARG_BIND_USER
},
831 { "suppress-sync", required_argument
, NULL
, ARG_SUPPRESS_SYNC
},
832 { "image-policy", required_argument
, NULL
, ARG_IMAGE_POLICY
},
833 { "background", required_argument
, NULL
, ARG_BACKGROUND
},
838 uint64_t plus
= 0, minus
= 0;
839 bool mask_all_settings
= false, mask_no_settings
= false;
844 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
845 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
847 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options
, NULL
)) >= 0)
857 r
= parse_path_argument(optarg
, false, &arg_directory
);
861 arg_settings_mask
|= SETTING_DIRECTORY
;
865 r
= parse_path_argument(optarg
, false, &arg_template
);
869 arg_settings_mask
|= SETTING_DIRECTORY
;
873 r
= parse_path_argument(optarg
, false, &arg_image
);
877 arg_settings_mask
|= SETTING_DIRECTORY
;
881 r
= parse_path_argument(optarg
, false, &arg_oci_bundle
);
888 arg_ephemeral
= true;
889 arg_settings_mask
|= SETTING_EPHEMERAL
;
893 r
= free_and_strdup(&arg_user
, optarg
);
897 arg_settings_mask
|= SETTING_USER
;
900 case ARG_NETWORK_ZONE
: {
901 _cleanup_free_
char *j
= NULL
;
903 j
= strjoin("vz-", optarg
);
907 if (!ifname_valid(j
))
908 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
909 "Network zone name not valid: %s", j
);
911 free_and_replace(arg_network_zone
, j
);
913 arg_network_veth
= true;
914 arg_private_network
= true;
915 arg_settings_mask
|= SETTING_NETWORK
;
919 case ARG_NETWORK_BRIDGE
:
921 if (!ifname_valid(optarg
))
922 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
923 "Bridge interface name not valid: %s", optarg
);
925 r
= free_and_strdup(&arg_network_bridge
, optarg
);
931 arg_network_veth
= true;
932 arg_private_network
= true;
933 arg_settings_mask
|= SETTING_NETWORK
;
936 case ARG_NETWORK_VETH_EXTRA
:
937 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
939 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
941 arg_private_network
= true;
942 arg_settings_mask
|= SETTING_NETWORK
;
945 case ARG_NETWORK_INTERFACE
:
946 r
= interface_pair_parse(&arg_network_interfaces
, optarg
);
950 arg_private_network
= true;
951 arg_settings_mask
|= SETTING_NETWORK
;
954 case ARG_NETWORK_MACVLAN
:
955 r
= macvlan_pair_parse(&arg_network_macvlan
, optarg
);
959 arg_private_network
= true;
960 arg_settings_mask
|= SETTING_NETWORK
;
963 case ARG_NETWORK_IPVLAN
:
964 r
= ipvlan_pair_parse(&arg_network_ipvlan
, optarg
);
969 case ARG_PRIVATE_NETWORK
:
970 arg_private_network
= true;
971 arg_settings_mask
|= SETTING_NETWORK
;
974 case ARG_NETWORK_NAMESPACE_PATH
:
975 r
= parse_path_argument(optarg
, false, &arg_network_namespace_path
);
979 arg_settings_mask
|= SETTING_NETWORK
;
983 if (arg_start_mode
== START_PID2
)
984 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
985 "--boot and --as-pid2 may not be combined.");
987 arg_start_mode
= START_BOOT
;
988 arg_settings_mask
|= SETTING_START_MODE
;
992 if (arg_start_mode
== START_BOOT
)
993 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
994 "--boot and --as-pid2 may not be combined.");
996 arg_start_mode
= START_PID2
;
997 arg_settings_mask
|= SETTING_START_MODE
;
1001 r
= id128_from_string_nonzero(optarg
, &arg_uuid
);
1003 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1004 "Machine UUID may not be all zeroes.");
1006 return log_error_errno(r
, "Invalid UUID: %s", optarg
);
1008 arg_settings_mask
|= SETTING_MACHINE_ID
;
1012 _cleanup_free_
char *mangled
= NULL
;
1014 r
= unit_name_mangle_with_suffix(optarg
, NULL
, UNIT_NAME_MANGLE_WARN
, ".slice", &mangled
);
1018 free_and_replace(arg_slice
, mangled
);
1019 arg_settings_mask
|= SETTING_SLICE
;
1024 if (isempty(optarg
))
1025 arg_machine
= mfree(arg_machine
);
1027 if (!hostname_is_valid(optarg
, 0))
1028 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1029 "Invalid machine name: %s", optarg
);
1031 r
= free_and_strdup(&arg_machine
, optarg
);
1038 if (isempty(optarg
))
1039 arg_hostname
= mfree(arg_hostname
);
1041 if (!hostname_is_valid(optarg
, 0))
1042 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1043 "Invalid hostname: %s", optarg
);
1045 r
= free_and_strdup(&arg_hostname
, optarg
);
1050 arg_settings_mask
|= SETTING_HOSTNAME
;
1054 arg_selinux_context
= optarg
;
1058 arg_selinux_apifs_context
= optarg
;
1062 arg_read_only
= true;
1063 arg_settings_mask
|= SETTING_READ_ONLY
;
1066 case ARG_AMBIENT_CAPABILITY
: {
1068 r
= parse_capability_spec(optarg
, &m
);
1071 arg_caps_ambient
|= m
;
1072 arg_settings_mask
|= SETTING_CAPABILITY
;
1075 case ARG_CAPABILITY
:
1076 case ARG_DROP_CAPABILITY
: {
1078 r
= parse_capability_spec(optarg
, &m
);
1082 if (c
== ARG_CAPABILITY
)
1086 arg_settings_mask
|= SETTING_CAPABILITY
;
1089 case ARG_NO_NEW_PRIVILEGES
:
1090 r
= parse_boolean(optarg
);
1092 return log_error_errno(r
, "Failed to parse --no-new-privileges= argument: %s", optarg
);
1094 arg_no_new_privileges
= r
;
1095 arg_settings_mask
|= SETTING_NO_NEW_PRIVILEGES
;
1099 arg_link_journal
= LINK_GUEST
;
1100 arg_link_journal_try
= true;
1101 arg_settings_mask
|= SETTING_LINK_JOURNAL
;
1104 case ARG_LINK_JOURNAL
:
1105 r
= parse_link_journal(optarg
, &arg_link_journal
, &arg_link_journal_try
);
1107 return log_error_errno(r
, "Failed to parse link journal mode %s", optarg
);
1109 arg_settings_mask
|= SETTING_LINK_JOURNAL
;
1114 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
1116 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
1118 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1122 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
1124 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
1126 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1130 case ARG_OVERLAY_RO
:
1131 r
= overlay_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_OVERLAY_RO
);
1132 if (r
== -EADDRNOTAVAIL
)
1133 return log_error_errno(r
, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1135 return log_error_errno(r
, "Failed to parse --overlay(-ro)= argument %s: %m", optarg
);
1137 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1140 case ARG_INACCESSIBLE
:
1141 r
= inaccessible_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
1143 return log_error_errno(r
, "Failed to parse --inaccessible= argument %s: %m", optarg
);
1145 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1149 r
= strv_env_replace_strdup_passthrough(&arg_setenv
, optarg
);
1151 return log_error_errno(r
, "Cannot assign environment variable %s: %m", optarg
);
1153 arg_settings_mask
|= SETTING_ENVIRONMENT
;
1160 case ARG_SHARE_SYSTEM
:
1161 /* We don't officially support this anymore, except for compat reasons. People should use the
1162 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1163 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1164 arg_clone_ns_flags
= 0;
1168 r
= parse_boolean(optarg
);
1170 log_error("Failed to parse --register= argument: %s", optarg
);
1178 arg_keep_unit
= true;
1181 case ARG_PERSONALITY
:
1183 arg_personality
= personality_from_string(optarg
);
1184 if (arg_personality
== PERSONALITY_INVALID
)
1185 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1186 "Unknown or unsupported personality '%s'.", optarg
);
1188 arg_settings_mask
|= SETTING_PERSONALITY
;
1194 arg_volatile_mode
= VOLATILE_YES
;
1195 else if (streq(optarg
, "help")) {
1196 DUMP_STRING_TABLE(volatile_mode
, VolatileMode
, _VOLATILE_MODE_MAX
);
1201 m
= volatile_mode_from_string(optarg
);
1203 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1204 "Failed to parse --volatile= argument: %s", optarg
);
1206 arg_volatile_mode
= m
;
1209 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
1213 r
= expose_port_parse(&arg_expose_ports
, optarg
);
1215 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
1217 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
1219 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
1223 if (strv_extend(&arg_property
, optarg
) < 0)
1228 case ARG_PRIVATE_USERS
: {
1233 else if (!in_charset(optarg
, DIGITS
))
1234 /* do *not* parse numbers as booleans */
1235 boolean
= parse_boolean(optarg
);
1240 /* no: User namespacing off */
1241 arg_userns_mode
= USER_NAMESPACE_NO
;
1242 arg_uid_shift
= UID_INVALID
;
1243 arg_uid_range
= UINT32_C(0x10000);
1244 } else if (boolean
> 0) {
1245 /* yes: User namespacing on, UID range is read from root dir */
1246 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1247 arg_uid_shift
= UID_INVALID
;
1248 arg_uid_range
= UINT32_C(0x10000);
1249 } else if (streq(optarg
, "pick")) {
1250 /* pick: User namespacing on, UID range is picked randomly */
1251 arg_userns_mode
= USER_NAMESPACE_PICK
; /* Note that arg_userns_ownership is
1252 * implied by USER_NAMESPACE_PICK
1254 arg_uid_shift
= UID_INVALID
;
1255 arg_uid_range
= UINT32_C(0x10000);
1257 } else if (streq(optarg
, "identity")) {
1258 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1259 * itself, i.e. we don't actually map anything, but do take benefit of
1260 * isolation of capability sets. */
1261 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1263 arg_uid_range
= UINT32_C(0x10000);
1265 /* anything else: User namespacing on, UID range is explicitly configured */
1266 r
= parse_userns_uid_range(optarg
, &arg_uid_shift
, &arg_uid_range
);
1269 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1272 arg_settings_mask
|= SETTING_USERNS
;
1277 if (userns_supported()) {
1278 arg_userns_mode
= USER_NAMESPACE_PICK
; /* Note that arg_userns_ownership is
1279 * implied by USER_NAMESPACE_PICK
1281 arg_uid_shift
= UID_INVALID
;
1282 arg_uid_range
= UINT32_C(0x10000);
1284 arg_settings_mask
|= SETTING_USERNS
;
1289 case ARG_PRIVATE_USERS_CHOWN
:
1290 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
1292 arg_settings_mask
|= SETTING_USERNS
;
1295 case ARG_PRIVATE_USERS_OWNERSHIP
:
1296 if (streq(optarg
, "help")) {
1297 DUMP_STRING_TABLE(user_namespace_ownership
, UserNamespaceOwnership
, _USER_NAMESPACE_OWNERSHIP_MAX
);
1301 arg_userns_ownership
= user_namespace_ownership_from_string(optarg
);
1302 if (arg_userns_ownership
< 0)
1303 return log_error_errno(arg_userns_ownership
, "Cannot parse --user-namespace-ownership= value: %s", optarg
);
1305 arg_settings_mask
|= SETTING_USERNS
;
1308 case ARG_KILL_SIGNAL
:
1309 if (streq(optarg
, "help")) {
1310 DUMP_STRING_TABLE(signal
, int, _NSIG
);
1314 arg_kill_signal
= signal_from_string(optarg
);
1315 if (arg_kill_signal
< 0)
1316 return log_error_errno(arg_kill_signal
, "Cannot parse signal: %s", optarg
);
1318 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
1323 /* no → do not read files
1324 * yes → read files, do not override cmdline, trust only subset
1325 * override → read files, override cmdline, trust only subset
1326 * trusted → read files, do not override cmdline, trust all
1329 r
= parse_boolean(optarg
);
1331 if (streq(optarg
, "trusted")) {
1332 mask_all_settings
= false;
1333 mask_no_settings
= false;
1334 arg_settings_trusted
= true;
1336 } else if (streq(optarg
, "override")) {
1337 mask_all_settings
= false;
1338 mask_no_settings
= true;
1339 arg_settings_trusted
= -1;
1341 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
1344 mask_all_settings
= false;
1345 mask_no_settings
= false;
1346 arg_settings_trusted
= -1;
1349 mask_all_settings
= true;
1350 mask_no_settings
= false;
1351 arg_settings_trusted
= false;
1357 _cleanup_free_
char *wd
= NULL
;
1359 if (!path_is_absolute(optarg
))
1360 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1361 "Working directory %s is not an absolute path.", optarg
);
1363 r
= path_simplify_alloc(optarg
, &wd
);
1365 return log_error_errno(r
, "Failed to simplify path %s: %m", optarg
);
1367 if (!path_is_normalized(wd
))
1368 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Working directory path is not normalized: %s", wd
);
1370 if (path_below_api_vfs(wd
))
1371 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Working directory is below API VFS, refusing: %s", wd
);
1373 free_and_replace(arg_chdir
, wd
);
1374 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
1378 case ARG_PIVOT_ROOT
:
1379 r
= pivot_root_parse(&arg_pivot_root_new
, &arg_pivot_root_old
, optarg
);
1381 return log_error_errno(r
, "Failed to parse --pivot-root= argument %s: %m", optarg
);
1383 arg_settings_mask
|= SETTING_PIVOT_ROOT
;
1386 case ARG_NOTIFY_READY
:
1387 r
= parse_boolean(optarg
);
1389 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1390 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg
);
1391 arg_notify_ready
= r
;
1392 arg_settings_mask
|= SETTING_NOTIFY_READY
;
1395 case ARG_ROOT_HASH
: {
1396 _cleanup_free_
void *k
= NULL
;
1399 r
= unhexmem(optarg
, &k
, &l
);
1401 return log_error_errno(r
, "Failed to parse root hash: %s", optarg
);
1402 if (l
< sizeof(sd_id128_t
))
1403 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Root hash must be at least 128-bit long: %s", optarg
);
1405 free_and_replace(arg_verity_settings
.root_hash
, k
);
1406 arg_verity_settings
.root_hash_size
= l
;
1410 case ARG_ROOT_HASH_SIG
: {
1415 if ((value
= startswith(optarg
, "base64:"))) {
1416 r
= unbase64mem(value
, &p
, &l
);
1418 return log_error_errno(r
, "Failed to parse root hash signature '%s': %m", optarg
);
1421 r
= read_full_file(optarg
, (char**) &p
, &l
);
1423 return log_error_errno(r
, "Failed parse root hash signature file '%s': %m", optarg
);
1426 free_and_replace(arg_verity_settings
.root_hash_sig
, p
);
1427 arg_verity_settings
.root_hash_sig_size
= l
;
1431 case ARG_VERITY_DATA
:
1432 r
= parse_path_argument(optarg
, false, &arg_verity_settings
.data_path
);
1437 case ARG_SYSTEM_CALL_FILTER
: {
1441 negative
= optarg
[0] == '~';
1442 items
= negative
? optarg
+ 1 : optarg
;
1445 _cleanup_free_
char *word
= NULL
;
1447 r
= extract_first_word(&items
, &word
, NULL
, 0);
1453 return log_error_errno(r
, "Failed to parse system call filter: %m");
1456 r
= strv_extend(&arg_syscall_deny_list
, word
);
1458 r
= strv_extend(&arg_syscall_allow_list
, word
);
1463 arg_settings_mask
|= SETTING_SYSCALL_FILTER
;
1469 _cleanup_free_
char *name
= NULL
;
1472 if (streq(optarg
, "help")) {
1473 DUMP_STRING_TABLE(rlimit
, int, _RLIMIT_MAX
);
1477 eq
= strchr(optarg
, '=');
1479 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1480 "--rlimit= expects an '=' assignment.");
1482 name
= strndup(optarg
, eq
- optarg
);
1486 rl
= rlimit_from_string_harder(name
);
1488 return log_error_errno(rl
, "Unknown resource limit: %s", name
);
1490 if (!arg_rlimit
[rl
]) {
1491 arg_rlimit
[rl
] = new0(struct rlimit
, 1);
1492 if (!arg_rlimit
[rl
])
1496 r
= rlimit_parse(rl
, eq
+ 1, arg_rlimit
[rl
]);
1498 return log_error_errno(r
, "Failed to parse resource limit: %s", eq
+ 1);
1500 arg_settings_mask
|= SETTING_RLIMIT_FIRST
<< rl
;
1504 case ARG_OOM_SCORE_ADJUST
:
1505 r
= parse_oom_score_adjust(optarg
, &arg_oom_score_adjust
);
1507 return log_error_errno(r
, "Failed to parse --oom-score-adjust= parameter: %s", optarg
);
1509 arg_oom_score_adjust_set
= true;
1510 arg_settings_mask
|= SETTING_OOM_SCORE_ADJUST
;
1513 case ARG_CPU_AFFINITY
: {
1516 r
= parse_cpu_set(optarg
, &cpuset
);
1518 return log_error_errno(r
, "Failed to parse CPU affinity mask %s: %m", optarg
);
1520 cpu_set_reset(&arg_cpu_set
);
1521 arg_cpu_set
= cpuset
;
1522 arg_settings_mask
|= SETTING_CPU_AFFINITY
;
1526 case ARG_RESOLV_CONF
:
1527 if (streq(optarg
, "help")) {
1528 DUMP_STRING_TABLE(resolv_conf_mode
, ResolvConfMode
, _RESOLV_CONF_MODE_MAX
);
1532 arg_resolv_conf
= resolv_conf_mode_from_string(optarg
);
1533 if (arg_resolv_conf
< 0)
1534 return log_error_errno(arg_resolv_conf
,
1535 "Failed to parse /etc/resolv.conf mode: %s", optarg
);
1537 arg_settings_mask
|= SETTING_RESOLV_CONF
;
1541 if (streq(optarg
, "help")) {
1542 DUMP_STRING_TABLE(timezone_mode
, TimezoneMode
, _TIMEZONE_MODE_MAX
);
1546 arg_timezone
= timezone_mode_from_string(optarg
);
1547 if (arg_timezone
< 0)
1548 return log_error_errno(arg_timezone
,
1549 "Failed to parse /etc/localtime mode: %s", optarg
);
1551 arg_settings_mask
|= SETTING_TIMEZONE
;
1555 r
= handle_arg_console(optarg
);
1562 r
= handle_arg_console("pipe");
1568 arg_pager_flags
|= PAGER_DISABLE
;
1571 case ARG_SET_CREDENTIAL
:
1572 r
= machine_credential_set(&arg_credentials
, optarg
);
1576 arg_settings_mask
|= SETTING_CREDENTIALS
;
1579 case ARG_LOAD_CREDENTIAL
:
1580 r
= machine_credential_load(&arg_credentials
, optarg
);
1584 arg_settings_mask
|= SETTING_CREDENTIALS
;
1588 if (!valid_user_group_name(optarg
, 0))
1589 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid user name to bind: %s", optarg
);
1591 if (strv_extend(&arg_bind_user
, optarg
) < 0)
1594 arg_settings_mask
|= SETTING_BIND_USER
;
1597 case ARG_SUPPRESS_SYNC
:
1598 r
= parse_boolean_argument("--suppress-sync=", optarg
, &arg_suppress_sync
);
1602 arg_settings_mask
|= SETTING_SUPPRESS_SYNC
;
1605 case ARG_IMAGE_POLICY
:
1606 r
= parse_image_policy_argument(optarg
, &arg_image_policy
);
1611 case ARG_BACKGROUND
:
1612 r
= free_and_strdup_warn(&arg_background
, optarg
);
1621 assert_not_reached();
1624 if (argc
> optind
) {
1625 strv_free(arg_parameters
);
1626 arg_parameters
= strv_copy(argv
+ optind
);
1627 if (!arg_parameters
)
1630 arg_settings_mask
|= SETTING_START_MODE
;
1633 if (arg_ephemeral
&& arg_template
&& !arg_directory
)
1634 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1635 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1636 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1638 arg_directory
= TAKE_PTR(arg_template
);
1640 arg_caps_retain
|= plus
;
1641 arg_caps_retain
|= arg_private_network
? UINT64_C(1) << CAP_NET_ADMIN
: 0;
1642 arg_caps_retain
&= ~minus
;
1644 /* Make sure to parse environment before we reset the settings mask below */
1645 r
= parse_environment();
1649 /* Load all settings from .nspawn files */
1650 if (mask_no_settings
)
1651 arg_settings_mask
= 0;
1653 /* Don't load any settings from .nspawn files */
1654 if (mask_all_settings
)
1655 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1660 static int verify_arguments(void) {
1663 SET_FLAG(arg_mount_settings
, MOUNT_PRIVILEGED
, arg_privileged
);
1665 if (!arg_privileged
) {
1666 /* machined is not accessible to unpriv clients */
1668 log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients.");
1669 arg_register
= false;
1672 if (!arg_private_network
) {
1673 log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
1674 arg_private_network
= true;
1678 if (arg_start_mode
== START_PID2
&& arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
1679 /* If we are running the stub init in the container, we don't need to look at what the init
1680 * in the container supports, because we are not using it. Let's immediately pick the right
1681 * setting based on the host system configuration.
1683 * We only do this, if the user didn't use an environment variable to override the detection.
1686 r
= cg_all_unified();
1688 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
1690 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
1691 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)
1692 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
1694 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
1697 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
1698 arg_mount_settings
|= MOUNT_USE_USERNS
;
1700 if (arg_private_network
)
1701 arg_mount_settings
|= MOUNT_APPLY_APIVFS_NETNS
;
1703 if (!(arg_clone_ns_flags
& CLONE_NEWPID
) ||
1704 !(arg_clone_ns_flags
& CLONE_NEWUTS
)) {
1705 arg_register
= false;
1706 if (arg_start_mode
!= START_PID1
)
1707 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--boot cannot be used without namespacing.");
1710 if (arg_userns_ownership
< 0)
1711 arg_userns_ownership
=
1712 arg_userns_mode
== USER_NAMESPACE_PICK
? USER_NAMESPACE_OWNERSHIP_AUTO
:
1713 USER_NAMESPACE_OWNERSHIP_OFF
;
1715 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
1716 arg_kill_signal
= SIGRTMIN
+3;
1718 if (arg_volatile_mode
!= VOLATILE_NO
) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1719 arg_read_only
= true;
1721 if (has_custom_root_mount(arg_custom_mounts
, arg_n_custom_mounts
))
1722 arg_read_only
= true;
1724 if (arg_keep_unit
&& arg_register
&& cg_pid_get_owner_uid(0, NULL
) >= 0)
1725 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1726 * The latter is not technically a user session, but we don't need to labour the point. */
1727 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--keep-unit --register=yes may not be used when invoked from a user session.");
1729 if (arg_directory
&& arg_image
)
1730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--directory= and --image= may not be combined.");
1732 if (arg_template
&& arg_image
)
1733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--template= and --image= may not be combined.");
1735 if (arg_template
&& !(arg_directory
|| arg_machine
))
1736 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--template= needs --directory= or --machine=.");
1738 if (arg_ephemeral
&& arg_template
)
1739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--ephemeral and --template= may not be combined.");
1741 /* Permit --ephemeral with --link-journal=try-* to satisfy principle of the least astonishment
1742 * (by common sense, "try" means "do not fail if not possible") */
1743 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
) && !arg_link_journal_try
)
1744 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--ephemeral and --link-journal={host,guest} may not be combined.");
1746 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !userns_supported())
1747 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "--private-users= is not supported, kernel compiled without user namespace support.");
1749 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_CHOWN
&& arg_read_only
)
1750 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1751 "--read-only and --private-users-ownership=chown may not be combined.");
1753 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1754 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1755 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1756 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_CHOWN
&& arg_volatile_mode
!= VOLATILE_NO
)
1757 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--volatile= and --private-users-ownership=chown may not be combined.");
1759 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1760 * we need to error out, to avoid conflicts between different network options. */
1761 if (arg_network_namespace_path
&&
1762 (arg_network_interfaces
|| arg_network_macvlan
||
1763 arg_network_ipvlan
|| arg_network_veth_extra
||
1764 arg_network_bridge
|| arg_network_zone
||
1766 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--network-namespace-path= cannot be combined with other network options.");
1768 if (arg_network_bridge
&& arg_network_zone
)
1769 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1770 "--network-bridge= and --network-zone= may not be combined.");
1772 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& (arg_mount_settings
& MOUNT_APPLY_APIVFS_NETNS
) && !arg_private_network
)
1773 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1775 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !(arg_mount_settings
& MOUNT_APPLY_APIVFS_RO
))
1776 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Cannot combine --private-users with read-write mounts.");
1778 if (arg_expose_ports
&& !arg_private_network
)
1779 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Cannot use --port= without private networking.");
1781 if (arg_caps_ambient
) {
1782 if (arg_caps_ambient
== UINT64_MAX
)
1783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= does not support the value all.");
1785 if ((arg_caps_ambient
& arg_caps_retain
) != arg_caps_ambient
)
1786 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= setting is not fully covered by Capability= setting.");
1788 if (arg_start_mode
== START_BOOT
)
1789 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= setting is not useful for boot mode.");
1792 if (arg_userns_mode
== USER_NAMESPACE_NO
&& !strv_isempty(arg_bind_user
))
1793 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--bind-user= requires --private-users");
1795 /* Drop duplicate --bind-user= entries */
1796 strv_uniq(arg_bind_user
);
1798 r
= custom_mount_check_all();
1805 static int verify_network_interfaces_initialized(void) {
1807 r
= test_network_interfaces_initialized(arg_network_interfaces
);
1811 r
= test_network_interfaces_initialized(arg_network_macvlan
);
1815 r
= test_network_interfaces_initialized(arg_network_ipvlan
);
1822 int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1825 if (arg_userns_mode
== USER_NAMESPACE_NO
)
1828 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1831 if (uid
!= UID_INVALID
) {
1832 uid
+= arg_uid_shift
;
1834 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1838 if (gid
!= GID_INVALID
) {
1839 gid
+= (gid_t
) arg_uid_shift
;
1841 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1845 return RET_NERRNO(lchown(p
, uid
, gid
));
1848 int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1852 q
= prefix_roota(root
, path
);
1853 r
= RET_NERRNO(mkdir(q
, mode
));
1859 return userns_lchown(q
, uid
, gid
);
1862 static const char *timezone_from_path(const char *path
) {
1863 return PATH_STARTSWITH_SET(
1865 "../usr/share/zoneinfo/",
1866 "/usr/share/zoneinfo/");
1869 static bool etc_writable(void) {
1870 return !arg_read_only
|| IN_SET(arg_volatile_mode
, VOLATILE_YES
, VOLATILE_OVERLAY
);
1873 static int setup_timezone(const char *dest
) {
1874 _cleanup_free_
char *p
= NULL
, *etc
= NULL
;
1875 const char *where
, *check
;
1881 if (IN_SET(arg_timezone
, TIMEZONE_AUTO
, TIMEZONE_SYMLINK
)) {
1882 r
= readlink_malloc("/etc/localtime", &p
);
1883 if (r
== -ENOENT
&& arg_timezone
== TIMEZONE_AUTO
)
1884 m
= etc_writable() ? TIMEZONE_DELETE
: TIMEZONE_OFF
;
1885 else if (r
== -EINVAL
&& arg_timezone
== TIMEZONE_AUTO
) /* regular file? */
1886 m
= etc_writable() ? TIMEZONE_COPY
: TIMEZONE_BIND
;
1888 log_warning_errno(r
, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1889 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1893 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1896 } else if (arg_timezone
== TIMEZONE_AUTO
)
1897 m
= etc_writable() ? TIMEZONE_SYMLINK
: TIMEZONE_BIND
;
1903 if (m
== TIMEZONE_OFF
)
1906 r
= chase("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
, NULL
);
1908 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
1912 where
= strjoina(etc
, "/localtime");
1916 case TIMEZONE_DELETE
:
1917 if (unlink(where
) < 0)
1918 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
, "Failed to remove '%s', ignoring: %m", where
);
1922 case TIMEZONE_SYMLINK
: {
1923 _cleanup_free_
char *q
= NULL
;
1924 const char *z
, *what
;
1926 z
= timezone_from_path(p
);
1928 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1932 r
= readlink_malloc(where
, &q
);
1933 if (r
>= 0 && streq_ptr(timezone_from_path(q
), z
))
1934 return 0; /* Already pointing to the right place? Then do nothing .. */
1936 check
= strjoina(dest
, "/usr/share/zoneinfo/", z
);
1937 r
= chase(check
, dest
, 0, NULL
, NULL
);
1939 log_debug_errno(r
, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z
);
1941 if (unlink(where
) < 0 && errno
!= ENOENT
) {
1942 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
, /* Don't complain on read-only images */
1943 errno
, "Failed to remove existing timezone info %s in container, ignoring: %m", where
);
1947 what
= strjoina("../usr/share/zoneinfo/", z
);
1948 if (symlink(what
, where
) < 0) {
1949 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
,
1950 errno
, "Failed to correct timezone of container, ignoring: %m");
1960 case TIMEZONE_BIND
: {
1961 _cleanup_free_
char *resolved
= NULL
;
1964 found
= chase(where
, dest
, CHASE_NONEXISTENT
, &resolved
, NULL
);
1966 log_warning_errno(found
, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1970 if (found
== 0) /* missing? */
1971 (void) touch(resolved
);
1973 r
= mount_nofollow_verbose(LOG_WARNING
, "/etc/localtime", resolved
, NULL
, MS_BIND
, NULL
);
1975 return mount_nofollow_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
1981 /* If mounting failed, try to copy */
1982 r
= copy_file_atomic("/etc/localtime", where
, 0644, COPY_REFLINK
|COPY_REPLACE
);
1984 log_full_errno(IN_SET(r
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1985 "Failed to copy /etc/localtime to %s, ignoring: %m", where
);
1992 assert_not_reached();
1995 /* Fix permissions of the symlink or file copy we just created */
1996 r
= userns_lchown(where
, 0, 0);
1998 log_warning_errno(r
, "Failed to chown /etc/localtime, ignoring: %m");
2003 static int have_resolv_conf(const char *path
) {
2006 if (access(path
, F_OK
) < 0) {
2007 if (errno
== ENOENT
)
2010 return log_debug_errno(errno
, "Failed to determine whether '%s' is available: %m", path
);
2016 static int resolved_listening(void) {
2017 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
2018 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
2019 _cleanup_free_
char *dns_stub_listener_mode
= NULL
;
2022 /* Check if resolved is listening */
2024 r
= sd_bus_open_system(&bus
);
2026 return log_debug_errno(r
, "Failed to open system bus: %m");
2028 r
= bus_name_has_owner(bus
, "org.freedesktop.resolve1", NULL
);
2030 return log_debug_errno(r
, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2034 r
= bus_get_property_string(bus
, bus_resolve_mgr
, "DNSStubListener", &error
, &dns_stub_listener_mode
);
2036 return log_debug_errno(r
, "Failed to query DNSStubListener property: %s", bus_error_message(&error
, r
));
2038 return STR_IN_SET(dns_stub_listener_mode
, "udp", "yes");
2041 static int setup_resolv_conf(const char *dest
) {
2042 _cleanup_free_
char *etc
= NULL
;
2043 const char *where
, *what
;
2049 if (arg_resolv_conf
== RESOLV_CONF_AUTO
) {
2050 if (arg_private_network
)
2051 m
= RESOLV_CONF_OFF
;
2052 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF
) > 0 && resolved_listening() > 0)
2053 m
= etc_writable() ? RESOLV_CONF_COPY_STUB
: RESOLV_CONF_BIND_STUB
;
2054 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2055 m
= etc_writable() ? RESOLV_CONF_COPY_HOST
: RESOLV_CONF_BIND_HOST
;
2057 m
= etc_writable() ? RESOLV_CONF_DELETE
: RESOLV_CONF_OFF
;
2060 m
= arg_resolv_conf
;
2062 if (m
== RESOLV_CONF_OFF
)
2065 r
= chase("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
, NULL
);
2067 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
2071 where
= strjoina(etc
, "/resolv.conf");
2073 if (m
== RESOLV_CONF_DELETE
) {
2074 if (unlink(where
) < 0)
2075 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
, "Failed to remove '%s', ignoring: %m", where
);
2080 if (IN_SET(m
, RESOLV_CONF_BIND_STATIC
, RESOLV_CONF_REPLACE_STATIC
, RESOLV_CONF_COPY_STATIC
))
2081 what
= PRIVATE_STATIC_RESOLV_CONF
;
2082 else if (IN_SET(m
, RESOLV_CONF_BIND_UPLINK
, RESOLV_CONF_REPLACE_UPLINK
, RESOLV_CONF_COPY_UPLINK
))
2083 what
= PRIVATE_UPLINK_RESOLV_CONF
;
2084 else if (IN_SET(m
, RESOLV_CONF_BIND_STUB
, RESOLV_CONF_REPLACE_STUB
, RESOLV_CONF_COPY_STUB
))
2085 what
= PRIVATE_STUB_RESOLV_CONF
;
2087 what
= "/etc/resolv.conf";
2089 if (IN_SET(m
, RESOLV_CONF_BIND_HOST
, RESOLV_CONF_BIND_STATIC
, RESOLV_CONF_BIND_UPLINK
, RESOLV_CONF_BIND_STUB
)) {
2090 _cleanup_free_
char *resolved
= NULL
;
2093 found
= chase(where
, dest
, CHASE_NONEXISTENT
|CHASE_NOFOLLOW
, &resolved
, NULL
);
2095 log_warning_errno(found
, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2099 if (found
== 0) /* missing? */
2100 (void) touch(resolved
);
2102 r
= mount_nofollow_verbose(LOG_WARNING
, what
, resolved
, NULL
, MS_BIND
, NULL
);
2104 return mount_nofollow_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
2106 /* If that didn't work, let's copy the file */
2109 if (IN_SET(m
, RESOLV_CONF_REPLACE_HOST
, RESOLV_CONF_REPLACE_STATIC
, RESOLV_CONF_REPLACE_UPLINK
, RESOLV_CONF_REPLACE_STUB
))
2110 r
= copy_file_atomic(what
, where
, 0644, COPY_REFLINK
|COPY_REPLACE
);
2112 r
= copy_file(what
, where
, O_TRUNC
|O_NOFOLLOW
, 0644, COPY_REFLINK
);
2114 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2115 * resolved or something similar runs inside and the symlink points there.
2117 * If the disk image is read-only, there's also no point in complaining.
2119 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST
, RESOLV_CONF_COPY_STATIC
, RESOLV_CONF_COPY_UPLINK
, RESOLV_CONF_COPY_STUB
) &&
2120 IN_SET(r
, -ELOOP
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
2121 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where
);
2125 r
= userns_lchown(where
, 0, 0);
2127 log_warning_errno(r
, "Failed to chown /etc/resolv.conf, ignoring: %m");
2132 static int setup_boot_id(void) {
2133 _cleanup_(unlink_and_freep
) char *from
= NULL
;
2134 _cleanup_free_
char *path
= NULL
;
2135 sd_id128_t rnd
= SD_ID128_NULL
;
2139 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2141 r
= tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path
);
2143 return log_error_errno(r
, "Failed to generate random boot ID path: %m");
2145 r
= sd_id128_randomize(&rnd
);
2147 return log_error_errno(r
, "Failed to generate random boot id: %m");
2149 r
= id128_write(path
, ID128_FORMAT_UUID
, rnd
);
2151 return log_error_errno(r
, "Failed to write boot id: %m");
2153 from
= TAKE_PTR(path
);
2154 to
= "/proc/sys/kernel/random/boot_id";
2156 r
= mount_nofollow_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
2160 return mount_nofollow_verbose(LOG_ERR
, NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2163 static int copy_devnodes(const char *dest
) {
2164 static const char devnodes
[] =
2177 BLOCK_WITH_UMASK(0000);
2179 /* Create /dev/net, so that we can create /dev/net/tun in it */
2180 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
2181 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
2183 NULSTR_FOREACH(d
, devnodes
) {
2184 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
2187 from
= path_join("/dev/", d
);
2191 to
= path_join(dest
, from
);
2195 if (stat(from
, &st
) < 0) {
2197 if (errno
!= ENOENT
)
2198 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
2200 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
))
2201 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2202 "%s is not a char or block device, cannot copy.", from
);
2204 _cleanup_free_
char *sl
= NULL
, *prefixed
= NULL
, *dn
= NULL
, *t
= NULL
;
2206 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
2207 /* Explicitly warn the user when /dev is already populated. */
2208 if (errno
== EEXIST
)
2209 log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest
);
2211 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
2213 /* Some systems abusively restrict mknod but allow bind mounts. */
2216 return log_error_errno(r
, "touch (%s) failed: %m", to
);
2217 r
= mount_nofollow_verbose(LOG_DEBUG
, from
, to
, NULL
, MS_BIND
, NULL
);
2219 return log_error_errno(r
, "Both mknod and bind mount (%s) failed: %m", to
);
2222 r
= userns_lchown(to
, 0, 0);
2224 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
2226 dn
= path_join("/dev", S_ISCHR(st
.st_mode
) ? "char" : "block");
2230 r
= userns_mkdir(dest
, dn
, 0755, 0, 0);
2232 return log_error_errno(r
, "Failed to create '%s': %m", dn
);
2234 if (asprintf(&sl
, "%s/%u:%u", dn
, major(st
.st_rdev
), minor(st
.st_rdev
)) < 0)
2237 prefixed
= path_join(dest
, sl
);
2241 t
= path_join("..", d
);
2245 if (symlink(t
, prefixed
) < 0)
2246 log_debug_errno(errno
, "Failed to symlink '%s' to '%s': %m", t
, prefixed
);
2253 static int make_extra_nodes(const char *dest
) {
2257 BLOCK_WITH_UMASK(0000);
2259 for (i
= 0; i
< arg_n_extra_nodes
; i
++) {
2260 _cleanup_free_
char *path
= NULL
;
2261 DeviceNode
*n
= arg_extra_nodes
+ i
;
2263 path
= path_join(dest
, n
->path
);
2267 if (mknod(path
, n
->mode
, S_ISCHR(n
->mode
) || S_ISBLK(n
->mode
) ? makedev(n
->major
, n
->minor
) : 0) < 0)
2268 return log_error_errno(errno
, "Failed to create device node '%s': %m", path
);
2270 r
= chmod_and_chown(path
, n
->mode
, n
->uid
, n
->gid
);
2272 return log_error_errno(r
, "Failed to adjust device node ownership of '%s': %m", path
);
2278 static int setup_pts(const char *dest
) {
2279 _cleanup_free_
char *options
= NULL
;
2284 if (arg_selinux_apifs_context
)
2285 (void) asprintf(&options
,
2286 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
2287 arg_uid_shift
+ TTY_GID
,
2288 arg_selinux_apifs_context
);
2291 (void) asprintf(&options
,
2292 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
2293 arg_uid_shift
+ TTY_GID
);
2298 /* Mount /dev/pts itself */
2299 p
= prefix_roota(dest
, "/dev/pts");
2300 r
= RET_NERRNO(mkdir(p
, 0755));
2302 return log_error_errno(r
, "Failed to create /dev/pts: %m");
2304 r
= mount_nofollow_verbose(LOG_ERR
, "devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
);
2307 r
= userns_lchown(p
, 0, 0);
2309 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
2311 /* Create /dev/ptmx symlink */
2312 p
= prefix_roota(dest
, "/dev/ptmx");
2313 if (symlink("pts/ptmx", p
) < 0)
2314 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
2315 r
= userns_lchown(p
, 0, 0);
2317 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
2319 /* And fix /dev/pts/ptmx ownership */
2320 p
= prefix_roota(dest
, "/dev/pts/ptmx");
2321 r
= userns_lchown(p
, 0, 0);
2323 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
2328 static int setup_stdio_as_dev_console(void) {
2329 _cleanup_close_
int terminal
= -EBADF
;
2332 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2333 * explicitly, if we are configured to. */
2334 terminal
= open_terminal("/dev/console", O_RDWR
|O_NOCTTY
);
2336 return log_error_errno(terminal
, "Failed to open console: %m");
2338 /* Make sure we can continue logging to the original stderr, even if
2339 * stderr points elsewhere now */
2340 r
= log_dup_console();
2342 return log_error_errno(r
, "Failed to duplicate stderr: %m");
2344 /* invalidates 'terminal' on success and failure */
2345 r
= rearrange_stdio(terminal
, terminal
, terminal
);
2348 return log_error_errno(r
, "Failed to move console to stdin/stdout/stderr: %m");
2353 static int setup_dev_console(const char *console
) {
2354 _cleanup_free_
char *p
= NULL
;
2357 /* Create /dev/console symlink */
2358 r
= path_make_relative("/dev", console
, &p
);
2360 return log_error_errno(r
, "Failed to create relative path: %m");
2362 if (symlink(p
, "/dev/console") < 0)
2363 return log_error_errno(errno
, "Failed to create /dev/console symlink: %m");
2368 static int setup_keyring(void) {
2369 key_serial_t keyring
;
2371 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2372 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2373 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2374 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2375 * into the container. */
2377 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2378 if (keyring
== -1) {
2379 if (errno
== ENOSYS
)
2380 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
2381 else if (ERRNO_IS_PRIVILEGE(errno
))
2382 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
2384 return log_error_errno(errno
, "Setting up kernel keyring failed: %m");
2390 int make_run_host(const char *root
) {
2395 r
= userns_mkdir(root
, "/run/host", 0755, 0, 0);
2397 return log_error_errno(r
, "Failed to create /run/host/: %m");
2402 static int setup_credentials(const char *root
) {
2403 bool world_readable
= false;
2407 if (arg_credentials
.n_credentials
== 0)
2410 /* If starting a single-process container as a non-root user, the uid will only be resolved after we
2411 * are inside the inner child, when credential directories and files are already read-only, so they
2412 * are unusable as the single process won't have access to them. We also don't have access to the
2413 * uid that will actually be used from here, as we are setting credentials up from the outer child.
2414 * In order to make them usable as requested by the configuration, make them world readable in that
2415 * case, as by definition there are no other processes in that case besides the one being started,
2416 * which is being configured to be able to access credentials, and any of its children which will
2417 * inherit its privileges anyway. To ensure this, also enforce (and document) that
2418 * --no-new-privileges is necessary for this combination to work. */
2419 if (arg_no_new_privileges
&& !isempty(arg_user
) && !STR_IN_SET(arg_user
, "root", "0") &&
2420 arg_start_mode
== START_PID1
)
2421 world_readable
= true;
2423 r
= make_run_host(root
);
2427 r
= userns_mkdir(root
, "/run/host/credentials", world_readable
? 0777 : 0700, 0, 0);
2429 return log_error_errno(r
, "Failed to create /run/host/credentials: %m");
2431 q
= prefix_roota(root
, "/run/host/credentials");
2432 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, "ramfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "mode=0700");
2436 FOREACH_ARRAY(cred
, arg_credentials
.credentials
, arg_credentials
.n_credentials
) {
2437 _cleanup_free_
char *j
= NULL
;
2438 _cleanup_close_
int fd
= -EBADF
;
2440 j
= path_join(q
, cred
->id
);
2444 fd
= open(j
, O_CREAT
|O_EXCL
|O_WRONLY
|O_CLOEXEC
|O_NOFOLLOW
, world_readable
? 0666 : 0600);
2446 return log_error_errno(errno
, "Failed to create credential file %s: %m", j
);
2448 r
= loop_write(fd
, cred
->data
, cred
->size
);
2450 return log_error_errno(r
, "Failed to write credential to file %s: %m", j
);
2452 if (fchmod(fd
, world_readable
? 0444 : 0400) < 0)
2453 return log_error_errno(errno
, "Failed to adjust access mode of %s: %m", j
);
2455 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2456 if (fchown(fd
, arg_uid_shift
, arg_uid_shift
) < 0)
2457 return log_error_errno(errno
, "Failed to adjust ownership of %s: %m", j
);
2461 if (chmod(q
, world_readable
? 0555 : 0500) < 0)
2462 return log_error_errno(errno
, "Failed to adjust access mode of %s: %m", q
);
2464 r
= userns_lchown(q
, 0, 0);
2468 /* Make both mount and superblock read-only now */
2469 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2473 return mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "mode=0500");
2476 static int setup_kmsg(int fd_inner_socket
) {
2477 _cleanup_(unlink_and_freep
) char *from
= NULL
;
2478 _cleanup_free_
char *fifo
= NULL
;
2479 _cleanup_close_
int fd
= -EBADF
;
2482 assert(fd_inner_socket
>= 0);
2484 BLOCK_WITH_UMASK(0000);
2486 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2487 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2488 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2489 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2491 r
= tempfn_random_child("/run", "proc-kmsg", &fifo
);
2493 return log_error_errno(r
, "Failed to generate kmsg path: %m");
2495 if (mkfifo(fifo
, 0600) < 0)
2496 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
2498 from
= TAKE_PTR(fifo
);
2500 r
= mount_nofollow_verbose(LOG_ERR
, from
, "/proc/kmsg", NULL
, MS_BIND
, NULL
);
2504 fd
= open(from
, O_RDWR
|O_NONBLOCK
|O_CLOEXEC
);
2506 return log_error_errno(errno
, "Failed to open fifo: %m");
2508 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2509 r
= send_one_fd(fd_inner_socket
, fd
, 0);
2511 return log_error_errno(r
, "Failed to send FIFO fd: %m");
2517 union in_addr_union address4
;
2518 union in_addr_union address6
;
2519 struct FirewallContext
*fw_ctx
;
2522 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2523 struct ExposeArgs
*args
= ASSERT_PTR(userdata
);
2528 (void) expose_port_execute(rtnl
, &args
->fw_ctx
, arg_expose_ports
, AF_INET
, &args
->address4
);
2529 (void) expose_port_execute(rtnl
, &args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &args
->address6
);
2533 static int setup_hostname(void) {
2536 if ((arg_clone_ns_flags
& CLONE_NEWUTS
) == 0)
2539 r
= sethostname_idempotent(arg_hostname
?: arg_machine
);
2541 return log_error_errno(r
, "Failed to set hostname: %m");
2546 static int setup_journal(const char *directory
) {
2547 _cleanup_free_
char *d
= NULL
;
2553 /* Don't link journals in ephemeral mode */
2557 if (arg_link_journal
== LINK_NO
)
2560 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
2562 r
= sd_id128_get_machine(&this_id
);
2564 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2566 if (sd_id128_equal(arg_uuid
, this_id
)) {
2567 log_full(try ? LOG_WARNING
: LOG_ERR
,
2568 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid
));
2574 FOREACH_STRING(dirname
, "/var", "/var/log", "/var/log/journal") {
2575 r
= userns_mkdir(directory
, dirname
, 0755, 0, 0);
2577 bool ignore
= r
== -EROFS
&& try;
2578 log_full_errno(ignore
? LOG_DEBUG
: LOG_ERR
, r
,
2579 "Failed to create %s%s: %m", dirname
, ignore
? ", ignoring" : "");
2580 return ignore
? 0 : r
;
2584 p
= strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid
));
2585 q
= prefix_roota(directory
, p
);
2587 if (path_is_mount_point(p
) > 0) {
2591 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
),
2592 "%s: already a mount point, refusing to use for journal", p
);
2595 if (path_is_mount_point(q
) > 0) {
2599 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
),
2600 "%s: already a mount point, refusing to use for journal", q
);
2603 r
= readlink_and_make_absolute(p
, &d
);
2605 if (IN_SET(arg_link_journal
, LINK_GUEST
, LINK_AUTO
) &&
2608 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2610 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
2615 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2616 } else if (r
== -EINVAL
) {
2618 if (arg_link_journal
== LINK_GUEST
&&
2621 if (errno
== ENOTDIR
) {
2622 log_error("%s already exists and is neither a symlink nor a directory", p
);
2625 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
2627 } else if (r
!= -ENOENT
)
2628 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
2630 if (arg_link_journal
== LINK_GUEST
) {
2632 if (symlink(q
, p
) < 0) {
2634 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2637 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2640 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2642 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
2646 if (arg_link_journal
== LINK_HOST
) {
2647 /* don't create parents here — if the host doesn't have
2648 * permanent journal set up, don't force it here */
2650 r
= RET_NERRNO(mkdir(p
, 0755));
2651 if (r
< 0 && r
!= -EEXIST
) {
2653 log_debug_errno(r
, "Failed to create %s, skipping journal setup: %m", p
);
2656 return log_error_errno(r
, "Failed to create %s: %m", p
);
2659 } else if (access(p
, F_OK
) < 0)
2662 if (dir_is_empty(q
, /* ignore_hidden_or_backup= */ false) == 0)
2663 log_warning("%s is not empty, proceeding anyway.", q
);
2665 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2667 return log_error_errno(r
, "Failed to create %s: %m", q
);
2669 r
= mount_nofollow_verbose(LOG_DEBUG
, p
, q
, NULL
, MS_BIND
, NULL
);
2671 return log_error_errno(r
, "Failed to bind mount journal from host into guest: %m");
2676 static int drop_capabilities(uid_t uid
) {
2677 CapabilityQuintet q
;
2679 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2680 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2681 * arg_caps_retain. */
2683 if (capability_quintet_is_set(&arg_full_capabilities
)) {
2684 q
= arg_full_capabilities
;
2686 if (q
.bounding
== UINT64_MAX
)
2687 q
.bounding
= uid
== 0 ? arg_caps_retain
: 0;
2689 if (q
.effective
== UINT64_MAX
)
2690 q
.effective
= uid
== 0 ? q
.bounding
: 0;
2692 if (q
.inheritable
== UINT64_MAX
)
2693 q
.inheritable
= uid
== 0 ? q
.bounding
: arg_caps_ambient
;
2695 if (q
.permitted
== UINT64_MAX
)
2696 q
.permitted
= uid
== 0 ? q
.bounding
: arg_caps_ambient
;
2698 if (q
.ambient
== UINT64_MAX
&& ambient_capabilities_supported())
2699 q
.ambient
= arg_caps_ambient
;
2701 if (capability_quintet_mangle(&q
))
2702 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Cannot set capabilities that are not in the current bounding set.");
2705 q
= (CapabilityQuintet
) {
2706 .bounding
= arg_caps_retain
,
2707 .effective
= uid
== 0 ? arg_caps_retain
: 0,
2708 .inheritable
= uid
== 0 ? arg_caps_retain
: arg_caps_ambient
,
2709 .permitted
= uid
== 0 ? arg_caps_retain
: arg_caps_ambient
,
2710 .ambient
= ambient_capabilities_supported() ? arg_caps_ambient
: UINT64_MAX
,
2713 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2714 * in order to maintain the same behavior as systemd < 242. */
2715 if (capability_quintet_mangle(&q
))
2716 log_full(arg_quiet
? LOG_DEBUG
: LOG_WARNING
,
2717 "Some capabilities will not be set because they are not in the current bounding set.");
2721 return capability_quintet_enforce(&q
);
2724 static int reset_audit_loginuid(void) {
2725 _cleanup_free_
char *p
= NULL
;
2728 if ((arg_clone_ns_flags
& CLONE_NEWPID
) == 0)
2731 if (!arg_privileged
)
2734 r
= read_one_line_file("/proc/self/loginuid", &p
);
2738 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2740 /* Already reset? */
2741 if (streq(p
, "4294967295"))
2744 r
= write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER
);
2747 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2748 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2749 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2750 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2751 "using systemd-nspawn. Sleeping for 5s... (%m)");
2759 static int mount_tunnel_dig(const char *root
) {
2763 if (!arg_privileged
) {
2764 log_debug("Not digging mount tunnel, because running unprivileged.");
2768 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2769 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2770 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
2771 (void) mkdir_p(p
, 0600);
2773 r
= make_run_host(root
);
2777 r
= userns_mkdir(root
, NSPAWN_MOUNT_TUNNEL
, 0600, 0, 0);
2779 return log_error_errno(r
, "Failed to create "NSPAWN_MOUNT_TUNNEL
": %m");
2781 q
= prefix_roota(root
, NSPAWN_MOUNT_TUNNEL
);
2782 r
= mount_nofollow_verbose(LOG_ERR
, p
, q
, NULL
, MS_BIND
, NULL
);
2786 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
2793 static int mount_tunnel_open(void) {
2796 if (!arg_privileged
) {
2797 log_debug("Not opening up mount tunnel, because running unprivileged.");
2801 r
= mount_follow_verbose(LOG_ERR
, NULL
, NSPAWN_MOUNT_TUNNEL
, NULL
, MS_SLAVE
, NULL
);
2808 static int setup_machine_id(const char *directory
) {
2811 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2812 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2813 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2814 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2815 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2816 * container behaves nicely). */
2818 r
= id128_get_machine(directory
, &arg_uuid
);
2819 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r
)) {
2820 /* If the file is missing, empty, or uninitialized, we don't mind */
2821 if (sd_id128_is_null(arg_uuid
)) {
2822 r
= sd_id128_randomize(&arg_uuid
);
2824 return log_error_errno(r
, "Failed to acquire randomized machine UUID: %m");
2827 return log_error_errno(r
, "Failed to read machine ID from container image: %m");
2832 static int recursive_chown(const char *directory
, uid_t shift
, uid_t range
) {
2837 if (arg_userns_mode
== USER_NAMESPACE_NO
|| arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_CHOWN
)
2840 r
= path_patch_uid(directory
, arg_uid_shift
, arg_uid_range
);
2841 if (r
== -EOPNOTSUPP
)
2842 return log_error_errno(r
, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2844 return log_error_errno(r
, "Upper 16 bits of root directory UID and GID do not match.");
2846 return log_error_errno(r
, "Failed to adjust UID/GID shift of OS tree: %m");
2848 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2850 log_debug("Patched directory tree to match UID/GID range.");
2857 * < 0 : wait_for_terminate() failed to get the state of the
2858 * container, the container was terminated by a signal, or
2859 * failed for an unknown reason. No change is made to the
2860 * container argument.
2861 * > 0 : The program executed in the container terminated with an
2862 * error. The exit code of the program executed in the
2863 * container is returned. The container argument has been set
2864 * to CONTAINER_TERMINATED.
2865 * 0 : The container is being rebooted, has been shut down or exited
2866 * successfully. The container argument has been set to either
2867 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2869 * That is, success is indicated by a return value of zero, and an
2870 * error is indicated by a non-zero value.
2872 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2876 r
= wait_for_terminate(pid
, &status
);
2878 return log_warning_errno(r
, "Failed to wait for container: %m");
2880 switch (status
.si_code
) {
2883 if (status
.si_status
== 0)
2884 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2886 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2888 *container
= CONTAINER_TERMINATED
;
2889 return status
.si_status
;
2892 if (status
.si_status
== SIGINT
) {
2893 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2894 *container
= CONTAINER_TERMINATED
;
2897 } else if (status
.si_status
== SIGHUP
) {
2898 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2899 *container
= CONTAINER_REBOOTED
;
2905 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2906 "Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2909 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2910 "Container %s failed due to unknown reason.", arg_machine
);
2914 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2917 pid
= PTR_TO_PID(userdata
);
2919 if (kill(pid
, arg_kill_signal
) >= 0) {
2920 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2921 sd_event_source_set_userdata(s
, NULL
);
2926 sd_event_exit(sd_event_source_get_event(s
), 0);
2930 static int on_sigchld(sd_event_source
*s
, const struct signalfd_siginfo
*ssi
, void *userdata
) {
2936 pid
= PTR_TO_PID(userdata
);
2941 if (waitid(P_ALL
, 0, &si
, WNOHANG
|WNOWAIT
|WEXITED
) < 0)
2942 return log_error_errno(errno
, "Failed to waitid(): %m");
2943 if (si
.si_pid
== 0) /* No pending children. */
2945 if (si
.si_pid
== pid
) {
2946 /* The main process we care for has exited. Return from
2947 * signal handler but leave the zombie. */
2948 sd_event_exit(sd_event_source_get_event(s
), 0);
2952 /* Reap all other children. */
2953 (void) waitid(P_PID
, si
.si_pid
, &si
, WNOHANG
|WEXITED
);
2959 static int on_request_stop(sd_bus_message
*m
, void *userdata
, sd_bus_error
*error
) {
2964 pid
= PTR_TO_PID(userdata
);
2966 if (arg_kill_signal
> 0) {
2967 log_info("Container termination requested. Attempting to halt container.");
2968 (void) kill(pid
, arg_kill_signal
);
2970 log_info("Container termination requested. Exiting.");
2971 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m
)), 0);
2977 static int pick_paths(void) {
2980 if (arg_directory
) {
2981 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
2982 PickFilter filter
= pick_filter_image_dir
;
2984 filter
.architecture
= arg_architecture
;
2986 r
= path_pick_update_warn(
2989 PICK_ARCHITECTURE
|PICK_TRIES
,
2992 /* Accept ENOENT here so that the --template= logic can work */
2996 arg_architecture
= result
.architecture
;
3000 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
3001 PickFilter filter
= pick_filter_image_raw
;
3003 filter
.architecture
= arg_architecture
;
3005 r
= path_pick_update_warn(
3008 PICK_ARCHITECTURE
|PICK_TRIES
,
3013 arg_architecture
= result
.architecture
;
3017 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
3018 PickFilter filter
= pick_filter_image_dir
;
3020 filter
.architecture
= arg_architecture
;
3022 r
= path_pick_update_warn(
3030 arg_architecture
= result
.architecture
;
3036 static int determine_names(void) {
3039 if (arg_template
&& !arg_directory
&& arg_machine
) {
3041 /* If --template= was specified then we should not search for a machine, but instead create a
3042 * new one in /var/lib/machine. */
3044 arg_directory
= path_join("/var/lib/machines", arg_machine
);
3049 if (!arg_image
&& !arg_directory
) {
3051 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3053 r
= image_find(IMAGE_MACHINE
, arg_machine
, NULL
, &i
);
3055 return log_error_errno(r
, "No image for machine '%s'.", arg_machine
);
3057 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3059 if (IN_SET(i
->type
, IMAGE_RAW
, IMAGE_BLOCK
))
3060 r
= free_and_strdup(&arg_image
, i
->path
);
3062 r
= free_and_strdup(&arg_directory
, i
->path
);
3067 arg_read_only
= arg_read_only
|| i
->read_only
;
3069 r
= safe_getcwd(&arg_directory
);
3071 return log_error_errno(r
, "Failed to determine current directory: %m");
3074 if (!arg_directory
&& !arg_image
)
3075 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to determine path, please use -D or -i.");
3079 if (arg_directory
&& path_equal(arg_directory
, "/")) {
3080 arg_machine
= gethostname_malloc();
3083 } else if (arg_image
) {
3086 r
= path_extract_filename(arg_image
, &arg_machine
);
3088 return log_error_errno(r
, "Failed to extract file name from '%s': %m", arg_image
);
3090 /* Truncate suffix if there is one */
3091 e
= endswith(arg_machine
, ".raw");
3095 r
= path_extract_filename(arg_directory
, &arg_machine
);
3097 return log_error_errno(r
, "Failed to extract file name from '%s': %m", arg_directory
);
3100 hostname_cleanup(arg_machine
);
3101 if (!hostname_is_valid(arg_machine
, 0))
3102 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to determine machine name automatically, please use -M.");
3104 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3105 * to match fixed config file names. */
3106 arg_settings_filename
= strjoin(arg_machine
, ".nspawn");
3107 if (!arg_settings_filename
)
3110 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3111 * instances at once without manually having to specify -M each time. */
3113 if (strextendf(&arg_machine
, "-%016" PRIx64
, random_u64()) < 0)
3116 arg_settings_filename
= strjoin(arg_machine
, ".nspawn");
3117 if (!arg_settings_filename
)
3124 static int chase_and_update(char **p
, unsigned flags
) {
3133 r
= chase(*p
, NULL
, flags
, &chased
, NULL
);
3135 return log_error_errno(r
, "Failed to resolve path %s: %m", *p
);
3137 return free_and_replace(*p
, chased
);
3140 static int determine_uid_shift(const char *directory
) {
3142 if (arg_userns_mode
== USER_NAMESPACE_NO
) {
3147 if (arg_uid_shift
== UID_INVALID
) {
3150 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3152 if (stat(directory
, &st
) < 0)
3153 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
3155 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
3157 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000)))
3158 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
3159 "UID and GID base of %s don't match.", directory
);
3161 arg_uid_range
= UINT32_C(0x10000);
3163 if (arg_uid_shift
!= 0) {
3164 /* If the image is shifted already, then we'll fall back to classic chowning, for
3165 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3167 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_AUTO
) {
3168 log_debug("UID base of %s is non-zero, not using UID mapping.", directory
);
3169 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
3170 } else if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_MAP
)
3171 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
3172 "UID base of %s is not zero, UID mapping not supported.", directory
);
3176 if (!userns_shift_range_valid(arg_uid_shift
, arg_uid_range
))
3177 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "UID base too high for UID range.");
3182 static unsigned long effective_clone_ns_flags(void) {
3183 unsigned long flags
= arg_clone_ns_flags
;
3185 if (arg_private_network
)
3186 flags
|= CLONE_NEWNET
;
3188 flags
|= CLONE_NEWCGROUP
;
3189 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
3190 flags
|= CLONE_NEWUSER
;
3195 static int patch_sysctl(void) {
3197 /* This table is inspired by runc's sysctl() function */
3198 static const struct {
3201 unsigned long clone_flags
;
3203 { "kernel.hostname", false, CLONE_NEWUTS
},
3204 { "kernel.domainname", false, CLONE_NEWUTS
},
3205 { "kernel.msgmax", false, CLONE_NEWIPC
},
3206 { "kernel.msgmnb", false, CLONE_NEWIPC
},
3207 { "kernel.msgmni", false, CLONE_NEWIPC
},
3208 { "kernel.sem", false, CLONE_NEWIPC
},
3209 { "kernel.shmall", false, CLONE_NEWIPC
},
3210 { "kernel.shmmax", false, CLONE_NEWIPC
},
3211 { "kernel.shmmni", false, CLONE_NEWIPC
},
3212 { "fs.mqueue.", true, CLONE_NEWIPC
},
3213 { "net.", true, CLONE_NEWNET
},
3216 unsigned long flags
;
3219 flags
= effective_clone_ns_flags();
3221 STRV_FOREACH_PAIR(k
, v
, arg_sysctl
) {
3225 for (i
= 0; i
< ELEMENTSOF(safe_sysctl
); i
++) {
3227 if (!FLAGS_SET(flags
, safe_sysctl
[i
].clone_flags
))
3230 if (safe_sysctl
[i
].prefix
)
3231 good
= startswith(*k
, safe_sysctl
[i
].key
);
3233 good
= streq(*k
, safe_sysctl
[i
].key
);
3240 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k
);
3242 r
= sysctl_write(*k
, *v
);
3244 return log_error_errno(r
, "Failed to write sysctl '%s': %m", *k
);
3250 static int inner_child(
3252 int fd_inner_socket
,
3254 char **os_release_pairs
) {
3256 _cleanup_free_
char *home
= NULL
;
3259 (char*) "PATH=" DEFAULT_PATH_COMPAT
,
3260 NULL
, /* container */
3265 NULL
, /* container_uuid */
3266 NULL
, /* LISTEN_FDS */
3267 NULL
, /* LISTEN_PID */
3268 NULL
, /* NOTIFY_SOCKET */
3269 NULL
, /* CREDENTIALS_DIRECTORY */
3273 const char *exec_target
;
3274 _cleanup_strv_free_
char **env_use
= NULL
;
3275 int r
, which_failed
;
3277 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3278 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3279 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3280 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3281 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3282 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3285 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3286 * unshare(). See below. */
3289 assert(fd_inner_socket
>= 0);
3291 log_debug("Inner child is initializing.");
3293 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3294 /* Tell the parent, that it now can write the UID map. */
3295 (void) barrier_place(barrier
); /* #1 */
3297 /* Wait until the parent wrote the UID map */
3298 if (!barrier_place_and_sync(barrier
)) /* #2 */
3299 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Parent died too early");
3301 /* Become the new root user inside our namespace */
3302 r
= reset_uid_gid();
3304 return log_error_errno(r
, "Couldn't become new root: %m");
3306 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3307 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3308 * propagation, but simply create new peer groups for all our mounts). */
3309 r
= mount_follow_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SHARED
|MS_REC
, NULL
);
3315 arg_mount_settings
| MOUNT_IN_USERNS
,
3317 arg_selinux_apifs_context
);
3321 if (!arg_network_namespace_path
&& arg_private_network
) {
3322 _cleanup_close_
int netns_fd
= -EBADF
;
3324 if (arg_privileged
) {
3325 if (unshare(CLONE_NEWNET
) < 0)
3326 return log_error_errno(errno
, "Failed to unshare network namespace: %m");
3329 netns_fd
= namespace_open_by_type(NAMESPACE_NET
);
3331 return log_error_errno(netns_fd
, "Failed to open newly allocate network namespace: %m");
3333 r
= send_one_fd(fd_inner_socket
, netns_fd
, 0);
3335 return log_error_errno(r
, "Failed to send network namespace to supervisor: %m");
3337 /* Tell the parent that it can setup network interfaces. */
3338 (void) barrier_place(barrier
); /* #3 */
3341 if (arg_privileged
) {
3342 r
= mount_sysfs(NULL
, arg_mount_settings
);
3347 /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */
3348 if (!barrier_place_and_sync(barrier
)) /* #4 */
3349 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
),
3350 "Parent died too early");
3353 r
= unshare(CLONE_NEWCGROUP
);
3355 return log_error_errno(errno
, "Failed to unshare cgroup namespace: %m");
3358 arg_unified_cgroup_hierarchy
,
3359 arg_userns_mode
!= USER_NAMESPACE_NO
,
3362 arg_selinux_apifs_context
,
3365 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
3369 r
= setup_boot_id();
3373 r
= setup_kmsg(fd_inner_socket
);
3380 arg_n_custom_mounts
,
3383 arg_selinux_apifs_context
,
3384 MOUNT_NON_ROOT_ONLY
| MOUNT_IN_USERNS
);
3389 return log_error_errno(errno
, "setsid() failed: %m");
3391 if (arg_private_network
)
3392 (void) loopback_setup();
3394 if (arg_expose_ports
) {
3395 r
= expose_port_send_rtnl(fd_inner_socket
);
3400 if (arg_console_mode
!= CONSOLE_PIPE
) {
3401 _cleanup_close_
int master
= -EBADF
;
3402 _cleanup_free_
char *console
= NULL
;
3404 /* Allocate a pty and make it available as /dev/console. */
3405 master
= openpt_allocate(O_RDWR
|O_NONBLOCK
, &console
);
3407 return log_error_errno(master
, "Failed to allocate a pty: %m");
3409 r
= setup_dev_console(console
);
3411 return log_error_errno(r
, "Failed to set up /dev/console: %m");
3413 r
= send_one_fd(fd_inner_socket
, master
, 0);
3415 return log_error_errno(r
, "Failed to send master fd: %m");
3417 r
= setup_stdio_as_dev_console();
3426 if (arg_oom_score_adjust_set
) {
3427 r
= set_oom_score_adjust(arg_oom_score_adjust
);
3429 return log_error_errno(r
, "Failed to adjust OOM score: %m");
3432 if (arg_cpu_set
.set
)
3433 if (sched_setaffinity(0, arg_cpu_set
.allocated
, arg_cpu_set
.set
) < 0)
3434 return log_error_errno(errno
, "Failed to set CPU affinity: %m");
3436 (void) setup_hostname();
3438 if (arg_personality
!= PERSONALITY_INVALID
) {
3439 r
= safe_personality(arg_personality
);
3441 return log_error_errno(r
, "personality() failed: %m");
3442 #ifdef ARCHITECTURE_SECONDARY
3443 } else if (arg_architecture
== ARCHITECTURE_SECONDARY
) {
3444 r
= safe_personality(PER_LINUX32
);
3446 return log_error_errno(r
, "personality() failed: %m");
3448 } else if (!arg_quiet
&& arg_architecture
>= 0 && arg_architecture
!= native_architecture())
3449 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3450 "invocation with qemu userspace emulator (or equivalent) in effect.",
3451 architecture_to_string(arg_architecture
));
3453 r
= setrlimit_closest_all((const struct rlimit
*const*) arg_rlimit
, &which_failed
);
3455 return log_error_errno(r
, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
3460 if (is_seccomp_available()) {
3461 r
= seccomp_load(arg_seccomp
);
3462 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
3463 return log_error_errno(r
, "Failed to install seccomp filter: %m");
3465 log_debug_errno(r
, "Failed to install seccomp filter: %m");
3470 r
= setup_seccomp(arg_caps_retain
, arg_syscall_allow_list
, arg_syscall_deny_list
);
3475 if (arg_suppress_sync
) {
3477 r
= seccomp_suppress_sync();
3479 log_debug_errno(r
, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3481 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3486 if (arg_selinux_context
)
3487 if (setexeccon(arg_selinux_context
) < 0)
3488 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
3491 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3492 * if we need to later on. */
3493 if (prctl(PR_SET_KEEPCAPS
, 1) < 0)
3494 return log_error_errno(errno
, "Failed to set PR_SET_KEEPCAPS: %m");
3496 if (uid_is_valid(arg_uid
) || gid_is_valid(arg_gid
))
3497 r
= change_uid_gid_raw(arg_uid
, arg_gid
, arg_supplementary_gids
, arg_n_supplementary_gids
, arg_console_mode
!= CONSOLE_PIPE
);
3499 r
= change_uid_gid(arg_user
, arg_console_mode
!= CONSOLE_PIPE
, &home
);
3503 r
= drop_capabilities(getuid());
3505 return log_error_errno(r
, "Dropping capabilities failed: %m");
3507 if (arg_no_new_privileges
)
3508 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0)
3509 return log_error_errno(errno
, "Failed to disable new privileges: %m");
3511 /* LXC sets container=lxc, so follow the scheme here */
3512 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
3514 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
3518 if (home
|| !uid_is_valid(arg_uid
) || arg_uid
== 0)
3519 if (asprintf(envp
+ n_env
++, "HOME=%s", home
?: "/root") < 0)
3522 if (arg_user
|| !uid_is_valid(arg_uid
) || arg_uid
== 0)
3523 if (asprintf(envp
+ n_env
++, "USER=%s", arg_user
?: "root") < 0 ||
3524 asprintf(envp
+ n_env
++, "LOGNAME=%s", arg_user
?: "root") < 0)
3527 assert(!sd_id128_is_null(arg_uuid
));
3529 if (asprintf(envp
+ n_env
++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid
)) < 0)
3532 if (!fdset_isempty(fds
)) {
3533 r
= fdset_cloexec(fds
, false);
3535 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
3537 if ((asprintf(envp
+ n_env
++, "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
3538 (asprintf(envp
+ n_env
++, "LISTEN_PID=1") < 0))
3541 if (asprintf(envp
+ n_env
++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH
) < 0)
3544 if (arg_credentials
.n_credentials
> 0) {
3545 envp
[n_env
] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3551 if (arg_start_mode
!= START_BOOT
) {
3552 envp
[n_env
] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE
);
3558 env_use
= strv_env_merge(envp
, os_release_pairs
, arg_setenv
);
3562 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
3563 if (!barrier_place_and_sync(barrier
)) /* #5 */
3564 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Parent died too early");
3566 /* Note, this should be done this late (💣 and not moved earlier! 💣), so that all namespacing
3567 * changes are already in effect by now, so that any resolved paths here definitely reference
3568 * resources inside the container, and not outside of them. */
3570 if (chdir(arg_chdir
) < 0)
3571 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
3573 if (arg_start_mode
== START_PID2
) {
3574 r
= stub_pid1(arg_uuid
);
3579 if (arg_console_mode
!= CONSOLE_PIPE
) {
3580 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3581 * are configured for that. Acquire it as controlling tty. */
3582 if (ioctl(STDIN_FILENO
, TIOCSCTTY
) < 0)
3583 return log_error_errno(errno
, "Failed to acquire controlling TTY: %m");
3586 log_debug("Inner child completed, invoking payload.");
3588 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3589 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3590 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3592 log_set_open_when_needed(true);
3593 log_settle_target();
3595 (void) fdset_close_others(fds
);
3597 if (arg_start_mode
== START_BOOT
) {
3601 /* Automatically search for the init system */
3603 m
= strv_length(arg_parameters
);
3604 a
= newa(char*, m
+ 2);
3605 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
3608 FOREACH_STRING(init
,
3609 "/usr/lib/systemd/systemd",
3610 "/lib/systemd/systemd",
3612 a
[0] = (char*) init
;
3613 execve(a
[0], a
, env_use
);
3616 exec_target
= "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3617 } else if (!strv_isempty(arg_parameters
)) {
3618 const char *dollar_path
;
3620 exec_target
= arg_parameters
[0];
3622 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3624 dollar_path
= strv_env_get(env_use
, "PATH");
3626 if (setenv("PATH", dollar_path
, 1) < 0)
3627 return log_error_errno(errno
, "Failed to update $PATH: %m");
3630 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
3633 /* If we cannot change the directory, we'll end up in /, that is expected. */
3634 (void) chdir(home
?: "/root");
3636 execle(DEFAULT_USER_SHELL
, "-" DEFAULT_USER_SHELL_NAME
, NULL
, env_use
);
3637 if (!streq(DEFAULT_USER_SHELL
, "/bin/bash"))
3638 execle("/bin/bash", "-bash", NULL
, env_use
);
3639 if (!streq(DEFAULT_USER_SHELL
, "/bin/sh"))
3640 execle("/bin/sh", "-sh", NULL
, env_use
);
3642 exec_target
= DEFAULT_USER_SHELL
", /bin/bash, /bin/sh";
3645 return log_error_errno(errno
, "execv(%s) failed: %m", exec_target
);
3648 static int setup_notify_child(const void *directory
) {
3649 _cleanup_close_
int fd
= -EBADF
;
3650 _cleanup_free_
char *j
= NULL
;
3651 union sockaddr_union sa
= {
3652 .un
.sun_family
= AF_UNIX
,
3656 fd
= socket(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
3658 return log_error_errno(errno
, "Failed to allocate notification socket: %m");
3661 j
= path_join(directory
, NSPAWN_NOTIFY_SOCKET_PATH
);
3666 r
= sockaddr_un_set_path(&sa
.un
, j
?: NSPAWN_NOTIFY_SOCKET_PATH
);
3668 return log_error_errno(r
, "Failed to set AF_UNIX path to %s: %m", j
?: NSPAWN_NOTIFY_SOCKET_PATH
);
3670 (void) mkdir_parents(sa
.un
.sun_path
, 0755);
3671 (void) sockaddr_un_unlink(&sa
.un
);
3673 WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */
3674 r
= bind(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
));
3676 return log_error_errno(errno
, "bind(" NSPAWN_NOTIFY_SOCKET_PATH
") failed: %m");
3679 r
= userns_lchown(sa
.un
.sun_path
, 0, 0);
3681 return log_error_errno(r
, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH
": %m");
3683 r
= setsockopt_int(fd
, SOL_SOCKET
, SO_PASSCRED
, true);
3685 return log_error_errno(r
, "SO_PASSCRED failed: %m");
3690 static int setup_unix_export_dir_outside(char **ret
) {
3695 if (!arg_privileged
) {
3696 log_debug("Not digging socket tunnel, because running unprivileged.");
3700 _cleanup_free_
char *p
= NULL
;
3701 p
= path_join("/run/systemd/nspawn/unix-export", arg_machine
);
3705 r
= path_is_mount_point(p
);
3707 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
), "Mount point '%s' exists already, refusing.", p
);
3708 if (r
< 0 && r
!= -ENOENT
)
3709 return log_error_errno(r
, "Failed to detect if '%s' is a mount point: %m", p
);
3711 r
= mkdir_p(p
, 0755);
3713 return log_error_errno(r
, "Failed to create '%s': %m", p
);
3715 _cleanup_(rmdir_and_freep
) char *q
= TAKE_PTR(p
);
3717 /* Mount the "unix export" directory really tiny, just 64 inodes. We mark the superblock writable
3718 * (since the container shall bind sockets into it). */
3719 r
= mount_nofollow_verbose(
3724 MS_NODEV
|MS_NOEXEC
|MS_NOSUID
|ms_nosymfollow_supported(),
3725 "size=4M,nr_inodes=64,mode=0755");
3729 _cleanup_(umount_and_rmdir_and_freep
) char *w
= TAKE_PTR(q
);
3731 /* After creating the superblock we change the bind mount to be read-only. This means that the fs
3732 * itself is writable, but not through the mount accessible from the host. */
3733 r
= mount_nofollow_verbose(
3738 MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
|ms_nosymfollow_supported(),
3739 /* options= */ NULL
);
3747 static int setup_unix_export_host_inside(const char *directory
, const char *unix_export_path
) {
3752 if (!arg_privileged
)
3755 assert(unix_export_path
);
3757 r
= make_run_host(directory
);
3761 _cleanup_free_
char *p
= path_join(directory
, "run/host/unix-export");
3765 if (mkdir(p
, 0755) < 0)
3766 return log_error_errno(errno
, "Failed to create '%s': %m", p
);
3768 r
= mount_nofollow_verbose(
3774 /* options= */ NULL
);
3778 r
= mount_nofollow_verbose(
3783 MS_BIND
|MS_REMOUNT
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
|ms_nosymfollow_supported(),
3784 /* options= */ NULL
);
3788 r
= userns_lchown(p
, 0, 0);
3790 return log_error_errno(r
, "Failed to chown '%s': %m", p
);
3795 static DissectImageFlags
determine_dissect_image_flags(void) {
3797 DISSECT_IMAGE_GENERIC_ROOT
|
3798 DISSECT_IMAGE_REQUIRE_ROOT
|
3799 DISSECT_IMAGE_RELAX_VAR_CHECK
|
3800 DISSECT_IMAGE_USR_NO_ROOT
|
3801 DISSECT_IMAGE_DISCARD_ON_LOOP
|
3802 DISSECT_IMAGE_ADD_PARTITION_DEVICES
|
3803 DISSECT_IMAGE_PIN_PARTITION_DEVICES
|
3804 (arg_read_only
? DISSECT_IMAGE_READ_ONLY
: DISSECT_IMAGE_FSCK
|DISSECT_IMAGE_GROWFS
) |
3805 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY
|
3806 (arg_console_mode
== CONSOLE_INTERACTIVE
? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH
: 0);
3809 static int outer_child(
3811 const char *directory
,
3812 DissectedImage
*dissected_image
,
3813 int fd_outer_socket
,
3814 int fd_inner_socket
,
3817 const char *unix_export_path
) {
3819 _cleanup_(bind_user_context_freep
) BindUserContext
*bind_user_context
= NULL
;
3820 _cleanup_strv_free_
char **os_release_pairs
= NULL
;
3821 _cleanup_close_
int fd
= -EBADF
, mntns_fd
= -EBADF
;
3828 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3829 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3830 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3831 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3832 * forked off it, and it exits. */
3836 assert(fd_outer_socket
>= 0);
3837 assert(fd_inner_socket
>= 0);
3839 log_debug("Outer child is initializing.");
3841 r
= load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs
);
3843 log_debug_errno(r
, "Failed to read os-release from host for container, ignoring: %m");
3845 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
3846 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
3848 r
= reset_audit_loginuid();
3852 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3853 * mounts to the real root. */
3854 r
= mount_follow_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
);
3858 if (dissected_image
) {
3859 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3860 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3861 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3862 * right place right away. This makes sure ESP partitions and userns are compatible. */
3864 r
= dissected_image_mount_and_warn(
3869 /* userns_fd= */ -EBADF
,
3870 determine_dissect_image_flags()|
3871 DISSECT_IMAGE_MOUNT_ROOT_ONLY
|
3872 (arg_start_mode
== START_BOOT
? DISSECT_IMAGE_VALIDATE_OS
: 0));
3877 r
= determine_uid_shift(directory
);
3881 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3882 r
= namespace_open(0,
3883 /* ret_pidns_fd = */ NULL
,
3885 /* ret_netns_fd = */ NULL
,
3886 /* ret_userns_fd = */ NULL
,
3887 /* ret_root_fd = */ NULL
);
3889 return log_error_errno(r
, "Failed to pin outer mount namespace: %m");
3891 l
= send_one_fd(fd_outer_socket
, mntns_fd
, 0);
3893 return log_error_errno(l
, "Failed to send outer mount namespace fd: %m");
3894 mntns_fd
= safe_close(mntns_fd
);
3896 /* Let the parent know which UID shift we read from the image */
3897 l
= send(fd_outer_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3899 return log_error_errno(errno
, "Failed to send UID shift: %m");
3900 if (l
!= sizeof(arg_uid_shift
))
3901 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3902 "Short write while sending UID shift.");
3904 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3905 /* When we are supposed to pick the UID shift, the parent will check now whether the
3906 * UID shift we just read from the image is available. If yes, it will send the UID
3907 * shift back to us, if not it will pick a different one, and send it back to us. */
3909 l
= recv(fd_outer_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3911 return log_error_errno(errno
, "Failed to recv UID shift: %m");
3912 if (l
!= sizeof(arg_uid_shift
))
3913 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3914 "Short read while receiving UID shift.");
3917 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
3918 "Selected user namespace base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
3921 if (path_equal(directory
, "/")) {
3922 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3923 * place, so that we can make changes to its mount structure (for example, to implement
3924 * --volatile=) without this interfering with our ability to access files such as
3925 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3926 * (instead of a temporary directory, since we are living in our own mount namespace here
3927 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3928 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3930 r
= mount_nofollow_verbose(LOG_ERR
, "/", "/run/systemd/nspawn-root", NULL
, MS_BIND
|MS_REC
, NULL
);
3934 directory
= "/run/systemd/nspawn-root";
3937 /* Make sure we always have a mount that we can move to root later on. */
3938 r
= make_mount_point(directory
);
3942 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3943 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3944 * we'll live in our own little world from now on, and propagation from the host may only happen via
3945 * the mount tunnel dir, or not at all. */
3946 r
= mount_follow_verbose(LOG_ERR
, NULL
, directory
, NULL
, MS_PRIVATE
|MS_REC
, NULL
);
3950 r
= setup_pivot_root(
3953 arg_pivot_root_old
);
3957 r
= setup_volatile_mode(
3961 arg_selinux_apifs_context
);
3965 r
= bind_user_prepare(
3970 &arg_custom_mounts
, &arg_n_custom_mounts
,
3971 &bind_user_context
);
3975 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& bind_user_context
) {
3976 /* Send the user maps we determined to the parent, so that it installs it in our user
3977 * namespace UID map table */
3979 for (size_t i
= 0; i
< bind_user_context
->n_data
; i
++) {
3981 bind_user_context
->data
[i
].payload_user
->uid
,
3982 bind_user_context
->data
[i
].host_user
->uid
,
3983 (uid_t
) bind_user_context
->data
[i
].payload_group
->gid
,
3984 (uid_t
) bind_user_context
->data
[i
].host_group
->gid
,
3987 l
= send(fd_outer_socket
, map
, sizeof(map
), MSG_NOSIGNAL
);
3989 return log_error_errno(errno
, "Failed to send user UID map: %m");
3990 if (l
!= sizeof(map
))
3991 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3992 "Short write while sending user UID map.");
3999 arg_n_custom_mounts
,
4002 arg_selinux_apifs_context
,
4007 if (arg_userns_mode
!= USER_NAMESPACE_NO
&&
4008 IN_SET(arg_userns_ownership
, USER_NAMESPACE_OWNERSHIP_MAP
, USER_NAMESPACE_OWNERSHIP_AUTO
) &&
4009 arg_uid_shift
!= 0) {
4010 _cleanup_free_
char *usr_subtree
= NULL
;
4014 dirs
[i
++] = (char*) directory
;
4016 if (dissected_image
&& dissected_image
->partitions
[PARTITION_USR
].found
) {
4017 usr_subtree
= path_join(directory
, "/usr");
4021 dirs
[i
++] = usr_subtree
;
4026 r
= remount_idmap(dirs
, arg_uid_shift
, arg_uid_range
, UID_INVALID
, UID_INVALID
, REMOUNT_IDMAPPING_HOST_ROOT
);
4027 if (r
== -EINVAL
|| ERRNO_IS_NEG_NOT_SUPPORTED(r
)) {
4028 /* This might fail because the kernel or file system doesn't support idmapping. We
4029 * can't really distinguish this nicely, nor do we have any guarantees about the
4030 * error codes we see, could be EOPNOTSUPP or EINVAL. */
4031 if (arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_AUTO
)
4032 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
4033 "ID mapped mounts are apparently not available, sorry.");
4035 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
4036 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
4038 return log_error_errno(r
, "Failed to set up ID mapped mounts: %m");
4040 log_debug("ID mapped mounts available, making use of them.");
4045 if (dissected_image
) {
4046 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
4047 r
= dissected_image_mount_and_warn(
4052 /* userns_fd= */ -EBADF
,
4053 determine_dissect_image_flags()|
4054 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY
|
4055 (idmap
? DISSECT_IMAGE_MOUNT_IDMAPPED
: 0));
4060 if (arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
4061 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
4063 r
= detect_unified_cgroup_hierarchy_from_image(directory
);
4067 l
= send(fd_outer_socket
, &arg_unified_cgroup_hierarchy
, sizeof(arg_unified_cgroup_hierarchy
), MSG_NOSIGNAL
);
4069 return log_error_errno(errno
, "Failed to send cgroup mode: %m");
4070 if (l
!= sizeof(arg_unified_cgroup_hierarchy
))
4071 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4072 "Short write while sending cgroup mode.");
4075 r
= recursive_chown(directory
, arg_uid_shift
, arg_uid_range
);
4079 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4083 if (arg_read_only
&& arg_volatile_mode
== VOLATILE_NO
&&
4084 !has_custom_root_mount(arg_custom_mounts
, arg_n_custom_mounts
)) {
4085 r
= bind_remount_recursive(directory
, MS_RDONLY
, MS_RDONLY
, NULL
);
4087 return log_error_errno(r
, "Failed to make tree read-only: %m");
4090 r
= mount_all(directory
,
4093 arg_selinux_apifs_context
);
4097 r
= copy_devnodes(directory
);
4101 r
= make_extra_nodes(directory
);
4105 (void) dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4107 p
= prefix_roota(directory
, "/run/host");
4108 (void) make_inaccessible_nodes(p
, arg_uid_shift
, arg_uid_shift
);
4110 r
= setup_unix_export_host_inside(directory
, unix_export_path
);
4114 r
= setup_pts(directory
);
4118 r
= mount_tunnel_dig(directory
);
4122 r
= setup_keyring();
4126 r
= setup_credentials(directory
);
4130 r
= bind_user_setup(bind_user_context
, directory
);
4137 arg_n_custom_mounts
,
4140 arg_selinux_apifs_context
,
4141 MOUNT_NON_ROOT_ONLY
);
4145 r
= setup_timezone(directory
);
4149 r
= setup_resolv_conf(directory
);
4153 r
= setup_machine_id(directory
);
4157 r
= setup_journal(directory
);
4161 /* The same stuff as the $container env var, but nicely readable for the entire payload */
4162 p
= prefix_roota(directory
, "/run/host/container-manager");
4163 (void) write_string_file(p
, arg_container_service_name
, WRITE_STRING_FILE_CREATE
|WRITE_STRING_FILE_MODE_0444
);
4165 /* The same stuff as the $container_uuid env var */
4166 p
= prefix_roota(directory
, "/run/host/container-uuid");
4167 (void) write_string_filef(p
, WRITE_STRING_FILE_CREATE
|WRITE_STRING_FILE_MODE_0444
, SD_ID128_UUID_FORMAT_STR
, SD_ID128_FORMAT_VAL(arg_uuid
));
4169 if (!arg_use_cgns
) {
4172 arg_unified_cgroup_hierarchy
,
4173 arg_userns_mode
!= USER_NAMESPACE_NO
,
4176 arg_selinux_apifs_context
,
4182 /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll
4183 * now switch into the target directory, and then do the final setup from there. If a user namespace
4184 * is then allocated for the container, the root mount and everything else will be out of reach for
4185 * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and
4186 * procfs then anymore, since that only works if there's an unobstructed instance currently
4187 * visible. Hence there we do it the other way round: we first allocate a new set of namespaces
4188 * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
4190 if (arg_privileged
) {
4191 /* Mark everything as shared so our mounts get propagated down. This is required to make new
4192 * bind mounts available in systemd services inside the container that create a new mount
4193 * namespace. See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
4194 * /dev/) done after this will inherit the shared propagation mode.
4196 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4197 * directory mount to root later on.
4198 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4200 r
= mount_switch_root(directory
, MS_SHARED
);
4202 return log_error_errno(r
, "Failed to move root directory: %m");
4204 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4205 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4207 r
= mount_tunnel_open();
4211 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4212 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4213 * requires that a fully visible instance is already present in the target mount
4214 * namespace. Mount one here so the inner child can mount its own instances. Later
4215 * we umount the temporary instances created here before we actually exec the
4216 * payload. Since the rootfs is shared the umount will propagate into the container.
4217 * Note, the inner child wouldn't be able to unmount the instances on its own since
4218 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4220 r
= pin_fully_visible_fs();
4225 fd
= setup_notify_child(NULL
);
4227 fd
= setup_notify_child(directory
);
4231 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4232 arg_clone_ns_flags
|
4233 (arg_userns_mode
!= USER_NAMESPACE_NO
? CLONE_NEWUSER
: 0) |
4234 ((arg_private_network
&& !arg_privileged
) ? CLONE_NEWNET
: 0));
4236 return log_error_errno(errno
, "Failed to fork inner child: %m");
4238 fd_outer_socket
= safe_close(fd_outer_socket
);
4240 /* The inner child has all namespaces that are requested, so that we all are owned by the
4241 * user if user namespaces are turned on. */
4243 if (arg_network_namespace_path
) {
4244 r
= namespace_enter(/* pidns_fd = */ -EBADF
,
4245 /* mntns_fd = */ -EBADF
,
4247 /* userns_fd = */ -EBADF
,
4248 /* root_fd = */ -EBADF
);
4250 return log_error_errno(r
, "Failed to join network namespace: %m");
4253 if (!arg_privileged
) {
4254 /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
4255 * inside the inner namespaces, but before we switch root. Hence do so here. */
4256 _cleanup_free_
char *j
= path_join(directory
, "/proc");
4260 r
= mount_follow_verbose(LOG_ERR
, "proc", j
, "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
4264 r
= mount_sysfs(directory
, arg_mount_settings
);
4268 r
= mount_switch_root(directory
, MS_SHARED
);
4270 return log_error_errno(r
, "Failed to move root directory: %m");
4273 r
= inner_child(barrier
, fd_inner_socket
, fds
, os_release_pairs
);
4275 _exit(EXIT_FAILURE
);
4277 _exit(EXIT_SUCCESS
);
4280 l
= send(fd_outer_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4282 return log_error_errno(errno
, "Failed to send PID: %m");
4283 if (l
!= sizeof(pid
))
4284 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4285 "Short write while sending PID.");
4287 l
= send(fd_outer_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
4289 return log_error_errno(errno
, "Failed to send machine ID: %m");
4290 if (l
!= sizeof(arg_uuid
))
4291 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4292 "Short write while sending machine ID.");
4294 l
= send_one_fd(fd_outer_socket
, fd
, 0);
4296 return log_error_errno(l
, "Failed to send notify fd: %m");
4298 fd_outer_socket
= safe_close(fd_outer_socket
);
4299 fd_inner_socket
= safe_close(fd_inner_socket
);
4300 netns_fd
= safe_close(netns_fd
);
4305 static int uid_shift_pick(uid_t
*shift
, LockFile
*ret_lock_file
) {
4306 bool tried_hashed
= false;
4307 unsigned n_tries
= 100;
4312 assert(ret_lock_file
);
4313 assert(arg_userns_mode
== USER_NAMESPACE_PICK
);
4314 assert(arg_uid_range
== 0x10000U
);
4318 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4321 char lock_path
[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t
) + 1];
4322 _cleanup_(release_lock_file
) LockFile lf
= LOCK_FILE_INIT
;
4327 if (candidate
< CONTAINER_UID_BASE_MIN
|| candidate
> CONTAINER_UID_BASE_MAX
)
4329 if ((candidate
& UINT32_C(0xFFFF)) != 0)
4332 xsprintf(lock_path
, "/run/systemd/nspawn-uid/" UID_FMT
, candidate
);
4333 r
= make_lock_file(lock_path
, LOCK_EX
|LOCK_NB
, &lf
);
4334 if (r
== -EBUSY
) /* Range already taken by another nspawn instance */
4339 /* Make some superficial checks whether the range is currently known in the user database */
4340 if (getpwuid_malloc(candidate
, /* ret= */ NULL
) >= 0)
4342 if (getpwuid_malloc(candidate
+ UINT32_C(0xFFFE), /* ret= */ NULL
) >= 0)
4344 if (getgrgid_malloc(candidate
, /* ret= */ NULL
) >= 0)
4346 if (getgrgid_malloc(candidate
+ UINT32_C(0xFFFE), /* ret= */ NULL
) >= 0)
4349 *ret_lock_file
= lf
;
4350 lf
= (struct LockFile
) LOCK_FILE_INIT
;
4355 if (arg_machine
&& !tried_hashed
) {
4356 /* Try to hash the base from the container name */
4358 static const uint8_t hash_key
[] = {
4359 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4360 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4363 candidate
= (uid_t
) siphash24(arg_machine
, strlen(arg_machine
), hash_key
);
4365 tried_hashed
= true;
4367 random_bytes(&candidate
, sizeof(candidate
));
4369 candidate
= (candidate
% (CONTAINER_UID_BASE_MAX
- CONTAINER_UID_BASE_MIN
)) + CONTAINER_UID_BASE_MIN
;
4370 candidate
&= (uid_t
) UINT32_C(0xFFFF0000);
4374 static int add_one_uid_map(
4376 uid_t container_uid
,
4380 return strextendf(p
,
4381 UID_FMT
" " UID_FMT
" " UID_FMT
"\n",
4382 container_uid
, host_uid
, range
);
4385 static int make_uid_map_string(
4386 const uid_t bind_user_uid
[],
4387 size_t n_bind_user_uid
,
4391 _cleanup_free_
char *s
= NULL
;
4392 uid_t previous_uid
= 0;
4395 assert(n_bind_user_uid
== 0 || bind_user_uid
);
4396 assert(IN_SET(offset
, 0, 2)); /* used to switch between UID and GID map */
4399 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4400 * quadruplet, consisting of host and container UID + GID. */
4402 for (size_t i
= 0; i
< n_bind_user_uid
; i
++) {
4403 uid_t payload_uid
= bind_user_uid
[i
*4+offset
],
4404 host_uid
= bind_user_uid
[i
*4+offset
+1];
4406 assert(previous_uid
<= payload_uid
);
4407 assert(payload_uid
< arg_uid_range
);
4409 /* Add a range to close the gap to previous entry */
4410 if (payload_uid
> previous_uid
) {
4411 r
= add_one_uid_map(&s
, previous_uid
, arg_uid_shift
+ previous_uid
, payload_uid
- previous_uid
);
4416 /* Map this specific user */
4417 r
= add_one_uid_map(&s
, payload_uid
, host_uid
, 1);
4421 previous_uid
= payload_uid
+ 1;
4424 /* And add a range to close the gap to finish the range */
4425 if (arg_uid_range
> previous_uid
) {
4426 r
= add_one_uid_map(&s
, previous_uid
, arg_uid_shift
+ previous_uid
, arg_uid_range
- previous_uid
);
4437 static int setup_uid_map(
4439 const uid_t bind_user_uid
[],
4440 size_t n_bind_user_uid
) {
4442 char uid_map
[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1];
4443 _cleanup_free_
char *s
= NULL
;
4448 /* Build the UID map string */
4449 if (make_uid_map_string(bind_user_uid
, n_bind_user_uid
, 0, &s
) < 0) /* offset=0 contains the UID pair */
4452 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4453 r
= write_string_file(uid_map
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
4455 return log_error_errno(r
, "Failed to write UID map: %m");
4457 /* And now build the GID map string */
4459 if (make_uid_map_string(bind_user_uid
, n_bind_user_uid
, 2, &s
) < 0) /* offset=2 contains the GID pair */
4462 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4463 r
= write_string_file(uid_map
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
4465 return log_error_errno(r
, "Failed to write GID map: %m");
4470 static int nspawn_dispatch_notify_fd(sd_event_source
*source
, int fd
, uint32_t revents
, void *userdata
) {
4471 char buf
[NOTIFY_BUFFER_MAX
+1];
4473 struct iovec iovec
= {
4475 .iov_len
= sizeof(buf
)-1,
4477 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred
)) +
4478 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX
)) control
;
4479 struct msghdr msghdr
= {
4482 .msg_control
= &control
,
4483 .msg_controllen
= sizeof(control
),
4485 struct ucred
*ucred
;
4487 pid_t inner_child_pid
;
4488 _cleanup_strv_free_
char **tags
= NULL
;
4493 inner_child_pid
= PTR_TO_PID(userdata
);
4495 if (revents
!= EPOLLIN
) {
4496 log_warning("Got unexpected poll event for notify fd.");
4500 n
= recvmsg_safe(fd
, &msghdr
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
);
4501 if (ERRNO_IS_NEG_TRANSIENT(n
))
4503 else if (n
== -EXFULL
) {
4504 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4507 return log_warning_errno(n
, "Couldn't read notification socket: %m");
4509 cmsg_close_all(&msghdr
);
4511 ucred
= CMSG_FIND_DATA(&msghdr
, SOL_SOCKET
, SCM_CREDENTIALS
, struct ucred
);
4512 if (!ucred
|| ucred
->pid
!= inner_child_pid
) {
4513 log_debug("Received notify message without valid credentials. Ignoring.");
4517 if ((size_t) n
>= sizeof(buf
)) {
4518 log_warning("Received notify message exceeded maximum size. Ignoring.");
4523 tags
= strv_split(buf
, "\n\r");
4527 if (DEBUG_LOGGING
) {
4528 _cleanup_free_
char *joined
= strv_join(tags
, " ");
4531 _cleanup_free_
char *j
= cescape(joined
);
4532 free_and_replace(joined
, j
);
4535 log_debug("Got sd_notify() message: %s", strnull(joined
));
4538 if (strv_contains(tags
, "READY=1")) {
4539 r
= sd_notify(false, "READY=1\n");
4541 log_warning_errno(r
, "Failed to send readiness notification, ignoring: %m");
4544 p
= strv_find_startswith(tags
, "STATUS=");
4546 (void) sd_notifyf(false, "STATUS=Container running: %s", p
);
4551 static int setup_notify_parent(sd_event
*event
, int fd
, pid_t
*inner_child_pid
, sd_event_source
**notify_event_source
) {
4557 r
= sd_event_add_io(event
, notify_event_source
, fd
, EPOLLIN
, nspawn_dispatch_notify_fd
, inner_child_pid
);
4559 return log_error_errno(r
, "Failed to allocate notify event source: %m");
4561 (void) sd_event_source_set_description(*notify_event_source
, "nspawn-notify");
4566 static void set_window_title(PTYForward
*f
) {
4567 _cleanup_free_
char *hn
= NULL
, *dot
= NULL
;
4571 (void) gethostname_strict(&hn
);
4573 if (emoji_enabled())
4574 dot
= strjoin(special_glyph(SPECIAL_GLYPH_BLUE_CIRCLE
), " ");
4577 (void) pty_forward_set_titlef(f
, "%sContainer %s on %s", strempty(dot
), arg_machine
, hn
);
4579 (void) pty_forward_set_titlef(f
, "%sContainer %s", strempty(dot
), arg_machine
);
4582 (void) pty_forward_set_title_prefix(f
, dot
);
4585 static int merge_settings(Settings
*settings
, const char *path
) {
4591 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4592 * that this steals the fields of the Settings* structure, and hence modifies it. */
4594 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
4595 settings
->start_mode
>= 0) {
4596 arg_start_mode
= settings
->start_mode
;
4597 strv_free_and_replace(arg_parameters
, settings
->parameters
);
4600 if ((arg_settings_mask
& SETTING_EPHEMERAL
) == 0 &&
4601 settings
->ephemeral
>= 0)
4602 arg_ephemeral
= settings
->ephemeral
;
4604 if ((arg_settings_mask
& SETTING_DIRECTORY
) == 0 &&
4607 if (!arg_settings_trusted
)
4608 log_warning("Ignoring root directory setting, file %s is not trusted.", path
);
4610 free_and_replace(arg_directory
, settings
->root
);
4613 if ((arg_settings_mask
& SETTING_PIVOT_ROOT
) == 0 &&
4614 settings
->pivot_root_new
) {
4615 free_and_replace(arg_pivot_root_new
, settings
->pivot_root_new
);
4616 free_and_replace(arg_pivot_root_old
, settings
->pivot_root_old
);
4619 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
4620 settings
->working_directory
)
4621 free_and_replace(arg_chdir
, settings
->working_directory
);
4623 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
4624 settings
->environment
)
4625 strv_free_and_replace(arg_setenv
, settings
->environment
);
4627 if ((arg_settings_mask
& SETTING_USER
) == 0) {
4630 free_and_replace(arg_user
, settings
->user
);
4632 if (uid_is_valid(settings
->uid
))
4633 arg_uid
= settings
->uid
;
4634 if (gid_is_valid(settings
->gid
))
4635 arg_gid
= settings
->gid
;
4636 if (settings
->n_supplementary_gids
> 0) {
4637 free_and_replace(arg_supplementary_gids
, settings
->supplementary_gids
);
4638 arg_n_supplementary_gids
= settings
->n_supplementary_gids
;
4642 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
4643 uint64_t plus
, minus
;
4644 uint64_t network_minus
= 0;
4647 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4648 * Settings structure */
4650 plus
= settings
->capability
;
4651 minus
= settings
->drop_capability
;
4653 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4654 settings_network_configured(settings
)) {
4655 if (settings_private_network(settings
))
4656 plus
|= UINT64_C(1) << CAP_NET_ADMIN
;
4658 network_minus
|= UINT64_C(1) << CAP_NET_ADMIN
;
4661 if (!arg_settings_trusted
&& plus
!= 0) {
4662 if (settings
->capability
!= 0)
4663 log_warning("Ignoring Capability= setting, file %s is not trusted.", path
);
4665 arg_caps_retain
&= ~network_minus
;
4666 arg_caps_retain
|= plus
;
4669 arg_caps_retain
&= ~minus
;
4671 /* Copy the full capabilities over too */
4672 if (capability_quintet_is_set(&settings
->full_capabilities
)) {
4673 if (!arg_settings_trusted
)
4674 log_warning("Ignoring capability settings, file %s is not trusted.", path
);
4676 arg_full_capabilities
= settings
->full_capabilities
;
4679 ambient
= settings
->ambient_capability
;
4680 if (!arg_settings_trusted
&& ambient
!= 0)
4681 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path
);
4683 arg_caps_ambient
|= ambient
;
4686 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
4687 settings
->kill_signal
> 0)
4688 arg_kill_signal
= settings
->kill_signal
;
4690 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
4691 settings
->personality
!= PERSONALITY_INVALID
)
4692 arg_personality
= settings
->personality
;
4694 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
4695 !sd_id128_is_null(settings
->machine_id
)) {
4697 if (!arg_settings_trusted
)
4698 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path
);
4700 arg_uuid
= settings
->machine_id
;
4703 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
4704 settings
->read_only
>= 0)
4705 arg_read_only
= settings
->read_only
;
4707 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
4708 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
4709 arg_volatile_mode
= settings
->volatile_mode
;
4711 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
4712 settings
->n_custom_mounts
> 0) {
4714 if (!arg_settings_trusted
)
4715 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path
);
4717 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4718 arg_custom_mounts
= TAKE_PTR(settings
->custom_mounts
);
4719 arg_n_custom_mounts
= settings
->n_custom_mounts
;
4720 settings
->n_custom_mounts
= 0;
4724 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4725 settings_network_configured(settings
)) {
4727 if (!arg_settings_trusted
)
4728 log_warning("Ignoring network settings, file %s is not trusted.", path
);
4730 arg_network_veth
= settings_network_veth(settings
);
4731 arg_private_network
= settings_private_network(settings
);
4733 strv_free_and_replace(arg_network_interfaces
, settings
->network_interfaces
);
4734 strv_free_and_replace(arg_network_macvlan
, settings
->network_macvlan
);
4735 strv_free_and_replace(arg_network_ipvlan
, settings
->network_ipvlan
);
4736 strv_free_and_replace(arg_network_veth_extra
, settings
->network_veth_extra
);
4738 free_and_replace(arg_network_bridge
, settings
->network_bridge
);
4739 free_and_replace(arg_network_zone
, settings
->network_zone
);
4741 free_and_replace(arg_network_namespace_path
, settings
->network_namespace_path
);
4745 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
4746 settings
->expose_ports
) {
4748 if (!arg_settings_trusted
)
4749 log_warning("Ignoring Port= setting, file %s is not trusted.", path
);
4751 expose_port_free_all(arg_expose_ports
);
4752 arg_expose_ports
= TAKE_PTR(settings
->expose_ports
);
4756 if ((arg_settings_mask
& SETTING_USERNS
) == 0 &&
4757 settings
->userns_mode
!= _USER_NAMESPACE_MODE_INVALID
) {
4759 if (!arg_settings_trusted
)
4760 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path
);
4762 arg_userns_mode
= settings
->userns_mode
;
4763 arg_uid_shift
= settings
->uid_shift
;
4764 arg_uid_range
= settings
->uid_range
;
4765 arg_userns_ownership
= settings
->userns_ownership
;
4769 if ((arg_settings_mask
& SETTING_BIND_USER
) == 0 &&
4770 !strv_isempty(settings
->bind_user
))
4771 strv_free_and_replace(arg_bind_user
, settings
->bind_user
);
4773 if ((arg_settings_mask
& SETTING_NOTIFY_READY
) == 0 &&
4774 settings
->notify_ready
>= 0)
4775 arg_notify_ready
= settings
->notify_ready
;
4777 if ((arg_settings_mask
& SETTING_SYSCALL_FILTER
) == 0) {
4779 if (!strv_isempty(settings
->syscall_allow_list
) || !strv_isempty(settings
->syscall_deny_list
)) {
4780 if (!arg_settings_trusted
&& !strv_isempty(settings
->syscall_allow_list
))
4781 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path
);
4783 strv_free_and_replace(arg_syscall_allow_list
, settings
->syscall_allow_list
);
4784 strv_free_and_replace(arg_syscall_deny_list
, settings
->syscall_deny_list
);
4789 if (settings
->seccomp
) {
4790 if (!arg_settings_trusted
)
4791 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path
);
4793 seccomp_release(arg_seccomp
);
4794 arg_seccomp
= TAKE_PTR(settings
->seccomp
);
4800 for (rl
= 0; rl
< _RLIMIT_MAX
; rl
++) {
4801 if ((arg_settings_mask
& (SETTING_RLIMIT_FIRST
<< rl
)))
4804 if (!settings
->rlimit
[rl
])
4807 if (!arg_settings_trusted
) {
4808 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl
), path
);
4812 free_and_replace(arg_rlimit
[rl
], settings
->rlimit
[rl
]);
4815 if ((arg_settings_mask
& SETTING_HOSTNAME
) == 0 &&
4817 free_and_replace(arg_hostname
, settings
->hostname
);
4819 if ((arg_settings_mask
& SETTING_NO_NEW_PRIVILEGES
) == 0 &&
4820 settings
->no_new_privileges
>= 0)
4821 arg_no_new_privileges
= settings
->no_new_privileges
;
4823 if ((arg_settings_mask
& SETTING_OOM_SCORE_ADJUST
) == 0 &&
4824 settings
->oom_score_adjust_set
) {
4826 if (!arg_settings_trusted
)
4827 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path
);
4829 arg_oom_score_adjust
= settings
->oom_score_adjust
;
4830 arg_oom_score_adjust_set
= true;
4834 if ((arg_settings_mask
& SETTING_CPU_AFFINITY
) == 0 &&
4835 settings
->cpu_set
.set
) {
4837 if (!arg_settings_trusted
)
4838 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path
);
4840 cpu_set_reset(&arg_cpu_set
);
4841 arg_cpu_set
= TAKE_STRUCT(settings
->cpu_set
);
4845 if ((arg_settings_mask
& SETTING_RESOLV_CONF
) == 0 &&
4846 settings
->resolv_conf
!= _RESOLV_CONF_MODE_INVALID
)
4847 arg_resolv_conf
= settings
->resolv_conf
;
4849 if ((arg_settings_mask
& SETTING_LINK_JOURNAL
) == 0 &&
4850 settings
->link_journal
!= _LINK_JOURNAL_INVALID
) {
4852 if (!arg_settings_trusted
)
4853 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path
);
4855 arg_link_journal
= settings
->link_journal
;
4856 arg_link_journal_try
= settings
->link_journal_try
;
4860 if ((arg_settings_mask
& SETTING_TIMEZONE
) == 0 &&
4861 settings
->timezone
!= _TIMEZONE_MODE_INVALID
)
4862 arg_timezone
= settings
->timezone
;
4864 if ((arg_settings_mask
& SETTING_SLICE
) == 0 &&
4867 if (!arg_settings_trusted
)
4868 log_warning("Ignoring slice setting, file '%s' is not trusted.", path
);
4870 free_and_replace(arg_slice
, settings
->slice
);
4873 if ((arg_settings_mask
& SETTING_USE_CGNS
) == 0 &&
4874 settings
->use_cgns
>= 0) {
4876 if (!arg_settings_trusted
)
4877 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path
);
4879 arg_use_cgns
= settings
->use_cgns
;
4882 if ((arg_settings_mask
& SETTING_CLONE_NS_FLAGS
) == 0 &&
4883 settings
->clone_ns_flags
!= ULONG_MAX
) {
4885 if (!arg_settings_trusted
)
4886 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path
);
4888 arg_clone_ns_flags
= settings
->clone_ns_flags
;
4891 if ((arg_settings_mask
& SETTING_CONSOLE_MODE
) == 0 &&
4892 settings
->console_mode
>= 0) {
4894 if (!arg_settings_trusted
)
4895 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path
);
4897 arg_console_mode
= settings
->console_mode
;
4900 if ((arg_settings_mask
& SETTING_SUPPRESS_SYNC
) == 0 &&
4901 settings
->suppress_sync
>= 0)
4902 arg_suppress_sync
= settings
->suppress_sync
;
4904 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4905 * don't consult arg_settings_mask for them. */
4907 sd_bus_message_unref(arg_property_message
);
4908 arg_property_message
= TAKE_PTR(settings
->properties
);
4910 arg_console_width
= settings
->console_width
;
4911 arg_console_height
= settings
->console_height
;
4913 device_node_array_free(arg_extra_nodes
, arg_n_extra_nodes
);
4914 arg_extra_nodes
= TAKE_PTR(settings
->extra_nodes
);
4915 arg_n_extra_nodes
= settings
->n_extra_nodes
;
4916 settings
->n_extra_nodes
= 0;
4921 static int load_settings(void) {
4922 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4923 _cleanup_fclose_
FILE *f
= NULL
;
4924 _cleanup_free_
char *p
= NULL
;
4930 /* If all settings are masked, there's no point in looking for
4931 * the settings file */
4932 if (FLAGS_SET(arg_settings_mask
, _SETTINGS_MASK_ALL
))
4935 /* We first look in the admin's directories in /etc and /run */
4936 if (arg_privileged
) {
4937 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4938 _cleanup_free_
char *j
= NULL
;
4940 j
= path_join(i
, arg_settings_filename
);
4948 /* By default, we trust configuration from /etc and /run */
4949 if (arg_settings_trusted
< 0)
4950 arg_settings_trusted
= true;
4955 if (errno
!= ENOENT
)
4956 return log_error_errno(errno
, "Failed to open %s: %m", j
);
4961 /* After that, let's look for a file next to the
4962 * actual image we shall boot. */
4965 r
= file_in_same_dir(arg_image
, arg_settings_filename
, &p
);
4967 return log_error_errno(r
, "Failed to generate settings path from image path: %m");
4968 } else if (arg_directory
) {
4969 r
= file_in_same_dir(arg_directory
, arg_settings_filename
, &p
);
4970 if (r
< 0 && r
!= -EADDRNOTAVAIL
) /* if directory is root fs, don't complain */
4971 return log_error_errno(r
, "Failed to generate settings path from directory path: %m");
4976 if (!f
&& errno
!= ENOENT
)
4977 return log_error_errno(errno
, "Failed to open %s: %m", p
);
4979 /* By default, we do not trust configuration from /var/lib/machines */
4980 if (arg_settings_trusted
< 0)
4981 arg_settings_trusted
= false;
4988 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
4990 r
= settings_load(f
, p
, &settings
);
4994 return merge_settings(settings
, p
);
4997 static int load_oci_bundle(void) {
4998 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
5001 if (!arg_oci_bundle
)
5004 /* By default let's trust OCI bundles */
5005 if (arg_settings_trusted
< 0)
5006 arg_settings_trusted
= true;
5008 r
= oci_load(NULL
, arg_oci_bundle
, &settings
);
5012 return merge_settings(settings
, arg_oci_bundle
);
5015 static int run_container(
5016 DissectedImage
*dissected_image
,
5019 char veth_name
[IFNAMSIZ
],
5021 struct ExposeArgs
*expose_args
,
5026 static const struct sigaction sa
= {
5027 .sa_handler
= nop_signal_handler
,
5028 .sa_flags
= SA_NOCLDSTOP
|SA_RESTART
,
5031 _cleanup_(release_lock_file
) LockFile uid_shift_lock
= LOCK_FILE_INIT
;
5032 _cleanup_close_
int etc_passwd_lock
= -EBADF
;
5033 _cleanup_close_pair_
int
5034 fd_inner_socket_pair
[2] = EBADF_PAIR
,
5035 fd_outer_socket_pair
[2] = EBADF_PAIR
;
5037 _cleanup_close_
int notify_socket
= -EBADF
, mntns_fd
= -EBADF
, fd_kmsg_fifo
= -EBADF
;
5038 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
5039 _cleanup_(sd_event_source_unrefp
) sd_event_source
*notify_event_source
= NULL
;
5040 _cleanup_(umount_and_rmdir_and_freep
) char *unix_export_host_dir
= NULL
;
5041 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
5042 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
5043 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
5044 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
5045 _cleanup_free_ uid_t
*bind_user_uid
= NULL
;
5046 size_t n_bind_user_uid
= 0;
5047 ContainerStatus container_status
= 0;
5051 _cleanup_close_
int child_netns_fd
= -EBADF
;
5053 assert_se(sigemptyset(&mask_chld
) == 0);
5054 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
5056 /* Set up the unix export host directory on the host first */
5057 r
= setup_unix_export_dir_outside(&unix_export_host_dir
);
5061 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
5062 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
5063 * check with getpwuid() if the specific user already exists. Note that /etc might be
5064 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
5065 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
5066 * really just an extra safety net. We kinda assume that the UID range we allocate from is
5069 etc_passwd_lock
= take_etc_passwd_lock(NULL
);
5070 if (etc_passwd_lock
< 0 && etc_passwd_lock
!= -EROFS
)
5071 return log_error_errno(etc_passwd_lock
, "Failed to take /etc/passwd lock: %m");
5074 r
= barrier_create(&barrier
);
5076 return log_error_errno(r
, "Cannot initialize IPC barrier: %m");
5078 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, fd_inner_socket_pair
) < 0)
5079 return log_error_errno(errno
, "Failed to create inner socket pair: %m");
5081 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, fd_outer_socket_pair
) < 0)
5082 return log_error_errno(errno
, "Failed to create outer socket pair: %m");
5084 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
5085 * parent's blocking calls and give it a chance to call wait() and terminate. */
5086 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
5088 return log_error_errno(errno
, "Failed to change the signal mask: %m");
5090 r
= sigaction(SIGCHLD
, &sa
, NULL
);
5092 return log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
5094 if (arg_network_namespace_path
) {
5095 child_netns_fd
= open(arg_network_namespace_path
, O_RDONLY
|O_NOCTTY
|O_CLOEXEC
);
5096 if (child_netns_fd
< 0)
5097 return log_error_errno(errno
, "Cannot open file %s: %m", arg_network_namespace_path
);
5099 r
= fd_is_ns(child_netns_fd
, CLONE_NEWNET
);
5101 log_debug_errno(r
, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path
);
5103 return log_error_errno(r
, "Failed to check %s fs type: %m", arg_network_namespace_path
);
5105 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5106 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path
);
5109 if (arg_privileged
) {
5110 assert(userns_fd
< 0);
5112 /* If we have no user namespace then we'll clone and create a new mount namespace right-away. */
5114 *pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
);
5116 return log_error_errno(errno
, "clone() failed%s: %m",
5118 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
5120 assert(userns_fd
>= 0);
5122 /* If we have a user namespace then we'll clone() first, and then join the user namespace,
5123 * and then open the mount namespace, so that it is owned by the user namespace */
5125 *pid
= raw_clone(SIGCHLD
);
5127 return log_error_errno(errno
, "clone() failed: %m");
5130 if (setns(userns_fd
, CLONE_NEWUSER
) < 0) {
5131 log_error_errno(errno
, "Failed to join allocate user namespace: %m");
5132 _exit(EXIT_FAILURE
);
5135 r
= reset_uid_gid();
5137 log_error_errno(r
, "Failed to reset UID/GID to root: %m");
5138 _exit(EXIT_FAILURE
);
5141 if (unshare(CLONE_NEWNS
) < 0) {
5142 log_error_errno(errno
, "Failed to unshare file system namespace: %m");
5143 _exit(EXIT_FAILURE
);
5149 /* The outer child only has a file system namespace. */
5150 barrier_set_role(&barrier
, BARRIER_CHILD
);
5152 fd_inner_socket_pair
[0] = safe_close(fd_inner_socket_pair
[0]);
5153 fd_outer_socket_pair
[0] = safe_close(fd_outer_socket_pair
[0]);
5155 (void) reset_all_signal_handlers();
5156 (void) reset_signal_mask();
5158 r
= outer_child(&barrier
,
5161 fd_outer_socket_pair
[1],
5162 fd_inner_socket_pair
[1],
5165 unix_export_host_dir
);
5167 _exit(EXIT_FAILURE
);
5169 _exit(EXIT_SUCCESS
);
5172 barrier_set_role(&barrier
, BARRIER_PARENT
);
5176 fd_inner_socket_pair
[1] = safe_close(fd_inner_socket_pair
[1]);
5177 fd_outer_socket_pair
[1] = safe_close(fd_outer_socket_pair
[1]);
5179 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
5180 mntns_fd
= receive_one_fd(fd_outer_socket_pair
[0], 0);
5182 return log_error_errno(mntns_fd
, "Failed to receive mount namespace fd from outer child: %m");
5184 /* The child just let us know the UID shift it might have read from the image. */
5185 l
= recv(fd_outer_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, 0);
5187 return log_error_errno(errno
, "Failed to read UID shift: %m");
5188 if (l
!= sizeof arg_uid_shift
)
5189 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading UID shift.");
5191 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
5192 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
5193 * image, but if that's already in use, pick a new one, and report back to the child,
5194 * which one we now picked. */
5196 r
= uid_shift_pick(&arg_uid_shift
, &uid_shift_lock
);
5198 return log_error_errno(r
, "Failed to pick suitable UID/GID range: %m");
5200 l
= send(fd_outer_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, MSG_NOSIGNAL
);
5202 return log_error_errno(errno
, "Failed to send UID shift: %m");
5203 if (l
!= sizeof arg_uid_shift
)
5204 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short write while writing UID shift.");
5207 n_bind_user_uid
= strv_length(arg_bind_user
);
5208 if (n_bind_user_uid
> 0) {
5209 /* Right after the UID shift, we'll receive the list of UID mappings for the
5210 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
5212 bind_user_uid
= new(uid_t
, n_bind_user_uid
*4);
5216 for (size_t i
= 0; i
< n_bind_user_uid
; i
++) {
5217 l
= recv(fd_outer_socket_pair
[0], bind_user_uid
+ i
*4, sizeof(uid_t
)*4, 0);
5219 return log_error_errno(errno
, "Failed to read user UID map pair: %m");
5220 if (l
!= sizeof(uid_t
)*4)
5221 return log_full_errno(l
== 0 ? LOG_DEBUG
: LOG_WARNING
,
5222 SYNTHETIC_ERRNO(EIO
),
5223 "Short read while reading bind user UID pairs.");
5228 if (arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
5229 /* The child let us know the support cgroup mode it might have read from the image. */
5230 l
= recv(fd_outer_socket_pair
[0], &arg_unified_cgroup_hierarchy
, sizeof(arg_unified_cgroup_hierarchy
), 0);
5232 return log_error_errno(errno
, "Failed to read cgroup mode: %m");
5233 if (l
!= sizeof(arg_unified_cgroup_hierarchy
))
5234 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading cgroup mode (%zi bytes).%s",
5235 l
, l
== 0 ? " The child is most likely dead." : "");
5238 /* Wait for the outer child. */
5239 r
= wait_for_terminate_and_check("(sd-namespace)", *pid
, WAIT_LOG_ABNORMAL
);
5242 if (r
!= EXIT_SUCCESS
)
5245 /* And now retrieve the PID of the inner child. */
5246 l
= recv(fd_outer_socket_pair
[0], pid
, sizeof *pid
, 0);
5248 return log_error_errno(errno
, "Failed to read inner child PID: %m");
5249 if (l
!= sizeof *pid
)
5250 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading inner child PID.");
5252 /* We also retrieve container UUID in case it was generated by outer child */
5253 l
= recv(fd_outer_socket_pair
[0], &arg_uuid
, sizeof arg_uuid
, 0);
5255 return log_error_errno(errno
, "Failed to read container machine ID: %m");
5256 if (l
!= sizeof(arg_uuid
))
5257 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading container machined ID.");
5259 /* We also retrieve the socket used for notifications generated by outer child */
5260 notify_socket
= receive_one_fd(fd_outer_socket_pair
[0], 0);
5261 if (notify_socket
< 0)
5262 return log_error_errno(notify_socket
,
5263 "Failed to receive notification socket from the outer child: %m");
5265 log_debug("Init process invoked as PID "PID_FMT
, *pid
);
5267 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
5268 if (!barrier_place_and_sync(&barrier
)) /* #1 */
5269 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
5271 r
= setup_uid_map(*pid
, bind_user_uid
, n_bind_user_uid
);
5275 (void) barrier_place(&barrier
); /* #2 */
5278 if (arg_private_network
) {
5279 if (!arg_network_namespace_path
) {
5280 /* Wait until the child has unshared its network namespace. */
5281 if (!barrier_place_and_sync(&barrier
)) /* #3 */
5282 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early");
5284 /* Make sure we have an open file descriptor to the child's network namespace so it
5285 * stays alive even if the child exits. */
5286 assert(child_netns_fd
< 0);
5287 child_netns_fd
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5288 if (child_netns_fd
< 0)
5289 return log_error_errno(r
, "Failed to receive child network namespace: %m");
5292 r
= move_network_interfaces(child_netns_fd
, arg_network_interfaces
);
5296 if (arg_network_veth
) {
5297 if (arg_privileged
) {
5298 r
= setup_veth(arg_machine
, *pid
, veth_name
,
5299 arg_network_bridge
|| arg_network_zone
, &arg_network_provided_mac
);
5305 _cleanup_free_
char *host_ifname
= NULL
;
5307 r
= nsresource_add_netif(userns_fd
, child_netns_fd
, /* namespace_ifname= */ NULL
, &host_ifname
, /* ret_namespace_ifname= */ NULL
);
5309 return log_error_errno(r
, "Failed to add network interface to container: %m");
5311 ifi
= if_nametoindex(host_ifname
);
5313 return log_error_errno(errno
, "Failed to resolve interface '%s': %m", host_ifname
);
5315 if (strlen(host_ifname
) >= IFNAMSIZ
)
5316 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Host interface name too long?");
5318 strcpy(veth_name
, host_ifname
);
5321 if (arg_network_bridge
) {
5322 /* Add the interface to a bridge */
5323 r
= setup_bridge(veth_name
, arg_network_bridge
, false);
5328 } else if (arg_network_zone
) {
5329 /* Add the interface to a bridge, possibly creating it */
5330 r
= setup_bridge(veth_name
, arg_network_zone
, true);
5338 r
= setup_veth_extra(arg_machine
, *pid
, arg_network_veth_extra
);
5342 /* We created the primary and extra veth links now; let's remember this, so that we know to
5343 remove them later on. Note that we don't bother with removing veth links that were created
5344 here when their setup failed half-way, because in that case the kernel should be able to
5345 remove them on its own, since they cannot be referenced by anything yet. */
5346 *veth_created
= true;
5348 r
= setup_macvlan(arg_machine
, *pid
, arg_network_macvlan
);
5352 r
= setup_ipvlan(arg_machine
, *pid
, arg_network_ipvlan
);
5357 if (arg_register
|| !arg_keep_unit
) {
5359 r
= sd_bus_default_system(&bus
);
5361 r
= sd_bus_default_user(&bus
);
5363 return log_error_errno(r
, "Failed to open bus: %m");
5365 r
= sd_bus_set_close_on_exit(bus
, false);
5367 return log_error_errno(r
, "Failed to disable close-on-exit behaviour: %m");
5370 if (!arg_keep_unit
) {
5371 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5372 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5373 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5375 r
= sd_bus_match_signal_async(
5378 "org.freedesktop.systemd1",
5380 "org.freedesktop.systemd1.Scope",
5382 on_request_stop
, NULL
, PID_TO_PTR(*pid
));
5384 return log_error_errno(r
, "Failed to request RequestStop match: %m");
5388 r
= register_machine(
5396 arg_custom_mounts
, arg_n_custom_mounts
,
5399 arg_property_message
,
5401 arg_container_service_name
,
5406 } else if (!arg_keep_unit
) {
5412 arg_custom_mounts
, arg_n_custom_mounts
,
5415 arg_property_message
,
5416 /* allow_pidfds= */ true,
5421 } else if (arg_slice
|| arg_property
)
5422 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5424 r
= create_subcgroup(
5427 arg_unified_cgroup_hierarchy
,
5434 r
= sync_cgroup(*pid
, arg_unified_cgroup_hierarchy
, arg_uid_shift
);
5438 /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
5439 * that the child can now hand over control to the code to run inside the container. */
5440 (void) barrier_place(&barrier
); /* #4 */
5442 /* Block SIGCHLD here, before notifying child.
5443 * process_pty() will handle it with the other signals. */
5444 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
5446 /* Reset signal to default */
5447 r
= default_signals(SIGCHLD
);
5449 return log_error_errno(r
, "Failed to reset SIGCHLD: %m");
5451 r
= sd_event_new(&event
);
5453 return log_error_errno(r
, "Failed to get default event source: %m");
5455 (void) sd_event_set_watchdog(event
, true);
5458 r
= sd_bus_attach_event(bus
, event
, 0);
5460 return log_error_errno(r
, "Failed to attach bus to event loop: %m");
5463 r
= setup_notify_parent(event
, notify_socket
, PID_TO_PTR(*pid
), ¬ify_event_source
);
5467 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5468 * before we take the fully visible instances away. */
5469 if (!barrier_sync(&barrier
)) /* #5.1 */
5470 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
5472 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
5473 r
= wipe_fully_visible_fs(mntns_fd
);
5476 mntns_fd
= safe_close(mntns_fd
);
5479 /* And now let the child know that we completed removing the procfs instances, and it can start the
5481 if (!barrier_place(&barrier
)) /* #5.2 */
5482 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
5484 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5485 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5486 etc_passwd_lock
= safe_close(etc_passwd_lock
);
5488 (void) sd_notifyf(false,
5489 "STATUS=Container running.\n"
5490 "X_NSPAWN_LEADER_PID=" PID_FMT
, *pid
);
5491 if (!arg_notify_ready
) {
5492 r
= sd_notify(false, "READY=1\n");
5494 log_warning_errno(r
, "Failed to send readiness notification, ignoring: %m");
5497 if (arg_kill_signal
> 0) {
5498 /* Try to kill the init system on SIGINT or SIGTERM */
5499 (void) sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
5500 (void) sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
5502 /* Immediately exit */
5503 (void) sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
5504 (void) sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
5507 (void) sd_event_add_signal(event
, NULL
, SIGRTMIN
+18, sigrtmin18_handler
, NULL
);
5509 r
= sd_event_add_memory_pressure(event
, NULL
, NULL
, NULL
);
5511 log_debug_errno(r
, "Failed allocate memory pressure event source, ignoring: %m");
5513 /* Exit when the child exits */
5514 (void) sd_event_add_signal(event
, NULL
, SIGCHLD
, on_sigchld
, PID_TO_PTR(*pid
));
5516 /* Retrieve the kmsg fifo allocated by inner child */
5517 fd_kmsg_fifo
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5518 if (fd_kmsg_fifo
< 0)
5519 return log_error_errno(fd_kmsg_fifo
, "Failed to receive kmsg fifo from inner child: %m");
5521 if (arg_expose_ports
) {
5522 r
= expose_port_watch_rtnl(event
, fd_inner_socket_pair
[0], on_address_change
, expose_args
, &rtnl
);
5526 (void) expose_port_execute(rtnl
, &expose_args
->fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
->address4
);
5527 (void) expose_port_execute(rtnl
, &expose_args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
->address6
);
5530 if (arg_console_mode
!= CONSOLE_PIPE
) {
5531 _cleanup_close_
int fd
= -EBADF
;
5532 PTYForwardFlags flags
= 0;
5534 /* Retrieve the master pty allocated by inner child */
5535 fd
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5537 return log_error_errno(fd
, "Failed to receive master pty from the inner child: %m");
5539 switch (arg_console_mode
) {
5541 case CONSOLE_READ_ONLY
:
5542 flags
|= PTY_FORWARD_READ_ONLY
;
5546 case CONSOLE_INTERACTIVE
:
5547 flags
|= PTY_FORWARD_IGNORE_VHANGUP
;
5549 r
= pty_forward_new(event
, fd
, flags
, &forward
);
5551 return log_error_errno(r
, "Failed to create PTY forwarder: %m");
5553 if (arg_console_width
!= UINT_MAX
|| arg_console_height
!= UINT_MAX
)
5554 (void) pty_forward_set_width_height(
5557 arg_console_height
);
5559 if (!arg_background
&& shall_tint_background()) {
5560 _cleanup_free_
char *bg
= NULL
;
5562 r
= terminal_tint_color(220 /* blue */, &bg
);
5564 log_debug_errno(r
, "Failed to determine terminal background color, not tinting.");
5566 (void) pty_forward_set_background_color(forward
, bg
);
5567 } else if (!isempty(arg_background
))
5568 (void) pty_forward_set_background_color(forward
, arg_background
);
5570 set_window_title(forward
);
5574 assert(arg_console_mode
== CONSOLE_PASSIVE
);
5577 *master
= TAKE_FD(fd
);
5580 fd_inner_socket_pair
[0] = safe_close(fd_inner_socket_pair
[0]);
5582 r
= sd_event_loop(event
);
5584 return log_error_errno(r
, "Failed to run event loop: %m");
5589 (void) pty_forward_get_last_char(forward
, &last_char
);
5590 forward
= pty_forward_free(forward
);
5592 if (!arg_quiet
&& last_char
!= '\n')
5596 /* Kill if it is not dead yet anyway */
5597 if (!arg_register
&& !arg_keep_unit
&& bus
)
5598 terminate_scope(bus
, arg_machine
);
5600 /* Normally redundant, but better safe than sorry */
5601 (void) kill(*pid
, SIGKILL
);
5603 fd_kmsg_fifo
= safe_close(fd_kmsg_fifo
);
5605 if (arg_private_network
&& arg_privileged
) {
5606 r
= move_back_network_interfaces(child_netns_fd
, arg_network_interfaces
);
5611 r
= wait_for_container(TAKE_PID(*pid
), &container_status
);
5613 /* Tell machined that we are gone. */
5615 (void) unregister_machine(bus
, arg_machine
);
5618 /* We failed to wait for the container, or the container exited abnormally. */
5620 if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
5621 /* r > 0 → The container exited with a non-zero status.
5622 * As a special case, we need to replace 133 with a different value,
5623 * because 133 is special-cased in the service file to reboot the container.
5624 * otherwise → The container exited with zero status and a reboot was not requested.
5626 if (r
== EXIT_FORCE_RESTART
)
5627 r
= EXIT_FAILURE
; /* replace 133 with the general failure code */
5629 return 0; /* finito */
5632 /* CONTAINER_REBOOTED, loop again */
5634 if (arg_keep_unit
) {
5635 /* Special handling if we are running as a service: instead of simply
5636 * restarting the machine we want to restart the entire service, so let's
5637 * inform systemd about this with the special exit code 133. The service
5638 * file uses RestartForceExitStatus=133 so that this results in a full
5639 * nspawn restart. This is necessary since we might have cgroup parameters
5640 * set we want to have flushed out. */
5641 *ret
= EXIT_FORCE_RESTART
;
5642 return 0; /* finito */
5645 expose_port_flush(&expose_args
->fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
->address4
);
5646 expose_port_flush(&expose_args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
->address6
);
5648 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
5649 *veth_created
= false;
5650 return 1; /* loop again */
5653 static int initialize_rlimits(void) {
5654 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5655 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5656 * container execution environments. */
5658 static const struct rlimit kernel_defaults
[_RLIMIT_MAX
] = {
5659 [RLIMIT_AS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5660 [RLIMIT_CORE
] = { 0, RLIM_INFINITY
},
5661 [RLIMIT_CPU
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5662 [RLIMIT_DATA
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5663 [RLIMIT_FSIZE
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5664 [RLIMIT_LOCKS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5665 [RLIMIT_MEMLOCK
] = { DEFAULT_RLIMIT_MEMLOCK
, DEFAULT_RLIMIT_MEMLOCK
},
5666 [RLIMIT_MSGQUEUE
] = { 819200, 819200 },
5667 [RLIMIT_NICE
] = { 0, 0 },
5668 [RLIMIT_NOFILE
] = { 1024, 4096 },
5669 [RLIMIT_RSS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5670 [RLIMIT_RTPRIO
] = { 0, 0 },
5671 [RLIMIT_RTTIME
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5672 [RLIMIT_STACK
] = { 8388608, RLIM_INFINITY
},
5674 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5675 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5676 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5677 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5678 * that PID 1 changes a number of other resource limits during early initialization which is why we
5679 * don't read the other limits from PID 1 but prefer the static table above. */
5684 for (rl
= 0; rl
< _RLIMIT_MAX
; rl
++) {
5685 /* Let's only fill in what the user hasn't explicitly configured anyway */
5686 if ((arg_settings_mask
& (SETTING_RLIMIT_FIRST
<< rl
)) == 0) {
5687 const struct rlimit
*v
;
5688 struct rlimit buffer
;
5690 if (IN_SET(rl
, RLIMIT_NPROC
, RLIMIT_SIGPENDING
)) {
5691 /* For these two let's read the limits off PID 1. See above for an explanation. */
5693 r
= pid_getrlimit(1, rl
, &buffer
);
5695 return log_error_errno(r
, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl
));
5698 } else if (rl
== RLIMIT_NOFILE
) {
5699 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5700 * userspace. Given that nspawn containers are often run without our PID 1,
5701 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5702 * so that container userspace gets similar resources as host userspace
5704 buffer
= kernel_defaults
[rl
];
5705 buffer
.rlim_max
= MIN((rlim_t
) read_nr_open(), (rlim_t
) HIGH_RLIMIT_NOFILE
);
5708 v
= kernel_defaults
+ rl
;
5710 arg_rlimit
[rl
] = newdup(struct rlimit
, v
, 1);
5711 if (!arg_rlimit
[rl
])
5715 if (DEBUG_LOGGING
) {
5716 _cleanup_free_
char *k
= NULL
;
5718 (void) rlimit_format(arg_rlimit
[rl
], &k
);
5719 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl
), k
);
5726 static int cant_be_in_netns(void) {
5727 _cleanup_close_
int fd
= -EBADF
;
5731 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5732 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5735 if (!arg_image
) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5738 fd
= socket(AF_UNIX
, SOCK_SEQPACKET
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0);
5740 return log_error_errno(errno
, "Failed to allocate udev control socket: %m");
5742 r
= connect_unix_path(fd
, AT_FDCWD
, "/run/udev/control");
5743 if (r
== -ENOENT
|| ERRNO_IS_NEG_DISCONNECT(r
))
5744 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
5745 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5746 if (ERRNO_IS_NEG_PRIVILEGE(r
)) {
5747 log_debug_errno(r
, "Can't connect to udev control socket, assuming we are in same netns.");
5751 return log_error_errno(r
, "Failed to connect socket to udev control socket: %m");
5753 r
= getpeercred(fd
, &ucred
);
5755 return log_error_errno(r
, "Failed to determine peer of udev control socket: %m");
5757 r
= in_same_namespace(ucred
.pid
, 0, NAMESPACE_NET
);
5759 return log_error_errno(r
, "Failed to determine network namespace of udev: %m");
5761 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
5762 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5766 static int run(int argc
, char *argv
[]) {
5767 bool remove_directory
= false, remove_image
= false, veth_created
= false, remove_tmprootdir
= false;
5768 _cleanup_close_
int master
= -EBADF
, userns_fd
= -EBADF
;
5769 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
5770 int r
, n_fd_passed
, ret
= EXIT_SUCCESS
;
5771 char veth_name
[IFNAMSIZ
] = "";
5772 struct ExposeArgs expose_args
= {};
5773 _cleanup_(release_lock_file
) LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
5774 char tmprootdir
[] = "/tmp/nspawn-root-XXXXXX";
5775 _cleanup_(loop_device_unrefp
) LoopDevice
*loop
= NULL
;
5776 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
5777 _cleanup_(fw_ctx_freep
) FirewallContext
*fw_ctx
= NULL
;
5782 arg_privileged
= getuid() == 0;
5784 r
= parse_argv(argc
, argv
);
5788 r
= cant_be_in_netns();
5792 r
= initialize_rlimits();
5796 r
= load_oci_bundle();
5804 r
= determine_names();
5808 r
= load_settings();
5812 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
5813 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
5815 if (!arg_private_network
&& arg_userns_mode
!= USER_NAMESPACE_NO
&& arg_uid_shift
> 0)
5816 arg_caps_retain
&= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE
);
5818 r
= cg_unified(); /* initialize cache early */
5820 log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5824 r
= verify_arguments();
5828 r
= resolve_network_interface_names(arg_network_interfaces
);
5832 r
= verify_network_interfaces_initialized();
5836 /* Reapply environment settings. */
5837 (void) detect_unified_cgroup_hierarchy_from_environment();
5839 if (!arg_privileged
) {
5840 r
= cg_all_unified();
5842 log_error_errno(r
, "Failed to determine if we are in unified cgroupv2 mode: %m");
5846 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "Unprivileged operation only supported in unified cgroupv2 mode.");
5849 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5850 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5851 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5852 (void) ignore_signals(SIGPIPE
);
5854 n_fd_passed
= sd_listen_fds(false);
5855 if (n_fd_passed
> 0) {
5856 r
= fdset_new_listen_fds(&fds
, false);
5858 log_error_errno(r
, "Failed to collect file descriptors: %m");
5863 /* The "default" umask. This is appropriate for most file and directory
5864 * operations performed by nspawn, and is the umask that will be used for
5865 * the child. Functions like copy_devnodes() change the umask temporarily. */
5868 if (arg_console_mode
< 0)
5869 arg_console_mode
= isatty(STDIN_FILENO
) && isatty(STDOUT_FILENO
) ?
5870 CONSOLE_INTERACTIVE
: CONSOLE_READ_ONLY
;
5872 if (arg_console_mode
== CONSOLE_PIPE
) /* if we pass STDERR on to the container, don't add our own logs into it too */
5875 if (arg_directory
) {
5878 if (!arg_privileged
) {
5879 r
= log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "Invoking container from plain directory tree is currently not supported if called without privileges.");
5883 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5884 * /var from the host will propagate into container dynamically (because bad things happen if
5885 * two systems write to the same /var). Let's allow it for the special cases where /var is
5886 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5887 if (path_equal(arg_directory
, "/") && !(arg_ephemeral
|| IN_SET(arg_volatile_mode
, VOLATILE_YES
, VOLATILE_STATE
))) {
5888 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5889 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5893 if (arg_ephemeral
) {
5894 _cleanup_free_
char *np
= NULL
;
5896 r
= chase_and_update(&arg_directory
, 0);
5900 /* If the specified path is a mount point we generate the new snapshot immediately
5901 * inside it under a random name. However if the specified is not a mount point we
5902 * create the new snapshot in the parent directory, just next to it. */
5903 r
= path_is_mount_point(arg_directory
);
5905 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
5909 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
5911 r
= tempfn_random(arg_directory
, "machine.", &np
);
5913 log_error_errno(r
, "Failed to generate name for directory snapshot: %m");
5917 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5918 * only owned by us and no one else. */
5919 r
= image_path_lock(
5922 arg_privileged
? &tree_global_lock
: NULL
,
5925 log_error_errno(r
, "Failed to lock %s: %m", np
);
5930 BLOCK_SIGNALS(SIGINT
);
5931 r
= btrfs_subvol_snapshot_at(AT_FDCWD
, arg_directory
, AT_FDCWD
, np
,
5932 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
5933 BTRFS_SNAPSHOT_FALLBACK_COPY
|
5934 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
5935 BTRFS_SNAPSHOT_RECURSIVE
|
5936 BTRFS_SNAPSHOT_QUOTA
|
5937 BTRFS_SNAPSHOT_SIGINT
);
5940 log_error_errno(r
, "Interrupted while copying file system tree to %s, removed again.", np
);
5944 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
5948 free_and_replace(arg_directory
, np
);
5949 remove_directory
= true;
5951 r
= chase_and_update(&arg_directory
, arg_template
? CHASE_NONEXISTENT
: 0);
5955 r
= image_path_lock(
5957 (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
,
5958 arg_privileged
? &tree_global_lock
: NULL
,
5961 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
5965 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
5970 r
= chase_and_update(&arg_template
, 0);
5975 BLOCK_SIGNALS(SIGINT
);
5976 r
= btrfs_subvol_snapshot_at(AT_FDCWD
, arg_template
, AT_FDCWD
, arg_directory
,
5977 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
5978 BTRFS_SNAPSHOT_FALLBACK_COPY
|
5979 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
5980 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
5981 BTRFS_SNAPSHOT_RECURSIVE
|
5982 BTRFS_SNAPSHOT_QUOTA
|
5983 BTRFS_SNAPSHOT_SIGINT
);
5986 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
5987 "Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
5988 else if (r
== -EINTR
) {
5989 log_error_errno(r
, "Interrupted while copying file system tree to %s, removed again.", arg_directory
);
5992 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
5995 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
5996 "Populated %s from template %s.", arg_directory
, arg_template
);
6000 if (arg_start_mode
== START_BOOT
) {
6001 _cleanup_free_
char *b
= NULL
;
6003 int check_os_release
, is_os_tree
;
6005 if (arg_pivot_root_new
) {
6006 b
= path_join(arg_directory
, arg_pivot_root_new
);
6016 check_os_release
= getenv_bool("SYSTEMD_NSPAWN_CHECK_OS_RELEASE");
6017 if (check_os_release
< 0 && check_os_release
!= -ENXIO
) {
6018 r
= log_error_errno(check_os_release
, "Failed to parse $SYSTEMD_NSPAWN_CHECK_OS_RELEASE: %m");
6022 is_os_tree
= path_is_os_tree(p
);
6023 if (is_os_tree
== 0 && check_os_release
== 0)
6024 log_debug("Directory %s is missing an os-release file, continuing anyway.", p
);
6025 else if (is_os_tree
<= 0) {
6026 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
6027 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p
);
6031 _cleanup_free_
char *p
= NULL
;
6033 if (arg_pivot_root_new
)
6034 p
= path_join(arg_directory
, arg_pivot_root_new
, "/usr/");
6036 p
= path_join(arg_directory
, "/usr/");
6042 if (laccess(p
, F_OK
) < 0) {
6043 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
6044 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory
);
6050 DissectImageFlags dissect_image_flags
=
6051 determine_dissect_image_flags();
6054 assert(!arg_template
);
6057 r
= chase_and_update(&arg_image
, 0);
6061 if (arg_ephemeral
) {
6062 _cleanup_free_
char *np
= NULL
;
6064 r
= tempfn_random(arg_image
, "machine.", &np
);
6066 log_error_errno(r
, "Failed to generate name for image snapshot: %m");
6070 /* Always take an exclusive lock on our own ephemeral copy. */
6071 r
= image_path_lock(
6074 arg_privileged
? &tree_global_lock
: NULL
,
6077 log_error_errno(r
, "Failed to create image lock: %m");
6082 BLOCK_SIGNALS(SIGINT
);
6083 r
= copy_file_full(arg_image
, np
, O_EXCL
, arg_read_only
? 0400 : 0600,
6084 FS_NOCOW_FL
, FS_NOCOW_FL
,
6085 COPY_REFLINK
|COPY_CRTIME
|COPY_SIGINT
,
6089 log_error_errno(r
, "Interrupted while copying image file to %s, removed again.", np
);
6093 r
= log_error_errno(r
, "Failed to copy image file: %m");
6097 free_and_replace(arg_image
, np
);
6098 remove_image
= true;
6100 r
= image_path_lock(
6102 (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
,
6103 arg_privileged
? &tree_global_lock
: NULL
,
6106 log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
6110 log_error_errno(r
, "Failed to create image lock: %m");
6114 r
= verity_settings_load(
6115 &arg_verity_settings
,
6116 arg_image
, NULL
, NULL
);
6118 log_error_errno(r
, "Failed to read verity artefacts for %s: %m", arg_image
);
6122 if (arg_verity_settings
.data_path
)
6123 dissect_image_flags
|= DISSECT_IMAGE_NO_PARTITION_TABLE
;
6126 if (!mkdtemp(tmprootdir
)) {
6127 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
6131 remove_tmprootdir
= true;
6133 arg_directory
= strdup(tmprootdir
);
6134 if (!arg_directory
) {
6139 if (arg_privileged
) {
6140 r
= loop_device_make_by_path(
6142 arg_read_only
? O_RDONLY
: O_RDWR
,
6143 /* sector_size= */ UINT32_MAX
,
6144 FLAGS_SET(dissect_image_flags
, DISSECT_IMAGE_NO_PARTITION_TABLE
) ? 0 : LO_FLAGS_PARTSCAN
,
6148 log_error_errno(r
, "Failed to set up loopback block device: %m");
6152 r
= dissect_loop_device_and_warn(
6154 &arg_verity_settings
,
6155 /* mount_options=*/ NULL
,
6156 arg_image_policy
?: &image_policy_container
,
6157 dissect_image_flags
,
6160 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
6161 log_notice("Note that the disk image needs to\n"
6162 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
6163 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
6164 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
6165 " d) or contain a file system without a partition table\n"
6166 "in order to be bootable with systemd-nspawn.");
6172 r
= dissected_image_load_verity_sig_partition(
6175 &arg_verity_settings
);
6179 if (dissected_image
->has_verity
&& !arg_verity_settings
.root_hash
&& !dissected_image
->has_verity_sig
)
6180 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
6181 "root hash signature found! Proceeding without integrity checking.", arg_image
);
6183 r
= dissected_image_decrypt_interactively(
6186 &arg_verity_settings
,
6187 dissect_image_flags
);
6191 _cleanup_free_
char *userns_name
= strjoin("nspawn-", arg_machine
);
6197 /* if we are unprivileged, let's allocate a 64K userns first */
6198 userns_fd
= nsresource_allocate_userns(userns_name
, UINT64_C(0x10000));
6199 if (userns_fd
< 0) {
6200 r
= log_error_errno(userns_fd
, "Failed to allocate user namespace with 64K users: %m");
6204 r
= mountfsd_mount_image(
6208 dissect_image_flags
,
6214 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
6215 if (remove_image
&& unlink(arg_image
) >= 0)
6216 remove_image
= false;
6218 if (arg_architecture
< 0)
6219 arg_architecture
= dissected_image_architecture(dissected_image
);
6222 r
= custom_mount_prepare_all(arg_directory
, arg_custom_mounts
, arg_n_custom_mounts
);
6227 const char *t
= arg_image
?: arg_directory
;
6228 _cleanup_free_
char *u
= NULL
;
6229 (void) terminal_urlify_path(t
, t
, &u
);
6231 log_info("%s %sSpawning container %s on %s.%s",
6232 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE
), ansi_grey(), arg_machine
, u
?: t
, ansi_normal());
6234 if (arg_console_mode
== CONSOLE_INTERACTIVE
)
6235 log_info("%s %sPress %sCtrl-]%s three times within 1s to kill container.%s",
6236 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE
), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal());
6239 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, SIGRTMIN
+18) >= 0);
6241 r
= make_reaper_process(true);
6243 log_error_errno(r
, "Failed to become subreaper: %m");
6247 if (arg_expose_ports
) {
6248 r
= fw_ctx_new(&fw_ctx
);
6250 log_error_errno(r
, "Cannot expose configured ports, firewall initialization failed: %m");
6253 expose_args
.fw_ctx
= fw_ctx
;
6260 veth_name
, &veth_created
,
6261 &expose_args
, &master
,
6268 (void) sd_notify(false,
6269 r
== 0 && ret
== EXIT_FORCE_RESTART
? "STOPPING=1\nSTATUS=Restarting..." :
6270 "STOPPING=1\nSTATUS=Terminating...");
6273 (void) kill(pid
, SIGKILL
);
6275 /* Try to flush whatever is still queued in the pty */
6277 (void) copy_bytes(master
, STDOUT_FILENO
, UINT64_MAX
, 0);
6278 master
= safe_close(master
);
6282 (void) wait_for_terminate(pid
, NULL
);
6286 if (remove_directory
&& arg_directory
) {
6289 k
= rm_rf(arg_directory
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
6291 log_warning_errno(k
, "Cannot remove '%s', ignoring: %m", arg_directory
);
6294 if (remove_image
&& arg_image
) {
6295 if (unlink(arg_image
) < 0)
6296 log_warning_errno(errno
, "Can't remove image file '%s', ignoring: %m", arg_image
);
6299 if (remove_tmprootdir
) {
6300 if (rmdir(tmprootdir
) < 0)
6301 log_debug_errno(errno
, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir
);
6304 if (arg_machine
&& arg_privileged
) {
6307 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
6308 (void) rm_rf(p
, REMOVE_ROOT
);
6310 p
= strjoina("/run/systemd/nspawn/unix-export/", arg_machine
);
6311 (void) umount2(p
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
6315 expose_port_flush(&fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
.address4
);
6316 expose_port_flush(&fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
.address6
);
6318 if (arg_privileged
) {
6320 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
6321 (void) remove_bridge(arg_network_zone
);
6324 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
6325 expose_port_free_all(arg_expose_ports
);
6326 rlimit_free_all(arg_rlimit
);
6327 device_node_array_free(arg_extra_nodes
, arg_n_extra_nodes
);
6335 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run
);