1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <linux/loop.h>
10 #include <selinux/selinux.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
23 #include "sd-daemon.h"
26 #include "alloc-util.h"
28 #include "base-filesystem.h"
29 #include "blkid-util.h"
30 #include "btrfs-util.h"
32 #include "bus-error.h"
35 #include "capability-util.h"
36 #include "cgroup-util.h"
38 #include "common-signal.h"
40 #include "cpu-set-util.h"
41 #include "creds-util.h"
42 #include "dev-setup.h"
43 #include "discover-image.h"
44 #include "dissect-image.h"
50 #include "format-util.h"
53 #include "hexdecoct.h"
54 #include "hostname-setup.h"
55 #include "hostname-util.h"
56 #include "id128-util.h"
59 #include "loop-util.h"
60 #include "loopback-setup.h"
62 #include "main-func.h"
63 #include "missing_sched.h"
65 #include "mount-util.h"
66 #include "mountpoint-util.h"
67 #include "namespace-util.h"
68 #include "netlink-util.h"
69 #include "nspawn-bind-user.h"
70 #include "nspawn-cgroup.h"
71 #include "nspawn-creds.h"
72 #include "nspawn-def.h"
73 #include "nspawn-expose-ports.h"
74 #include "nspawn-mount.h"
75 #include "nspawn-network.h"
76 #include "nspawn-oci.h"
77 #include "nspawn-patch-uid.h"
78 #include "nspawn-register.h"
79 #include "nspawn-seccomp.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "nspawn-stub-pid1.h"
83 #include "nspawn-util.h"
85 #include "nulstr-util.h"
88 #include "parse-argument.h"
89 #include "parse-util.h"
90 #include "pretty-print.h"
91 #include "process-util.h"
93 #include "random-util.h"
94 #include "raw-clone.h"
95 #include "resolve-util.h"
96 #include "rlimit-util.h"
99 #include "seccomp-util.h"
101 #include "selinux-util.h"
102 #include "signal-util.h"
103 #include "socket-util.h"
104 #include "stat-util.h"
105 #include "stdio-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
109 #include "sysctl-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-name.h"
114 #include "user-util.h"
116 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
118 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
120 #define EXIT_FORCE_RESTART 133
122 typedef enum ContainerStatus
{
123 CONTAINER_TERMINATED
,
127 static char *arg_directory
= NULL
;
128 static char *arg_template
= NULL
;
129 static char *arg_chdir
= NULL
;
130 static char *arg_pivot_root_new
= NULL
;
131 static char *arg_pivot_root_old
= NULL
;
132 static char *arg_user
= NULL
;
133 static uid_t arg_uid
= UID_INVALID
;
134 static gid_t arg_gid
= GID_INVALID
;
135 static gid_t
* arg_supplementary_gids
= NULL
;
136 static size_t arg_n_supplementary_gids
= 0;
137 static sd_id128_t arg_uuid
= {};
138 static char *arg_machine
= NULL
; /* The name used by the host to refer to this */
139 static char *arg_hostname
= NULL
; /* The name the payload sees by default */
140 static const char *arg_selinux_context
= NULL
;
141 static const char *arg_selinux_apifs_context
= NULL
;
142 static char *arg_slice
= NULL
;
143 static bool arg_private_network
= false;
144 static bool arg_read_only
= false;
145 static StartMode arg_start_mode
= START_PID1
;
146 static bool arg_ephemeral
= false;
147 static LinkJournal arg_link_journal
= LINK_AUTO
;
148 static bool arg_link_journal_try
= false;
149 static uint64_t arg_caps_retain
=
150 (1ULL << CAP_AUDIT_CONTROL
) |
151 (1ULL << CAP_AUDIT_WRITE
) |
152 (1ULL << CAP_CHOWN
) |
153 (1ULL << CAP_DAC_OVERRIDE
) |
154 (1ULL << CAP_DAC_READ_SEARCH
) |
155 (1ULL << CAP_FOWNER
) |
156 (1ULL << CAP_FSETID
) |
157 (1ULL << CAP_IPC_OWNER
) |
159 (1ULL << CAP_LEASE
) |
160 (1ULL << CAP_LINUX_IMMUTABLE
) |
161 (1ULL << CAP_MKNOD
) |
162 (1ULL << CAP_NET_BIND_SERVICE
) |
163 (1ULL << CAP_NET_BROADCAST
) |
164 (1ULL << CAP_NET_RAW
) |
165 (1ULL << CAP_SETFCAP
) |
166 (1ULL << CAP_SETGID
) |
167 (1ULL << CAP_SETPCAP
) |
168 (1ULL << CAP_SETUID
) |
169 (1ULL << CAP_SYS_ADMIN
) |
170 (1ULL << CAP_SYS_BOOT
) |
171 (1ULL << CAP_SYS_CHROOT
) |
172 (1ULL << CAP_SYS_NICE
) |
173 (1ULL << CAP_SYS_PTRACE
) |
174 (1ULL << CAP_SYS_RESOURCE
) |
175 (1ULL << CAP_SYS_TTY_CONFIG
);
176 static uint64_t arg_caps_ambient
= 0;
177 static CapabilityQuintet arg_full_capabilities
= CAPABILITY_QUINTET_NULL
;
178 static CustomMount
*arg_custom_mounts
= NULL
;
179 static size_t arg_n_custom_mounts
= 0;
180 static char **arg_setenv
= NULL
;
181 static bool arg_quiet
= false;
182 static bool arg_register
= true;
183 static bool arg_keep_unit
= false;
184 static char **arg_network_interfaces
= NULL
;
185 static char **arg_network_macvlan
= NULL
;
186 static char **arg_network_ipvlan
= NULL
;
187 static bool arg_network_veth
= false;
188 static char **arg_network_veth_extra
= NULL
;
189 static char *arg_network_bridge
= NULL
;
190 static char *arg_network_zone
= NULL
;
191 static char *arg_network_namespace_path
= NULL
;
192 static PagerFlags arg_pager_flags
= 0;
193 static unsigned long arg_personality
= PERSONALITY_INVALID
;
194 static char *arg_image
= NULL
;
195 static char *arg_oci_bundle
= NULL
;
196 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
197 static ExposePort
*arg_expose_ports
= NULL
;
198 static char **arg_property
= NULL
;
199 static sd_bus_message
*arg_property_message
= NULL
;
200 static UserNamespaceMode arg_userns_mode
= USER_NAMESPACE_NO
;
201 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
202 static UserNamespaceOwnership arg_userns_ownership
= _USER_NAMESPACE_OWNERSHIP_INVALID
;
203 static int arg_kill_signal
= 0;
204 static CGroupUnified arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_UNKNOWN
;
205 static SettingsMask arg_settings_mask
= 0;
206 static int arg_settings_trusted
= -1;
207 static char **arg_parameters
= NULL
;
208 static const char *arg_container_service_name
= "systemd-nspawn";
209 static bool arg_notify_ready
= false;
210 static bool arg_use_cgns
= true;
211 static unsigned long arg_clone_ns_flags
= CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
;
212 static MountSettingsMask arg_mount_settings
= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_TMPFS_TMP
;
213 static VeritySettings arg_verity_settings
= VERITY_SETTINGS_DEFAULT
;
214 static char **arg_syscall_allow_list
= NULL
;
215 static char **arg_syscall_deny_list
= NULL
;
217 static scmp_filter_ctx arg_seccomp
= NULL
;
219 static struct rlimit
*arg_rlimit
[_RLIMIT_MAX
] = {};
220 static bool arg_no_new_privileges
= false;
221 static int arg_oom_score_adjust
= 0;
222 static bool arg_oom_score_adjust_set
= false;
223 static CPUSet arg_cpu_set
= {};
224 static ResolvConfMode arg_resolv_conf
= RESOLV_CONF_AUTO
;
225 static TimezoneMode arg_timezone
= TIMEZONE_AUTO
;
226 static unsigned arg_console_width
= UINT_MAX
, arg_console_height
= UINT_MAX
;
227 static DeviceNode
* arg_extra_nodes
= NULL
;
228 static size_t arg_n_extra_nodes
= 0;
229 static char **arg_sysctl
= NULL
;
230 static ConsoleMode arg_console_mode
= _CONSOLE_MODE_INVALID
;
231 static Credential
*arg_credentials
= NULL
;
232 static size_t arg_n_credentials
= 0;
233 static char **arg_bind_user
= NULL
;
234 static bool arg_suppress_sync
= false;
235 static char *arg_settings_filename
= NULL
;
236 static Architecture arg_architecture
= _ARCHITECTURE_INVALID
;
238 STATIC_DESTRUCTOR_REGISTER(arg_directory
, freep
);
239 STATIC_DESTRUCTOR_REGISTER(arg_template
, freep
);
240 STATIC_DESTRUCTOR_REGISTER(arg_chdir
, freep
);
241 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new
, freep
);
242 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old
, freep
);
243 STATIC_DESTRUCTOR_REGISTER(arg_user
, freep
);
244 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids
, freep
);
245 STATIC_DESTRUCTOR_REGISTER(arg_machine
, freep
);
246 STATIC_DESTRUCTOR_REGISTER(arg_hostname
, freep
);
247 STATIC_DESTRUCTOR_REGISTER(arg_slice
, freep
);
248 STATIC_DESTRUCTOR_REGISTER(arg_setenv
, strv_freep
);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces
, strv_freep
);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan
, strv_freep
);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan
, strv_freep
);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra
, strv_freep
);
253 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge
, freep
);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_zone
, freep
);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path
, freep
);
256 STATIC_DESTRUCTOR_REGISTER(arg_image
, freep
);
257 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle
, freep
);
258 STATIC_DESTRUCTOR_REGISTER(arg_property
, strv_freep
);
259 STATIC_DESTRUCTOR_REGISTER(arg_property_message
, sd_bus_message_unrefp
);
260 STATIC_DESTRUCTOR_REGISTER(arg_parameters
, strv_freep
);
261 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings
, verity_settings_done
);
262 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list
, strv_freep
);
263 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list
, strv_freep
);
265 STATIC_DESTRUCTOR_REGISTER(arg_seccomp
, seccomp_releasep
);
267 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set
, cpu_set_reset
);
268 STATIC_DESTRUCTOR_REGISTER(arg_sysctl
, strv_freep
);
269 STATIC_DESTRUCTOR_REGISTER(arg_bind_user
, strv_freep
);
270 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename
, freep
);
272 static int handle_arg_console(const char *arg
) {
273 if (streq(arg
, "help")) {
282 if (streq(arg
, "interactive"))
283 arg_console_mode
= CONSOLE_INTERACTIVE
;
284 else if (streq(arg
, "read-only"))
285 arg_console_mode
= CONSOLE_READ_ONLY
;
286 else if (streq(arg
, "passive"))
287 arg_console_mode
= CONSOLE_PASSIVE
;
288 else if (streq(arg
, "pipe")) {
289 if (isatty(STDIN_FILENO
) > 0 && isatty(STDOUT_FILENO
) > 0)
290 log_full(arg_quiet
? LOG_DEBUG
: LOG_NOTICE
,
291 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
292 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
293 "Proceeding anyway.");
295 arg_console_mode
= CONSOLE_PIPE
;
296 } else if (streq(arg
, "autopipe")) {
297 if (isatty(STDIN_FILENO
) > 0 && isatty(STDOUT_FILENO
) > 0)
298 arg_console_mode
= CONSOLE_INTERACTIVE
;
300 arg_console_mode
= CONSOLE_PIPE
;
302 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Unknown console mode: %s", optarg
);
304 arg_settings_mask
|= SETTING_CONSOLE_MODE
;
308 static int help(void) {
309 _cleanup_free_
char *link
= NULL
;
312 pager_open(arg_pager_flags
);
314 r
= terminal_urlify_man("systemd-nspawn", "1", &link
);
318 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
319 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
320 " -h --help Show this help\n"
321 " --version Print version string\n"
322 " -q --quiet Do not show status information\n"
323 " --no-pager Do not pipe output into a pager\n"
324 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
326 " -D --directory=PATH Root directory for the container\n"
327 " --template=PATH Initialize root directory from template directory,\n"
329 " -x --ephemeral Run container with snapshot of root directory, and\n"
330 " remove it after exit\n"
331 " -i --image=PATH Root file system disk image (or device node) for\n"
333 " --oci-bundle=PATH OCI bundle directory\n"
334 " --read-only Mount the root directory read-only\n"
335 " --volatile[=MODE] Run the system in volatile mode\n"
336 " --root-hash=HASH Specify verity root hash for root disk image\n"
337 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
338 " as a DER encoded PKCS7, either as a path to a file\n"
339 " or as an ASCII base64 encoded string prefixed by\n"
341 " --verity-data=PATH Specify hash device for verity\n"
342 " --pivot-root=PATH[:PATH]\n"
343 " Pivot root to given directory in the container\n\n"
344 "%3$sExecution:%4$s\n"
345 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
346 " -b --boot Boot up full system (i.e. invoke init)\n"
347 " --chdir=PATH Set working directory in the container\n"
348 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
349 " -u --user=USER Run the command under specified user or UID\n"
350 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
351 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
352 " --suppress-sync=BOOLEAN\n"
353 " Suppress any form of disk data synchronization\n\n"
354 "%3$sSystem Identity:%4$s\n"
355 " -M --machine=NAME Set the machine name for the container\n"
356 " --hostname=NAME Override the hostname for the container\n"
357 " --uuid=UUID Set a specific machine UUID for the container\n\n"
358 "%3$sProperties:%4$s\n"
359 " -S --slice=SLICE Place the container in the specified slice\n"
360 " --property=NAME=VALUE Set scope unit property\n"
361 " --register=BOOLEAN Register container as machine\n"
362 " --keep-unit Do not register a scope for the machine, reuse\n"
363 " the service unit nspawn is running in\n\n"
364 "%3$sUser Namespacing:%4$s\n"
365 " --private-users=no Run without user namespacing\n"
366 " --private-users=yes|pick|identity\n"
367 " Run within user namespace, autoselect UID/GID range\n"
368 " --private-users=UIDBASE[:NUIDS]\n"
369 " Similar, but with user configured UID/GID range\n"
370 " --private-users-ownership=MODE\n"
371 " Adjust ('chown') or map ('map') OS tree ownership\n"
372 " to private UID/GID range\n"
373 " -U Equivalent to --private-users=pick and\n"
374 " --private-users-ownership=auto\n\n"
375 "%3$sNetworking:%4$s\n"
376 " --private-network Disable network in container\n"
377 " --network-interface=INTERFACE\n"
378 " Assign an existing network interface to the\n"
380 " --network-macvlan=INTERFACE\n"
381 " Create a macvlan network interface based on an\n"
382 " existing network interface to the container\n"
383 " --network-ipvlan=INTERFACE\n"
384 " Create an ipvlan network interface based on an\n"
385 " existing network interface to the container\n"
386 " -n --network-veth Add a virtual Ethernet connection between host\n"
388 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
389 " Add an additional virtual Ethernet link between\n"
390 " host and container\n"
391 " --network-bridge=INTERFACE\n"
392 " Add a virtual Ethernet connection to the container\n"
393 " and attach it to an existing bridge on the host\n"
394 " --network-zone=NAME Similar, but attach the new interface to an\n"
395 " an automatically managed bridge interface\n"
396 " --network-namespace-path=PATH\n"
397 " Set network namespace to the one represented by\n"
398 " the specified kernel namespace file node\n"
399 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
400 " Expose a container IP port on the host\n\n"
401 "%3$sSecurity:%4$s\n"
402 " --capability=CAP In addition to the default, retain specified\n"
404 " --drop-capability=CAP Drop the specified capability from the default set\n"
405 " --ambient-capability=CAP\n"
406 " Sets the specified capability for the started\n"
407 " process. Not useful if booting a machine.\n"
408 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
409 " --system-call-filter=LIST|~LIST\n"
410 " Permit/prohibit specific system calls\n"
411 " -Z --selinux-context=SECLABEL\n"
412 " Set the SELinux security context to be used by\n"
413 " processes in the container\n"
414 " -L --selinux-apifs-context=SECLABEL\n"
415 " Set the SELinux security context to be used by\n"
416 " API/tmpfs file systems in the container\n\n"
417 "%3$sResources:%4$s\n"
418 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
419 " --oom-score-adjust=VALUE\n"
420 " Adjust the OOM score value for the payload\n"
421 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
422 " --personality=ARCH Pick personality for this container\n\n"
423 "%3$sIntegration:%4$s\n"
424 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
425 " --timezone=MODE Select mode of /etc/localtime initialization\n"
426 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
427 " host, try-guest, try-host\n"
428 " -j Equivalent to --link-journal=try-guest\n\n"
430 " --bind=PATH[:PATH[:OPTIONS]]\n"
431 " Bind mount a file or directory from the host into\n"
433 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
434 " Similar, but creates a read-only bind mount\n"
435 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
437 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
438 " --overlay=PATH[:PATH...]:PATH\n"
439 " Create an overlay mount from the host to \n"
441 " --overlay-ro=PATH[:PATH...]:PATH\n"
442 " Similar, but creates a read-only overlay mount\n"
443 " --bind-user=NAME Bind user from host to container\n\n"
444 "%3$sInput/Output:%4$s\n"
445 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
446 " set up for the container.\n"
447 " -P --pipe Equivalent to --console=pipe\n\n"
448 "%3$sCredentials:%4$s\n"
449 " --set-credential=ID:VALUE\n"
450 " Pass a credential with literal value to container.\n"
451 " --load-credential=ID:PATH\n"
452 " Load credential to pass to container from file or\n"
453 " AF_UNIX stream socket.\n"
454 "\nSee the %2$s for details.\n",
455 program_invocation_short_name
,
465 static int custom_mount_check_all(void) {
468 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
469 CustomMount
*m
= &arg_custom_mounts
[i
];
471 if (path_equal(m
->destination
, "/") && arg_userns_mode
!= USER_NAMESPACE_NO
) {
472 if (arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_OFF
)
473 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
474 "--private-users-ownership=own may not be combined with custom root mounts.");
475 if (arg_uid_shift
== UID_INVALID
)
476 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
477 "--private-users with automatic UID shift may not be combined with custom root mounts.");
484 static int detect_unified_cgroup_hierarchy_from_environment(void) {
485 const char *e
, *var
= "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
488 /* Allow the user to control whether the unified hierarchy is used */
492 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
493 var
= "UNIFIED_CGROUP_HIERARCHY";
498 r
= parse_boolean(e
);
500 return log_error_errno(r
, "Failed to parse $%s: %m", var
);
502 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
504 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
510 static int detect_unified_cgroup_hierarchy_from_image(const char *directory
) {
513 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
514 * in the image actually supports. */
515 r
= cg_all_unified();
517 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
519 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
520 * routine only detects 231, so we'll have a false negative here for 230. */
521 r
= systemd_installation_has_version(directory
, "230");
523 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
525 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
527 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
528 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0) {
529 /* Mixed cgroup hierarchy support was added in 233 */
530 r
= systemd_installation_has_version(directory
, "233");
532 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
534 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
536 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
538 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
540 log_debug("Using %s hierarchy for container.",
541 arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_NONE
? "legacy" :
542 arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_SYSTEMD
? "hybrid" : "unified");
547 static int parse_capability_spec(const char *spec
, uint64_t *ret_mask
) {
552 _cleanup_free_
char *t
= NULL
;
554 r
= extract_first_word(&spec
, &t
, ",", 0);
556 return log_error_errno(r
, "Failed to parse capability %s.", t
);
560 if (streq(t
, "help")) {
561 for (int i
= 0; i
< capability_list_length(); i
++) {
564 name
= capability_to_name(i
);
575 r
= capability_from_name(t
);
577 return log_error_errno(r
, "Failed to parse capability %s.", t
);
584 return 1; /* continue */
587 static int parse_share_ns_env(const char *name
, unsigned long ns_flag
) {
590 r
= getenv_bool(name
);
594 return log_error_errno(r
, "Failed to parse $%s: %m", name
);
596 arg_clone_ns_flags
= (arg_clone_ns_flags
& ~ns_flag
) | (r
> 0 ? 0 : ns_flag
);
597 arg_settings_mask
|= SETTING_CLONE_NS_FLAGS
;
601 static int parse_mount_settings_env(void) {
605 r
= getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
606 if (r
< 0 && r
!= -ENXIO
)
607 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
609 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_TMPFS_TMP
, r
> 0);
611 e
= getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
612 if (streq_ptr(e
, "network"))
613 arg_mount_settings
|= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
;
616 r
= parse_boolean(e
);
618 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
620 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_RO
, r
== 0);
621 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_NETNS
, false);
627 static int parse_environment(void) {
631 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC
);
634 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID
);
637 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS
);
640 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
);
644 r
= parse_mount_settings_env();
648 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
649 * even if it is supported. If not supported, it has no effect. */
650 if (!cg_ns_supported())
651 arg_use_cgns
= false;
653 r
= getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
656 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
660 arg_use_cgns
= r
> 0;
661 arg_settings_mask
|= SETTING_USE_CGNS
;
665 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
667 arg_container_service_name
= e
;
669 r
= getenv_bool("SYSTEMD_SUPPRESS_SYNC");
671 arg_suppress_sync
= r
;
672 else if (r
!= -ENXIO
)
673 log_debug_errno(r
, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
675 return detect_unified_cgroup_hierarchy_from_environment();
678 static int parse_argv(int argc
, char *argv
[]) {
685 ARG_AMBIENT_CAPABILITY
,
697 ARG_NETWORK_INTERFACE
,
702 ARG_NETWORK_VETH_EXTRA
,
703 ARG_NETWORK_NAMESPACE_PATH
,
713 ARG_PRIVATE_USERS_CHOWN
,
714 ARG_PRIVATE_USERS_OWNERSHIP
,
719 ARG_SYSTEM_CALL_FILTER
,
722 ARG_NO_NEW_PRIVILEGES
,
723 ARG_OOM_SCORE_ADJUST
,
737 static const struct option options
[] = {
738 { "help", no_argument
, NULL
, 'h' },
739 { "version", no_argument
, NULL
, ARG_VERSION
},
740 { "directory", required_argument
, NULL
, 'D' },
741 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
742 { "ephemeral", no_argument
, NULL
, 'x' },
743 { "user", required_argument
, NULL
, 'u' },
744 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
745 { "as-pid2", no_argument
, NULL
, 'a' },
746 { "boot", no_argument
, NULL
, 'b' },
747 { "uuid", required_argument
, NULL
, ARG_UUID
},
748 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
749 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
750 { "ambient-capability", required_argument
, NULL
, ARG_AMBIENT_CAPABILITY
},
751 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
752 { "no-new-privileges", required_argument
, NULL
, ARG_NO_NEW_PRIVILEGES
},
753 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
754 { "bind", required_argument
, NULL
, ARG_BIND
},
755 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
756 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
757 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
758 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
759 { "inaccessible", required_argument
, NULL
, ARG_INACCESSIBLE
},
760 { "machine", required_argument
, NULL
, 'M' },
761 { "hostname", required_argument
, NULL
, ARG_HOSTNAME
},
762 { "slice", required_argument
, NULL
, 'S' },
763 { "setenv", required_argument
, NULL
, 'E' },
764 { "selinux-context", required_argument
, NULL
, 'Z' },
765 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
766 { "quiet", no_argument
, NULL
, 'q' },
767 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
}, /* not documented */
768 { "register", required_argument
, NULL
, ARG_REGISTER
},
769 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
770 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
771 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
772 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
773 { "network-veth", no_argument
, NULL
, 'n' },
774 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
775 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
776 { "network-zone", required_argument
, NULL
, ARG_NETWORK_ZONE
},
777 { "network-namespace-path", required_argument
, NULL
, ARG_NETWORK_NAMESPACE_PATH
},
778 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
779 { "image", required_argument
, NULL
, 'i' },
780 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
781 { "port", required_argument
, NULL
, 'p' },
782 { "property", required_argument
, NULL
, ARG_PROPERTY
},
783 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
784 { "private-users-chown", optional_argument
, NULL
, ARG_PRIVATE_USERS_CHOWN
}, /* obsolete */
785 { "private-users-ownership",required_argument
, NULL
, ARG_PRIVATE_USERS_OWNERSHIP
},
786 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
787 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
788 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
789 { "pivot-root", required_argument
, NULL
, ARG_PIVOT_ROOT
},
790 { "notify-ready", required_argument
, NULL
, ARG_NOTIFY_READY
},
791 { "root-hash", required_argument
, NULL
, ARG_ROOT_HASH
},
792 { "root-hash-sig", required_argument
, NULL
, ARG_ROOT_HASH_SIG
},
793 { "verity-data", required_argument
, NULL
, ARG_VERITY_DATA
},
794 { "system-call-filter", required_argument
, NULL
, ARG_SYSTEM_CALL_FILTER
},
795 { "rlimit", required_argument
, NULL
, ARG_RLIMIT
},
796 { "oom-score-adjust", required_argument
, NULL
, ARG_OOM_SCORE_ADJUST
},
797 { "cpu-affinity", required_argument
, NULL
, ARG_CPU_AFFINITY
},
798 { "resolv-conf", required_argument
, NULL
, ARG_RESOLV_CONF
},
799 { "timezone", required_argument
, NULL
, ARG_TIMEZONE
},
800 { "console", required_argument
, NULL
, ARG_CONSOLE
},
801 { "pipe", no_argument
, NULL
, ARG_PIPE
},
802 { "oci-bundle", required_argument
, NULL
, ARG_OCI_BUNDLE
},
803 { "no-pager", no_argument
, NULL
, ARG_NO_PAGER
},
804 { "set-credential", required_argument
, NULL
, ARG_SET_CREDENTIAL
},
805 { "load-credential", required_argument
, NULL
, ARG_LOAD_CREDENTIAL
},
806 { "bind-user", required_argument
, NULL
, ARG_BIND_USER
},
807 { "suppress-sync", required_argument
, NULL
, ARG_SUPPRESS_SYNC
},
812 uint64_t plus
= 0, minus
= 0;
813 bool mask_all_settings
= false, mask_no_settings
= false;
818 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options
, NULL
)) >= 0)
828 r
= parse_path_argument(optarg
, false, &arg_directory
);
832 arg_settings_mask
|= SETTING_DIRECTORY
;
836 r
= parse_path_argument(optarg
, false, &arg_template
);
840 arg_settings_mask
|= SETTING_DIRECTORY
;
844 r
= parse_path_argument(optarg
, false, &arg_image
);
848 arg_settings_mask
|= SETTING_DIRECTORY
;
852 r
= parse_path_argument(optarg
, false, &arg_oci_bundle
);
859 arg_ephemeral
= true;
860 arg_settings_mask
|= SETTING_EPHEMERAL
;
864 r
= free_and_strdup(&arg_user
, optarg
);
868 arg_settings_mask
|= SETTING_USER
;
871 case ARG_NETWORK_ZONE
: {
874 j
= strjoin("vz-", optarg
);
878 if (!ifname_valid(j
)) {
879 log_error("Network zone name not valid: %s", j
);
884 free_and_replace(arg_network_zone
, j
);
886 arg_network_veth
= true;
887 arg_private_network
= true;
888 arg_settings_mask
|= SETTING_NETWORK
;
892 case ARG_NETWORK_BRIDGE
:
894 if (!ifname_valid(optarg
))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
896 "Bridge interface name not valid: %s", optarg
);
898 r
= free_and_strdup(&arg_network_bridge
, optarg
);
904 arg_network_veth
= true;
905 arg_private_network
= true;
906 arg_settings_mask
|= SETTING_NETWORK
;
909 case ARG_NETWORK_VETH_EXTRA
:
910 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
912 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
914 arg_private_network
= true;
915 arg_settings_mask
|= SETTING_NETWORK
;
918 case ARG_NETWORK_INTERFACE
:
919 if (!ifname_valid(optarg
))
920 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
921 "Network interface name not valid: %s", optarg
);
923 r
= test_network_interface_initialized(optarg
);
927 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
930 arg_private_network
= true;
931 arg_settings_mask
|= SETTING_NETWORK
;
934 case ARG_NETWORK_MACVLAN
:
936 if (!ifname_valid(optarg
))
937 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
938 "MACVLAN network interface name not valid: %s", optarg
);
940 r
= test_network_interface_initialized(optarg
);
944 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
947 arg_private_network
= true;
948 arg_settings_mask
|= SETTING_NETWORK
;
951 case ARG_NETWORK_IPVLAN
:
953 if (!ifname_valid(optarg
))
954 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
955 "IPVLAN network interface name not valid: %s", optarg
);
957 r
= test_network_interface_initialized(optarg
);
961 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
965 case ARG_PRIVATE_NETWORK
:
966 arg_private_network
= true;
967 arg_settings_mask
|= SETTING_NETWORK
;
970 case ARG_NETWORK_NAMESPACE_PATH
:
971 r
= parse_path_argument(optarg
, false, &arg_network_namespace_path
);
975 arg_settings_mask
|= SETTING_NETWORK
;
979 if (arg_start_mode
== START_PID2
)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
981 "--boot and --as-pid2 may not be combined.");
983 arg_start_mode
= START_BOOT
;
984 arg_settings_mask
|= SETTING_START_MODE
;
988 if (arg_start_mode
== START_BOOT
)
989 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
990 "--boot and --as-pid2 may not be combined.");
992 arg_start_mode
= START_PID2
;
993 arg_settings_mask
|= SETTING_START_MODE
;
997 r
= sd_id128_from_string(optarg
, &arg_uuid
);
999 return log_error_errno(r
, "Invalid UUID: %s", optarg
);
1001 if (sd_id128_is_null(arg_uuid
))
1002 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1003 "Machine UUID may not be all zeroes.");
1005 arg_settings_mask
|= SETTING_MACHINE_ID
;
1009 _cleanup_free_
char *mangled
= NULL
;
1011 r
= unit_name_mangle_with_suffix(optarg
, NULL
, UNIT_NAME_MANGLE_WARN
, ".slice", &mangled
);
1015 free_and_replace(arg_slice
, mangled
);
1016 arg_settings_mask
|= SETTING_SLICE
;
1021 if (isempty(optarg
))
1022 arg_machine
= mfree(arg_machine
);
1024 if (!hostname_is_valid(optarg
, 0))
1025 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1026 "Invalid machine name: %s", optarg
);
1028 r
= free_and_strdup(&arg_machine
, optarg
);
1035 if (isempty(optarg
))
1036 arg_hostname
= mfree(arg_hostname
);
1038 if (!hostname_is_valid(optarg
, 0))
1039 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1040 "Invalid hostname: %s", optarg
);
1042 r
= free_and_strdup(&arg_hostname
, optarg
);
1047 arg_settings_mask
|= SETTING_HOSTNAME
;
1051 arg_selinux_context
= optarg
;
1055 arg_selinux_apifs_context
= optarg
;
1059 arg_read_only
= true;
1060 arg_settings_mask
|= SETTING_READ_ONLY
;
1063 case ARG_AMBIENT_CAPABILITY
: {
1065 r
= parse_capability_spec(optarg
, &m
);
1068 arg_caps_ambient
|= m
;
1069 arg_settings_mask
|= SETTING_CAPABILITY
;
1072 case ARG_CAPABILITY
:
1073 case ARG_DROP_CAPABILITY
: {
1075 r
= parse_capability_spec(optarg
, &m
);
1079 if (c
== ARG_CAPABILITY
)
1083 arg_settings_mask
|= SETTING_CAPABILITY
;
1086 case ARG_NO_NEW_PRIVILEGES
:
1087 r
= parse_boolean(optarg
);
1089 return log_error_errno(r
, "Failed to parse --no-new-privileges= argument: %s", optarg
);
1091 arg_no_new_privileges
= r
;
1092 arg_settings_mask
|= SETTING_NO_NEW_PRIVILEGES
;
1096 arg_link_journal
= LINK_GUEST
;
1097 arg_link_journal_try
= true;
1098 arg_settings_mask
|= SETTING_LINK_JOURNAL
;
1101 case ARG_LINK_JOURNAL
:
1102 r
= parse_link_journal(optarg
, &arg_link_journal
, &arg_link_journal_try
);
1104 return log_error_errno(r
, "Failed to parse link journal mode %s", optarg
);
1106 arg_settings_mask
|= SETTING_LINK_JOURNAL
;
1111 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
1113 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
1115 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1119 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
1121 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
1123 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1127 case ARG_OVERLAY_RO
:
1128 r
= overlay_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_OVERLAY_RO
);
1129 if (r
== -EADDRNOTAVAIL
)
1130 return log_error_errno(r
, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1132 return log_error_errno(r
, "Failed to parse --overlay(-ro)= argument %s: %m", optarg
);
1134 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1137 case ARG_INACCESSIBLE
:
1138 r
= inaccessible_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
1140 return log_error_errno(r
, "Failed to parse --inaccessible= argument %s: %m", optarg
);
1142 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1146 r
= strv_env_replace_strdup_passthrough(&arg_setenv
, optarg
);
1148 return log_error_errno(r
, "Cannot assign environment variable %s: %m", optarg
);
1150 arg_settings_mask
|= SETTING_ENVIRONMENT
;
1157 case ARG_SHARE_SYSTEM
:
1158 /* We don't officially support this anymore, except for compat reasons. People should use the
1159 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1160 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1161 arg_clone_ns_flags
= 0;
1165 r
= parse_boolean(optarg
);
1167 log_error("Failed to parse --register= argument: %s", optarg
);
1175 arg_keep_unit
= true;
1178 case ARG_PERSONALITY
:
1180 arg_personality
= personality_from_string(optarg
);
1181 if (arg_personality
== PERSONALITY_INVALID
)
1182 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1183 "Unknown or unsupported personality '%s'.", optarg
);
1185 arg_settings_mask
|= SETTING_PERSONALITY
;
1191 arg_volatile_mode
= VOLATILE_YES
;
1192 else if (streq(optarg
, "help")) {
1193 DUMP_STRING_TABLE(volatile_mode
, VolatileMode
, _VOLATILE_MODE_MAX
);
1198 m
= volatile_mode_from_string(optarg
);
1200 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1201 "Failed to parse --volatile= argument: %s", optarg
);
1203 arg_volatile_mode
= m
;
1206 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
1210 r
= expose_port_parse(&arg_expose_ports
, optarg
);
1212 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
1214 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
1216 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
1220 if (strv_extend(&arg_property
, optarg
) < 0)
1225 case ARG_PRIVATE_USERS
: {
1230 else if (!in_charset(optarg
, DIGITS
))
1231 /* do *not* parse numbers as booleans */
1232 boolean
= parse_boolean(optarg
);
1237 /* no: User namespacing off */
1238 arg_userns_mode
= USER_NAMESPACE_NO
;
1239 arg_uid_shift
= UID_INVALID
;
1240 arg_uid_range
= UINT32_C(0x10000);
1241 } else if (boolean
> 0) {
1242 /* yes: User namespacing on, UID range is read from root dir */
1243 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1244 arg_uid_shift
= UID_INVALID
;
1245 arg_uid_range
= UINT32_C(0x10000);
1246 } else if (streq(optarg
, "pick")) {
1247 /* pick: User namespacing on, UID range is picked randomly */
1248 arg_userns_mode
= USER_NAMESPACE_PICK
; /* Note that arg_userns_ownership is
1249 * implied by USER_NAMESPACE_PICK
1251 arg_uid_shift
= UID_INVALID
;
1252 arg_uid_range
= UINT32_C(0x10000);
1254 } else if (streq(optarg
, "identity")) {
1255 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1256 * itself, i.e. we don't actually map anything, but do take benefit of
1257 * isolation of capability sets. */
1258 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1260 arg_uid_range
= UINT32_C(0x10000);
1262 _cleanup_free_
char *buffer
= NULL
;
1263 const char *range
, *shift
;
1265 /* anything else: User namespacing on, UID range is explicitly configured */
1267 range
= strchr(optarg
, ':');
1269 buffer
= strndup(optarg
, range
- optarg
);
1275 r
= safe_atou32(range
, &arg_uid_range
);
1277 return log_error_errno(r
, "Failed to parse UID range \"%s\": %m", range
);
1281 r
= parse_uid(shift
, &arg_uid_shift
);
1283 return log_error_errno(r
, "Failed to parse UID \"%s\": %m", optarg
);
1285 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1287 if (!userns_shift_range_valid(arg_uid_shift
, arg_uid_range
))
1288 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "UID range cannot be empty or go beyond " UID_FMT
".", UID_INVALID
);
1291 arg_settings_mask
|= SETTING_USERNS
;
1296 if (userns_supported()) {
1297 arg_userns_mode
= USER_NAMESPACE_PICK
; /* Note that arg_userns_ownership is
1298 * implied by USER_NAMESPACE_PICK
1300 arg_uid_shift
= UID_INVALID
;
1301 arg_uid_range
= UINT32_C(0x10000);
1303 arg_settings_mask
|= SETTING_USERNS
;
1308 case ARG_PRIVATE_USERS_CHOWN
:
1309 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
1311 arg_settings_mask
|= SETTING_USERNS
;
1314 case ARG_PRIVATE_USERS_OWNERSHIP
:
1315 if (streq(optarg
, "help")) {
1316 DUMP_STRING_TABLE(user_namespace_ownership
, UserNamespaceOwnership
, _USER_NAMESPACE_OWNERSHIP_MAX
);
1320 arg_userns_ownership
= user_namespace_ownership_from_string(optarg
);
1321 if (arg_userns_ownership
< 0)
1322 return log_error_errno(arg_userns_ownership
, "Cannot parse --user-namespace-ownership= value: %s", optarg
);
1324 arg_settings_mask
|= SETTING_USERNS
;
1327 case ARG_KILL_SIGNAL
:
1328 if (streq(optarg
, "help")) {
1329 DUMP_STRING_TABLE(signal
, int, _NSIG
);
1333 arg_kill_signal
= signal_from_string(optarg
);
1334 if (arg_kill_signal
< 0)
1335 return log_error_errno(arg_kill_signal
, "Cannot parse signal: %s", optarg
);
1337 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
1342 /* no → do not read files
1343 * yes → read files, do not override cmdline, trust only subset
1344 * override → read files, override cmdline, trust only subset
1345 * trusted → read files, do not override cmdline, trust all
1348 r
= parse_boolean(optarg
);
1350 if (streq(optarg
, "trusted")) {
1351 mask_all_settings
= false;
1352 mask_no_settings
= false;
1353 arg_settings_trusted
= true;
1355 } else if (streq(optarg
, "override")) {
1356 mask_all_settings
= false;
1357 mask_no_settings
= true;
1358 arg_settings_trusted
= -1;
1360 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
1363 mask_all_settings
= false;
1364 mask_no_settings
= false;
1365 arg_settings_trusted
= -1;
1368 mask_all_settings
= true;
1369 mask_no_settings
= false;
1370 arg_settings_trusted
= false;
1376 if (!path_is_absolute(optarg
))
1377 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1378 "Working directory %s is not an absolute path.", optarg
);
1380 r
= free_and_strdup(&arg_chdir
, optarg
);
1384 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
1387 case ARG_PIVOT_ROOT
:
1388 r
= pivot_root_parse(&arg_pivot_root_new
, &arg_pivot_root_old
, optarg
);
1390 return log_error_errno(r
, "Failed to parse --pivot-root= argument %s: %m", optarg
);
1392 arg_settings_mask
|= SETTING_PIVOT_ROOT
;
1395 case ARG_NOTIFY_READY
:
1396 r
= parse_boolean(optarg
);
1398 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1399 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg
);
1400 arg_notify_ready
= r
;
1401 arg_settings_mask
|= SETTING_NOTIFY_READY
;
1404 case ARG_ROOT_HASH
: {
1405 _cleanup_free_
void *k
= NULL
;
1408 r
= unhexmem(optarg
, strlen(optarg
), &k
, &l
);
1410 return log_error_errno(r
, "Failed to parse root hash: %s", optarg
);
1411 if (l
< sizeof(sd_id128_t
))
1412 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Root hash must be at least 128bit long: %s", optarg
);
1414 free_and_replace(arg_verity_settings
.root_hash
, k
);
1415 arg_verity_settings
.root_hash_size
= l
;
1419 case ARG_ROOT_HASH_SIG
: {
1424 if ((value
= startswith(optarg
, "base64:"))) {
1425 r
= unbase64mem(value
, strlen(value
), &p
, &l
);
1427 return log_error_errno(r
, "Failed to parse root hash signature '%s': %m", optarg
);
1430 r
= read_full_file(optarg
, (char**) &p
, &l
);
1432 return log_error_errno(r
, "Failed parse root hash signature file '%s': %m", optarg
);
1435 free_and_replace(arg_verity_settings
.root_hash_sig
, p
);
1436 arg_verity_settings
.root_hash_sig_size
= l
;
1440 case ARG_VERITY_DATA
:
1441 r
= parse_path_argument(optarg
, false, &arg_verity_settings
.data_path
);
1446 case ARG_SYSTEM_CALL_FILTER
: {
1450 negative
= optarg
[0] == '~';
1451 items
= negative
? optarg
+ 1 : optarg
;
1454 _cleanup_free_
char *word
= NULL
;
1456 r
= extract_first_word(&items
, &word
, NULL
, 0);
1462 return log_error_errno(r
, "Failed to parse system call filter: %m");
1465 r
= strv_extend(&arg_syscall_deny_list
, word
);
1467 r
= strv_extend(&arg_syscall_allow_list
, word
);
1472 arg_settings_mask
|= SETTING_SYSCALL_FILTER
;
1478 _cleanup_free_
char *name
= NULL
;
1481 if (streq(optarg
, "help")) {
1482 DUMP_STRING_TABLE(rlimit
, int, _RLIMIT_MAX
);
1486 eq
= strchr(optarg
, '=');
1488 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1489 "--rlimit= expects an '=' assignment.");
1491 name
= strndup(optarg
, eq
- optarg
);
1495 rl
= rlimit_from_string_harder(name
);
1497 return log_error_errno(rl
, "Unknown resource limit: %s", name
);
1499 if (!arg_rlimit
[rl
]) {
1500 arg_rlimit
[rl
] = new0(struct rlimit
, 1);
1501 if (!arg_rlimit
[rl
])
1505 r
= rlimit_parse(rl
, eq
+ 1, arg_rlimit
[rl
]);
1507 return log_error_errno(r
, "Failed to parse resource limit: %s", eq
+ 1);
1509 arg_settings_mask
|= SETTING_RLIMIT_FIRST
<< rl
;
1513 case ARG_OOM_SCORE_ADJUST
:
1514 r
= parse_oom_score_adjust(optarg
, &arg_oom_score_adjust
);
1516 return log_error_errno(r
, "Failed to parse --oom-score-adjust= parameter: %s", optarg
);
1518 arg_oom_score_adjust_set
= true;
1519 arg_settings_mask
|= SETTING_OOM_SCORE_ADJUST
;
1522 case ARG_CPU_AFFINITY
: {
1525 r
= parse_cpu_set(optarg
, &cpuset
);
1527 return log_error_errno(r
, "Failed to parse CPU affinity mask %s: %m", optarg
);
1529 cpu_set_reset(&arg_cpu_set
);
1530 arg_cpu_set
= cpuset
;
1531 arg_settings_mask
|= SETTING_CPU_AFFINITY
;
1535 case ARG_RESOLV_CONF
:
1536 if (streq(optarg
, "help")) {
1537 DUMP_STRING_TABLE(resolv_conf_mode
, ResolvConfMode
, _RESOLV_CONF_MODE_MAX
);
1541 arg_resolv_conf
= resolv_conf_mode_from_string(optarg
);
1542 if (arg_resolv_conf
< 0)
1543 return log_error_errno(arg_resolv_conf
,
1544 "Failed to parse /etc/resolv.conf mode: %s", optarg
);
1546 arg_settings_mask
|= SETTING_RESOLV_CONF
;
1550 if (streq(optarg
, "help")) {
1551 DUMP_STRING_TABLE(timezone_mode
, TimezoneMode
, _TIMEZONE_MODE_MAX
);
1555 arg_timezone
= timezone_mode_from_string(optarg
);
1556 if (arg_timezone
< 0)
1557 return log_error_errno(arg_timezone
,
1558 "Failed to parse /etc/localtime mode: %s", optarg
);
1560 arg_settings_mask
|= SETTING_TIMEZONE
;
1564 r
= handle_arg_console(optarg
);
1571 r
= handle_arg_console("pipe");
1577 arg_pager_flags
|= PAGER_DISABLE
;
1580 case ARG_SET_CREDENTIAL
: {
1581 _cleanup_free_
char *word
= NULL
, *data
= NULL
;
1582 const char *p
= optarg
;
1586 r
= extract_first_word(&p
, &word
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1590 return log_error_errno(r
, "Failed to parse --set-credential= parameter: %m");
1592 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Missing value for --set-credential=: %s", optarg
);
1594 if (!credential_name_valid(word
))
1595 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential name is not valid: %s", word
);
1597 for (size_t i
= 0; i
< arg_n_credentials
; i
++)
1598 if (streq(arg_credentials
[i
].id
, word
))
1599 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
), "Duplicate credential '%s', refusing.", word
);
1601 l
= cunescape(p
, UNESCAPE_ACCEPT_NUL
, &data
);
1603 return log_error_errno(l
, "Failed to unescape credential data: %s", p
);
1605 a
= reallocarray(arg_credentials
, arg_n_credentials
+ 1, sizeof(Credential
));
1609 a
[arg_n_credentials
++] = (Credential
) {
1610 .id
= TAKE_PTR(word
),
1611 .data
= TAKE_PTR(data
),
1615 arg_credentials
= a
;
1617 arg_settings_mask
|= SETTING_CREDENTIALS
;
1621 case ARG_LOAD_CREDENTIAL
: {
1622 ReadFullFileFlags flags
= READ_FULL_FILE_SECURE
;
1623 _cleanup_(erase_and_freep
) char *data
= NULL
;
1624 _cleanup_free_
char *word
= NULL
, *j
= NULL
;
1625 const char *p
= optarg
;
1629 r
= extract_first_word(&p
, &word
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1633 return log_error_errno(r
, "Failed to parse --load-credential= parameter: %m");
1635 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Missing value for --load-credential=: %s", optarg
);
1637 if (!credential_name_valid(word
))
1638 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential name is not valid: %s", word
);
1640 for (i
= 0; i
< arg_n_credentials
; i
++)
1641 if (streq(arg_credentials
[i
].id
, word
))
1642 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
), "Duplicate credential '%s', refusing.", word
);
1644 if (path_is_absolute(p
))
1645 flags
|= READ_FULL_FILE_CONNECT_SOCKET
;
1649 r
= get_credentials_dir(&e
);
1651 return log_error_errno(r
, "Credential not available (no credentials passed at all): %s", word
);
1653 j
= path_join(e
, p
);
1658 r
= read_full_file_full(AT_FDCWD
, j
?: p
, UINT64_MAX
, SIZE_MAX
,
1663 return log_error_errno(r
, "Failed to read credential '%s': %m", j
?: p
);
1665 a
= reallocarray(arg_credentials
, arg_n_credentials
+ 1, sizeof(Credential
));
1669 a
[arg_n_credentials
++] = (Credential
) {
1670 .id
= TAKE_PTR(word
),
1671 .data
= TAKE_PTR(data
),
1675 arg_credentials
= a
;
1677 arg_settings_mask
|= SETTING_CREDENTIALS
;
1682 if (!valid_user_group_name(optarg
, 0))
1683 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid user name to bind: %s", optarg
);
1685 if (strv_extend(&arg_bind_user
, optarg
) < 0)
1688 arg_settings_mask
|= SETTING_BIND_USER
;
1691 case ARG_SUPPRESS_SYNC
:
1692 r
= parse_boolean_argument("--suppress-sync=", optarg
, &arg_suppress_sync
);
1696 arg_settings_mask
|= SETTING_SUPPRESS_SYNC
;
1703 assert_not_reached();
1706 if (argc
> optind
) {
1707 strv_free(arg_parameters
);
1708 arg_parameters
= strv_copy(argv
+ optind
);
1709 if (!arg_parameters
)
1712 arg_settings_mask
|= SETTING_START_MODE
;
1715 if (arg_ephemeral
&& arg_template
&& !arg_directory
)
1716 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1717 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1718 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1720 arg_directory
= TAKE_PTR(arg_template
);
1722 arg_caps_retain
|= plus
;
1723 arg_caps_retain
|= arg_private_network
? UINT64_C(1) << CAP_NET_ADMIN
: 0;
1725 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1726 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1728 if (!arg_private_network
&& arg_userns_mode
!= USER_NAMESPACE_NO
&& arg_uid_shift
> 0)
1729 arg_caps_retain
&= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE
);
1731 arg_caps_retain
&= ~minus
;
1733 /* Make sure to parse environment before we reset the settings mask below */
1734 r
= parse_environment();
1738 /* Load all settings from .nspawn files */
1739 if (mask_no_settings
)
1740 arg_settings_mask
= 0;
1742 /* Don't load any settings from .nspawn files */
1743 if (mask_all_settings
)
1744 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1749 static int verify_arguments(void) {
1752 if (arg_start_mode
== START_PID2
&& arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
1753 /* If we are running the stub init in the container, we don't need to look at what the init
1754 * in the container supports, because we are not using it. Let's immediately pick the right
1755 * setting based on the host system configuration.
1757 * We only do this, if the user didn't use an environment variable to override the detection.
1760 r
= cg_all_unified();
1762 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
1764 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
1765 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)
1766 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
1768 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
1771 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
1772 arg_mount_settings
|= MOUNT_USE_USERNS
;
1774 if (arg_private_network
)
1775 arg_mount_settings
|= MOUNT_APPLY_APIVFS_NETNS
;
1777 if (!(arg_clone_ns_flags
& CLONE_NEWPID
) ||
1778 !(arg_clone_ns_flags
& CLONE_NEWUTS
)) {
1779 arg_register
= false;
1780 if (arg_start_mode
!= START_PID1
)
1781 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--boot cannot be used without namespacing.");
1784 if (arg_userns_ownership
< 0)
1785 arg_userns_ownership
=
1786 arg_userns_mode
== USER_NAMESPACE_PICK
? USER_NAMESPACE_OWNERSHIP_AUTO
:
1787 USER_NAMESPACE_OWNERSHIP_OFF
;
1789 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
1790 arg_kill_signal
= SIGRTMIN
+3;
1792 if (arg_volatile_mode
!= VOLATILE_NO
) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1793 arg_read_only
= true;
1795 if (has_custom_root_mount(arg_custom_mounts
, arg_n_custom_mounts
))
1796 arg_read_only
= true;
1798 if (arg_keep_unit
&& arg_register
&& cg_pid_get_owner_uid(0, NULL
) >= 0)
1799 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1800 * The latter is not technically a user session, but we don't need to labour the point. */
1801 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--keep-unit --register=yes may not be used when invoked from a user session.");
1803 if (arg_directory
&& arg_image
)
1804 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--directory= and --image= may not be combined.");
1806 if (arg_template
&& arg_image
)
1807 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--template= and --image= may not be combined.");
1809 if (arg_template
&& !(arg_directory
|| arg_machine
))
1810 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--template= needs --directory= or --machine=.");
1812 if (arg_ephemeral
&& arg_template
)
1813 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--ephemeral and --template= may not be combined.");
1815 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
))
1816 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--ephemeral and --link-journal= may not be combined.");
1818 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !userns_supported())
1819 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "--private-users= is not supported, kernel compiled without user namespace support.");
1821 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_CHOWN
&& arg_read_only
)
1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1823 "--read-only and --private-users-ownership=chown may not be combined.");
1825 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1826 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1827 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1828 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_CHOWN
&& arg_volatile_mode
!= VOLATILE_NO
)
1829 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--volatile= and --private-users-ownership=chown may not be combined.");
1831 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1832 * we need to error out, to avoid conflicts between different network options. */
1833 if (arg_network_namespace_path
&&
1834 (arg_network_interfaces
|| arg_network_macvlan
||
1835 arg_network_ipvlan
|| arg_network_veth_extra
||
1836 arg_network_bridge
|| arg_network_zone
||
1838 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--network-namespace-path= cannot be combined with other network options.");
1840 if (arg_network_bridge
&& arg_network_zone
)
1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1842 "--network-bridge= and --network-zone= may not be combined.");
1844 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& (arg_mount_settings
& MOUNT_APPLY_APIVFS_NETNS
) && !arg_private_network
)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1847 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !(arg_mount_settings
& MOUNT_APPLY_APIVFS_RO
))
1848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Cannot combine --private-users with read-write mounts.");
1850 if (arg_expose_ports
&& !arg_private_network
)
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Cannot use --port= without private networking.");
1853 if (arg_caps_ambient
) {
1854 if (arg_caps_ambient
== UINT64_MAX
)
1855 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= does not support the value all.");
1857 if ((arg_caps_ambient
& arg_caps_retain
) != arg_caps_ambient
)
1858 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= setting is not fully covered by Capability= setting.");
1860 if (arg_start_mode
== START_BOOT
)
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= setting is not useful for boot mode.");
1864 if (arg_userns_mode
== USER_NAMESPACE_NO
&& !strv_isempty(arg_bind_user
))
1865 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--bind-user= requires --private-users");
1867 /* Drop duplicate --bind-user= entries */
1868 strv_uniq(arg_bind_user
);
1870 r
= custom_mount_check_all();
1877 int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1880 if (arg_userns_mode
== USER_NAMESPACE_NO
)
1883 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1886 if (uid
!= UID_INVALID
) {
1887 uid
+= arg_uid_shift
;
1889 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1893 if (gid
!= GID_INVALID
) {
1894 gid
+= (gid_t
) arg_uid_shift
;
1896 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1900 return RET_NERRNO(lchown(p
, uid
, gid
));
1903 int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1907 q
= prefix_roota(root
, path
);
1908 r
= RET_NERRNO(mkdir(q
, mode
));
1914 return userns_lchown(q
, uid
, gid
);
1917 static const char *timezone_from_path(const char *path
) {
1918 return PATH_STARTSWITH_SET(
1920 "../usr/share/zoneinfo/",
1921 "/usr/share/zoneinfo/");
1924 static bool etc_writable(void) {
1925 return !arg_read_only
|| IN_SET(arg_volatile_mode
, VOLATILE_YES
, VOLATILE_OVERLAY
);
1928 static int setup_timezone(const char *dest
) {
1929 _cleanup_free_
char *p
= NULL
, *etc
= NULL
;
1930 const char *where
, *check
;
1936 if (IN_SET(arg_timezone
, TIMEZONE_AUTO
, TIMEZONE_SYMLINK
)) {
1937 r
= readlink_malloc("/etc/localtime", &p
);
1938 if (r
== -ENOENT
&& arg_timezone
== TIMEZONE_AUTO
)
1939 m
= etc_writable() ? TIMEZONE_DELETE
: TIMEZONE_OFF
;
1940 else if (r
== -EINVAL
&& arg_timezone
== TIMEZONE_AUTO
) /* regular file? */
1941 m
= etc_writable() ? TIMEZONE_COPY
: TIMEZONE_BIND
;
1943 log_warning_errno(r
, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1944 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1948 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1951 } else if (arg_timezone
== TIMEZONE_AUTO
)
1952 m
= etc_writable() ? TIMEZONE_SYMLINK
: TIMEZONE_BIND
;
1958 if (m
== TIMEZONE_OFF
)
1961 r
= chase("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
, NULL
);
1963 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
1967 where
= strjoina(etc
, "/localtime");
1971 case TIMEZONE_DELETE
:
1972 if (unlink(where
) < 0)
1973 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
, "Failed to remove '%s', ignoring: %m", where
);
1977 case TIMEZONE_SYMLINK
: {
1978 _cleanup_free_
char *q
= NULL
;
1979 const char *z
, *what
;
1981 z
= timezone_from_path(p
);
1983 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1987 r
= readlink_malloc(where
, &q
);
1988 if (r
>= 0 && streq_ptr(timezone_from_path(q
), z
))
1989 return 0; /* Already pointing to the right place? Then do nothing .. */
1991 check
= strjoina(dest
, "/usr/share/zoneinfo/", z
);
1992 r
= chase(check
, dest
, 0, NULL
, NULL
);
1994 log_debug_errno(r
, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z
);
1996 if (unlink(where
) < 0 && errno
!= ENOENT
) {
1997 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
, /* Don't complain on read-only images */
1998 errno
, "Failed to remove existing timezone info %s in container, ignoring: %m", where
);
2002 what
= strjoina("../usr/share/zoneinfo/", z
);
2003 if (symlink(what
, where
) < 0) {
2004 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
,
2005 errno
, "Failed to correct timezone of container, ignoring: %m");
2015 case TIMEZONE_BIND
: {
2016 _cleanup_free_
char *resolved
= NULL
;
2019 found
= chase(where
, dest
, CHASE_NONEXISTENT
, &resolved
, NULL
);
2021 log_warning_errno(found
, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2025 if (found
== 0) /* missing? */
2026 (void) touch(resolved
);
2028 r
= mount_nofollow_verbose(LOG_WARNING
, "/etc/localtime", resolved
, NULL
, MS_BIND
, NULL
);
2030 return mount_nofollow_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
2036 /* If mounting failed, try to copy */
2037 r
= copy_file_atomic("/etc/localtime", where
, 0644, COPY_REFLINK
|COPY_REPLACE
);
2039 log_full_errno(IN_SET(r
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
2040 "Failed to copy /etc/localtime to %s, ignoring: %m", where
);
2047 assert_not_reached();
2050 /* Fix permissions of the symlink or file copy we just created */
2051 r
= userns_lchown(where
, 0, 0);
2053 log_warning_errno(r
, "Failed to chown /etc/localtime, ignoring: %m");
2058 static int have_resolv_conf(const char *path
) {
2061 if (access(path
, F_OK
) < 0) {
2062 if (errno
== ENOENT
)
2065 return log_debug_errno(errno
, "Failed to determine whether '%s' is available: %m", path
);
2071 static int resolved_listening(void) {
2072 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
2073 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
2074 _cleanup_free_
char *dns_stub_listener_mode
= NULL
;
2077 /* Check if resolved is listening */
2079 r
= sd_bus_open_system(&bus
);
2081 return log_debug_errno(r
, "Failed to open system bus: %m");
2083 r
= bus_name_has_owner(bus
, "org.freedesktop.resolve1", NULL
);
2085 return log_debug_errno(r
, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2089 r
= sd_bus_get_property_string(bus
,
2090 "org.freedesktop.resolve1",
2091 "/org/freedesktop/resolve1",
2092 "org.freedesktop.resolve1.Manager",
2095 &dns_stub_listener_mode
);
2097 return log_debug_errno(r
, "Failed to query DNSStubListener property: %s", bus_error_message(&error
, r
));
2099 return STR_IN_SET(dns_stub_listener_mode
, "udp", "yes");
2102 static int setup_resolv_conf(const char *dest
) {
2103 _cleanup_free_
char *etc
= NULL
;
2104 const char *where
, *what
;
2110 if (arg_resolv_conf
== RESOLV_CONF_AUTO
) {
2111 if (arg_private_network
)
2112 m
= RESOLV_CONF_OFF
;
2113 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF
) > 0 && resolved_listening() > 0)
2114 m
= etc_writable() ? RESOLV_CONF_COPY_STUB
: RESOLV_CONF_BIND_STUB
;
2115 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2116 m
= etc_writable() ? RESOLV_CONF_COPY_HOST
: RESOLV_CONF_BIND_HOST
;
2118 m
= etc_writable() ? RESOLV_CONF_DELETE
: RESOLV_CONF_OFF
;
2121 m
= arg_resolv_conf
;
2123 if (m
== RESOLV_CONF_OFF
)
2126 r
= chase("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
, NULL
);
2128 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
2132 where
= strjoina(etc
, "/resolv.conf");
2134 if (m
== RESOLV_CONF_DELETE
) {
2135 if (unlink(where
) < 0)
2136 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
, "Failed to remove '%s', ignoring: %m", where
);
2141 if (IN_SET(m
, RESOLV_CONF_BIND_STATIC
, RESOLV_CONF_REPLACE_STATIC
, RESOLV_CONF_COPY_STATIC
))
2142 what
= PRIVATE_STATIC_RESOLV_CONF
;
2143 else if (IN_SET(m
, RESOLV_CONF_BIND_UPLINK
, RESOLV_CONF_REPLACE_UPLINK
, RESOLV_CONF_COPY_UPLINK
))
2144 what
= PRIVATE_UPLINK_RESOLV_CONF
;
2145 else if (IN_SET(m
, RESOLV_CONF_BIND_STUB
, RESOLV_CONF_REPLACE_STUB
, RESOLV_CONF_COPY_STUB
))
2146 what
= PRIVATE_STUB_RESOLV_CONF
;
2148 what
= "/etc/resolv.conf";
2150 if (IN_SET(m
, RESOLV_CONF_BIND_HOST
, RESOLV_CONF_BIND_STATIC
, RESOLV_CONF_BIND_UPLINK
, RESOLV_CONF_BIND_STUB
)) {
2151 _cleanup_free_
char *resolved
= NULL
;
2154 found
= chase(where
, dest
, CHASE_NONEXISTENT
, &resolved
, NULL
);
2156 log_warning_errno(found
, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2160 if (found
== 0) /* missing? */
2161 (void) touch(resolved
);
2163 r
= mount_nofollow_verbose(LOG_WARNING
, what
, resolved
, NULL
, MS_BIND
, NULL
);
2165 return mount_nofollow_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
2167 /* If that didn't work, let's copy the file */
2170 if (IN_SET(m
, RESOLV_CONF_REPLACE_HOST
, RESOLV_CONF_REPLACE_STATIC
, RESOLV_CONF_REPLACE_UPLINK
, RESOLV_CONF_REPLACE_STUB
))
2171 r
= copy_file_atomic(what
, where
, 0644, COPY_REFLINK
|COPY_REPLACE
);
2173 r
= copy_file(what
, where
, O_TRUNC
|O_NOFOLLOW
, 0644, COPY_REFLINK
);
2175 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2176 * resolved or something similar runs inside and the symlink points there.
2178 * If the disk image is read-only, there's also no point in complaining.
2180 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST
, RESOLV_CONF_COPY_STATIC
, RESOLV_CONF_COPY_UPLINK
, RESOLV_CONF_COPY_STUB
) &&
2181 IN_SET(r
, -ELOOP
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
2182 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where
);
2186 r
= userns_lchown(where
, 0, 0);
2188 log_warning_errno(r
, "Failed to chown /etc/resolv.conf, ignoring: %m");
2193 static int setup_boot_id(void) {
2194 _cleanup_(unlink_and_freep
) char *from
= NULL
;
2195 _cleanup_free_
char *path
= NULL
;
2196 sd_id128_t rnd
= SD_ID128_NULL
;
2200 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2202 r
= tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path
);
2204 return log_error_errno(r
, "Failed to generate random boot ID path: %m");
2206 r
= sd_id128_randomize(&rnd
);
2208 return log_error_errno(r
, "Failed to generate random boot id: %m");
2210 r
= id128_write(path
, ID128_FORMAT_UUID
, rnd
);
2212 return log_error_errno(r
, "Failed to write boot id: %m");
2214 from
= TAKE_PTR(path
);
2215 to
= "/proc/sys/kernel/random/boot_id";
2217 r
= mount_nofollow_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
2221 return mount_nofollow_verbose(LOG_ERR
, NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2224 static int copy_devnodes(const char *dest
) {
2225 static const char devnodes
[] =
2238 BLOCK_WITH_UMASK(0000);
2240 /* Create /dev/net, so that we can create /dev/net/tun in it */
2241 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
2242 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
2244 NULSTR_FOREACH(d
, devnodes
) {
2245 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
2248 from
= path_join("/dev/", d
);
2252 to
= path_join(dest
, from
);
2256 if (stat(from
, &st
) < 0) {
2258 if (errno
!= ENOENT
)
2259 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
2261 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
))
2262 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2263 "%s is not a char or block device, cannot copy.", from
);
2265 _cleanup_free_
char *sl
= NULL
, *prefixed
= NULL
, *dn
= NULL
, *t
= NULL
;
2267 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
2268 /* Explicitly warn the user when /dev is already populated. */
2269 if (errno
== EEXIST
)
2270 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest
);
2272 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
2274 /* Some systems abusively restrict mknod but allow bind mounts. */
2277 return log_error_errno(r
, "touch (%s) failed: %m", to
);
2278 r
= mount_nofollow_verbose(LOG_DEBUG
, from
, to
, NULL
, MS_BIND
, NULL
);
2280 return log_error_errno(r
, "Both mknod and bind mount (%s) failed: %m", to
);
2283 r
= userns_lchown(to
, 0, 0);
2285 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
2287 dn
= path_join("/dev", S_ISCHR(st
.st_mode
) ? "char" : "block");
2291 r
= userns_mkdir(dest
, dn
, 0755, 0, 0);
2293 return log_error_errno(r
, "Failed to create '%s': %m", dn
);
2295 if (asprintf(&sl
, "%s/%u:%u", dn
, major(st
.st_rdev
), minor(st
.st_rdev
)) < 0)
2298 prefixed
= path_join(dest
, sl
);
2302 t
= path_join("..", d
);
2306 if (symlink(t
, prefixed
) < 0)
2307 log_debug_errno(errno
, "Failed to symlink '%s' to '%s': %m", t
, prefixed
);
2314 static int make_extra_nodes(const char *dest
) {
2318 BLOCK_WITH_UMASK(0000);
2320 for (i
= 0; i
< arg_n_extra_nodes
; i
++) {
2321 _cleanup_free_
char *path
= NULL
;
2322 DeviceNode
*n
= arg_extra_nodes
+ i
;
2324 path
= path_join(dest
, n
->path
);
2328 if (mknod(path
, n
->mode
, S_ISCHR(n
->mode
) || S_ISBLK(n
->mode
) ? makedev(n
->major
, n
->minor
) : 0) < 0)
2329 return log_error_errno(errno
, "Failed to create device node '%s': %m", path
);
2331 r
= chmod_and_chown(path
, n
->mode
, n
->uid
, n
->gid
);
2333 return log_error_errno(r
, "Failed to adjust device node ownership of '%s': %m", path
);
2339 static int setup_pts(const char *dest
) {
2340 _cleanup_free_
char *options
= NULL
;
2345 if (arg_selinux_apifs_context
)
2346 (void) asprintf(&options
,
2347 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
2348 arg_uid_shift
+ TTY_GID
,
2349 arg_selinux_apifs_context
);
2352 (void) asprintf(&options
,
2353 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
2354 arg_uid_shift
+ TTY_GID
);
2359 /* Mount /dev/pts itself */
2360 p
= prefix_roota(dest
, "/dev/pts");
2361 r
= RET_NERRNO(mkdir(p
, 0755));
2363 return log_error_errno(r
, "Failed to create /dev/pts: %m");
2365 r
= mount_nofollow_verbose(LOG_ERR
, "devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
);
2368 r
= userns_lchown(p
, 0, 0);
2370 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
2372 /* Create /dev/ptmx symlink */
2373 p
= prefix_roota(dest
, "/dev/ptmx");
2374 if (symlink("pts/ptmx", p
) < 0)
2375 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
2376 r
= userns_lchown(p
, 0, 0);
2378 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
2380 /* And fix /dev/pts/ptmx ownership */
2381 p
= prefix_roota(dest
, "/dev/pts/ptmx");
2382 r
= userns_lchown(p
, 0, 0);
2384 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
2389 static int setup_stdio_as_dev_console(void) {
2390 _cleanup_close_
int terminal
= -EBADF
;
2393 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2394 * explicitly, if we are configured to. */
2395 terminal
= open_terminal("/dev/console", O_RDWR
|O_NOCTTY
);
2397 return log_error_errno(terminal
, "Failed to open console: %m");
2399 /* Make sure we can continue logging to the original stderr, even if
2400 * stderr points elsewhere now */
2401 r
= log_dup_console();
2403 return log_error_errno(r
, "Failed to duplicate stderr: %m");
2405 /* invalidates 'terminal' on success and failure */
2406 r
= rearrange_stdio(terminal
, terminal
, terminal
);
2409 return log_error_errno(r
, "Failed to move console to stdin/stdout/stderr: %m");
2414 static int setup_dev_console(const char *console
) {
2415 _cleanup_free_
char *p
= NULL
;
2418 /* Create /dev/console symlink */
2419 r
= path_make_relative("/dev", console
, &p
);
2421 return log_error_errno(r
, "Failed to create relative path: %m");
2423 if (symlink(p
, "/dev/console") < 0)
2424 return log_error_errno(errno
, "Failed to create /dev/console symlink: %m");
2429 static int setup_keyring(void) {
2430 key_serial_t keyring
;
2432 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2433 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2434 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2435 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2436 * into the container. */
2438 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2439 if (keyring
== -1) {
2440 if (errno
== ENOSYS
)
2441 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
2442 else if (ERRNO_IS_PRIVILEGE(errno
))
2443 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
2445 return log_error_errno(errno
, "Setting up kernel keyring failed: %m");
2451 static int setup_credentials(const char *root
) {
2455 if (arg_n_credentials
<= 0)
2458 r
= userns_mkdir(root
, "/run/host", 0755, 0, 0);
2460 return log_error_errno(r
, "Failed to create /run/host: %m");
2462 r
= userns_mkdir(root
, "/run/host/credentials", 0700, 0, 0);
2464 return log_error_errno(r
, "Failed to create /run/host/credentials: %m");
2466 q
= prefix_roota(root
, "/run/host/credentials");
2467 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, "ramfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "mode=0700");
2471 for (size_t i
= 0; i
< arg_n_credentials
; i
++) {
2472 _cleanup_free_
char *j
= NULL
;
2473 _cleanup_close_
int fd
= -EBADF
;
2475 j
= path_join(q
, arg_credentials
[i
].id
);
2479 fd
= open(j
, O_CREAT
|O_EXCL
|O_WRONLY
|O_CLOEXEC
|O_NOFOLLOW
, 0600);
2481 return log_error_errno(errno
, "Failed to create credential file %s: %m", j
);
2483 r
= loop_write(fd
, arg_credentials
[i
].data
, arg_credentials
[i
].size
, /* do_poll= */ false);
2485 return log_error_errno(r
, "Failed to write credential to file %s: %m", j
);
2487 if (fchmod(fd
, 0400) < 0)
2488 return log_error_errno(errno
, "Failed to adjust access mode of %s: %m", j
);
2490 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2491 if (fchown(fd
, arg_uid_shift
, arg_uid_shift
) < 0)
2492 return log_error_errno(errno
, "Failed to adjust ownership of %s: %m", j
);
2496 if (chmod(q
, 0500) < 0)
2497 return log_error_errno(errno
, "Failed to adjust access mode of %s: %m", q
);
2499 r
= userns_lchown(q
, 0, 0);
2503 /* Make both mount and superblock read-only now */
2504 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2508 return mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "mode=0500");
2511 static int setup_kmsg(int fd_inner_socket
) {
2512 _cleanup_(unlink_and_freep
) char *from
= NULL
;
2513 _cleanup_free_
char *fifo
= NULL
;
2514 _cleanup_close_
int fd
= -EBADF
;
2517 assert(fd_inner_socket
>= 0);
2519 BLOCK_WITH_UMASK(0000);
2521 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2522 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2523 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2524 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2526 r
= tempfn_random_child("/run", "proc-kmsg", &fifo
);
2528 return log_error_errno(r
, "Failed to generate kmsg path: %m");
2530 if (mkfifo(fifo
, 0600) < 0)
2531 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
2533 from
= TAKE_PTR(fifo
);
2535 r
= mount_nofollow_verbose(LOG_ERR
, from
, "/proc/kmsg", NULL
, MS_BIND
, NULL
);
2539 fd
= open(from
, O_RDWR
|O_NONBLOCK
|O_CLOEXEC
);
2541 return log_error_errno(errno
, "Failed to open fifo: %m");
2543 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2544 r
= send_one_fd(fd_inner_socket
, fd
, 0);
2546 return log_error_errno(r
, "Failed to send FIFO fd: %m");
2552 union in_addr_union address4
;
2553 union in_addr_union address6
;
2554 struct FirewallContext
*fw_ctx
;
2557 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2558 struct ExposeArgs
*args
= ASSERT_PTR(userdata
);
2563 (void) expose_port_execute(rtnl
, &args
->fw_ctx
, arg_expose_ports
, AF_INET
, &args
->address4
);
2564 (void) expose_port_execute(rtnl
, &args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &args
->address6
);
2568 static int setup_hostname(void) {
2571 if ((arg_clone_ns_flags
& CLONE_NEWUTS
) == 0)
2574 r
= sethostname_idempotent(arg_hostname
?: arg_machine
);
2576 return log_error_errno(r
, "Failed to set hostname: %m");
2581 static int setup_journal(const char *directory
) {
2582 _cleanup_free_
char *d
= NULL
;
2588 /* Don't link journals in ephemeral mode */
2592 if (arg_link_journal
== LINK_NO
)
2595 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
2597 r
= sd_id128_get_machine(&this_id
);
2599 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2601 if (sd_id128_equal(arg_uuid
, this_id
)) {
2602 log_full(try ? LOG_WARNING
: LOG_ERR
,
2603 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid
));
2609 FOREACH_STRING(dirname
, "/var", "/var/log", "/var/log/journal") {
2610 r
= userns_mkdir(directory
, dirname
, 0755, 0, 0);
2612 bool ignore
= r
== -EROFS
&& try;
2613 log_full_errno(ignore
? LOG_DEBUG
: LOG_ERR
, r
,
2614 "Failed to create %s%s: %m", dirname
, ignore
? ", ignoring" : "");
2615 return ignore
? 0 : r
;
2619 p
= strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid
));
2620 q
= prefix_roota(directory
, p
);
2622 if (path_is_mount_point(p
, NULL
, 0) > 0) {
2626 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
),
2627 "%s: already a mount point, refusing to use for journal", p
);
2630 if (path_is_mount_point(q
, NULL
, 0) > 0) {
2634 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
),
2635 "%s: already a mount point, refusing to use for journal", q
);
2638 r
= readlink_and_make_absolute(p
, &d
);
2640 if (IN_SET(arg_link_journal
, LINK_GUEST
, LINK_AUTO
) &&
2643 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2645 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
2650 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2651 } else if (r
== -EINVAL
) {
2653 if (arg_link_journal
== LINK_GUEST
&&
2656 if (errno
== ENOTDIR
) {
2657 log_error("%s already exists and is neither a symlink nor a directory", p
);
2660 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
2662 } else if (r
!= -ENOENT
)
2663 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
2665 if (arg_link_journal
== LINK_GUEST
) {
2667 if (symlink(q
, p
) < 0) {
2669 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2672 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2675 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2677 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
2681 if (arg_link_journal
== LINK_HOST
) {
2682 /* don't create parents here — if the host doesn't have
2683 * permanent journal set up, don't force it here */
2685 r
= RET_NERRNO(mkdir(p
, 0755));
2686 if (r
< 0 && r
!= -EEXIST
) {
2688 log_debug_errno(r
, "Failed to create %s, skipping journal setup: %m", p
);
2691 return log_error_errno(r
, "Failed to create %s: %m", p
);
2694 } else if (access(p
, F_OK
) < 0)
2697 if (dir_is_empty(q
, /* ignore_hidden_or_backup= */ false) == 0)
2698 log_warning("%s is not empty, proceeding anyway.", q
);
2700 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2702 return log_error_errno(r
, "Failed to create %s: %m", q
);
2704 r
= mount_nofollow_verbose(LOG_DEBUG
, p
, q
, NULL
, MS_BIND
, NULL
);
2706 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2711 static int drop_capabilities(uid_t uid
) {
2712 CapabilityQuintet q
;
2714 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2715 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2716 * arg_caps_retain. */
2718 if (capability_quintet_is_set(&arg_full_capabilities
)) {
2719 q
= arg_full_capabilities
;
2721 if (q
.bounding
== UINT64_MAX
)
2722 q
.bounding
= uid
== 0 ? arg_caps_retain
: 0;
2724 if (q
.effective
== UINT64_MAX
)
2725 q
.effective
= uid
== 0 ? q
.bounding
: 0;
2727 if (q
.inheritable
== UINT64_MAX
)
2728 q
.inheritable
= uid
== 0 ? q
.bounding
: arg_caps_ambient
;
2730 if (q
.permitted
== UINT64_MAX
)
2731 q
.permitted
= uid
== 0 ? q
.bounding
: arg_caps_ambient
;
2733 if (q
.ambient
== UINT64_MAX
&& ambient_capabilities_supported())
2734 q
.ambient
= arg_caps_ambient
;
2736 if (capability_quintet_mangle(&q
))
2737 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Cannot set capabilities that are not in the current bounding set.");
2740 q
= (CapabilityQuintet
) {
2741 .bounding
= arg_caps_retain
,
2742 .effective
= uid
== 0 ? arg_caps_retain
: 0,
2743 .inheritable
= uid
== 0 ? arg_caps_retain
: arg_caps_ambient
,
2744 .permitted
= uid
== 0 ? arg_caps_retain
: arg_caps_ambient
,
2745 .ambient
= ambient_capabilities_supported() ? arg_caps_ambient
: UINT64_MAX
,
2748 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2749 * in order to maintain the same behavior as systemd < 242. */
2750 if (capability_quintet_mangle(&q
))
2751 log_full(arg_quiet
? LOG_DEBUG
: LOG_WARNING
,
2752 "Some capabilities will not be set because they are not in the current bounding set.");
2756 return capability_quintet_enforce(&q
);
2759 static int reset_audit_loginuid(void) {
2760 _cleanup_free_
char *p
= NULL
;
2763 if ((arg_clone_ns_flags
& CLONE_NEWPID
) == 0)
2766 r
= read_one_line_file("/proc/self/loginuid", &p
);
2770 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2772 /* Already reset? */
2773 if (streq(p
, "4294967295"))
2776 r
= write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER
);
2779 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2780 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2781 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2782 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2783 "using systemd-nspawn. Sleeping for 5s... (%m)");
2791 static int mount_tunnel_dig(const char *root
) {
2795 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2796 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2797 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
2798 (void) mkdir_p(p
, 0600);
2800 r
= userns_mkdir(root
, "/run/host", 0755, 0, 0);
2802 return log_error_errno(r
, "Failed to create /run/host: %m");
2804 r
= userns_mkdir(root
, NSPAWN_MOUNT_TUNNEL
, 0600, 0, 0);
2806 return log_error_errno(r
, "Failed to create "NSPAWN_MOUNT_TUNNEL
": %m");
2808 q
= prefix_roota(root
, NSPAWN_MOUNT_TUNNEL
);
2809 r
= mount_nofollow_verbose(LOG_ERR
, p
, q
, NULL
, MS_BIND
, NULL
);
2813 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
2820 static int mount_tunnel_open(void) {
2823 r
= mount_follow_verbose(LOG_ERR
, NULL
, NSPAWN_MOUNT_TUNNEL
, NULL
, MS_SLAVE
, NULL
);
2830 static int setup_machine_id(const char *directory
) {
2833 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2834 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2835 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2836 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2837 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2838 * container behaves nicely). */
2840 r
= id128_get_machine(directory
, &arg_uuid
);
2842 if (!ERRNO_IS_MACHINE_ID_UNSET(r
)) /* If the file is missing, empty, or uninitialized, we don't mind */
2843 return log_error_errno(r
, "Failed to read machine ID from container image: %m");
2845 if (sd_id128_is_null(arg_uuid
)) {
2846 r
= sd_id128_randomize(&arg_uuid
);
2848 return log_error_errno(r
, "Failed to acquire randomized machine UUID: %m");
2855 static int recursive_chown(const char *directory
, uid_t shift
, uid_t range
) {
2860 if (arg_userns_mode
== USER_NAMESPACE_NO
|| arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_CHOWN
)
2863 r
= path_patch_uid(directory
, arg_uid_shift
, arg_uid_range
);
2864 if (r
== -EOPNOTSUPP
)
2865 return log_error_errno(r
, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2867 return log_error_errno(r
, "Upper 16 bits of root directory UID and GID do not match.");
2869 return log_error_errno(r
, "Failed to adjust UID/GID shift of OS tree: %m");
2871 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2873 log_debug("Patched directory tree to match UID/GID range.");
2880 * < 0 : wait_for_terminate() failed to get the state of the
2881 * container, the container was terminated by a signal, or
2882 * failed for an unknown reason. No change is made to the
2883 * container argument.
2884 * > 0 : The program executed in the container terminated with an
2885 * error. The exit code of the program executed in the
2886 * container is returned. The container argument has been set
2887 * to CONTAINER_TERMINATED.
2888 * 0 : The container is being rebooted, has been shut down or exited
2889 * successfully. The container argument has been set to either
2890 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2892 * That is, success is indicated by a return value of zero, and an
2893 * error is indicated by a non-zero value.
2895 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2899 r
= wait_for_terminate(pid
, &status
);
2901 return log_warning_errno(r
, "Failed to wait for container: %m");
2903 switch (status
.si_code
) {
2906 if (status
.si_status
== 0)
2907 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2909 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2911 *container
= CONTAINER_TERMINATED
;
2912 return status
.si_status
;
2915 if (status
.si_status
== SIGINT
) {
2916 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2917 *container
= CONTAINER_TERMINATED
;
2920 } else if (status
.si_status
== SIGHUP
) {
2921 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2922 *container
= CONTAINER_REBOOTED
;
2928 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2929 "Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2932 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2933 "Container %s failed due to unknown reason.", arg_machine
);
2937 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2940 pid
= PTR_TO_PID(userdata
);
2942 if (kill(pid
, arg_kill_signal
) >= 0) {
2943 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2944 sd_event_source_set_userdata(s
, NULL
);
2949 sd_event_exit(sd_event_source_get_event(s
), 0);
2953 static int on_sigchld(sd_event_source
*s
, const struct signalfd_siginfo
*ssi
, void *userdata
) {
2959 pid
= PTR_TO_PID(userdata
);
2964 if (waitid(P_ALL
, 0, &si
, WNOHANG
|WNOWAIT
|WEXITED
) < 0)
2965 return log_error_errno(errno
, "Failed to waitid(): %m");
2966 if (si
.si_pid
== 0) /* No pending children. */
2968 if (si
.si_pid
== pid
) {
2969 /* The main process we care for has exited. Return from
2970 * signal handler but leave the zombie. */
2971 sd_event_exit(sd_event_source_get_event(s
), 0);
2975 /* Reap all other children. */
2976 (void) waitid(P_PID
, si
.si_pid
, &si
, WNOHANG
|WEXITED
);
2982 static int on_request_stop(sd_bus_message
*m
, void *userdata
, sd_bus_error
*error
) {
2987 pid
= PTR_TO_PID(userdata
);
2989 if (arg_kill_signal
> 0) {
2990 log_info("Container termination requested. Attempting to halt container.");
2991 (void) kill(pid
, arg_kill_signal
);
2993 log_info("Container termination requested. Exiting.");
2994 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m
)), 0);
3000 static int determine_names(void) {
3003 if (arg_template
&& !arg_directory
&& arg_machine
) {
3005 /* If --template= was specified then we should not
3006 * search for a machine, but instead create a new one
3007 * in /var/lib/machine. */
3009 arg_directory
= path_join("/var/lib/machines", arg_machine
);
3014 if (!arg_image
&& !arg_directory
) {
3016 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3018 r
= image_find(IMAGE_MACHINE
, arg_machine
, NULL
, &i
);
3020 return log_error_errno(r
, "No image for machine '%s'.", arg_machine
);
3022 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3024 if (IN_SET(i
->type
, IMAGE_RAW
, IMAGE_BLOCK
))
3025 r
= free_and_strdup(&arg_image
, i
->path
);
3027 r
= free_and_strdup(&arg_directory
, i
->path
);
3032 arg_read_only
= arg_read_only
|| i
->read_only
;
3034 r
= safe_getcwd(&arg_directory
);
3036 return log_error_errno(r
, "Failed to determine current directory: %m");
3039 if (!arg_directory
&& !arg_image
)
3040 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to determine path, please use -D or -i.");
3044 if (arg_directory
&& path_equal(arg_directory
, "/"))
3045 arg_machine
= gethostname_malloc();
3046 else if (arg_image
) {
3049 r
= path_extract_filename(arg_image
, &arg_machine
);
3051 return log_error_errno(r
, "Failed to extract file name from '%s': %m", arg_image
);
3053 /* Truncate suffix if there is one */
3054 e
= endswith(arg_machine
, ".raw");
3058 r
= path_extract_filename(arg_directory
, &arg_machine
);
3060 return log_error_errno(r
, "Failed to extract file name from '%s': %m", arg_directory
);
3063 hostname_cleanup(arg_machine
);
3064 if (!hostname_is_valid(arg_machine
, 0))
3065 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to determine machine name automatically, please use -M.");
3067 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3068 * to match fixed config file names. */
3069 arg_settings_filename
= strjoin(arg_machine
, ".nspawn");
3070 if (!arg_settings_filename
)
3073 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3074 * instances at once without manually having to specify -M each time. */
3076 if (strextendf(&arg_machine
, "-%016" PRIx64
, random_u64()) < 0)
3079 arg_settings_filename
= strjoin(arg_machine
, ".nspawn");
3080 if (!arg_settings_filename
)
3087 static int chase_and_update(char **p
, unsigned flags
) {
3096 r
= chase(*p
, NULL
, flags
, &chased
, NULL
);
3098 return log_error_errno(r
, "Failed to resolve path %s: %m", *p
);
3100 return free_and_replace(*p
, chased
);
3103 static int determine_uid_shift(const char *directory
) {
3105 if (arg_userns_mode
== USER_NAMESPACE_NO
) {
3110 if (arg_uid_shift
== UID_INVALID
) {
3113 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3115 if (stat(directory
, &st
) < 0)
3116 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
3118 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
3120 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000)))
3121 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
3122 "UID and GID base of %s don't match.", directory
);
3124 arg_uid_range
= UINT32_C(0x10000);
3126 if (arg_uid_shift
!= 0) {
3127 /* If the image is shifted already, then we'll fall back to classic chowning, for
3128 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3130 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_AUTO
) {
3131 log_debug("UID base of %s is non-zero, not using UID mapping.", directory
);
3132 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
3133 } else if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_MAP
)
3134 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
3135 "UID base of %s is not zero, UID mapping not supported.", directory
);
3139 if (!userns_shift_range_valid(arg_uid_shift
, arg_uid_range
))
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "UID base too high for UID range.");
3145 static unsigned long effective_clone_ns_flags(void) {
3146 unsigned long flags
= arg_clone_ns_flags
;
3148 if (arg_private_network
)
3149 flags
|= CLONE_NEWNET
;
3151 flags
|= CLONE_NEWCGROUP
;
3152 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
3153 flags
|= CLONE_NEWUSER
;
3158 static int patch_sysctl(void) {
3160 /* This table is inspired by runc's sysctl() function */
3161 static const struct {
3164 unsigned long clone_flags
;
3166 { "kernel.hostname", false, CLONE_NEWUTS
},
3167 { "kernel.domainname", false, CLONE_NEWUTS
},
3168 { "kernel.msgmax", false, CLONE_NEWIPC
},
3169 { "kernel.msgmnb", false, CLONE_NEWIPC
},
3170 { "kernel.msgmni", false, CLONE_NEWIPC
},
3171 { "kernel.sem", false, CLONE_NEWIPC
},
3172 { "kernel.shmall", false, CLONE_NEWIPC
},
3173 { "kernel.shmmax", false, CLONE_NEWIPC
},
3174 { "kernel.shmmni", false, CLONE_NEWIPC
},
3175 { "fs.mqueue.", true, CLONE_NEWIPC
},
3176 { "net.", true, CLONE_NEWNET
},
3179 unsigned long flags
;
3182 flags
= effective_clone_ns_flags();
3184 STRV_FOREACH_PAIR(k
, v
, arg_sysctl
) {
3188 for (i
= 0; i
< ELEMENTSOF(safe_sysctl
); i
++) {
3190 if (!FLAGS_SET(flags
, safe_sysctl
[i
].clone_flags
))
3193 if (safe_sysctl
[i
].prefix
)
3194 good
= startswith(*k
, safe_sysctl
[i
].key
);
3196 good
= streq(*k
, safe_sysctl
[i
].key
);
3203 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k
);
3205 r
= sysctl_write(*k
, *v
);
3207 return log_error_errno(r
, "Failed to write sysctl '%s': %m", *k
);
3213 static int inner_child(
3215 int fd_inner_socket
,
3217 char **os_release_pairs
) {
3219 _cleanup_free_
char *home
= NULL
;
3222 (char*) "PATH=" DEFAULT_PATH_COMPAT
,
3223 NULL
, /* container */
3228 NULL
, /* container_uuid */
3229 NULL
, /* LISTEN_FDS */
3230 NULL
, /* LISTEN_PID */
3231 NULL
, /* NOTIFY_SOCKET */
3232 NULL
, /* CREDENTIALS_DIRECTORY */
3236 const char *exec_target
;
3237 _cleanup_strv_free_
char **env_use
= NULL
;
3238 int r
, which_failed
;
3240 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3241 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3242 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3243 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3244 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3245 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3248 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3249 * unshare(). See below. */
3252 assert(fd_inner_socket
>= 0);
3254 log_debug("Inner child is initializing.");
3256 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3257 /* Tell the parent, that it now can write the UID map. */
3258 (void) barrier_place(barrier
); /* #1 */
3260 /* Wait until the parent wrote the UID map */
3261 if (!barrier_place_and_sync(barrier
)) /* #2 */
3262 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Parent died too early");
3264 /* Become the new root user inside our namespace */
3265 r
= reset_uid_gid();
3267 return log_error_errno(r
, "Couldn't become new root: %m");
3269 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3270 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3271 * propagation, but simply create new peer groups for all our mounts). */
3272 r
= mount_follow_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SHARED
|MS_REC
, NULL
);
3278 arg_mount_settings
| MOUNT_IN_USERNS
,
3280 arg_selinux_apifs_context
);
3284 if (!arg_network_namespace_path
&& arg_private_network
) {
3285 r
= unshare(CLONE_NEWNET
);
3287 return log_error_errno(errno
, "Failed to unshare network namespace: %m");
3289 /* Tell the parent that it can setup network interfaces. */
3290 (void) barrier_place(barrier
); /* #3 */
3293 r
= mount_sysfs(NULL
, arg_mount_settings
);
3297 /* Wait until we are cgroup-ified, so that we
3298 * can mount the right cgroup path writable */
3299 if (!barrier_place_and_sync(barrier
)) /* #4 */
3300 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
),
3301 "Parent died too early");
3304 r
= unshare(CLONE_NEWCGROUP
);
3306 return log_error_errno(errno
, "Failed to unshare cgroup namespace: %m");
3309 arg_unified_cgroup_hierarchy
,
3310 arg_userns_mode
!= USER_NAMESPACE_NO
,
3313 arg_selinux_apifs_context
,
3316 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
3320 r
= setup_boot_id();
3324 r
= setup_kmsg(fd_inner_socket
);
3331 arg_n_custom_mounts
,
3334 arg_selinux_apifs_context
,
3335 MOUNT_NON_ROOT_ONLY
| MOUNT_IN_USERNS
);
3340 return log_error_errno(errno
, "setsid() failed: %m");
3342 if (arg_private_network
)
3343 (void) loopback_setup();
3345 if (arg_expose_ports
) {
3346 r
= expose_port_send_rtnl(fd_inner_socket
);
3351 if (arg_console_mode
!= CONSOLE_PIPE
) {
3352 _cleanup_close_
int master
= -EBADF
;
3353 _cleanup_free_
char *console
= NULL
;
3355 /* Allocate a pty and make it available as /dev/console. */
3356 master
= openpt_allocate(O_RDWR
|O_NONBLOCK
, &console
);
3358 return log_error_errno(master
, "Failed to allocate a pty: %m");
3360 r
= setup_dev_console(console
);
3362 return log_error_errno(r
, "Failed to set up /dev/console: %m");
3364 r
= send_one_fd(fd_inner_socket
, master
, 0);
3366 return log_error_errno(r
, "Failed to send master fd: %m");
3368 r
= setup_stdio_as_dev_console();
3377 if (arg_oom_score_adjust_set
) {
3378 r
= set_oom_score_adjust(arg_oom_score_adjust
);
3380 return log_error_errno(r
, "Failed to adjust OOM score: %m");
3383 if (arg_cpu_set
.set
)
3384 if (sched_setaffinity(0, arg_cpu_set
.allocated
, arg_cpu_set
.set
) < 0)
3385 return log_error_errno(errno
, "Failed to set CPU affinity: %m");
3387 (void) setup_hostname();
3389 if (arg_personality
!= PERSONALITY_INVALID
) {
3390 r
= safe_personality(arg_personality
);
3392 return log_error_errno(r
, "personality() failed: %m");
3393 #ifdef ARCHITECTURE_SECONDARY
3394 } else if (arg_architecture
== ARCHITECTURE_SECONDARY
) {
3395 r
= safe_personality(PER_LINUX32
);
3397 return log_error_errno(r
, "personality() failed: %m");
3399 } else if (arg_architecture
>= 0 && arg_architecture
!= native_architecture())
3400 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
3401 "Selected architecture '%s' not supported locally, refusing.",
3402 architecture_to_string(arg_architecture
));
3404 r
= setrlimit_closest_all((const struct rlimit
*const*) arg_rlimit
, &which_failed
);
3406 return log_error_errno(r
, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
3411 if (is_seccomp_available()) {
3413 r
= seccomp_load(arg_seccomp
);
3414 if (ERRNO_IS_SECCOMP_FATAL(r
))
3415 return log_error_errno(r
, "Failed to install seccomp filter: %m");
3417 log_debug_errno(r
, "Failed to install seccomp filter: %m");
3422 r
= setup_seccomp(arg_caps_retain
, arg_syscall_allow_list
, arg_syscall_deny_list
);
3427 if (arg_suppress_sync
) {
3429 r
= seccomp_suppress_sync();
3431 log_debug_errno(r
, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3433 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3438 if (arg_selinux_context
)
3439 if (setexeccon(arg_selinux_context
) < 0)
3440 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
3443 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3444 * if we need to later on. */
3445 if (prctl(PR_SET_KEEPCAPS
, 1) < 0)
3446 return log_error_errno(errno
, "Failed to set PR_SET_KEEPCAPS: %m");
3448 if (uid_is_valid(arg_uid
) || gid_is_valid(arg_gid
))
3449 r
= change_uid_gid_raw(arg_uid
, arg_gid
, arg_supplementary_gids
, arg_n_supplementary_gids
, arg_console_mode
!= CONSOLE_PIPE
);
3451 r
= change_uid_gid(arg_user
, arg_console_mode
!= CONSOLE_PIPE
, &home
);
3455 r
= drop_capabilities(getuid());
3457 return log_error_errno(r
, "Dropping capabilities failed: %m");
3459 if (arg_no_new_privileges
)
3460 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0)
3461 return log_error_errno(errno
, "Failed to disable new privileges: %m");
3463 /* LXC sets container=lxc, so follow the scheme here */
3464 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
3466 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
3470 if (home
|| !uid_is_valid(arg_uid
) || arg_uid
== 0)
3471 if (asprintf(envp
+ n_env
++, "HOME=%s", home
?: "/root") < 0)
3474 if (arg_user
|| !uid_is_valid(arg_uid
) || arg_uid
== 0)
3475 if (asprintf(envp
+ n_env
++, "USER=%s", arg_user
?: "root") < 0 ||
3476 asprintf(envp
+ n_env
++, "LOGNAME=%s", arg_user
?: "root") < 0)
3479 assert(!sd_id128_is_null(arg_uuid
));
3481 if (asprintf(envp
+ n_env
++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid
)) < 0)
3484 if (fdset_size(fds
) > 0) {
3485 r
= fdset_cloexec(fds
, false);
3487 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
3489 if ((asprintf(envp
+ n_env
++, "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
3490 (asprintf(envp
+ n_env
++, "LISTEN_PID=1") < 0))
3493 if (asprintf(envp
+ n_env
++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH
) < 0)
3496 if (arg_n_credentials
> 0) {
3497 envp
[n_env
] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3503 if (arg_start_mode
!= START_BOOT
) {
3504 envp
[n_env
] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE
);
3510 env_use
= strv_env_merge(envp
, os_release_pairs
, arg_setenv
);
3514 /* Let the parent know that we are ready and
3515 * wait until the parent is ready with the
3517 if (!barrier_place_and_sync(barrier
)) /* #5 */
3518 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Parent died too early");
3521 if (chdir(arg_chdir
) < 0)
3522 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
3524 if (arg_start_mode
== START_PID2
) {
3525 r
= stub_pid1(arg_uuid
);
3530 if (arg_console_mode
!= CONSOLE_PIPE
) {
3531 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3532 * are configured for that. Acquire it as controlling tty. */
3533 if (ioctl(STDIN_FILENO
, TIOCSCTTY
) < 0)
3534 return log_error_errno(errno
, "Failed to acquire controlling TTY: %m");
3537 log_debug("Inner child completed, invoking payload.");
3539 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3540 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3541 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3543 log_set_open_when_needed(true);
3544 log_settle_target();
3546 (void) fdset_close_others(fds
);
3548 if (arg_start_mode
== START_BOOT
) {
3552 /* Automatically search for the init system */
3554 m
= strv_length(arg_parameters
);
3555 a
= newa(char*, m
+ 2);
3556 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
3559 FOREACH_STRING(init
,
3560 "/usr/lib/systemd/systemd",
3561 "/lib/systemd/systemd",
3563 a
[0] = (char*) init
;
3564 execve(a
[0], a
, env_use
);
3567 exec_target
= "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3568 } else if (!strv_isempty(arg_parameters
)) {
3569 const char *dollar_path
;
3571 exec_target
= arg_parameters
[0];
3573 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3575 dollar_path
= strv_env_get(env_use
, "PATH");
3577 if (setenv("PATH", dollar_path
, 1) < 0)
3578 return log_error_errno(errno
, "Failed to update $PATH: %m");
3581 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
3584 /* If we cannot change the directory, we'll end up in /, that is expected. */
3585 (void) chdir(home
?: "/root");
3587 execle(DEFAULT_USER_SHELL
, "-" DEFAULT_USER_SHELL_NAME
, NULL
, env_use
);
3588 if (!streq(DEFAULT_USER_SHELL
, "/bin/bash"))
3589 execle("/bin/bash", "-bash", NULL
, env_use
);
3590 if (!streq(DEFAULT_USER_SHELL
, "/bin/sh"))
3591 execle("/bin/sh", "-sh", NULL
, env_use
);
3593 exec_target
= DEFAULT_USER_SHELL
", /bin/bash, /bin/sh";
3596 return log_error_errno(errno
, "execv(%s) failed: %m", exec_target
);
3599 static int setup_notify_child(void) {
3600 _cleanup_close_
int fd
= -EBADF
;
3601 static const union sockaddr_union sa
= {
3602 .un
.sun_family
= AF_UNIX
,
3603 .un
.sun_path
= NSPAWN_NOTIFY_SOCKET_PATH
,
3607 fd
= socket(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
3609 return log_error_errno(errno
, "Failed to allocate notification socket: %m");
3611 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH
, 0755);
3612 (void) sockaddr_un_unlink(&sa
.un
);
3614 r
= bind(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
));
3616 return log_error_errno(errno
, "bind(" NSPAWN_NOTIFY_SOCKET_PATH
") failed: %m");
3618 r
= userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH
, 0, 0);
3620 return log_error_errno(r
, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH
": %m");
3622 r
= setsockopt_int(fd
, SOL_SOCKET
, SO_PASSCRED
, true);
3624 return log_error_errno(r
, "SO_PASSCRED failed: %m");
3629 static int outer_child(
3631 const char *directory
,
3632 DissectedImage
*dissected_image
,
3633 int fd_outer_socket
,
3634 int fd_inner_socket
,
3638 _cleanup_(bind_user_context_freep
) BindUserContext
*bind_user_context
= NULL
;
3639 _cleanup_strv_free_
char **os_release_pairs
= NULL
;
3640 _cleanup_close_
int fd
= -EBADF
, mntns_fd
= -EBADF
;
3647 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3648 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3649 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3650 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3651 * forked off it, and it exits. */
3655 assert(fd_outer_socket
>= 0);
3656 assert(fd_inner_socket
>= 0);
3658 log_debug("Outer child is initializing.");
3660 r
= load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs
);
3662 log_debug_errno(r
, "Failed to read os-release from host for container, ignoring: %m");
3664 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
3665 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
3667 r
= reset_audit_loginuid();
3671 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3672 * mounts to the real root. */
3673 r
= mount_follow_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
);
3677 if (dissected_image
) {
3678 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3679 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3680 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3681 * right place right away. This makes sure ESP partitions and userns are compatible. */
3683 r
= dissected_image_mount_and_warn(
3688 DISSECT_IMAGE_MOUNT_ROOT_ONLY
|
3689 DISSECT_IMAGE_DISCARD_ON_LOOP
|
3690 DISSECT_IMAGE_USR_NO_ROOT
|
3691 (arg_read_only
? DISSECT_IMAGE_READ_ONLY
: DISSECT_IMAGE_FSCK
|DISSECT_IMAGE_GROWFS
)|
3692 (arg_start_mode
== START_BOOT
? DISSECT_IMAGE_VALIDATE_OS
: 0));
3697 r
= determine_uid_shift(directory
);
3701 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3702 r
= namespace_open(0, NULL
, &mntns_fd
, NULL
, NULL
, NULL
);
3704 return log_error_errno(r
, "Failed to pin outer mount namespace: %m");
3706 l
= send_one_fd(fd_outer_socket
, mntns_fd
, 0);
3708 return log_error_errno(l
, "Failed to send outer mount namespace fd: %m");
3709 mntns_fd
= safe_close(mntns_fd
);
3711 /* Let the parent know which UID shift we read from the image */
3712 l
= send(fd_outer_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3714 return log_error_errno(errno
, "Failed to send UID shift: %m");
3715 if (l
!= sizeof(arg_uid_shift
))
3716 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3717 "Short write while sending UID shift.");
3719 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3720 /* When we are supposed to pick the UID shift, the parent will check now whether the
3721 * UID shift we just read from the image is available. If yes, it will send the UID
3722 * shift back to us, if not it will pick a different one, and send it back to us. */
3724 l
= recv(fd_outer_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3726 return log_error_errno(errno
, "Failed to recv UID shift: %m");
3727 if (l
!= sizeof(arg_uid_shift
))
3728 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3729 "Short read while receiving UID shift.");
3732 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
3733 "Selected user namespace base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
3736 if (path_equal(directory
, "/")) {
3737 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3738 * place, so that we can make changes to its mount structure (for example, to implement
3739 * --volatile=) without this interfering with our ability to access files such as
3740 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3741 * (instead of a temporary directory, since we are living in our own mount namespace here
3742 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3743 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3745 r
= mount_nofollow_verbose(LOG_ERR
, "/", "/run/systemd/nspawn-root", NULL
, MS_BIND
|MS_REC
, NULL
);
3749 directory
= "/run/systemd/nspawn-root";
3752 /* Make sure we always have a mount that we can move to root later on. */
3753 r
= make_mount_point(directory
);
3757 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3758 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3759 * we'll live in our own little world from now on, and propagation from the host may only happen via
3760 * the mount tunnel dir, or not at all. */
3761 r
= mount_follow_verbose(LOG_ERR
, NULL
, directory
, NULL
, MS_PRIVATE
|MS_REC
, NULL
);
3765 r
= setup_pivot_root(
3768 arg_pivot_root_old
);
3772 r
= setup_volatile_mode(
3776 arg_selinux_apifs_context
);
3780 r
= bind_user_prepare(
3785 &arg_custom_mounts
, &arg_n_custom_mounts
,
3786 &bind_user_context
);
3790 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& bind_user_context
) {
3791 /* Send the user maps we determined to the parent, so that it installs it in our user
3792 * namespace UID map table */
3794 for (size_t i
= 0; i
< bind_user_context
->n_data
; i
++) {
3796 bind_user_context
->data
[i
].payload_user
->uid
,
3797 bind_user_context
->data
[i
].host_user
->uid
,
3798 (uid_t
) bind_user_context
->data
[i
].payload_group
->gid
,
3799 (uid_t
) bind_user_context
->data
[i
].host_group
->gid
,
3802 l
= send(fd_outer_socket
, map
, sizeof(map
), MSG_NOSIGNAL
);
3804 return log_error_errno(errno
, "Failed to send user UID map: %m");
3805 if (l
!= sizeof(map
))
3806 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3807 "Short write while sending user UID map.");
3814 arg_n_custom_mounts
,
3817 arg_selinux_apifs_context
,
3822 if (arg_userns_mode
!= USER_NAMESPACE_NO
&&
3823 IN_SET(arg_userns_ownership
, USER_NAMESPACE_OWNERSHIP_MAP
, USER_NAMESPACE_OWNERSHIP_AUTO
) &&
3824 arg_uid_shift
!= 0) {
3826 r
= remount_idmap(directory
, arg_uid_shift
, arg_uid_range
, UID_INVALID
, REMOUNT_IDMAPPING_HOST_ROOT
);
3827 if (r
== -EINVAL
|| ERRNO_IS_NOT_SUPPORTED(r
)) {
3828 /* This might fail because the kernel or file system doesn't support idmapping. We
3829 * can't really distinguish this nicely, nor do we have any guarantees about the
3830 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3831 if (arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_AUTO
)
3832 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
3833 "ID mapped mounts are apparently not available, sorry.");
3835 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3836 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
3838 return log_error_errno(r
, "Failed to set up ID mapped mounts: %m");
3840 log_debug("ID mapped mounts available, making use of them.");
3845 if (dissected_image
) {
3846 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3847 r
= dissected_image_mount(
3852 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY
|
3853 DISSECT_IMAGE_DISCARD_ON_LOOP
|
3854 DISSECT_IMAGE_USR_NO_ROOT
|
3855 (arg_read_only
? DISSECT_IMAGE_READ_ONLY
: DISSECT_IMAGE_FSCK
|DISSECT_IMAGE_GROWFS
)|
3856 (idmap
? DISSECT_IMAGE_MOUNT_IDMAPPED
: 0));
3858 return log_error_errno(r
, "File system check for image failed: %m");
3860 return log_error_errno(r
, "Failed to mount image file system: %m");
3863 if (arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
3864 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3866 r
= detect_unified_cgroup_hierarchy_from_image(directory
);
3870 l
= send(fd_outer_socket
, &arg_unified_cgroup_hierarchy
, sizeof(arg_unified_cgroup_hierarchy
), MSG_NOSIGNAL
);
3872 return log_error_errno(errno
, "Failed to send cgroup mode: %m");
3873 if (l
!= sizeof(arg_unified_cgroup_hierarchy
))
3874 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3875 "Short write while sending cgroup mode.");
3878 r
= recursive_chown(directory
, arg_uid_shift
, arg_uid_range
);
3882 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
3886 if (arg_read_only
&& arg_volatile_mode
== VOLATILE_NO
&&
3887 !has_custom_root_mount(arg_custom_mounts
, arg_n_custom_mounts
)) {
3888 r
= bind_remount_recursive(directory
, MS_RDONLY
, MS_RDONLY
, NULL
);
3890 return log_error_errno(r
, "Failed to make tree read-only: %m");
3893 r
= mount_all(directory
,
3896 arg_selinux_apifs_context
);
3900 r
= copy_devnodes(directory
);
3904 r
= make_extra_nodes(directory
);
3908 (void) dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
3910 p
= prefix_roota(directory
, "/run/host");
3911 (void) make_inaccessible_nodes(p
, arg_uid_shift
, arg_uid_shift
);
3913 r
= setup_pts(directory
);
3917 r
= mount_tunnel_dig(directory
);
3921 r
= setup_keyring();
3925 r
= setup_credentials(directory
);
3929 r
= bind_user_setup(bind_user_context
, directory
);
3936 arg_n_custom_mounts
,
3939 arg_selinux_apifs_context
,
3940 MOUNT_NON_ROOT_ONLY
);
3944 r
= setup_timezone(directory
);
3948 r
= setup_resolv_conf(directory
);
3952 r
= setup_machine_id(directory
);
3956 r
= setup_journal(directory
);
3960 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3961 p
= prefix_roota(directory
, "/run/host/container-manager");
3962 (void) write_string_file(p
, arg_container_service_name
, WRITE_STRING_FILE_CREATE
);
3964 /* The same stuff as the $container_uuid env var */
3965 p
= prefix_roota(directory
, "/run/host/container-uuid");
3966 (void) write_string_filef(p
, WRITE_STRING_FILE_CREATE
, SD_ID128_UUID_FORMAT_STR
, SD_ID128_FORMAT_VAL(arg_uuid
));
3968 if (!arg_use_cgns
) {
3971 arg_unified_cgroup_hierarchy
,
3972 arg_userns_mode
!= USER_NAMESPACE_NO
,
3975 arg_selinux_apifs_context
,
3981 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3982 * mounts available in systemd services inside the container that create a new mount namespace. See
3983 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3984 * will inherit the shared propagation mode.
3986 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3987 * directory mount to root later on.
3988 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3990 r
= mount_switch_root(directory
, MS_SHARED
);
3992 return log_error_errno(r
, "Failed to move root directory: %m");
3994 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3995 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3997 r
= mount_tunnel_open();
4001 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4002 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4003 * requires that a fully visible instance is already present in the target mount
4004 * namespace. Mount one here so the inner child can mount its own instances. Later
4005 * we umount the temporary instances created here before we actually exec the
4006 * payload. Since the rootfs is shared the umount will propagate into the container.
4007 * Note, the inner child wouldn't be able to unmount the instances on its own since
4008 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4010 r
= pin_fully_visible_fs();
4015 fd
= setup_notify_child();
4019 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4020 arg_clone_ns_flags
|
4021 (arg_userns_mode
!= USER_NAMESPACE_NO
? CLONE_NEWUSER
: 0));
4023 return log_error_errno(errno
, "Failed to fork inner child: %m");
4025 fd_outer_socket
= safe_close(fd_outer_socket
);
4027 /* The inner child has all namespaces that are requested, so that we all are owned by the
4028 * user if user namespaces are turned on. */
4030 if (arg_network_namespace_path
) {
4031 r
= namespace_enter(-1, -1, netns_fd
, -1, -1);
4033 return log_error_errno(r
, "Failed to join network namespace: %m");
4036 r
= inner_child(barrier
, fd_inner_socket
, fds
, os_release_pairs
);
4038 _exit(EXIT_FAILURE
);
4040 _exit(EXIT_SUCCESS
);
4043 l
= send(fd_outer_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4045 return log_error_errno(errno
, "Failed to send PID: %m");
4046 if (l
!= sizeof(pid
))
4047 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4048 "Short write while sending PID.");
4050 l
= send(fd_outer_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
4052 return log_error_errno(errno
, "Failed to send machine ID: %m");
4053 if (l
!= sizeof(arg_uuid
))
4054 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4055 "Short write while sending machine ID.");
4057 l
= send_one_fd(fd_outer_socket
, fd
, 0);
4059 return log_error_errno(l
, "Failed to send notify fd: %m");
4061 fd_outer_socket
= safe_close(fd_outer_socket
);
4062 fd_inner_socket
= safe_close(fd_inner_socket
);
4063 netns_fd
= safe_close(netns_fd
);
4068 static int uid_shift_pick(uid_t
*shift
, LockFile
*ret_lock_file
) {
4069 bool tried_hashed
= false;
4070 unsigned n_tries
= 100;
4075 assert(ret_lock_file
);
4076 assert(arg_userns_mode
== USER_NAMESPACE_PICK
);
4077 assert(arg_uid_range
== 0x10000U
);
4081 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4084 char lock_path
[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t
) + 1];
4085 _cleanup_(release_lock_file
) LockFile lf
= LOCK_FILE_INIT
;
4090 if (candidate
< CONTAINER_UID_BASE_MIN
|| candidate
> CONTAINER_UID_BASE_MAX
)
4092 if ((candidate
& UINT32_C(0xFFFF)) != 0)
4095 xsprintf(lock_path
, "/run/systemd/nspawn-uid/" UID_FMT
, candidate
);
4096 r
= make_lock_file(lock_path
, LOCK_EX
|LOCK_NB
, &lf
);
4097 if (r
== -EBUSY
) /* Range already taken by another nspawn instance */
4102 /* Make some superficial checks whether the range is currently known in the user database */
4103 if (getpwuid(candidate
))
4105 if (getpwuid(candidate
+ UINT32_C(0xFFFE)))
4107 if (getgrgid(candidate
))
4109 if (getgrgid(candidate
+ UINT32_C(0xFFFE)))
4112 *ret_lock_file
= lf
;
4113 lf
= (struct LockFile
) LOCK_FILE_INIT
;
4118 if (arg_machine
&& !tried_hashed
) {
4119 /* Try to hash the base from the container name */
4121 static const uint8_t hash_key
[] = {
4122 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4123 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4126 candidate
= (uid_t
) siphash24(arg_machine
, strlen(arg_machine
), hash_key
);
4128 tried_hashed
= true;
4130 random_bytes(&candidate
, sizeof(candidate
));
4132 candidate
= (candidate
% (CONTAINER_UID_BASE_MAX
- CONTAINER_UID_BASE_MIN
)) + CONTAINER_UID_BASE_MIN
;
4133 candidate
&= (uid_t
) UINT32_C(0xFFFF0000);
4137 static int add_one_uid_map(
4139 uid_t container_uid
,
4143 return strextendf(p
,
4144 UID_FMT
" " UID_FMT
" " UID_FMT
"\n",
4145 container_uid
, host_uid
, range
);
4148 static int make_uid_map_string(
4149 const uid_t bind_user_uid
[],
4150 size_t n_bind_user_uid
,
4154 _cleanup_free_
char *s
= NULL
;
4155 uid_t previous_uid
= 0;
4158 assert(n_bind_user_uid
== 0 || bind_user_uid
);
4159 assert(IN_SET(offset
, 0, 2)); /* used to switch between UID and GID map */
4162 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4163 * quadruplet, consisting of host and container UID + GID. */
4165 for (size_t i
= 0; i
< n_bind_user_uid
; i
++) {
4166 uid_t payload_uid
= bind_user_uid
[i
*4+offset
],
4167 host_uid
= bind_user_uid
[i
*4+offset
+1];
4169 assert(previous_uid
<= payload_uid
);
4170 assert(payload_uid
< arg_uid_range
);
4172 /* Add a range to close the gap to previous entry */
4173 if (payload_uid
> previous_uid
) {
4174 r
= add_one_uid_map(&s
, previous_uid
, arg_uid_shift
+ previous_uid
, payload_uid
- previous_uid
);
4179 /* Map this specific user */
4180 r
= add_one_uid_map(&s
, payload_uid
, host_uid
, 1);
4184 previous_uid
= payload_uid
+ 1;
4187 /* And add a range to close the gap to finish the range */
4188 if (arg_uid_range
> previous_uid
) {
4189 r
= add_one_uid_map(&s
, previous_uid
, arg_uid_shift
+ previous_uid
, arg_uid_range
- previous_uid
);
4200 static int setup_uid_map(
4202 const uid_t bind_user_uid
[],
4203 size_t n_bind_user_uid
) {
4205 char uid_map
[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1];
4206 _cleanup_free_
char *s
= NULL
;
4211 /* Build the UID map string */
4212 if (make_uid_map_string(bind_user_uid
, n_bind_user_uid
, 0, &s
) < 0) /* offset=0 contains the UID pair */
4215 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4216 r
= write_string_file(uid_map
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
4218 return log_error_errno(r
, "Failed to write UID map: %m");
4220 /* And now build the GID map string */
4222 if (make_uid_map_string(bind_user_uid
, n_bind_user_uid
, 2, &s
) < 0) /* offset=2 contains the GID pair */
4225 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4226 r
= write_string_file(uid_map
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
4228 return log_error_errno(r
, "Failed to write GID map: %m");
4233 static int nspawn_dispatch_notify_fd(sd_event_source
*source
, int fd
, uint32_t revents
, void *userdata
) {
4234 char buf
[NOTIFY_BUFFER_MAX
+1];
4236 struct iovec iovec
= {
4238 .iov_len
= sizeof(buf
)-1,
4240 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred
)) +
4241 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX
)) control
;
4242 struct msghdr msghdr
= {
4245 .msg_control
= &control
,
4246 .msg_controllen
= sizeof(control
),
4248 struct ucred
*ucred
;
4250 pid_t inner_child_pid
;
4251 _cleanup_strv_free_
char **tags
= NULL
;
4256 inner_child_pid
= PTR_TO_PID(userdata
);
4258 if (revents
!= EPOLLIN
) {
4259 log_warning("Got unexpected poll event for notify fd.");
4263 n
= recvmsg_safe(fd
, &msghdr
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
);
4265 if (ERRNO_IS_TRANSIENT(n
))
4268 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4271 return log_warning_errno(n
, "Couldn't read notification socket: %m");
4274 cmsg_close_all(&msghdr
);
4276 ucred
= CMSG_FIND_DATA(&msghdr
, SOL_SOCKET
, SCM_CREDENTIALS
, struct ucred
);
4277 if (!ucred
|| ucred
->pid
!= inner_child_pid
) {
4278 log_debug("Received notify message without valid credentials. Ignoring.");
4282 if ((size_t) n
>= sizeof(buf
)) {
4283 log_warning("Received notify message exceeded maximum size. Ignoring.");
4288 tags
= strv_split(buf
, "\n\r");
4292 if (strv_contains(tags
, "READY=1")) {
4293 r
= sd_notify(false, "READY=1\n");
4295 log_warning_errno(r
, "Failed to send readiness notification, ignoring: %m");
4298 p
= strv_find_startswith(tags
, "STATUS=");
4300 (void) sd_notifyf(false, "STATUS=Container running: %s", p
);
4305 static int setup_notify_parent(sd_event
*event
, int fd
, pid_t
*inner_child_pid
, sd_event_source
**notify_event_source
) {
4308 r
= sd_event_add_io(event
, notify_event_source
, fd
, EPOLLIN
, nspawn_dispatch_notify_fd
, inner_child_pid
);
4310 return log_error_errno(r
, "Failed to allocate notify event source: %m");
4312 (void) sd_event_source_set_description(*notify_event_source
, "nspawn-notify");
4317 static int merge_settings(Settings
*settings
, const char *path
) {
4323 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4324 * that this steals the fields of the Settings* structure, and hence modifies it. */
4326 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
4327 settings
->start_mode
>= 0) {
4328 arg_start_mode
= settings
->start_mode
;
4329 strv_free_and_replace(arg_parameters
, settings
->parameters
);
4332 if ((arg_settings_mask
& SETTING_EPHEMERAL
) == 0 &&
4333 settings
->ephemeral
>= 0)
4334 arg_ephemeral
= settings
->ephemeral
;
4336 if ((arg_settings_mask
& SETTING_DIRECTORY
) == 0 &&
4339 if (!arg_settings_trusted
)
4340 log_warning("Ignoring root directory setting, file %s is not trusted.", path
);
4342 free_and_replace(arg_directory
, settings
->root
);
4345 if ((arg_settings_mask
& SETTING_PIVOT_ROOT
) == 0 &&
4346 settings
->pivot_root_new
) {
4347 free_and_replace(arg_pivot_root_new
, settings
->pivot_root_new
);
4348 free_and_replace(arg_pivot_root_old
, settings
->pivot_root_old
);
4351 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
4352 settings
->working_directory
)
4353 free_and_replace(arg_chdir
, settings
->working_directory
);
4355 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
4356 settings
->environment
)
4357 strv_free_and_replace(arg_setenv
, settings
->environment
);
4359 if ((arg_settings_mask
& SETTING_USER
) == 0) {
4362 free_and_replace(arg_user
, settings
->user
);
4364 if (uid_is_valid(settings
->uid
))
4365 arg_uid
= settings
->uid
;
4366 if (gid_is_valid(settings
->gid
))
4367 arg_gid
= settings
->gid
;
4368 if (settings
->n_supplementary_gids
> 0) {
4369 free_and_replace(arg_supplementary_gids
, settings
->supplementary_gids
);
4370 arg_n_supplementary_gids
= settings
->n_supplementary_gids
;
4374 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
4375 uint64_t plus
, minus
;
4376 uint64_t network_minus
= 0;
4379 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4380 * Settings structure */
4382 plus
= settings
->capability
;
4383 minus
= settings
->drop_capability
;
4385 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4386 settings_network_configured(settings
)) {
4387 if (settings_private_network(settings
))
4388 plus
|= UINT64_C(1) << CAP_NET_ADMIN
;
4390 network_minus
|= UINT64_C(1) << CAP_NET_ADMIN
;
4393 if (!arg_settings_trusted
&& plus
!= 0) {
4394 if (settings
->capability
!= 0)
4395 log_warning("Ignoring Capability= setting, file %s is not trusted.", path
);
4397 arg_caps_retain
&= ~network_minus
;
4398 arg_caps_retain
|= plus
;
4401 arg_caps_retain
&= ~minus
;
4403 /* Copy the full capabilities over too */
4404 if (capability_quintet_is_set(&settings
->full_capabilities
)) {
4405 if (!arg_settings_trusted
)
4406 log_warning("Ignoring capability settings, file %s is not trusted.", path
);
4408 arg_full_capabilities
= settings
->full_capabilities
;
4411 ambient
= settings
->ambient_capability
;
4412 if (!arg_settings_trusted
&& ambient
!= 0)
4413 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path
);
4415 arg_caps_ambient
|= ambient
;
4418 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
4419 settings
->kill_signal
> 0)
4420 arg_kill_signal
= settings
->kill_signal
;
4422 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
4423 settings
->personality
!= PERSONALITY_INVALID
)
4424 arg_personality
= settings
->personality
;
4426 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
4427 !sd_id128_is_null(settings
->machine_id
)) {
4429 if (!arg_settings_trusted
)
4430 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path
);
4432 arg_uuid
= settings
->machine_id
;
4435 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
4436 settings
->read_only
>= 0)
4437 arg_read_only
= settings
->read_only
;
4439 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
4440 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
4441 arg_volatile_mode
= settings
->volatile_mode
;
4443 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
4444 settings
->n_custom_mounts
> 0) {
4446 if (!arg_settings_trusted
)
4447 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path
);
4449 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4450 arg_custom_mounts
= TAKE_PTR(settings
->custom_mounts
);
4451 arg_n_custom_mounts
= settings
->n_custom_mounts
;
4452 settings
->n_custom_mounts
= 0;
4456 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4457 settings_network_configured(settings
)) {
4459 if (!arg_settings_trusted
)
4460 log_warning("Ignoring network settings, file %s is not trusted.", path
);
4462 arg_network_veth
= settings_network_veth(settings
);
4463 arg_private_network
= settings_private_network(settings
);
4465 strv_free_and_replace(arg_network_interfaces
, settings
->network_interfaces
);
4466 strv_free_and_replace(arg_network_macvlan
, settings
->network_macvlan
);
4467 strv_free_and_replace(arg_network_ipvlan
, settings
->network_ipvlan
);
4468 strv_free_and_replace(arg_network_veth_extra
, settings
->network_veth_extra
);
4470 free_and_replace(arg_network_bridge
, settings
->network_bridge
);
4471 free_and_replace(arg_network_zone
, settings
->network_zone
);
4473 free_and_replace(arg_network_namespace_path
, settings
->network_namespace_path
);
4477 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
4478 settings
->expose_ports
) {
4480 if (!arg_settings_trusted
)
4481 log_warning("Ignoring Port= setting, file %s is not trusted.", path
);
4483 expose_port_free_all(arg_expose_ports
);
4484 arg_expose_ports
= TAKE_PTR(settings
->expose_ports
);
4488 if ((arg_settings_mask
& SETTING_USERNS
) == 0 &&
4489 settings
->userns_mode
!= _USER_NAMESPACE_MODE_INVALID
) {
4491 if (!arg_settings_trusted
)
4492 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path
);
4494 arg_userns_mode
= settings
->userns_mode
;
4495 arg_uid_shift
= settings
->uid_shift
;
4496 arg_uid_range
= settings
->uid_range
;
4497 arg_userns_ownership
= settings
->userns_ownership
;
4501 if ((arg_settings_mask
& SETTING_BIND_USER
) == 0 &&
4502 !strv_isempty(settings
->bind_user
))
4503 strv_free_and_replace(arg_bind_user
, settings
->bind_user
);
4505 if ((arg_settings_mask
& SETTING_NOTIFY_READY
) == 0 &&
4506 settings
->notify_ready
>= 0)
4507 arg_notify_ready
= settings
->notify_ready
;
4509 if ((arg_settings_mask
& SETTING_SYSCALL_FILTER
) == 0) {
4511 if (!strv_isempty(settings
->syscall_allow_list
) || !strv_isempty(settings
->syscall_deny_list
)) {
4512 if (!arg_settings_trusted
&& !strv_isempty(settings
->syscall_allow_list
))
4513 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path
);
4515 strv_free_and_replace(arg_syscall_allow_list
, settings
->syscall_allow_list
);
4516 strv_free_and_replace(arg_syscall_deny_list
, settings
->syscall_deny_list
);
4521 if (settings
->seccomp
) {
4522 if (!arg_settings_trusted
)
4523 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path
);
4525 seccomp_release(arg_seccomp
);
4526 arg_seccomp
= TAKE_PTR(settings
->seccomp
);
4532 for (rl
= 0; rl
< _RLIMIT_MAX
; rl
++) {
4533 if ((arg_settings_mask
& (SETTING_RLIMIT_FIRST
<< rl
)))
4536 if (!settings
->rlimit
[rl
])
4539 if (!arg_settings_trusted
) {
4540 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl
), path
);
4544 free_and_replace(arg_rlimit
[rl
], settings
->rlimit
[rl
]);
4547 if ((arg_settings_mask
& SETTING_HOSTNAME
) == 0 &&
4549 free_and_replace(arg_hostname
, settings
->hostname
);
4551 if ((arg_settings_mask
& SETTING_NO_NEW_PRIVILEGES
) == 0 &&
4552 settings
->no_new_privileges
>= 0)
4553 arg_no_new_privileges
= settings
->no_new_privileges
;
4555 if ((arg_settings_mask
& SETTING_OOM_SCORE_ADJUST
) == 0 &&
4556 settings
->oom_score_adjust_set
) {
4558 if (!arg_settings_trusted
)
4559 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path
);
4561 arg_oom_score_adjust
= settings
->oom_score_adjust
;
4562 arg_oom_score_adjust_set
= true;
4566 if ((arg_settings_mask
& SETTING_CPU_AFFINITY
) == 0 &&
4567 settings
->cpu_set
.set
) {
4569 if (!arg_settings_trusted
)
4570 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path
);
4572 cpu_set_reset(&arg_cpu_set
);
4573 arg_cpu_set
= settings
->cpu_set
;
4574 settings
->cpu_set
= (CPUSet
) {};
4578 if ((arg_settings_mask
& SETTING_RESOLV_CONF
) == 0 &&
4579 settings
->resolv_conf
!= _RESOLV_CONF_MODE_INVALID
)
4580 arg_resolv_conf
= settings
->resolv_conf
;
4582 if ((arg_settings_mask
& SETTING_LINK_JOURNAL
) == 0 &&
4583 settings
->link_journal
!= _LINK_JOURNAL_INVALID
) {
4585 if (!arg_settings_trusted
)
4586 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path
);
4588 arg_link_journal
= settings
->link_journal
;
4589 arg_link_journal_try
= settings
->link_journal_try
;
4593 if ((arg_settings_mask
& SETTING_TIMEZONE
) == 0 &&
4594 settings
->timezone
!= _TIMEZONE_MODE_INVALID
)
4595 arg_timezone
= settings
->timezone
;
4597 if ((arg_settings_mask
& SETTING_SLICE
) == 0 &&
4600 if (!arg_settings_trusted
)
4601 log_warning("Ignoring slice setting, file '%s' is not trusted.", path
);
4603 free_and_replace(arg_slice
, settings
->slice
);
4606 if ((arg_settings_mask
& SETTING_USE_CGNS
) == 0 &&
4607 settings
->use_cgns
>= 0) {
4609 if (!arg_settings_trusted
)
4610 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path
);
4612 arg_use_cgns
= settings
->use_cgns
;
4615 if ((arg_settings_mask
& SETTING_CLONE_NS_FLAGS
) == 0 &&
4616 settings
->clone_ns_flags
!= ULONG_MAX
) {
4618 if (!arg_settings_trusted
)
4619 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path
);
4621 arg_clone_ns_flags
= settings
->clone_ns_flags
;
4624 if ((arg_settings_mask
& SETTING_CONSOLE_MODE
) == 0 &&
4625 settings
->console_mode
>= 0) {
4627 if (!arg_settings_trusted
)
4628 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path
);
4630 arg_console_mode
= settings
->console_mode
;
4633 if ((arg_settings_mask
& SETTING_SUPPRESS_SYNC
) == 0 &&
4634 settings
->suppress_sync
>= 0)
4635 arg_suppress_sync
= settings
->suppress_sync
;
4637 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4638 * don't consult arg_settings_mask for them. */
4640 sd_bus_message_unref(arg_property_message
);
4641 arg_property_message
= TAKE_PTR(settings
->properties
);
4643 arg_console_width
= settings
->console_width
;
4644 arg_console_height
= settings
->console_height
;
4646 device_node_array_free(arg_extra_nodes
, arg_n_extra_nodes
);
4647 arg_extra_nodes
= TAKE_PTR(settings
->extra_nodes
);
4648 arg_n_extra_nodes
= settings
->n_extra_nodes
;
4653 static int load_settings(void) {
4654 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4655 _cleanup_fclose_
FILE *f
= NULL
;
4656 _cleanup_free_
char *p
= NULL
;
4662 /* If all settings are masked, there's no point in looking for
4663 * the settings file */
4664 if (FLAGS_SET(arg_settings_mask
, _SETTINGS_MASK_ALL
))
4667 /* We first look in the admin's directories in /etc and /run */
4668 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4669 _cleanup_free_
char *j
= NULL
;
4671 j
= path_join(i
, arg_settings_filename
);
4679 /* By default, we trust configuration from /etc and /run */
4680 if (arg_settings_trusted
< 0)
4681 arg_settings_trusted
= true;
4686 if (errno
!= ENOENT
)
4687 return log_error_errno(errno
, "Failed to open %s: %m", j
);
4691 /* After that, let's look for a file next to the
4692 * actual image we shall boot. */
4695 r
= file_in_same_dir(arg_image
, arg_settings_filename
, &p
);
4697 return log_error_errno(r
, "Failed to generate settings path from image path: %m");
4698 } else if (arg_directory
) {
4699 r
= file_in_same_dir(arg_directory
, arg_settings_filename
, &p
);
4700 if (r
< 0 && r
!= -EADDRNOTAVAIL
) /* if directory is root fs, don't complain */
4701 return log_error_errno(r
, "Failed to generate settings path from directory path: %m");
4706 if (!f
&& errno
!= ENOENT
)
4707 return log_error_errno(errno
, "Failed to open %s: %m", p
);
4709 /* By default, we do not trust configuration from /var/lib/machines */
4710 if (arg_settings_trusted
< 0)
4711 arg_settings_trusted
= false;
4718 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
4720 r
= settings_load(f
, p
, &settings
);
4724 return merge_settings(settings
, p
);
4727 static int load_oci_bundle(void) {
4728 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4731 if (!arg_oci_bundle
)
4734 /* By default let's trust OCI bundles */
4735 if (arg_settings_trusted
< 0)
4736 arg_settings_trusted
= true;
4738 r
= oci_load(NULL
, arg_oci_bundle
, &settings
);
4742 return merge_settings(settings
, arg_oci_bundle
);
4745 static int run_container(
4746 DissectedImage
*dissected_image
,
4748 char veth_name
[IFNAMSIZ
], bool *veth_created
,
4749 struct ExposeArgs
*expose_args
,
4750 int *master
, pid_t
*pid
, int *ret
) {
4752 static const struct sigaction sa
= {
4753 .sa_handler
= nop_signal_handler
,
4754 .sa_flags
= SA_NOCLDSTOP
|SA_RESTART
,
4757 _cleanup_(release_lock_file
) LockFile uid_shift_lock
= LOCK_FILE_INIT
;
4758 _cleanup_close_
int etc_passwd_lock
= -EBADF
;
4759 _cleanup_close_pair_
int
4760 fd_inner_socket_pair
[2] = PIPE_EBADF
,
4761 fd_outer_socket_pair
[2] = PIPE_EBADF
;
4763 _cleanup_close_
int notify_socket
= -EBADF
, mntns_fd
= -EBADF
, fd_kmsg_fifo
= -EBADF
;
4764 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4765 _cleanup_(sd_event_source_unrefp
) sd_event_source
*notify_event_source
= NULL
;
4766 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
4767 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4768 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
4769 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
4770 _cleanup_free_ uid_t
*bind_user_uid
= NULL
;
4771 size_t n_bind_user_uid
= 0;
4772 ContainerStatus container_status
= 0;
4776 _cleanup_close_
int child_netns_fd
= -EBADF
;
4778 assert_se(sigemptyset(&mask_chld
) == 0);
4779 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4781 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
4782 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4783 * check with getpwuid() if the specific user already exists. Note that /etc might be
4784 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4785 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4786 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4789 etc_passwd_lock
= take_etc_passwd_lock(NULL
);
4790 if (etc_passwd_lock
< 0 && etc_passwd_lock
!= -EROFS
)
4791 return log_error_errno(etc_passwd_lock
, "Failed to take /etc/passwd lock: %m");
4794 r
= barrier_create(&barrier
);
4796 return log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4798 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, fd_inner_socket_pair
) < 0)
4799 return log_error_errno(errno
, "Failed to create inner socket pair: %m");
4801 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, fd_outer_socket_pair
) < 0)
4802 return log_error_errno(errno
, "Failed to create outer socket pair: %m");
4804 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4805 * parent's blocking calls and give it a chance to call wait() and terminate. */
4806 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4808 return log_error_errno(errno
, "Failed to change the signal mask: %m");
4810 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4812 return log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4814 if (arg_network_namespace_path
) {
4815 child_netns_fd
= open(arg_network_namespace_path
, O_RDONLY
|O_NOCTTY
|O_CLOEXEC
);
4816 if (child_netns_fd
< 0)
4817 return log_error_errno(errno
, "Cannot open file %s: %m", arg_network_namespace_path
);
4819 r
= fd_is_ns(child_netns_fd
, CLONE_NEWNET
);
4821 log_debug_errno(r
, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path
);
4823 return log_error_errno(r
, "Failed to check %s fs type: %m", arg_network_namespace_path
);
4825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
4826 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path
);
4829 *pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
);
4831 return log_error_errno(errno
, "clone() failed%s: %m",
4833 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4836 /* The outer child only has a file system namespace. */
4837 barrier_set_role(&barrier
, BARRIER_CHILD
);
4839 fd_inner_socket_pair
[0] = safe_close(fd_inner_socket_pair
[0]);
4840 fd_outer_socket_pair
[0] = safe_close(fd_outer_socket_pair
[0]);
4842 (void) reset_all_signal_handlers();
4843 (void) reset_signal_mask();
4845 r
= outer_child(&barrier
,
4848 fd_outer_socket_pair
[1],
4849 fd_inner_socket_pair
[1],
4853 _exit(EXIT_FAILURE
);
4855 _exit(EXIT_SUCCESS
);
4858 barrier_set_role(&barrier
, BARRIER_PARENT
);
4862 fd_inner_socket_pair
[1] = safe_close(fd_inner_socket_pair
[1]);
4863 fd_outer_socket_pair
[1] = safe_close(fd_outer_socket_pair
[1]);
4865 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4866 mntns_fd
= receive_one_fd(fd_outer_socket_pair
[0], 0);
4868 return log_error_errno(mntns_fd
, "Failed to receive mount namespace fd from outer child: %m");
4870 /* The child just let us know the UID shift it might have read from the image. */
4871 l
= recv(fd_outer_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, 0);
4873 return log_error_errno(errno
, "Failed to read UID shift: %m");
4874 if (l
!= sizeof arg_uid_shift
)
4875 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading UID shift.");
4877 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
4878 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4879 * image, but if that's already in use, pick a new one, and report back to the child,
4880 * which one we now picked. */
4882 r
= uid_shift_pick(&arg_uid_shift
, &uid_shift_lock
);
4884 return log_error_errno(r
, "Failed to pick suitable UID/GID range: %m");
4886 l
= send(fd_outer_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, MSG_NOSIGNAL
);
4888 return log_error_errno(errno
, "Failed to send UID shift: %m");
4889 if (l
!= sizeof arg_uid_shift
)
4890 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short write while writing UID shift.");
4893 n_bind_user_uid
= strv_length(arg_bind_user
);
4894 if (n_bind_user_uid
> 0) {
4895 /* Right after the UID shift, we'll receive the list of UID mappings for the
4896 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4898 bind_user_uid
= new(uid_t
, n_bind_user_uid
*4);
4902 for (size_t i
= 0; i
< n_bind_user_uid
; i
++) {
4903 l
= recv(fd_outer_socket_pair
[0], bind_user_uid
+ i
*4, sizeof(uid_t
)*4, 0);
4905 return log_error_errno(errno
, "Failed to read user UID map pair: %m");
4906 if (l
!= sizeof(uid_t
)*4)
4907 return log_full_errno(l
== 0 ? LOG_DEBUG
: LOG_WARNING
,
4908 SYNTHETIC_ERRNO(EIO
),
4909 "Short read while reading bind user UID pairs.");
4914 if (arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
4915 /* The child let us know the support cgroup mode it might have read from the image. */
4916 l
= recv(fd_outer_socket_pair
[0], &arg_unified_cgroup_hierarchy
, sizeof(arg_unified_cgroup_hierarchy
), 0);
4918 return log_error_errno(errno
, "Failed to read cgroup mode: %m");
4919 if (l
!= sizeof(arg_unified_cgroup_hierarchy
))
4920 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading cgroup mode (%zi bytes).%s",
4921 l
, l
== 0 ? " The child is most likely dead." : "");
4924 /* Wait for the outer child. */
4925 r
= wait_for_terminate_and_check("(sd-namespace)", *pid
, WAIT_LOG_ABNORMAL
);
4928 if (r
!= EXIT_SUCCESS
)
4931 /* And now retrieve the PID of the inner child. */
4932 l
= recv(fd_outer_socket_pair
[0], pid
, sizeof *pid
, 0);
4934 return log_error_errno(errno
, "Failed to read inner child PID: %m");
4935 if (l
!= sizeof *pid
)
4936 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading inner child PID.");
4938 /* We also retrieve container UUID in case it was generated by outer child */
4939 l
= recv(fd_outer_socket_pair
[0], &arg_uuid
, sizeof arg_uuid
, 0);
4941 return log_error_errno(errno
, "Failed to read container machine ID: %m");
4942 if (l
!= sizeof(arg_uuid
))
4943 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading container machined ID.");
4945 /* We also retrieve the socket used for notifications generated by outer child */
4946 notify_socket
= receive_one_fd(fd_outer_socket_pair
[0], 0);
4947 if (notify_socket
< 0)
4948 return log_error_errno(notify_socket
,
4949 "Failed to receive notification socket from the outer child: %m");
4951 log_debug("Init process invoked as PID "PID_FMT
, *pid
);
4953 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4954 if (!barrier_place_and_sync(&barrier
)) /* #1 */
4955 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
4957 r
= setup_uid_map(*pid
, bind_user_uid
, n_bind_user_uid
);
4961 (void) barrier_place(&barrier
); /* #2 */
4964 if (arg_private_network
) {
4965 if (!arg_network_namespace_path
) {
4966 /* Wait until the child has unshared its network namespace. */
4967 if (!barrier_place_and_sync(&barrier
)) /* #3 */
4968 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early");
4971 if (child_netns_fd
< 0) {
4972 /* Make sure we have an open file descriptor to the child's network
4973 * namespace so it stays alive even if the child exits. */
4974 r
= namespace_open(*pid
, NULL
, NULL
, &child_netns_fd
, NULL
, NULL
);
4976 return log_error_errno(r
, "Failed to open child network namespace: %m");
4979 r
= move_network_interfaces(child_netns_fd
, arg_network_interfaces
);
4983 if (arg_network_veth
) {
4984 r
= setup_veth(arg_machine
, *pid
, veth_name
,
4985 arg_network_bridge
|| arg_network_zone
);
4991 if (arg_network_bridge
) {
4992 /* Add the interface to a bridge */
4993 r
= setup_bridge(veth_name
, arg_network_bridge
, false);
4998 } else if (arg_network_zone
) {
4999 /* Add the interface to a bridge, possibly creating it */
5000 r
= setup_bridge(veth_name
, arg_network_zone
, true);
5008 r
= setup_veth_extra(arg_machine
, *pid
, arg_network_veth_extra
);
5012 /* We created the primary and extra veth links now; let's remember this, so that we know to
5013 remove them later on. Note that we don't bother with removing veth links that were created
5014 here when their setup failed half-way, because in that case the kernel should be able to
5015 remove them on its own, since they cannot be referenced by anything yet. */
5016 *veth_created
= true;
5018 r
= setup_macvlan(arg_machine
, *pid
, arg_network_macvlan
);
5022 r
= setup_ipvlan(arg_machine
, *pid
, arg_network_ipvlan
);
5027 if (arg_register
|| !arg_keep_unit
) {
5028 r
= sd_bus_default_system(&bus
);
5030 return log_error_errno(r
, "Failed to open system bus: %m");
5032 r
= sd_bus_set_close_on_exit(bus
, false);
5034 return log_error_errno(r
, "Failed to disable close-on-exit behaviour: %m");
5037 if (!arg_keep_unit
) {
5038 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5039 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5040 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5042 r
= sd_bus_match_signal_async(
5045 "org.freedesktop.systemd1",
5047 "org.freedesktop.systemd1.Scope",
5049 on_request_stop
, NULL
, PID_TO_PTR(*pid
));
5051 return log_error_errno(r
, "Failed to request RequestStop match: %m");
5055 r
= register_machine(
5063 arg_custom_mounts
, arg_n_custom_mounts
,
5066 arg_property_message
,
5068 arg_container_service_name
);
5072 } else if (!arg_keep_unit
) {
5078 arg_custom_mounts
, arg_n_custom_mounts
,
5081 arg_property_message
);
5085 } else if (arg_slice
|| arg_property
)
5086 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5088 r
= create_subcgroup(*pid
, arg_keep_unit
, arg_unified_cgroup_hierarchy
);
5092 r
= sync_cgroup(*pid
, arg_unified_cgroup_hierarchy
, arg_uid_shift
);
5096 r
= chown_cgroup(*pid
, arg_unified_cgroup_hierarchy
, arg_uid_shift
);
5100 /* Notify the child that the parent is ready with all
5101 * its setup (including cgroup-ification), and that
5102 * the child can now hand over control to the code to
5103 * run inside the container. */
5104 (void) barrier_place(&barrier
); /* #4 */
5106 /* Block SIGCHLD here, before notifying child.
5107 * process_pty() will handle it with the other signals. */
5108 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
5110 /* Reset signal to default */
5111 r
= default_signals(SIGCHLD
);
5113 return log_error_errno(r
, "Failed to reset SIGCHLD: %m");
5115 r
= sd_event_new(&event
);
5117 return log_error_errno(r
, "Failed to get default event source: %m");
5119 (void) sd_event_set_watchdog(event
, true);
5122 r
= sd_bus_attach_event(bus
, event
, 0);
5124 return log_error_errno(r
, "Failed to attach bus to event loop: %m");
5127 r
= setup_notify_parent(event
, notify_socket
, PID_TO_PTR(*pid
), ¬ify_event_source
);
5131 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
5132 r
= wipe_fully_visible_fs(mntns_fd
);
5135 mntns_fd
= safe_close(mntns_fd
);
5138 /* Let the child know that we are ready and wait that the child is completely ready now. */
5139 if (!barrier_place_and_sync(&barrier
)) /* #5 */
5140 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
5142 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5143 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5144 etc_passwd_lock
= safe_close(etc_passwd_lock
);
5146 (void) sd_notifyf(false,
5147 "STATUS=Container running.\n"
5148 "X_NSPAWN_LEADER_PID=" PID_FMT
, *pid
);
5149 if (!arg_notify_ready
) {
5150 r
= sd_notify(false, "READY=1\n");
5152 log_warning_errno(r
, "Failed to send readiness notification, ignoring: %m");
5155 if (arg_kill_signal
> 0) {
5156 /* Try to kill the init system on SIGINT or SIGTERM */
5157 (void) sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
5158 (void) sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
5160 /* Immediately exit */
5161 (void) sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
5162 (void) sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
5165 (void) sd_event_add_signal(event
, NULL
, SIGRTMIN
+18, sigrtmin18_handler
, NULL
);
5167 r
= sd_event_add_memory_pressure(event
, NULL
, NULL
, NULL
);
5169 log_debug_errno(r
, "Failed allocate memory pressure event source, ignoring: %m");
5171 /* Exit when the child exits */
5172 (void) sd_event_add_signal(event
, NULL
, SIGCHLD
, on_sigchld
, PID_TO_PTR(*pid
));
5174 /* Retrieve the kmsg fifo allocated by inner child */
5175 fd_kmsg_fifo
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5176 if (fd_kmsg_fifo
< 0)
5177 return log_error_errno(fd_kmsg_fifo
, "Failed to receive kmsg fifo from inner child: %m");
5179 if (arg_expose_ports
) {
5180 r
= expose_port_watch_rtnl(event
, fd_inner_socket_pair
[0], on_address_change
, expose_args
, &rtnl
);
5184 (void) expose_port_execute(rtnl
, &expose_args
->fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
->address4
);
5185 (void) expose_port_execute(rtnl
, &expose_args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
->address6
);
5188 if (arg_console_mode
!= CONSOLE_PIPE
) {
5189 _cleanup_close_
int fd
= -EBADF
;
5190 PTYForwardFlags flags
= 0;
5192 /* Retrieve the master pty allocated by inner child */
5193 fd
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5195 return log_error_errno(fd
, "Failed to receive master pty from the inner child: %m");
5197 switch (arg_console_mode
) {
5199 case CONSOLE_READ_ONLY
:
5200 flags
|= PTY_FORWARD_READ_ONLY
;
5204 case CONSOLE_INTERACTIVE
:
5205 flags
|= PTY_FORWARD_IGNORE_VHANGUP
;
5207 r
= pty_forward_new(event
, fd
, flags
, &forward
);
5209 return log_error_errno(r
, "Failed to create PTY forwarder: %m");
5211 if (arg_console_width
!= UINT_MAX
|| arg_console_height
!= UINT_MAX
)
5212 (void) pty_forward_set_width_height(forward
,
5214 arg_console_height
);
5218 assert(arg_console_mode
== CONSOLE_PASSIVE
);
5221 *master
= TAKE_FD(fd
);
5224 fd_inner_socket_pair
[0] = safe_close(fd_inner_socket_pair
[0]);
5226 r
= sd_event_loop(event
);
5228 return log_error_errno(r
, "Failed to run event loop: %m");
5233 (void) pty_forward_get_last_char(forward
, &last_char
);
5234 forward
= pty_forward_free(forward
);
5236 if (!arg_quiet
&& last_char
!= '\n')
5240 /* Kill if it is not dead yet anyway */
5241 if (!arg_register
&& !arg_keep_unit
&& bus
)
5242 terminate_scope(bus
, arg_machine
);
5244 /* Normally redundant, but better safe than sorry */
5245 (void) kill(*pid
, SIGKILL
);
5247 fd_kmsg_fifo
= safe_close(fd_kmsg_fifo
);
5249 if (arg_private_network
) {
5250 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5251 * to avoid having to move the parent to the child network namespace. */
5252 r
= safe_fork(NULL
, FORK_RESET_SIGNALS
|FORK_DEATHSIG
|FORK_WAIT
|FORK_LOG
, NULL
);
5257 _cleanup_close_
int parent_netns_fd
= -EBADF
;
5259 r
= namespace_open(getpid(), NULL
, NULL
, &parent_netns_fd
, NULL
, NULL
);
5261 log_error_errno(r
, "Failed to open parent network namespace: %m");
5262 _exit(EXIT_FAILURE
);
5265 r
= namespace_enter(-1, -1, child_netns_fd
, -1, -1);
5267 log_error_errno(r
, "Failed to enter child network namespace: %m");
5268 _exit(EXIT_FAILURE
);
5271 r
= move_network_interfaces(parent_netns_fd
, arg_network_interfaces
);
5273 log_error_errno(r
, "Failed to move network interfaces back to parent network namespace: %m");
5275 _exit(r
< 0 ? EXIT_FAILURE
: EXIT_SUCCESS
);
5279 r
= wait_for_container(TAKE_PID(*pid
), &container_status
);
5281 /* Tell machined that we are gone. */
5283 (void) unregister_machine(bus
, arg_machine
);
5286 /* We failed to wait for the container, or the container exited abnormally. */
5288 if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
5289 /* r > 0 → The container exited with a non-zero status.
5290 * As a special case, we need to replace 133 with a different value,
5291 * because 133 is special-cased in the service file to reboot the container.
5292 * otherwise → The container exited with zero status and a reboot was not requested.
5294 if (r
== EXIT_FORCE_RESTART
)
5295 r
= EXIT_FAILURE
; /* replace 133 with the general failure code */
5297 return 0; /* finito */
5300 /* CONTAINER_REBOOTED, loop again */
5302 if (arg_keep_unit
) {
5303 /* Special handling if we are running as a service: instead of simply
5304 * restarting the machine we want to restart the entire service, so let's
5305 * inform systemd about this with the special exit code 133. The service
5306 * file uses RestartForceExitStatus=133 so that this results in a full
5307 * nspawn restart. This is necessary since we might have cgroup parameters
5308 * set we want to have flushed out. */
5309 *ret
= EXIT_FORCE_RESTART
;
5310 return 0; /* finito */
5313 expose_port_flush(&expose_args
->fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
->address4
);
5314 expose_port_flush(&expose_args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
->address6
);
5316 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
5317 *veth_created
= false;
5318 return 1; /* loop again */
5321 static int initialize_rlimits(void) {
5322 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5323 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5324 * container execution environments. */
5326 static const struct rlimit kernel_defaults
[_RLIMIT_MAX
] = {
5327 [RLIMIT_AS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5328 [RLIMIT_CORE
] = { 0, RLIM_INFINITY
},
5329 [RLIMIT_CPU
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5330 [RLIMIT_DATA
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5331 [RLIMIT_FSIZE
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5332 [RLIMIT_LOCKS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5333 [RLIMIT_MEMLOCK
] = { DEFAULT_RLIMIT_MEMLOCK
, DEFAULT_RLIMIT_MEMLOCK
},
5334 [RLIMIT_MSGQUEUE
] = { 819200, 819200 },
5335 [RLIMIT_NICE
] = { 0, 0 },
5336 [RLIMIT_NOFILE
] = { 1024, 4096 },
5337 [RLIMIT_RSS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5338 [RLIMIT_RTPRIO
] = { 0, 0 },
5339 [RLIMIT_RTTIME
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5340 [RLIMIT_STACK
] = { 8388608, RLIM_INFINITY
},
5342 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5343 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5344 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5345 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5346 * that PID 1 changes a number of other resource limits during early initialization which is why we
5347 * don't read the other limits from PID 1 but prefer the static table above. */
5352 for (rl
= 0; rl
< _RLIMIT_MAX
; rl
++) {
5353 /* Let's only fill in what the user hasn't explicitly configured anyway */
5354 if ((arg_settings_mask
& (SETTING_RLIMIT_FIRST
<< rl
)) == 0) {
5355 const struct rlimit
*v
;
5356 struct rlimit buffer
;
5358 if (IN_SET(rl
, RLIMIT_NPROC
, RLIMIT_SIGPENDING
)) {
5359 /* For these two let's read the limits off PID 1. See above for an explanation. */
5361 if (prlimit(1, rl
, NULL
, &buffer
) < 0)
5362 return log_error_errno(errno
, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl
));
5365 } else if (rl
== RLIMIT_NOFILE
) {
5366 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5367 * userspace. Given that nspawn containers are often run without our PID 1,
5368 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5369 * so that container userspace gets similar resources as host userspace
5371 buffer
= kernel_defaults
[rl
];
5372 buffer
.rlim_max
= MIN((rlim_t
) read_nr_open(), (rlim_t
) HIGH_RLIMIT_NOFILE
);
5375 v
= kernel_defaults
+ rl
;
5377 arg_rlimit
[rl
] = newdup(struct rlimit
, v
, 1);
5378 if (!arg_rlimit
[rl
])
5382 if (DEBUG_LOGGING
) {
5383 _cleanup_free_
char *k
= NULL
;
5385 (void) rlimit_format(arg_rlimit
[rl
], &k
);
5386 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl
), k
);
5393 static int cant_be_in_netns(void) {
5394 _cleanup_close_
int fd
= -EBADF
;
5398 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5399 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5402 if (!arg_image
) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5405 fd
= socket(AF_UNIX
, SOCK_SEQPACKET
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0);
5407 return log_error_errno(errno
, "Failed to allocate udev control socket: %m");
5409 r
= connect_unix_path(fd
, AT_FDCWD
, "/run/udev/control");
5411 if (r
== -ENOENT
|| ERRNO_IS_DISCONNECT(r
))
5412 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
5413 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5415 return log_error_errno(r
, "Failed to connect socket to udev control socket: %m");
5418 r
= getpeercred(fd
, &ucred
);
5420 return log_error_errno(r
, "Failed to determine peer of udev control socket: %m");
5422 r
= in_same_namespace(ucred
.pid
, 0, NAMESPACE_NET
);
5424 return log_error_errno(r
, "Failed to determine network namespace of udev: %m");
5426 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
5427 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5431 static int run(int argc
, char *argv
[]) {
5432 bool remove_directory
= false, remove_image
= false, veth_created
= false, remove_tmprootdir
= false;
5433 _cleanup_close_
int master
= -EBADF
;
5434 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
5435 int r
, n_fd_passed
, ret
= EXIT_SUCCESS
;
5436 char veth_name
[IFNAMSIZ
] = "";
5437 struct ExposeArgs expose_args
= {};
5438 _cleanup_(release_lock_file
) LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
5439 char tmprootdir
[] = "/tmp/nspawn-root-XXXXXX";
5440 _cleanup_(loop_device_unrefp
) LoopDevice
*loop
= NULL
;
5441 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
5442 _cleanup_(fw_ctx_freep
) FirewallContext
*fw_ctx
= NULL
;
5445 log_parse_environment();
5448 r
= parse_argv(argc
, argv
);
5452 if (geteuid() != 0) {
5453 r
= log_warning_errno(SYNTHETIC_ERRNO(EPERM
),
5454 argc
>= 2 ? "Need to be root." :
5455 "Need to be root (and some arguments are usually required).\nHint: try --help");
5459 r
= cant_be_in_netns();
5463 r
= initialize_rlimits();
5467 r
= load_oci_bundle();
5471 r
= determine_names();
5475 r
= load_settings();
5481 log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5485 r
= verify_arguments();
5489 /* Reapply environment settings. */
5490 (void) detect_unified_cgroup_hierarchy_from_environment();
5492 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5493 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5494 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5495 (void) ignore_signals(SIGPIPE
);
5497 n_fd_passed
= sd_listen_fds(false);
5498 if (n_fd_passed
> 0) {
5499 r
= fdset_new_listen_fds(&fds
, false);
5501 log_error_errno(r
, "Failed to collect file descriptors: %m");
5506 /* The "default" umask. This is appropriate for most file and directory
5507 * operations performed by nspawn, and is the umask that will be used for
5508 * the child. Functions like copy_devnodes() change the umask temporarily. */
5511 if (arg_directory
) {
5514 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5515 * /var from the host will propagate into container dynamically (because bad things happen if
5516 * two systems write to the same /var). Let's allow it for the special cases where /var is
5517 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5518 if (path_equal(arg_directory
, "/") && !(arg_ephemeral
|| IN_SET(arg_volatile_mode
, VOLATILE_YES
, VOLATILE_STATE
))) {
5519 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5520 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5524 if (arg_ephemeral
) {
5525 _cleanup_free_
char *np
= NULL
;
5527 r
= chase_and_update(&arg_directory
, 0);
5531 /* If the specified path is a mount point we generate the new snapshot immediately
5532 * inside it under a random name. However if the specified is not a mount point we
5533 * create the new snapshot in the parent directory, just next to it. */
5534 r
= path_is_mount_point(arg_directory
, NULL
, 0);
5536 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
5540 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
5542 r
= tempfn_random(arg_directory
, "machine.", &np
);
5544 log_error_errno(r
, "Failed to generate name for directory snapshot: %m");
5548 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5549 * only owned by us and no one else. */
5550 r
= image_path_lock(np
, LOCK_EX
|LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5552 log_error_errno(r
, "Failed to lock %s: %m", np
);
5557 BLOCK_SIGNALS(SIGINT
);
5558 r
= btrfs_subvol_snapshot(arg_directory
, np
,
5559 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
5560 BTRFS_SNAPSHOT_FALLBACK_COPY
|
5561 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
5562 BTRFS_SNAPSHOT_RECURSIVE
|
5563 BTRFS_SNAPSHOT_QUOTA
|
5564 BTRFS_SNAPSHOT_SIGINT
);
5567 log_error_errno(r
, "Interrupted while copying file system tree to %s, removed again.", np
);
5571 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
5575 free_and_replace(arg_directory
, np
);
5576 remove_directory
= true;
5578 r
= chase_and_update(&arg_directory
, arg_template
? CHASE_NONEXISTENT
: 0);
5582 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5584 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
5588 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
5593 r
= chase_and_update(&arg_template
, 0);
5598 BLOCK_SIGNALS(SIGINT
);
5599 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
,
5600 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
5601 BTRFS_SNAPSHOT_FALLBACK_COPY
|
5602 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
5603 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
5604 BTRFS_SNAPSHOT_RECURSIVE
|
5605 BTRFS_SNAPSHOT_QUOTA
|
5606 BTRFS_SNAPSHOT_SIGINT
);
5609 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
5610 "Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
5611 else if (r
== -EINTR
) {
5612 log_error_errno(r
, "Interrupted while copying file system tree to %s, removed again.", arg_directory
);
5615 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
5618 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
5619 "Populated %s from template %s.", arg_directory
, arg_template
);
5623 if (arg_start_mode
== START_BOOT
) {
5624 _cleanup_free_
char *b
= NULL
;
5627 if (arg_pivot_root_new
) {
5628 b
= path_join(arg_directory
, arg_pivot_root_new
);
5636 if (path_is_os_tree(p
) <= 0) {
5637 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5638 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p
);
5642 _cleanup_free_
char *p
= NULL
;
5644 if (arg_pivot_root_new
)
5645 p
= path_join(arg_directory
, arg_pivot_root_new
, "/usr/");
5647 p
= path_join(arg_directory
, "/usr/");
5651 if (laccess(p
, F_OK
) < 0) {
5652 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5653 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory
);
5659 DissectImageFlags dissect_image_flags
=
5660 DISSECT_IMAGE_GENERIC_ROOT
|
5661 DISSECT_IMAGE_REQUIRE_ROOT
|
5662 DISSECT_IMAGE_RELAX_VAR_CHECK
|
5663 DISSECT_IMAGE_USR_NO_ROOT
|
5664 DISSECT_IMAGE_ADD_PARTITION_DEVICES
|
5665 DISSECT_IMAGE_PIN_PARTITION_DEVICES
;
5667 assert(!arg_template
);
5669 r
= chase_and_update(&arg_image
, 0);
5673 if (arg_ephemeral
) {
5674 _cleanup_free_
char *np
= NULL
;
5676 r
= tempfn_random(arg_image
, "machine.", &np
);
5678 log_error_errno(r
, "Failed to generate name for image snapshot: %m");
5682 /* Always take an exclusive lock on our own ephemeral copy. */
5683 r
= image_path_lock(np
, LOCK_EX
|LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5685 r
= log_error_errno(r
, "Failed to create image lock: %m");
5690 BLOCK_SIGNALS(SIGINT
);
5691 r
= copy_file_full(arg_image
, np
, O_EXCL
, arg_read_only
? 0400 : 0600,
5692 FS_NOCOW_FL
, FS_NOCOW_FL
,
5693 COPY_REFLINK
|COPY_CRTIME
|COPY_SIGINT
,
5697 log_error_errno(r
, "Interrupted while copying image file to %s, removed again.", np
);
5701 r
= log_error_errno(r
, "Failed to copy image file: %m");
5705 free_and_replace(arg_image
, np
);
5706 remove_image
= true;
5708 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5710 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
5714 r
= log_error_errno(r
, "Failed to create image lock: %m");
5718 r
= verity_settings_load(
5719 &arg_verity_settings
,
5720 arg_image
, NULL
, NULL
);
5722 log_error_errno(r
, "Failed to read verity artefacts for %s: %m", arg_image
);
5726 if (arg_verity_settings
.data_path
)
5727 dissect_image_flags
|= DISSECT_IMAGE_NO_PARTITION_TABLE
;
5730 if (!mkdtemp(tmprootdir
)) {
5731 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
5735 remove_tmprootdir
= true;
5737 arg_directory
= strdup(tmprootdir
);
5738 if (!arg_directory
) {
5743 r
= loop_device_make_by_path(
5745 arg_read_only
? O_RDONLY
: O_RDWR
,
5746 /* sector_size= */ UINT32_MAX
,
5747 FLAGS_SET(dissect_image_flags
, DISSECT_IMAGE_NO_PARTITION_TABLE
) ? 0 : LO_FLAGS_PARTSCAN
,
5751 log_error_errno(r
, "Failed to set up loopback block device: %m");
5755 r
= dissect_loop_device_and_warn(
5757 &arg_verity_settings
,
5759 dissect_image_flags
,
5762 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5763 log_notice("Note that the disk image needs to\n"
5764 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5765 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5766 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5767 " d) or contain a file system without a partition table\n"
5768 "in order to be bootable with systemd-nspawn.");
5774 r
= dissected_image_load_verity_sig_partition(
5777 &arg_verity_settings
);
5781 if (dissected_image
->has_verity
&& !arg_verity_settings
.root_hash
&& !dissected_image
->has_verity_sig
)
5782 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5783 "root hash signature found! Proceeding without integrity checking.", arg_image
);
5785 r
= dissected_image_decrypt_interactively(
5788 &arg_verity_settings
,
5793 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5794 if (remove_image
&& unlink(arg_image
) >= 0)
5795 remove_image
= false;
5797 if (arg_architecture
< 0)
5798 arg_architecture
= dissected_image_architecture(dissected_image
);
5801 r
= custom_mount_prepare_all(arg_directory
, arg_custom_mounts
, arg_n_custom_mounts
);
5805 if (arg_console_mode
< 0)
5807 isatty(STDIN_FILENO
) > 0 &&
5808 isatty(STDOUT_FILENO
) > 0 ? CONSOLE_INTERACTIVE
: CONSOLE_READ_ONLY
;
5810 if (arg_console_mode
== CONSOLE_PIPE
) /* if we pass STDERR on to the container, don't add our own logs into it too */
5814 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
5815 arg_machine
, arg_image
?: arg_directory
);
5817 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, SIGRTMIN
+18, -1) >= 0);
5819 if (prctl(PR_SET_CHILD_SUBREAPER
, 1, 0, 0, 0) < 0) {
5820 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
5824 if (arg_expose_ports
) {
5825 r
= fw_ctx_new(&fw_ctx
);
5827 log_error_errno(r
, "Cannot expose configured ports, firewall initialization failed: %m");
5830 expose_args
.fw_ctx
= fw_ctx
;
5833 r
= run_container(dissected_image
,
5835 veth_name
, &veth_created
,
5836 &expose_args
, &master
,
5843 (void) sd_notify(false,
5844 r
== 0 && ret
== EXIT_FORCE_RESTART
? "STOPPING=1\nSTATUS=Restarting..." :
5845 "STOPPING=1\nSTATUS=Terminating...");
5848 (void) kill(pid
, SIGKILL
);
5850 /* Try to flush whatever is still queued in the pty */
5852 (void) copy_bytes(master
, STDOUT_FILENO
, UINT64_MAX
, 0);
5853 master
= safe_close(master
);
5857 (void) wait_for_terminate(pid
, NULL
);
5861 if (remove_directory
&& arg_directory
) {
5864 k
= rm_rf(arg_directory
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
5866 log_warning_errno(k
, "Cannot remove '%s', ignoring: %m", arg_directory
);
5869 if (remove_image
&& arg_image
) {
5870 if (unlink(arg_image
) < 0)
5871 log_warning_errno(errno
, "Can't remove image file '%s', ignoring: %m", arg_image
);
5874 if (remove_tmprootdir
) {
5875 if (rmdir(tmprootdir
) < 0)
5876 log_debug_errno(errno
, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir
);
5882 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5883 (void) rm_rf(p
, REMOVE_ROOT
);
5886 expose_port_flush(&fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
.address4
);
5887 expose_port_flush(&fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
.address6
);
5890 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
5891 (void) remove_bridge(arg_network_zone
);
5893 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
5894 expose_port_free_all(arg_expose_ports
);
5895 rlimit_free_all(arg_rlimit
);
5896 device_node_array_free(arg_extra_nodes
, arg_n_extra_nodes
);
5897 credential_free_all(arg_credentials
, arg_n_credentials
);
5905 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run
);