1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
4 #include <linux/ioprio.h>
5 #include <linux/prctl.h>
6 #include <linux/sched.h>
7 #include <linux/securebits.h>
9 #include <sys/eventfd.h>
10 #include <sys/ioctl.h>
11 #include <sys/mount.h>
12 #include <sys/prctl.h>
16 #include <security/pam_appl.h>
19 #include "sd-messages.h"
21 #include "apparmor-util.h"
22 #include "argv-util.h"
23 #include "ask-password-api.h"
26 #include "bpf-dlopen.h"
27 #include "bpf-restrict-fs.h"
28 #include "btrfs-util.h"
29 #include "capability-util.h"
30 #include "cgroup-setup.h"
33 #include "chown-recursive.h"
34 #include "constants.h"
36 #include "coredump-util.h"
37 #include "dissect-image.h"
38 #include "dynamic-user.h"
41 #include "exec-credential.h"
42 #include "exec-invoke.h"
44 #include "exit-status.h"
47 #include "hexdecoct.h"
48 #include "hostname-setup.h"
49 #include "image-policy.h"
51 #include "iovec-util.h"
52 #include "journal-send.h"
54 #include "memfd-util.h"
55 #include "missing_sched.h"
56 #include "missing_syscall.h"
57 #include "mkdir-label.h"
58 #include "mount-util.h"
59 #include "namespace-util.h"
61 #include "open-file.h"
62 #include "osc-context.h"
63 #include "path-util.h"
65 #include "proc-cmdline.h"
66 #include "process-util.h"
68 #include "rlimit-util.h"
69 #include "seccomp-util.h"
70 #include "selinux-util.h"
72 #include "signal-util.h"
73 #include "smack-util.h"
74 #include "socket-util.h"
75 #include "stat-util.h"
76 #include "string-table.h"
78 #include "terminal-util.h"
79 #include "user-util.h"
80 #include "utmp-wtmp.h"
83 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
84 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
86 #define SNDBUF_SIZE (8*1024*1024)
96 assert(fds
|| n_fds
== 0);
98 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
99 * O_NONBLOCK only applies to socket activation though. */
101 for (size_t i
= 0; i
< n_fds
; i
++) {
103 if (i
< n_socket_fds
) {
104 r
= fd_nonblock(fds
[i
], nonblock
);
109 /* We unconditionally drop FD_CLOEXEC from the fds,
110 * since after all we want to pass these fds to our
113 r
= fd_cloexec(fds
[i
], false);
121 static bool is_terminal_input(ExecInput i
) {
124 EXEC_INPUT_TTY_FORCE
,
125 EXEC_INPUT_TTY_FAIL
);
128 static bool is_terminal_output(ExecOutput o
) {
131 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
132 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
135 static bool is_kmsg_output(ExecOutput o
) {
138 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
141 static int open_null_as(int flags
, int nfd
) {
146 fd
= open("/dev/null", flags
|O_NOCTTY
);
150 return move_fd(fd
, nfd
, false);
153 static int connect_journal_socket(
155 const char *log_namespace
,
159 uid_t olduid
= UID_INVALID
;
160 gid_t oldgid
= GID_INVALID
;
166 j
= journal_stream_path(log_namespace
);
170 if (gid_is_valid(gid
)) {
173 if (setegid(gid
) < 0)
177 if (uid_is_valid(uid
)) {
180 if (seteuid(uid
) < 0) {
186 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
188 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
189 an LSM interferes. */
191 if (uid_is_valid(uid
))
192 (void) seteuid(olduid
);
195 if (gid_is_valid(gid
))
196 (void) setegid(oldgid
);
201 static int connect_logger_as(
202 const ExecContext
*context
,
203 const ExecParameters
*params
,
210 _cleanup_close_
int fd
= -EBADF
;
215 assert(output
< _EXEC_OUTPUT_MAX
);
219 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
223 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
227 if (shutdown(fd
, SHUT_RD
) < 0)
230 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
240 context
->syslog_identifier
?: ident
,
241 params
->flags
& EXEC_PASS_LOG_UNIT
? params
->unit_id
: "",
242 context
->syslog_priority
,
243 !!context
->syslog_level_prefix
,
245 is_kmsg_output(output
),
246 is_terminal_output(output
)) < 0)
249 return move_fd(TAKE_FD(fd
), nfd
, false);
252 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
258 fd
= open_terminal(path
, flags
| O_NOCTTY
);
262 return move_fd(fd
, nfd
, false);
265 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
266 _cleanup_close_
int fd
= -EBADF
;
271 if (IN_SET(flags
& O_ACCMODE_STRICT
, O_WRONLY
, O_RDWR
))
274 fd
= open(path
, flags
|O_NOCTTY
, mode
);
278 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
281 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
283 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
287 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
288 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
289 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
290 * wasn't an AF_UNIX socket after all */
295 if ((flags
& O_ACCMODE_STRICT
) == O_RDONLY
)
296 r
= shutdown(fd
, SHUT_WR
);
297 else if ((flags
& O_ACCMODE_STRICT
) == O_WRONLY
)
298 r
= shutdown(fd
, SHUT_RD
);
307 static int fixup_input(
308 const ExecContext
*context
,
310 bool apply_tty_stdin
) {
316 std_input
= context
->std_input
;
318 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
319 return EXEC_INPUT_NULL
;
321 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
322 return EXEC_INPUT_NULL
;
324 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
325 return EXEC_INPUT_NULL
;
330 static int fixup_output(ExecOutput output
, int socket_fd
) {
332 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
333 return EXEC_OUTPUT_INHERIT
;
338 static int setup_input(
339 const ExecContext
*context
,
340 const ExecParameters
*params
,
342 const int named_iofds
[static 3]) {
351 if (params
->stdin_fd
>= 0) {
352 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
355 /* Try to make this our controlling tty, if it is a tty */
356 if (isatty_safe(STDIN_FILENO
) && ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
) < 0)
357 log_debug_errno(errno
, "Failed to make standard input TTY our controlling terminal: %m");
362 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
366 case EXEC_INPUT_NULL
:
367 return open_null_as(O_RDONLY
, STDIN_FILENO
);
370 case EXEC_INPUT_TTY_FORCE
:
371 case EXEC_INPUT_TTY_FAIL
: {
372 _cleanup_close_
int tty_fd
= -EBADF
;
373 _cleanup_free_
char *resolved
= NULL
;
374 const char *tty_path
;
376 tty_path
= ASSERT_PTR(exec_context_tty_path(context
));
378 if (tty_is_console(tty_path
)) {
379 r
= resolve_dev_console(&resolved
);
381 log_debug_errno(r
, "Failed to resolve /dev/console, ignoring: %m");
383 log_debug("Resolved /dev/console to %s", resolved
);
388 tty_fd
= acquire_terminal(tty_path
,
389 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
390 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
391 ACQUIRE_TERMINAL_WAIT
,
396 r
= move_fd(tty_fd
, STDIN_FILENO
, /* cloexec= */ false);
404 case EXEC_INPUT_SOCKET
:
405 assert(socket_fd
>= 0);
407 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
409 case EXEC_INPUT_NAMED_FD
:
410 assert(named_iofds
[STDIN_FILENO
] >= 0);
412 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
413 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
415 case EXEC_INPUT_DATA
: {
418 fd
= memfd_new_and_seal("exec-input", context
->stdin_data
, context
->stdin_data_size
);
422 return move_fd(fd
, STDIN_FILENO
, false);
425 case EXEC_INPUT_FILE
: {
429 assert(context
->stdio_file
[STDIN_FILENO
]);
431 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
432 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
434 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
438 return move_fd(fd
, STDIN_FILENO
, false);
442 assert_not_reached();
446 static bool can_inherit_stderr_from_stdout(
447 const ExecContext
*context
,
453 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
456 if (e
== EXEC_OUTPUT_INHERIT
)
461 if (e
== EXEC_OUTPUT_NAMED_FD
)
462 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
464 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
465 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
470 static int setup_output(
471 const ExecContext
*context
,
472 const ExecParameters
*params
,
475 const int named_iofds
[static 3],
479 dev_t
*journal_stream_dev
,
480 ino_t
*journal_stream_ino
) {
489 assert(journal_stream_dev
);
490 assert(journal_stream_ino
);
492 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
494 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
497 return STDOUT_FILENO
;
500 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
501 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
504 return STDERR_FILENO
;
507 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
508 o
= fixup_output(context
->std_output
, socket_fd
);
510 // FIXME: we probably should spend some time here to verify that if we inherit an fd from stdin
511 // (possibly indirect via inheritance from stdout) it is actually opened for write!
513 if (fileno
== STDERR_FILENO
) {
515 e
= fixup_output(context
->std_error
, socket_fd
);
517 /* This expects the input and output are already set up */
519 /* Don't change the stderr file descriptor if we inherit all
520 * the way and are not on a tty */
521 if (e
== EXEC_OUTPUT_INHERIT
&&
522 o
== EXEC_OUTPUT_INHERIT
&&
523 i
== EXEC_INPUT_NULL
&&
524 !is_terminal_input(context
->std_input
) &&
528 /* Duplicate from stdout if possible */
529 if (can_inherit_stderr_from_stdout(context
, o
, e
))
530 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
534 } else if (o
== EXEC_OUTPUT_INHERIT
) {
535 /* If input got downgraded, inherit the original value */
536 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
537 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
539 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
540 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
541 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
543 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
547 /* We need to open /dev/null here anew, to get the right access mode. */
548 return open_null_as(O_WRONLY
, fileno
);
553 case EXEC_OUTPUT_NULL
:
554 return open_null_as(O_WRONLY
, fileno
);
556 case EXEC_OUTPUT_TTY
:
557 if (is_terminal_input(i
))
558 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
560 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
562 case EXEC_OUTPUT_KMSG
:
563 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
564 case EXEC_OUTPUT_JOURNAL
:
565 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
566 r
= connect_logger_as(context
, params
, o
, ident
, fileno
, uid
, gid
);
568 log_warning_errno(r
, "Failed to connect %s to the journal socket, ignoring: %m",
569 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
570 r
= open_null_as(O_WRONLY
, fileno
);
574 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
575 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
576 * services to detect whether they are connected to the journal or not.
578 * If both stdout and stderr are connected to a stream then let's make sure to store the data
579 * about STDERR as that's usually the best way to do logging. */
581 if (fstat(fileno
, &st
) >= 0 &&
582 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
583 *journal_stream_dev
= st
.st_dev
;
584 *journal_stream_ino
= st
.st_ino
;
589 case EXEC_OUTPUT_SOCKET
:
590 assert(socket_fd
>= 0);
592 return RET_NERRNO(dup2(socket_fd
, fileno
));
594 case EXEC_OUTPUT_NAMED_FD
:
595 assert(named_iofds
[fileno
] >= 0);
597 (void) fd_nonblock(named_iofds
[fileno
], false);
598 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
600 case EXEC_OUTPUT_FILE
:
601 case EXEC_OUTPUT_FILE_APPEND
:
602 case EXEC_OUTPUT_FILE_TRUNCATE
: {
606 assert(context
->stdio_file
[fileno
]);
608 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
609 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
612 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
615 if (o
== EXEC_OUTPUT_FILE_APPEND
)
617 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
620 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
624 return move_fd(fd
, fileno
, 0);
628 assert_not_reached();
632 static int chown_terminal(int fd
, uid_t uid
) {
637 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
638 if (!isatty_safe(fd
))
641 /* This might fail. What matters are the results. */
642 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
649 static int setup_confirm_stdio(
650 const ExecContext
*context
,
652 int *ret_saved_stdin
,
653 int *ret_saved_stdout
) {
655 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
659 assert(ret_saved_stdin
);
660 assert(ret_saved_stdout
);
662 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD_CLOEXEC
, 3);
666 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD_CLOEXEC
, 3);
667 if (saved_stdout
< 0)
670 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
674 _cleanup_close_
int lock_fd
= lock_dev_console();
676 log_debug_errno(lock_fd
, "Failed to lock /dev/console, ignoring: %m");
678 r
= chown_terminal(fd
, getuid());
682 r
= terminal_reset_defensive(fd
, TERMINAL_RESET_SWITCH_TO_TEXT
);
686 r
= exec_context_apply_tty_size(context
, fd
, fd
, vc
);
690 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
695 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
696 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
700 static void write_confirm_error_fd(int err
, int fd
, const char *unit_id
) {
707 if (errno
== ETIMEDOUT
)
708 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", unit_id
);
710 dprintf(fd
, "Couldn't ask confirmation for %s, assuming positive response: %m\n", unit_id
);
713 static void write_confirm_error(int err
, const char *vc
, const char *unit_id
) {
714 _cleanup_close_
int fd
= -EBADF
;
718 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
722 write_confirm_error_fd(err
, fd
, unit_id
);
725 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
729 assert(saved_stdout
);
733 if (*saved_stdin
>= 0)
734 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
737 if (*saved_stdout
>= 0)
738 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
741 *saved_stdin
= safe_close(*saved_stdin
);
742 *saved_stdout
= safe_close(*saved_stdout
);
748 CONFIRM_PRETEND_FAILURE
= -1,
749 CONFIRM_PRETEND_SUCCESS
= 0,
753 static bool confirm_spawn_disabled(void) {
754 return access("/run/systemd/confirm_spawn_disabled", F_OK
) >= 0;
757 static int ask_for_confirmation(const ExecContext
*context
, const ExecParameters
*params
, const char *cmdline
) {
758 int saved_stdout
= -EBADF
, saved_stdin
= -EBADF
, r
;
759 _cleanup_free_
char *e
= NULL
;
765 /* For any internal errors, assume a positive response. */
766 r
= setup_confirm_stdio(context
, params
->confirm_spawn
, &saved_stdin
, &saved_stdout
);
768 write_confirm_error(r
, params
->confirm_spawn
, params
->unit_id
);
769 return CONFIRM_EXECUTE
;
772 /* confirm_spawn might have been disabled while we were sleeping. */
773 if (!params
->confirm_spawn
|| confirm_spawn_disabled()) {
778 e
= ellipsize(cmdline
, 60, 100);
786 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
788 write_confirm_error_fd(r
, STDOUT_FILENO
, params
->unit_id
);
795 printf("Resuming normal execution.\n");
796 manager_disable_confirm_spawn();
800 printf(" Unit: %s\n",
802 exec_context_dump(context
, stdout
, " ");
803 exec_params_dump(params
, stdout
, " ");
804 continue; /* ask again */
806 printf("Failing execution.\n");
807 r
= CONFIRM_PRETEND_FAILURE
;
810 printf(" c - continue, proceed without asking anymore\n"
811 " D - dump, show the state of the unit\n"
812 " f - fail, don't execute the command and pretend it failed\n"
814 " i - info, show a short summary of the unit\n"
815 " j - jobs, show jobs that are in progress\n"
816 " s - skip, don't execute the command and pretend it succeeded\n"
817 " y - yes, execute the command\n");
818 continue; /* ask again */
822 params
->unit_id
, cmdline
);
823 continue; /* ask again */
825 if (sigqueue(getppid(),
827 (const union sigval
) { .sival_int
= MANAGER_SIGNAL_COMMAND_DUMP_JOBS
}) < 0)
830 continue; /* ask again */
832 /* 'n' was removed in favor of 'f'. */
833 printf("Didn't understand 'n', did you mean 'f'?\n");
834 continue; /* ask again */
836 printf("Skipping execution.\n");
837 r
= CONFIRM_PRETEND_SUCCESS
;
843 assert_not_reached();
849 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
853 static int get_fixed_user(
854 const char *user_or_uid
,
856 const char **ret_username
,
859 const char **ret_home
,
860 const char **ret_shell
) {
865 assert(ret_username
);
867 r
= get_user_creds(&user_or_uid
, ret_uid
, ret_gid
, ret_home
, ret_shell
,
868 USER_CREDS_CLEAN
|(prefer_nss
? USER_CREDS_PREFER_NSS
: 0));
872 /* user_or_uid is normalized by get_user_creds to username */
873 *ret_username
= user_or_uid
;
878 static int get_fixed_group(
879 const char *group_or_gid
,
880 const char **ret_groupname
,
885 assert(group_or_gid
);
886 assert(ret_groupname
);
888 r
= get_group_creds(&group_or_gid
, ret_gid
, /* flags = */ 0);
892 /* group_or_gid is normalized by get_group_creds to groupname */
893 *ret_groupname
= group_or_gid
;
898 static int get_supplementary_groups(
899 const ExecContext
*c
,
910 * If user is given, then lookup GID and supplementary groups list.
911 * We avoid NSS lookups for gid=0. Also we have to initialize groups
912 * here and as early as possible so we keep the list of supplementary
913 * groups of the caller.
915 bool keep_groups
= false;
916 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
917 /* First step, initialize groups from /etc/groups */
918 if (initgroups(user
, gid
) < 0)
924 if (strv_isempty(c
->supplementary_groups
)) {
930 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
931 * be positive, otherwise fail.
934 int ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
935 if (ngroups_max
<= 0)
936 return errno_or_else(EOPNOTSUPP
);
938 _cleanup_free_ gid_t
*l_gids
= new(gid_t
, ngroups_max
);
945 * Lookup the list of groups that the user belongs to, we
946 * avoid NSS lookups here too for gid=0.
949 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
953 STRV_FOREACH(i
, c
->supplementary_groups
) {
954 if (k
>= ngroups_max
)
958 r
= get_group_creds(&g
, l_gids
+ k
, /* flags = */ 0);
970 /* Otherwise get the final list of supplementary groups */
971 gid_t
*groups
= newdup(gid_t
, l_gids
, k
);
979 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
982 /* Handle SupplementaryGroups= if it is not empty */
984 r
= maybe_setgroups(ngids
, supplementary_gids
);
989 if (gid_is_valid(gid
)) {
990 /* Then set our gids */
991 if (setresgid(gid
, gid
, gid
) < 0)
998 static int set_securebits(unsigned bits
, unsigned mask
) {
1002 current
= prctl(PR_GET_SECUREBITS
);
1006 /* Clear all securebits defined in mask and set bits */
1007 applied
= ((unsigned) current
& ~mask
) | bits
;
1008 if ((unsigned) current
== applied
)
1011 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1017 static int enforce_user(
1018 const ExecContext
*context
,
1020 uint64_t capability_ambient_set
) {
1026 if (!uid_is_valid(uid
))
1029 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1030 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1033 if ((capability_ambient_set
!= 0 || context
->secure_bits
!= 0) && uid
!= 0) {
1035 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1036 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1037 r
= set_securebits(1U << SECURE_KEEP_CAPS
, 0);
1042 /* Second step: actually set the uids */
1043 if (setresuid(uid
, uid
, uid
) < 0)
1046 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1047 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1048 * outside of this call. */
1054 static void pam_response_free_array(struct pam_response
*responses
, size_t n_responses
) {
1055 assert(responses
|| n_responses
== 0);
1057 FOREACH_ARRAY(resp
, responses
, n_responses
)
1058 erase_and_free(resp
->resp
);
1063 typedef struct AskPasswordConvData
{
1064 const ExecContext
*context
;
1065 const ExecParameters
*params
;
1066 } AskPasswordConvData
;
1068 static int ask_password_conv(
1070 const struct pam_message
*msg
[],
1071 struct pam_response
**ret
,
1074 AskPasswordConvData
*data
= ASSERT_PTR(userdata
);
1075 bool set_credential_env_var
= false;
1078 assert(num_msg
>= 0);
1080 assert(data
->context
);
1081 assert(data
->params
);
1084 struct pam_response
*responses
= new0(struct pam_response
, n
);
1087 CLEANUP_ARRAY(responses
, n
, pam_response_free_array
);
1089 for (size_t i
= 0; i
< n
; i
++) {
1090 const struct pam_message
*mi
= *msg
+ i
;
1092 switch (mi
->msg_style
) {
1094 case PAM_PROMPT_ECHO_ON
:
1095 case PAM_PROMPT_ECHO_OFF
: {
1097 /* Locally set the $CREDENTIALS_DIRECTORY to the credentials directory we just populated */
1098 if (!set_credential_env_var
) {
1099 _cleanup_free_
char *creds_dir
= NULL
;
1100 r
= exec_context_get_credential_directory(data
->context
, data
->params
, data
->params
->unit_id
, &creds_dir
);
1102 return log_error_errno(r
, "Failed to determine credentials directory: %m");
1105 if (setenv("CREDENTIALS_DIRECTORY", creds_dir
, /* overwrite= */ true) < 0)
1106 return log_error_errno(r
, "Failed to set $CREDENTIALS_DIRECTORY: %m");
1108 (void) unsetenv("CREDENTIALS_DIRECTORY");
1110 set_credential_env_var
= true;
1113 _cleanup_free_
char *credential_name
= strjoin("pam.authtok.", data
->context
->pam_name
);
1114 if (!credential_name
)
1117 AskPasswordRequest req
= {
1119 .credential
= credential_name
,
1122 .until
= usec_add(now(CLOCK_MONOTONIC
), 15 * USEC_PER_SEC
),
1125 _cleanup_strv_free_erase_
char **acquired
= NULL
;
1126 r
= ask_password_auto(
1128 ASK_PASSWORD_ACCEPT_CACHED
|
1129 ASK_PASSWORD_NO_TTY
|
1130 (mi
->msg_style
== PAM_PROMPT_ECHO_ON
? ASK_PASSWORD_ECHO
: 0),
1133 log_error_errno(r
, "Failed to query for password: %m");
1134 return PAM_CONV_ERR
;
1137 responses
[i
].resp
= strdup(ASSERT_PTR(acquired
[0]));
1138 if (!responses
[i
].resp
) {
1146 log_error("PAM: %s", mi
->msg
);
1150 log_info("PAM: %s", mi
->msg
);
1154 return PAM_CONV_ERR
;
1158 *ret
= TAKE_PTR(responses
);
1164 static int pam_close_session_and_delete_credentials(pam_handle_t
*handle
, int flags
) {
1169 r
= pam_close_session(handle
, flags
);
1170 if (r
!= PAM_SUCCESS
)
1171 log_debug("pam_close_session() failed: %s", pam_strerror(handle
, r
));
1173 s
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1174 if (s
!= PAM_SUCCESS
)
1175 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle
, s
));
1177 return r
!= PAM_SUCCESS
? r
: s
;
1181 static int attach_to_subcgroup(
1182 const ExecContext
*context
,
1183 const CGroupContext
*cgroup_context
,
1184 const ExecParameters
*params
,
1185 const char *prefix
) {
1187 _cleanup_free_
char *subgroup
= NULL
;
1191 assert(cgroup_context
);
1194 /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
1195 * we'd violate the "no inner processes" rule, so no need to do anything. */
1196 if (exec_params_needs_control_subcgroup(params
))
1199 r
= exec_params_get_cgroup_path(params
, cgroup_context
, prefix
, &subgroup
);
1201 return log_error_errno(r
, "Failed to acquire cgroup path: %m");
1202 /* No subgroup required? Then there's nothing to do. */
1206 r
= cg_attach(subgroup
, 0);
1208 return log_error_errno(r
,
1209 "Failed to attach process " PID_FMT
" to cgroup '%s', "
1210 "because the cgroup or one of its parents or "
1211 "siblings is in the threaded mode.",
1212 getpid_cached(), subgroup
);
1214 return log_error_errno(r
,
1215 "Failed to attach process " PID_FMT
" to cgroup %s: %m",
1216 getpid_cached(), subgroup
);
1221 static int setup_pam(
1222 const ExecContext
*context
,
1223 const CGroupContext
*cgroup_context
,
1224 ExecParameters
*params
,
1228 char ***env
, /* updated on success */
1229 const int fds
[], size_t n_fds
,
1230 bool needs_sandboxing
,
1234 AskPasswordConvData conv_data
= {
1239 const struct pam_conv conv
= {
1240 .conv
= ask_password_conv
,
1241 .appdata_ptr
= &conv_data
,
1244 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1245 _cleanup_strv_free_
char **e
= NULL
;
1246 _cleanup_free_
char *tty
= NULL
;
1247 pam_handle_t
*handle
= NULL
;
1249 int pam_code
= PAM_SUCCESS
, r
;
1250 bool close_session
= false;
1257 assert(uid_is_valid(uid
));
1258 assert(gid_is_valid(gid
));
1259 assert(fds
|| n_fds
== 0);
1262 /* We set up PAM in the parent process, then fork. The child
1263 * will then stay around until killed via PR_GET_PDEATHSIG or
1264 * systemd via the cgroup logic. It will then remove the PAM
1265 * session again. The parent process will exec() the actual
1266 * daemon. We do things this way to ensure that the main PID
1267 * of the daemon is the one we initially fork()ed. */
1269 r
= barrier_create(&barrier
);
1273 if (log_get_max_level() < LOG_DEBUG
)
1274 flags
|= PAM_SILENT
;
1276 pam_code
= pam_start(context
->pam_name
, user
, &conv
, &handle
);
1277 if (pam_code
!= PAM_SUCCESS
) {
1282 if (getttyname_malloc(STDIN_FILENO
, &tty
) >= 0) {
1283 _cleanup_free_
char *q
= path_join("/dev", tty
);
1289 free_and_replace(tty
, q
);
1293 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1294 if (pam_code
!= PAM_SUCCESS
)
1298 STRV_FOREACH(nv
, *env
) {
1299 pam_code
= pam_putenv(handle
, *nv
);
1300 if (pam_code
!= PAM_SUCCESS
)
1304 pam_code
= pam_acct_mgmt(handle
, flags
);
1305 if (pam_code
!= PAM_SUCCESS
)
1308 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1309 if (pam_code
!= PAM_SUCCESS
)
1310 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1312 pam_code
= pam_open_session(handle
, flags
);
1313 if (pam_code
!= PAM_SUCCESS
)
1316 close_session
= true;
1318 e
= pam_getenvlist(handle
);
1320 pam_code
= PAM_BUF_ERR
;
1324 /* Block SIGTERM, so that we know that it won't get lost in the child */
1326 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
) >= 0);
1328 parent_pid
= getpid_cached();
1330 r
= safe_fork("(sd-pam)", 0, NULL
);
1336 if (needs_sandboxing
&& exec_needs_cgroup_namespace(context
) && params
->cgroup_path
) {
1337 /* Move PAM process into subgroup immediately if the main process hasn't been moved
1338 * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
1340 r
= attach_to_subcgroup(context
, cgroup_context
, params
, params
->cgroup_path
);
1345 /* The child's job is to reset the PAM session on termination */
1346 barrier_set_role(&barrier
, BARRIER_CHILD
);
1348 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1349 * those fds are open here that have been opened by PAM. */
1350 (void) close_many(fds
, n_fds
);
1352 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1353 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1354 * we'd never signal completion. */
1355 exec_fd
= safe_close(exec_fd
);
1357 /* Drop privileges - we don't need any to pam_close_session and this will make
1358 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1359 * threads to fail to exit normally */
1361 r
= fully_set_uid_gid(uid
, gid
, /* supplementary_gids= */ NULL
, /* n_supplementary_gids= */ 0);
1363 log_warning_errno(r
, "Failed to drop privileges in sd-pam: %m");
1365 (void) ignore_signals(SIGPIPE
);
1367 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1368 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1369 * this way. We rely on the control groups kill logic to do the rest for us. */
1370 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1373 /* Tell the parent that our setup is done. This is especially important regarding dropping
1374 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1376 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1377 (void) barrier_place(&barrier
);
1379 /* Check if our parent process might already have died? */
1380 if (getppid() == parent_pid
) {
1384 assert_se(sigemptyset(&ss
) >= 0);
1385 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1387 assert_se(sigwait(&ss
, &sig
) == 0);
1388 assert(sig
== SIGTERM
);
1391 /* If our parent died we'll end the session */
1392 if (getppid() != parent_pid
) {
1393 pam_code
= pam_close_session_and_delete_credentials(handle
, flags
);
1394 if (pam_code
!= PAM_SUCCESS
)
1401 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1402 * know about this. See pam_end(3) */
1403 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1407 barrier_set_role(&barrier
, BARRIER_PARENT
);
1409 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1413 /* Unblock SIGTERM again in the parent */
1414 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1416 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1417 * this fd around. */
1420 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1421 * recover. However, warn loudly if it happens. */
1422 if (!barrier_place_and_sync(&barrier
))
1423 log_error("PAM initialization failed");
1425 return strv_free_and_replace(*env
, e
);
1428 if (pam_code
!= PAM_SUCCESS
) {
1429 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1430 r
= -EPERM
; /* PAM errors do not map to errno */
1432 log_error_errno(r
, "PAM failed: %m");
1436 pam_code
= pam_close_session_and_delete_credentials(handle
, flags
);
1438 (void) pam_end(handle
, pam_code
| flags
);
1448 static void rename_process_from_path(const char *path
) {
1449 _cleanup_free_
char *buf
= NULL
;
1454 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1457 if (path_extract_filename(path
, &buf
) < 0) {
1458 rename_process("(...)");
1462 size_t l
= strlen(buf
);
1464 /* The end of the process name is usually more interesting, since the first bit might just be
1471 char process_name
[11];
1472 process_name
[0] = '(';
1473 memcpy(process_name
+1, p
, l
);
1474 process_name
[1+l
] = ')';
1475 process_name
[1+l
+1] = 0;
1477 (void) rename_process(process_name
);
1480 static bool context_has_address_families(const ExecContext
*c
) {
1483 return c
->address_families_allow_list
||
1484 !set_isempty(c
->address_families
);
1487 static bool context_has_syscall_filters(const ExecContext
*c
) {
1490 return c
->syscall_allow_list
||
1491 !hashmap_isempty(c
->syscall_filter
);
1494 static bool context_has_syscall_logs(const ExecContext
*c
) {
1497 return c
->syscall_log_allow_list
||
1498 !hashmap_isempty(c
->syscall_log
);
1501 static bool context_has_seccomp(const ExecContext
*c
) {
1504 /* We need NNP if we have any form of seccomp and are unprivileged */
1505 return c
->lock_personality
||
1506 c
->memory_deny_write_execute
||
1507 c
->private_devices
||
1509 c
->protect_hostname
== PROTECT_HOSTNAME_YES
||
1510 c
->protect_kernel_tunables
||
1511 c
->protect_kernel_modules
||
1512 c
->protect_kernel_logs
||
1513 context_has_address_families(c
) ||
1514 exec_context_restrict_namespaces_set(c
) ||
1515 c
->restrict_realtime
||
1516 c
->restrict_suid_sgid
||
1517 !set_isempty(c
->syscall_archs
) ||
1518 context_has_syscall_filters(c
) ||
1519 context_has_syscall_logs(c
);
1522 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1525 if (c
->no_new_privileges
)
1528 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1531 return context_has_seccomp(c
);
1536 static bool seccomp_allows_drop_privileges(const ExecContext
*c
) {
1538 bool have_capget
= false, have_capset
= false, have_prctl
= false;
1542 /* No syscall filter, we are allowed to drop privileges */
1543 if (hashmap_isempty(c
->syscall_filter
))
1546 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
1547 _cleanup_free_
char *name
= NULL
;
1549 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
1551 if (streq(name
, "capget"))
1553 else if (streq(name
, "capset"))
1555 else if (streq(name
, "prctl"))
1559 if (c
->syscall_allow_list
)
1560 return have_capget
&& have_capset
&& have_prctl
;
1562 return !(have_capget
|| have_capset
|| have_prctl
);
1565 static bool skip_seccomp_unavailable(const char *msg
) {
1568 if (is_seccomp_available())
1571 log_debug("SECCOMP features not detected in the kernel, skipping %s", msg
);
1575 static int apply_syscall_filter(const ExecContext
*c
, const ExecParameters
*p
) {
1576 uint32_t negative_action
, default_action
, action
;
1582 if (!context_has_syscall_filters(c
))
1585 if (skip_seccomp_unavailable("SystemCallFilter="))
1588 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1590 if (c
->syscall_allow_list
) {
1591 default_action
= negative_action
;
1592 action
= SCMP_ACT_ALLOW
;
1594 default_action
= SCMP_ACT_ALLOW
;
1595 action
= negative_action
;
1598 /* Sending over exec_fd or handoff_timestamp_fd requires write() syscall. */
1599 if (p
->exec_fd
>= 0 || p
->handoff_timestamp_fd
>= 0) {
1600 r
= seccomp_filter_set_add_by_name(c
->syscall_filter
, c
->syscall_allow_list
, "write");
1605 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1608 static int apply_syscall_log(const ExecContext
*c
, const ExecParameters
*p
) {
1610 uint32_t default_action
, action
;
1616 if (!context_has_syscall_logs(c
))
1620 if (skip_seccomp_unavailable("SystemCallLog="))
1623 if (c
->syscall_log_allow_list
) {
1624 /* Log nothing but the ones listed */
1625 default_action
= SCMP_ACT_ALLOW
;
1626 action
= SCMP_ACT_LOG
;
1628 /* Log everything but the ones listed */
1629 default_action
= SCMP_ACT_LOG
;
1630 action
= SCMP_ACT_ALLOW
;
1633 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1635 /* old libseccomp */
1636 log_debug( "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1641 static int apply_syscall_archs(const ExecContext
*c
, const ExecParameters
*p
) {
1645 if (set_isempty(c
->syscall_archs
))
1648 if (skip_seccomp_unavailable("SystemCallArchitectures="))
1651 return seccomp_restrict_archs(c
->syscall_archs
);
1654 static int apply_address_families(const ExecContext
*c
, const ExecParameters
*p
) {
1658 if (!context_has_address_families(c
))
1661 if (skip_seccomp_unavailable("RestrictAddressFamilies="))
1664 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1667 static int apply_memory_deny_write_execute(const ExecContext
*c
, const ExecParameters
*p
) {
1673 if (!c
->memory_deny_write_execute
)
1676 /* use prctl() if kernel supports it (6.3) */
1677 r
= prctl(PR_SET_MDWE
, PR_MDWE_REFUSE_EXEC_GAIN
, 0, 0, 0);
1679 log_debug("Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1682 if (r
< 0 && errno
!= EINVAL
)
1683 return log_debug_errno(errno
, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1684 /* else use seccomp */
1685 log_debug("Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1687 if (skip_seccomp_unavailable("MemoryDenyWriteExecute="))
1690 return seccomp_memory_deny_write_execute();
1693 static int apply_restrict_realtime(const ExecContext
*c
, const ExecParameters
*p
) {
1697 if (!c
->restrict_realtime
)
1700 if (skip_seccomp_unavailable("RestrictRealtime="))
1703 return seccomp_restrict_realtime();
1706 static int apply_restrict_suid_sgid(const ExecContext
*c
, const ExecParameters
*p
) {
1710 if (!c
->restrict_suid_sgid
)
1713 if (skip_seccomp_unavailable("RestrictSUIDSGID="))
1716 return seccomp_restrict_suid_sgid();
1719 static int apply_protect_sysctl(const ExecContext
*c
, const ExecParameters
*p
) {
1723 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1724 * let's protect even those systems where this is left on in the kernel. */
1726 if (!c
->protect_kernel_tunables
)
1729 if (skip_seccomp_unavailable("ProtectKernelTunables="))
1732 return seccomp_protect_sysctl();
1735 static int apply_protect_kernel_modules(const ExecContext
*c
, const ExecParameters
*p
) {
1739 /* Turn off module syscalls on ProtectKernelModules=yes */
1741 if (!c
->protect_kernel_modules
)
1744 if (skip_seccomp_unavailable("ProtectKernelModules="))
1747 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1750 static int apply_protect_kernel_logs(const ExecContext
*c
, const ExecParameters
*p
) {
1754 if (!c
->protect_kernel_logs
)
1757 if (skip_seccomp_unavailable("ProtectKernelLogs="))
1760 return seccomp_protect_syslog();
1763 static int apply_protect_clock(const ExecContext
*c
, const ExecParameters
*p
) {
1767 if (!c
->protect_clock
)
1770 if (skip_seccomp_unavailable("ProtectClock="))
1773 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1776 static int apply_private_devices(const ExecContext
*c
, const ExecParameters
*p
) {
1780 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1782 if (!c
->private_devices
)
1785 if (skip_seccomp_unavailable("PrivateDevices="))
1788 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1791 static int apply_restrict_namespaces(const ExecContext
*c
, const ExecParameters
*p
) {
1795 if (!exec_context_restrict_namespaces_set(c
))
1798 if (skip_seccomp_unavailable("RestrictNamespaces="))
1801 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1804 static int apply_lock_personality(const ExecContext
*c
, const ExecParameters
*p
) {
1805 unsigned long personality
;
1811 if (!c
->lock_personality
)
1814 if (skip_seccomp_unavailable("LockPersonality="))
1817 personality
= c
->personality
;
1819 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1820 if (personality
== PERSONALITY_INVALID
) {
1822 r
= opinionated_personality(&personality
);
1827 return seccomp_lock_personality(personality
);
1833 static int apply_restrict_filesystems(const ExecContext
*c
, const ExecParameters
*p
) {
1839 if (!exec_context_restrict_filesystems_set(c
))
1842 if (p
->bpf_restrict_fs_map_fd
< 0) {
1843 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1844 log_debug("LSM BPF not supported, skipping RestrictFileSystems=");
1848 /* We are in a new binary, so dl-open again */
1853 return bpf_restrict_fs_update(c
->restrict_filesystems
, p
->cgroup_id
, p
->bpf_restrict_fs_map_fd
, c
->restrict_filesystems_allow_list
);
1857 static int apply_protect_hostname(const ExecContext
*c
, const ExecParameters
*p
, int *ret_exit_status
) {
1862 assert(ret_exit_status
);
1864 if (c
->protect_hostname
== PROTECT_HOSTNAME_NO
)
1867 if (namespace_type_supported(NAMESPACE_UTS
)) {
1868 if (unshare(CLONE_NEWUTS
) < 0) {
1869 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1870 *ret_exit_status
= EXIT_NAMESPACE
;
1871 return log_error_errno(errno
, "Failed to set up UTS namespacing: %m");
1874 log_warning("ProtectHostname=%s is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.",
1875 protect_hostname_to_string(c
->protect_hostname
));
1877 } else if (c
->private_hostname
) {
1878 r
= sethostname_idempotent(c
->private_hostname
);
1880 *ret_exit_status
= EXIT_NAMESPACE
;
1881 return log_error_errno(r
, "Failed to set private hostname '%s': %m", c
->private_hostname
);
1885 log_warning("ProtectHostname=%s is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.",
1886 protect_hostname_to_string(c
->protect_hostname
));
1889 if (c
->protect_hostname
== PROTECT_HOSTNAME_YES
) {
1890 if (skip_seccomp_unavailable("ProtectHostname="))
1893 r
= seccomp_protect_hostname();
1895 *ret_exit_status
= EXIT_SECCOMP
;
1896 return log_error_errno(r
, "Failed to apply hostname restrictions: %m");
1904 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1907 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1908 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1910 if (idle_pipe
[0] >= 0) {
1913 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1915 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1918 /* Signal systemd that we are bored and want to continue. */
1919 n
= write(idle_pipe
[3], "x", 1);
1921 /* Wait for systemd to react to the signal above. */
1922 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1925 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1929 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1932 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1934 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1935 * the service payload in. */
1936 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1937 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
1938 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
1939 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
1940 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
1941 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
1944 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
1946 static int build_environment(
1947 const ExecContext
*c
,
1948 const ExecParameters
*p
,
1949 const CGroupContext
*cgroup_context
,
1952 const char *username
,
1954 dev_t journal_stream_dev
,
1955 ino_t journal_stream_ino
,
1956 const char *memory_pressure_path
,
1957 bool needs_sandboxing
,
1960 _cleanup_strv_free_
char **our_env
= NULL
;
1967 assert(cgroup_context
);
1970 #define N_ENV_VARS 19
1971 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
+ 1);
1976 _cleanup_free_
char *joined
= NULL
;
1978 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1980 our_env
[n_env
++] = x
;
1982 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1984 our_env
[n_env
++] = x
;
1986 joined
= strv_join(p
->fd_names
, ":");
1990 x
= strjoin("LISTEN_FDNAMES=", joined
);
1993 our_env
[n_env
++] = x
;
1996 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1997 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1999 our_env
[n_env
++] = x
;
2001 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
2003 our_env
[n_env
++] = x
;
2006 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
2007 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
2008 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
2009 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
2010 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
2013 our_env
[n_env
++] = x
;
2016 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
2017 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
2018 * really make much sense since we're not logged in. Hence we conditionalize the three based on
2019 * SetLoginEnvironment= switch. */
2020 if (!username
&& !c
->dynamic_user
&& p
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
2023 r
= get_fixed_user("root", /* prefer_nss = */ false, &username
, NULL
, NULL
, &home
, &shell
);
2025 return log_debug_errno(r
, "Failed to determine user credentials for root: %m");
2028 bool set_user_login_env
= exec_context_get_set_login_environment(c
);
2031 x
= strjoin("USER=", username
);
2034 our_env
[n_env
++] = x
;
2036 if (set_user_login_env
) {
2037 x
= strjoin("LOGNAME=", username
);
2040 our_env
[n_env
++] = x
;
2044 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
2045 * (i.e. are "/" or "/bin/nologin"). */
2047 if (home
&& set_user_login_env
&& !empty_or_root(home
)) {
2048 x
= strjoin("HOME=", home
);
2052 path_simplify(x
+ 5);
2053 our_env
[n_env
++] = x
;
2056 if (shell
&& set_user_login_env
&& !shell_is_placeholder(shell
)) {
2057 x
= strjoin("SHELL=", shell
);
2061 path_simplify(x
+ 6);
2062 our_env
[n_env
++] = x
;
2065 if (!sd_id128_is_null(p
->invocation_id
)) {
2066 assert(p
->invocation_id_string
);
2068 x
= strjoin("INVOCATION_ID=", p
->invocation_id_string
);
2072 our_env
[n_env
++] = x
;
2075 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
2076 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
2079 our_env
[n_env
++] = x
;
2082 if (c
->log_namespace
) {
2083 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
2087 our_env
[n_env
++] = x
;
2090 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2091 _cleanup_free_
char *joined
= NULL
;
2097 if (c
->directories
[t
].n_items
== 0)
2100 n
= exec_directory_env_name_to_string(t
);
2104 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
2105 _cleanup_free_
char *prefixed
= NULL
;
2107 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
2111 if (!strextend_with_separator(&joined
, ":", prefixed
))
2115 x
= strjoin(n
, "=", joined
);
2119 our_env
[n_env
++] = x
;
2122 _cleanup_free_
char *creds_dir
= NULL
;
2123 r
= exec_context_get_credential_directory(c
, p
, p
->unit_id
, &creds_dir
);
2127 x
= strjoin("CREDENTIALS_DIRECTORY=", creds_dir
);
2131 our_env
[n_env
++] = x
;
2134 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
2137 our_env
[n_env
++] = x
;
2139 if (memory_pressure_path
) {
2140 x
= strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path
);
2144 our_env
[n_env
++] = x
;
2146 if (!path_equal(memory_pressure_path
, "/dev/null")) {
2147 _cleanup_free_
char *b
= NULL
, *e
= NULL
;
2149 if (asprintf(&b
, "%s " USEC_FMT
" " USEC_FMT
,
2150 MEMORY_PRESSURE_DEFAULT_TYPE
,
2151 cgroup_context
->memory_pressure_threshold_usec
== USEC_INFINITY
? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
:
2152 CLAMP(cgroup_context
->memory_pressure_threshold_usec
, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
),
2153 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2156 if (base64mem(b
, strlen(b
) + 1, &e
) < 0)
2159 x
= strjoin("MEMORY_PRESSURE_WRITE=", e
);
2163 our_env
[n_env
++] = x
;
2167 if (p
->notify_socket
) {
2168 x
= strjoin("NOTIFY_SOCKET=", exec_get_private_notify_socket_path(c
, p
, needs_sandboxing
) ?: p
->notify_socket
);
2172 our_env
[n_env
++] = x
;
2175 assert(c
->private_var_tmp
>= 0 && c
->private_var_tmp
< _PRIVATE_TMP_MAX
);
2176 if (needs_sandboxing
&& c
->private_tmp
!= c
->private_var_tmp
) {
2177 assert(c
->private_tmp
== PRIVATE_TMP_DISCONNECTED
);
2178 assert(c
->private_var_tmp
== PRIVATE_TMP_NO
);
2180 /* When private tmpfs is enabled only on /tmp/, then explicitly set $TMPDIR to suggest the
2181 * service to use /tmp/. */
2183 x
= strdup("TMPDIR=/tmp");
2187 our_env
[n_env
++] = x
;
2190 assert(n_env
<= N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
2193 *ret
= TAKE_PTR(our_env
);
2198 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2199 _cleanup_strv_free_
char **pass_env
= NULL
;
2205 STRV_FOREACH(i
, c
->pass_environment
) {
2206 _cleanup_free_
char *x
= NULL
;
2212 x
= strjoin(*i
, "=", v
);
2216 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2219 pass_env
[n_env
++] = TAKE_PTR(x
);
2220 pass_env
[n_env
] = NULL
;
2223 *ret
= TAKE_PTR(pass_env
);
2227 static int setup_private_users(PrivateUsers private_users
, uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
, bool allow_setgroups
) {
2228 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2229 _cleanup_close_pair_
int errno_pipe
[2] = EBADF_PAIR
;
2230 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2231 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2236 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2237 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2238 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2239 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2240 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2241 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2242 * continues execution normally.
2243 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2244 * does not need CAP_SETUID to write the single line mapping to itself. */
2246 if (private_users
== PRIVATE_USERS_NO
)
2249 if (private_users
== PRIVATE_USERS_IDENTITY
) {
2250 uid_map
= strdup("0 0 65536\n");
2253 } else if (private_users
== PRIVATE_USERS_FULL
) {
2254 /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
2255 * this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
2256 * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
2257 * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
2258 * namespace from the init namespace:
2260 * 1 1 UINT32_MAX - 1
2262 * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
2263 * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
2264 * this uid_map/gid_map hack until version 259 for version N-1 compatibility.
2266 * TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
2268 * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
2269 * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
2270 * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
2271 * use these UIDs/GIDs so we need to map them. */
2272 r
= asprintf(&uid_map
, "0 0 1\n"
2273 "1 1 " UID_FMT
"\n", (uid_t
) (UINT32_MAX
- 1));
2276 /* Can only set up multiple mappings with CAP_SETUID. */
2277 } else if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
)) {
2278 r
= asprintf(&uid_map
,
2279 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2280 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2281 ouid
, ouid
, uid
, uid
);
2285 r
= asprintf(&uid_map
,
2286 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2292 if (private_users
== PRIVATE_USERS_IDENTITY
) {
2293 gid_map
= strdup("0 0 65536\n");
2296 } else if (private_users
== PRIVATE_USERS_FULL
) {
2297 r
= asprintf(&gid_map
, "0 0 1\n"
2298 "1 1 " GID_FMT
"\n", (gid_t
) (UINT32_MAX
- 1));
2301 /* Can only set up multiple mappings with CAP_SETGID. */
2302 } else if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
)) {
2303 r
= asprintf(&gid_map
,
2304 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2305 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2306 ogid
, ogid
, gid
, gid
);
2310 r
= asprintf(&gid_map
,
2311 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2317 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2319 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2320 if (unshare_ready_fd
< 0)
2323 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2325 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2328 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGKILL
, &pid
);
2332 _cleanup_close_
int fd
= -EBADF
;
2336 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2337 * here, after the parent opened its own user namespace. */
2340 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2342 /* Wait until the parent unshared the user namespace */
2343 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2344 report_errno_and_exit(errno_pipe
[1], -errno
);
2346 /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
2347 * and using the system service manager. */
2348 a
= procfs_file_alloca(ppid
, "setgroups");
2349 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2351 if (errno
!= ENOENT
) {
2352 r
= log_debug_errno(errno
, "Failed to open %s: %m", a
);
2353 report_errno_and_exit(errno_pipe
[1], r
);
2356 /* If the file is missing the kernel is too old, let's continue anyway. */
2358 const char *setgroups
= allow_setgroups
? "allow\n" : "deny\n";
2359 if (write(fd
, setgroups
, strlen(setgroups
)) < 0) {
2360 r
= log_debug_errno(errno
, "Failed to write '%s' to %s: %m", setgroups
, a
);
2361 report_errno_and_exit(errno_pipe
[1], r
);
2364 fd
= safe_close(fd
);
2367 /* First write the GID map */
2368 a
= procfs_file_alloca(ppid
, "gid_map");
2369 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2371 r
= log_debug_errno(errno
, "Failed to open %s: %m", a
);
2372 report_errno_and_exit(errno_pipe
[1], r
);
2375 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2376 r
= log_debug_errno(errno
, "Failed to write GID map to %s: %m", a
);
2377 report_errno_and_exit(errno_pipe
[1], r
);
2380 fd
= safe_close(fd
);
2382 /* The write the UID map */
2383 a
= procfs_file_alloca(ppid
, "uid_map");
2384 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2386 r
= log_debug_errno(errno
, "Failed to open %s: %m", a
);
2387 report_errno_and_exit(errno_pipe
[1], r
);
2390 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2391 r
= log_debug_errno(errno
, "Failed to write UID map to %s: %m", a
);
2392 report_errno_and_exit(errno_pipe
[1], r
);
2395 _exit(EXIT_SUCCESS
);
2398 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2400 if (unshare(CLONE_NEWUSER
) < 0)
2401 return log_debug_errno(errno
, "Failed to unshare user namespace: %m");
2403 /* Let the child know that the namespace is ready now */
2404 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2407 /* Try to read an error code from the child */
2408 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2411 if (n
== sizeof(r
)) { /* an error code was sent to us */
2416 if (n
!= 0) /* on success we should have read 0 bytes */
2419 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2422 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2428 static int can_mount_proc(void) {
2429 _cleanup_close_pair_
int errno_pipe
[2] = EBADF_PAIR
;
2430 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2434 /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
2435 * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
2436 * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
2438 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2440 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2441 return log_debug_errno(errno
, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
2443 /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
2444 * with FORK_MOUNTNS_SLAVE. */
2445 r
= safe_fork("(sd-proc-check)",
2446 FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGKILL
|FORK_NEW_MOUNTNS
|FORK_MOUNTNS_SLAVE
|FORK_NEW_PIDNS
, &pid
);
2448 return log_debug_errno(r
, "Failed to fork child process (sd-proc-check): %m");
2450 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2452 /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
2453 * namespace will be cleaned up once the process exits. */
2454 r
= mount_follow_verbose(LOG_DEBUG
, "proc", "/dev/shm/", "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2456 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2457 _exit(EXIT_FAILURE
);
2460 _exit(EXIT_SUCCESS
);
2463 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2465 /* Try to read an error code from the child */
2466 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2468 return log_debug_errno(errno
, "Failed to read errno from pipe with child process (sd-proc-check): %m");
2469 if (n
== sizeof(r
)) { /* an error code was sent to us */
2470 /* This is the expected case where proc cannot be mounted due to permissions. */
2471 if (ERRNO_IS_NEG_PRIVILEGE(r
))
2478 if (n
!= 0) /* on success we should have read 0 bytes */
2481 r
= wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid
), 0 /* flags= */);
2483 return log_debug_errno(r
, "Failed to wait for (sd-proc-check) child process to terminate: %m");
2484 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2485 return log_debug_errno(SYNTHETIC_ERRNO(EIO
), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r
);
2490 static int setup_private_pids(const ExecContext
*c
, ExecParameters
*p
) {
2491 _cleanup_(pidref_done
) PidRef pidref
= PIDREF_NULL
;
2492 _cleanup_close_pair_
int errno_pipe
[2] = EBADF_PAIR
;
2498 assert(p
->pidref_transport_fd
>= 0);
2500 /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
2501 * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
2502 * pidref to the manager and exits while the child process continues with the rest of exec_invoke()
2503 * and finally executes the actual payload. */
2505 /* Create a communication channel so that the parent can tell the child a proper error code in case it
2506 * failed to send child pidref to the manager. */
2507 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2508 return log_debug_errno(errno
, "Failed to create pipe for communicating with parent process: %m");
2510 /* Set FORK_DETACH to immediately re-parent the child process to the invoking manager process. */
2511 r
= pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS
|FORK_DETACH
, &pidref
);
2513 return log_debug_errno(r
, "Failed to fork child into new pid namespace: %m");
2515 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2517 /* In the parent process, we send the child pidref to the manager and exit.
2518 * If PIDFD is not supported, only the child PID is sent. The server then
2519 * uses the child PID to set the new exec main process. */
2520 q
= send_one_fd_iov(
2521 p
->pidref_transport_fd
,
2523 &IOVEC_MAKE(&pidref
.pid
, sizeof(pidref
.pid
)),
2526 /* Send error code to child process. */
2527 (void) write(errno_pipe
[1], &q
, sizeof(q
));
2528 /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
2529 * some destructors have external effects. The main codepaths continue in the child process. */
2530 _exit(q
< 0 ? EXIT_FAILURE
: EXIT_SUCCESS
);
2533 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2534 p
->pidref_transport_fd
= safe_close(p
->pidref_transport_fd
);
2536 /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
2537 * receive an errno even on success. */
2538 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2540 return log_debug_errno(errno
, "Failed to read errno from pipe with parent process: %m");
2542 return log_debug_errno(SYNTHETIC_ERRNO(EIO
), "Failed to read enough bytes from pipe with parent process");
2544 return log_debug_errno(r
, "Failed to send child pidref to manager: %m");
2546 /* NOTE! This function returns in the child process only. */
2550 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2551 _cleanup_free_
char *src_abs
= NULL
;
2556 src_abs
= path_join(root
, source
);
2560 STRV_FOREACH(dst
, symlinks
) {
2561 _cleanup_free_
char *dst_abs
= NULL
;
2563 dst_abs
= path_join(root
, *dst
);
2567 r
= mkdir_parents_label(dst_abs
, 0755);
2571 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2579 static int setup_exec_directory(
2580 const ExecContext
*context
,
2581 const ExecParameters
*params
,
2584 ExecDirectoryType type
,
2585 bool needs_mount_namespace
,
2588 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2589 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2590 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2591 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2592 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2593 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2599 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2600 assert(exit_status
);
2602 if (!params
->prefix
[type
])
2605 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2606 if (!uid_is_valid(uid
))
2608 if (!gid_is_valid(gid
))
2612 FOREACH_ARRAY(i
, context
->directories
[type
].items
, context
->directories
[type
].n_items
) {
2613 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2615 p
= path_join(params
->prefix
[type
], i
->path
);
2621 r
= mkdir_parents_label(p
, 0755);
2625 if (IN_SET(type
, EXEC_DIRECTORY_STATE
, EXEC_DIRECTORY_LOGS
) && params
->runtime_scope
== RUNTIME_SCOPE_USER
) {
2627 /* If we are in user mode, and a configuration directory exists but a state directory
2628 * doesn't exist, then we likely are upgrading from an older systemd version that
2629 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2630 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2631 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2632 * separated. If a service has both dirs configured but only the configuration dir
2633 * exists and the state dir does not, we assume we are looking at an update
2634 * situation. Hence, create a compatibility symlink, so that all expectations are
2637 * (We also do something similar with the log directory, which still doesn't exist in
2638 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2640 /* this assumes the state dir is always created before the configuration dir */
2641 assert_cc(EXEC_DIRECTORY_STATE
< EXEC_DIRECTORY_LOGS
);
2642 assert_cc(EXEC_DIRECTORY_LOGS
< EXEC_DIRECTORY_CONFIGURATION
);
2644 r
= access_nofollow(p
, F_OK
);
2646 _cleanup_free_
char *q
= NULL
;
2648 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2649 * under the configuration hierarchy. */
2651 if (type
== EXEC_DIRECTORY_STATE
)
2652 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], i
->path
);
2653 else if (type
== EXEC_DIRECTORY_LOGS
)
2654 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], "log", i
->path
);
2656 assert_not_reached();
2662 r
= access_nofollow(q
, F_OK
);
2664 /* It does exist! This hence looks like an update. Symlink the
2665 * configuration directory into the state directory. */
2667 r
= symlink_idempotent(q
, p
, /* make_relative= */ true);
2671 log_notice("Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p
, q
);
2673 } else if (r
!= -ENOENT
)
2674 log_warning_errno(r
, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q
);
2677 log_warning_errno(r
, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p
);
2680 if (exec_directory_is_private(context
, type
)) {
2681 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2682 * case we want to avoid leaving a directory around fully accessible that is owned by
2683 * a dynamic user whose UID is later on reused. To lock this down we use the same
2684 * trick used by container managers to prohibit host users to get access to files of
2685 * the same UID in containers: we place everything inside a directory that has an
2686 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2687 * for unprivileged host code. We then use fs namespacing to make this directory
2688 * permeable for the service itself.
2690 * Specifically: for a service which wants a special directory "foo/" we first create
2691 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2692 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2693 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2694 * unprivileged host users can't look into it. Inside of the namespace of the unit
2695 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2696 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2697 * for the service and making sure it only gets access to the dirs it needs but no
2698 * others. Tricky? Yes, absolutely, but it works!
2700 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2701 * to be owned by the service itself.
2703 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2704 * for sharing files or sockets with other services. */
2706 pp
= path_join(params
->prefix
[type
], "private");
2712 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2713 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2717 if (!path_extend(&pp
, i
->path
)) {
2722 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2723 r
= mkdir_parents_label(pp
, 0755);
2727 if (is_dir(p
, false) > 0 &&
2728 (access_nofollow(pp
, F_OK
) == -ENOENT
)) {
2730 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2731 * it over. Most likely the service has been upgraded from one that didn't use
2732 * DynamicUser=1, to one that does. */
2734 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2735 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2736 exec_directory_type_to_string(type
), p
, pp
);
2738 r
= RET_NERRNO(rename(p
, pp
));
2742 /* Otherwise, create the actual directory for the service */
2744 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2745 if (r
< 0 && r
!= -EEXIST
)
2749 if (!FLAGS_SET(i
->flags
, EXEC_DIRECTORY_ONLY_CREATE
)) {
2750 /* And link it up from the original place.
2752 * 1) If a mount namespace is going to be used, then this symlink remains on
2753 * the host, and a new one for the child namespace will be created later.
2754 * 2) It is not necessary to create this symlink when one of its parent
2755 * directories is specified and already created. E.g.
2756 * StateDirectory=foo foo/bar
2757 * In that case, the inode points to pp and p for "foo/bar" are the same:
2758 * pp = "/var/lib/private/foo/bar"
2759 * p = "/var/lib/foo/bar"
2760 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2761 * we do not need to create the symlink, but we cannot create the symlink.
2762 * See issue #24783. */
2763 r
= symlink_idempotent(pp
, p
, true);
2769 _cleanup_free_
char *target
= NULL
;
2771 if (EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type
) &&
2772 readlink_and_make_absolute(p
, &target
) >= 0) {
2773 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2775 /* This already exists and is a symlink? Interesting. Maybe it's one created
2776 * by DynamicUser=1 (see above)?
2778 * We do this for all directory types except for ConfigurationDirectory=,
2779 * since they all support the private/ symlink logic at least in some
2780 * configurations, see above. */
2782 r
= chase(target
, NULL
, 0, &target_resolved
, NULL
);
2786 q
= path_join(params
->prefix
[type
], "private", i
->path
);
2792 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2793 r
= chase(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2797 if (path_equal(q_resolved
, target_resolved
)) {
2799 /* Hmm, apparently DynamicUser= was once turned on for this service,
2800 * but is no longer. Let's move the directory back up. */
2802 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2803 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2804 exec_directory_type_to_string(type
), q
, p
);
2806 r
= RET_NERRNO(unlink(p
));
2810 r
= RET_NERRNO(rename(q
, p
));
2816 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2821 if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type
)) {
2824 /* Don't change the owner/access mode of the configuration directory,
2825 * as in the common case it is not written to by a service, and shall
2826 * not be writable. */
2828 r
= RET_NERRNO(stat(p
, &st
));
2832 /* Still complain if the access mode doesn't match */
2833 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2834 log_warning("%s \'%s\' already exists but the mode is different. "
2835 "(File system: %o %sMode: %o)",
2836 exec_directory_type_to_string(type
), i
->path
,
2837 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2844 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2845 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2846 * current UID/GID ownership.) */
2847 const char *target_dir
= pp
?: p
;
2848 r
= chmod_and_chown(target_dir
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2852 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2853 * available to user code anyway */
2854 if (params
->runtime_scope
!= RUNTIME_SCOPE_SYSTEM
)
2857 int idmapping_supported
= is_idmapping_supported(target_dir
);
2858 if (idmapping_supported
< 0) {
2859 r
= log_debug_errno(idmapping_supported
, "Unable to determine if ID mapping is supported on mount '%s': %m", target_dir
);
2863 log_debug("ID-mapping is%ssupported for exec directory %s", idmapping_supported
? " " : " not ", target_dir
);
2865 /* Change the ownership of the whole tree, if necessary. When dynamic users are used we
2866 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2867 * assignments to exist. */
2868 uid_t chown_uid
= uid
;
2869 gid_t chown_gid
= gid
;
2870 bool do_chown
= false;
2872 if (uid
== 0 || gid
== 0 || !idmapping_supported
) {
2874 i
->idmapped
= false;
2876 /* Use 'nobody' uid/gid for exec directories if ID-mapping is supported. For backward compatibility,
2877 * continue doing chmod/chown if the directory was chmod/chowned before (if uid/gid is not 'nobody') */
2879 r
= RET_NERRNO(stat(target_dir
, &st
));
2883 if (st
.st_uid
== UID_NOBODY
&& st
.st_gid
== GID_NOBODY
) {
2886 } else if (exec_directory_is_private(context
, type
) && st
.st_uid
== 0 && st
.st_gid
== 0) {
2887 chown_uid
= UID_NOBODY
;
2888 chown_gid
= GID_NOBODY
;
2893 i
->idmapped
= false;
2898 r
= path_chown_recursive(target_dir
, chown_uid
, chown_gid
, context
->dynamic_user
? 01777 : 07777, AT_SYMLINK_FOLLOW
);
2904 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2905 * they are set up later, to allow configuring empty var/run/etc. */
2906 if (!needs_mount_namespace
)
2907 FOREACH_ARRAY(i
, context
->directories
[type
].items
, context
->directories
[type
].n_items
) {
2908 r
= create_many_symlinks(params
->prefix
[type
], i
->path
, i
->symlinks
);
2916 *exit_status
= exit_status_table
[type
];
2921 static int setup_smack(
2922 const ExecContext
*context
,
2923 const ExecParameters
*params
,
2924 int executable_fd
) {
2929 assert(executable_fd
>= 0);
2931 if (context
->smack_process_label
) {
2932 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2935 } else if (params
->fallback_smack_process_label
) {
2936 _cleanup_free_
char *exec_label
= NULL
;
2938 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
2939 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
2942 r
= mac_smack_apply_pid(0, exec_label
?: params
->fallback_smack_process_label
);
2951 static int compile_bind_mounts(
2952 const ExecContext
*context
,
2953 const ExecParameters
*params
,
2954 uid_t exec_directory_uid
, /* only used for id-mapped mounts Exec directories */
2955 gid_t exec_directory_gid
, /* only used for id-mapped mounts Exec directories */
2956 BindMount
**ret_bind_mounts
,
2957 size_t *ret_n_bind_mounts
,
2958 char ***ret_empty_directories
) {
2960 _cleanup_strv_free_
char **empty_directories
= NULL
;
2961 BindMount
*bind_mounts
= NULL
;
2967 assert(ret_bind_mounts
);
2968 assert(ret_n_bind_mounts
);
2969 assert(ret_empty_directories
);
2971 CLEANUP_ARRAY(bind_mounts
, h
, bind_mount_free_many
);
2973 n
= context
->n_bind_mounts
;
2974 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2975 if (!params
->prefix
[t
])
2978 FOREACH_ARRAY(i
, context
->directories
[t
].items
, context
->directories
[t
].n_items
)
2979 n
+= !FLAGS_SET(i
->flags
, EXEC_DIRECTORY_ONLY_CREATE
) || FLAGS_SET(i
->flags
, EXEC_DIRECTORY_READ_ONLY
);
2983 *ret_bind_mounts
= NULL
;
2984 *ret_n_bind_mounts
= 0;
2985 *ret_empty_directories
= NULL
;
2989 bind_mounts
= new(BindMount
, n
);
2993 FOREACH_ARRAY(item
, context
->bind_mounts
, context
->n_bind_mounts
) {
2994 r
= bind_mount_add(&bind_mounts
, &h
, item
);
2999 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3000 if (!params
->prefix
[t
])
3003 if (context
->directories
[t
].n_items
== 0)
3006 if (exec_directory_is_private(context
, t
) &&
3007 !exec_context_with_rootfs(context
)) {
3010 /* So this is for a dynamic user, and we need to make sure the process can access its own
3011 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3012 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3014 private_root
= path_join(params
->prefix
[t
], "private");
3018 r
= strv_consume(&empty_directories
, private_root
);
3023 FOREACH_ARRAY(i
, context
->directories
[t
].items
, context
->directories
[t
].n_items
) {
3024 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
3026 /* When one of the parent directories is in the list, we cannot create the symlink
3027 * for the child directory. See also the comments in setup_exec_directory().
3028 * But if it needs to be read only, then we have to create a bind mount anyway to
3030 if (FLAGS_SET(i
->flags
, EXEC_DIRECTORY_ONLY_CREATE
) && !FLAGS_SET(i
->flags
, EXEC_DIRECTORY_READ_ONLY
))
3033 if (exec_directory_is_private(context
, t
))
3034 s
= path_join(params
->prefix
[t
], "private", i
->path
);
3036 s
= path_join(params
->prefix
[t
], i
->path
);
3040 if (exec_directory_is_private(context
, t
) &&
3041 exec_context_with_rootfs(context
))
3042 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3043 * directory is not created on the root directory. So, let's bind-mount the directory
3044 * on the 'non-private' place. */
3045 d
= path_join(params
->prefix
[t
], i
->path
);
3051 bind_mounts
[h
++] = (BindMount
) {
3052 .source
= TAKE_PTR(s
),
3053 .destination
= TAKE_PTR(d
),
3054 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
3056 .read_only
= FLAGS_SET(i
->flags
, EXEC_DIRECTORY_READ_ONLY
),
3057 .idmapped
= i
->idmapped
,
3058 .uid
= exec_directory_uid
,
3059 .gid
= exec_directory_gid
,
3066 *ret_bind_mounts
= TAKE_PTR(bind_mounts
);
3067 *ret_n_bind_mounts
= n
;
3068 *ret_empty_directories
= TAKE_PTR(empty_directories
);
3073 /* ret_symlinks will contain a list of pairs src:dest that describes
3074 * the symlinks to create later on. For example, the symlinks needed
3075 * to safely give private directories to DynamicUser=1 users. */
3076 static int compile_symlinks(
3077 const ExecContext
*context
,
3078 const ExecParameters
*params
,
3079 bool setup_os_release_symlink
,
3080 char ***ret_symlinks
) {
3082 _cleanup_strv_free_
char **symlinks
= NULL
;
3087 assert(ret_symlinks
);
3089 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++)
3090 FOREACH_ARRAY(i
, context
->directories
[dt
].items
, context
->directories
[dt
].n_items
) {
3091 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
3093 STRV_FOREACH(symlink
, i
->symlinks
) {
3094 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
3096 src_abs
= path_join(params
->prefix
[dt
], i
->path
);
3097 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
3098 if (!src_abs
|| !dst_abs
)
3101 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
3106 if (!exec_directory_is_private(context
, dt
) ||
3107 exec_context_with_rootfs(context
) ||
3108 FLAGS_SET(i
->flags
, EXEC_DIRECTORY_ONLY_CREATE
))
3111 private_path
= path_join(params
->prefix
[dt
], "private", i
->path
);
3115 path
= path_join(params
->prefix
[dt
], i
->path
);
3119 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
3124 /* We make the host's os-release available via a symlink, so that we can copy it atomically
3125 * and readers will never get a half-written version. Note that, while the paths specified here are
3126 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3127 * 'os-release -> .os-release-stage/os-release' is what will be created. */
3128 if (setup_os_release_symlink
) {
3129 r
= strv_extend_many(
3131 "/run/host/.os-release-stage/os-release",
3132 "/run/host/os-release");
3137 *ret_symlinks
= TAKE_PTR(symlinks
);
3142 static bool insist_on_sandboxing(
3143 const ExecContext
*context
,
3144 const char *root_dir
,
3145 const char *root_image
,
3146 const BindMount
*bind_mounts
,
3147 size_t n_bind_mounts
) {
3150 assert(n_bind_mounts
== 0 || bind_mounts
);
3152 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3153 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3154 * rearrange stuff in a way we cannot ignore gracefully. */
3156 if (context
->n_temporary_filesystems
> 0)
3159 if (root_dir
|| root_image
)
3162 if (context
->n_mount_images
> 0)
3165 if (context
->dynamic_user
)
3168 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
3171 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3173 FOREACH_ARRAY(i
, bind_mounts
, n_bind_mounts
)
3174 if (!path_equal(i
->source
, i
->destination
))
3177 if (context
->log_namespace
)
3183 static int setup_ephemeral(
3184 const ExecContext
*context
,
3185 ExecRuntime
*runtime
,
3186 char **root_image
, /* both input and output! modified if ephemeral logic enabled */
3187 char **root_directory
, /* ditto */
3188 char **reterr_path
) {
3190 _cleanup_close_
int fd
= -EBADF
;
3191 _cleanup_free_
char *new_root
= NULL
;
3197 assert(root_directory
);
3199 if (!*root_image
&& !*root_directory
)
3202 if (!runtime
->ephemeral_copy
)
3205 assert(runtime
->ephemeral_storage_socket
[0] >= 0);
3206 assert(runtime
->ephemeral_storage_socket
[1] >= 0);
3208 new_root
= strdup(runtime
->ephemeral_copy
);
3210 return log_oom_debug();
3212 r
= posix_lock(runtime
->ephemeral_storage_socket
[0], LOCK_EX
);
3214 return log_debug_errno(r
, "Failed to lock ephemeral storage socket: %m");
3216 CLEANUP_POSIX_UNLOCK(runtime
->ephemeral_storage_socket
[0]);
3218 fd
= receive_one_fd(runtime
->ephemeral_storage_socket
[0], MSG_PEEK
|MSG_DONTWAIT
);
3220 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3223 return log_debug_errno(fd
, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3226 log_debug("Making ephemeral copy of %s to %s", *root_image
, new_root
);
3228 fd
= copy_file(*root_image
, new_root
, O_EXCL
, 0600,
3229 COPY_LOCK_BSD
|COPY_REFLINK
|COPY_CRTIME
|COPY_NOCOW_AFTER
);
3231 *reterr_path
= strdup(*root_image
);
3232 return log_debug_errno(fd
, "Failed to copy image %s to %s: %m",
3233 *root_image
, new_root
);
3236 assert(*root_directory
);
3238 log_debug("Making ephemeral snapshot of %s to %s", *root_directory
, new_root
);
3240 fd
= btrfs_subvol_snapshot_at(
3241 AT_FDCWD
, *root_directory
,
3243 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3244 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3245 BTRFS_SNAPSHOT_RECURSIVE
|
3246 BTRFS_SNAPSHOT_LOCK_BSD
);
3248 *reterr_path
= strdup(*root_directory
);
3249 return log_debug_errno(fd
, "Failed to snapshot directory %s to %s: %m",
3250 *root_directory
, new_root
);
3254 r
= send_one_fd(runtime
->ephemeral_storage_socket
[1], fd
, MSG_DONTWAIT
);
3256 return log_debug_errno(r
, "Failed to queue file descriptor on ephemeral storage socket: %m");
3259 free_and_replace(*root_image
, new_root
);
3261 assert(*root_directory
);
3262 free_and_replace(*root_directory
, new_root
);
3268 static int verity_settings_prepare(
3269 VeritySettings
*verity
,
3270 const char *root_image
,
3271 const void *root_hash
,
3272 size_t root_hash_size
,
3273 const char *root_hash_path
,
3274 const void *root_hash_sig
,
3275 size_t root_hash_sig_size
,
3276 const char *root_hash_sig_path
,
3277 const char *verity_data_path
) {
3286 d
= memdup(root_hash
, root_hash_size
);
3290 free_and_replace(verity
->root_hash
, d
);
3291 verity
->root_hash_size
= root_hash_size
;
3292 verity
->designator
= PARTITION_ROOT
;
3295 if (root_hash_sig
) {
3298 d
= memdup(root_hash_sig
, root_hash_sig_size
);
3302 free_and_replace(verity
->root_hash_sig
, d
);
3303 verity
->root_hash_sig_size
= root_hash_sig_size
;
3304 verity
->designator
= PARTITION_ROOT
;
3307 if (verity_data_path
) {
3308 r
= free_and_strdup(&verity
->data_path
, verity_data_path
);
3313 r
= verity_settings_load(
3317 root_hash_sig_path
);
3319 return log_debug_errno(r
, "Failed to load root hash: %m");
3324 static int pick_versions(
3325 const ExecContext
*context
,
3326 const ExecParameters
*params
,
3327 char **ret_root_image
,
3328 char **ret_root_directory
,
3329 char **reterr_path
) {
3335 assert(ret_root_image
);
3336 assert(ret_root_directory
);
3338 if (context
->root_image
) {
3339 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
3341 r
= path_pick(/* toplevel_path= */ NULL
,
3342 /* toplevel_fd= */ AT_FDCWD
,
3343 context
->root_image
,
3344 &pick_filter_image_raw
,
3345 PICK_ARCHITECTURE
|PICK_TRIES
|PICK_RESOLVE
,
3348 *reterr_path
= strdup(context
->root_image
);
3353 *reterr_path
= strdup(context
->root_image
);
3354 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT
), "No matching entry in .v/ directory %s found.", context
->root_image
);
3357 *ret_root_image
= TAKE_PTR(result
.path
);
3358 *ret_root_directory
= NULL
;
3362 if (context
->root_directory
) {
3363 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
3365 r
= path_pick(/* toplevel_path= */ NULL
,
3366 /* toplevel_fd= */ AT_FDCWD
,
3367 context
->root_directory
,
3368 &pick_filter_image_dir
,
3369 PICK_ARCHITECTURE
|PICK_TRIES
|PICK_RESOLVE
,
3372 *reterr_path
= strdup(context
->root_directory
);
3377 *reterr_path
= strdup(context
->root_directory
);
3378 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT
), "No matching entry in .v/ directory %s found.", context
->root_directory
);
3381 *ret_root_image
= NULL
;
3382 *ret_root_directory
= TAKE_PTR(result
.path
);
3386 *ret_root_image
= *ret_root_directory
= NULL
;
3390 static int apply_mount_namespace(
3391 ExecCommandFlags command_flags
,
3392 const ExecContext
*context
,
3393 const ExecParameters
*params
,
3394 ExecRuntime
*runtime
,
3395 const char *memory_pressure_path
,
3396 bool needs_sandboxing
,
3398 uid_t exec_directory_uid
,
3399 gid_t exec_directory_gid
) {
3401 _cleanup_(verity_settings_done
) VeritySettings verity
= VERITY_SETTINGS_DEFAULT
;
3402 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
,
3403 **read_write_paths_cleanup
= NULL
;
3404 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
3405 *private_namespace_dir
= NULL
, *host_os_release_stage
= NULL
, *root_image
= NULL
, *root_dir
= NULL
;
3406 const char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
3407 char **read_write_paths
;
3408 bool setup_os_release_symlink
;
3409 BindMount
*bind_mounts
= NULL
;
3410 size_t n_bind_mounts
= 0;
3417 CLEANUP_ARRAY(bind_mounts
, n_bind_mounts
, bind_mount_free_many
);
3419 if (params
->flags
& EXEC_APPLY_CHROOT
) {
3429 r
= setup_ephemeral(
3439 r
= compile_bind_mounts(context
, params
, exec_directory_uid
, exec_directory_gid
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
3443 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3444 * service will need to write to it in order to start the notifications. */
3445 if (exec_is_cgroup_mount_read_only(context
) && memory_pressure_path
&& !streq(memory_pressure_path
, "/dev/null")) {
3446 read_write_paths_cleanup
= strv_copy(context
->read_write_paths
);
3447 if (!read_write_paths_cleanup
)
3450 r
= strv_extend(&read_write_paths_cleanup
, memory_pressure_path
);
3454 read_write_paths
= read_write_paths_cleanup
;
3456 read_write_paths
= context
->read_write_paths
;
3458 if (needs_sandboxing
) {
3459 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3460 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3461 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3463 if (context
->private_tmp
== PRIVATE_TMP_CONNECTED
&& runtime
->shared
) {
3464 if (streq_ptr(runtime
->shared
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
3465 tmp_dir
= runtime
->shared
->tmp_dir
;
3466 else if (runtime
->shared
->tmp_dir
)
3467 tmp_dir
= strjoina(runtime
->shared
->tmp_dir
, "/tmp");
3469 if (streq_ptr(runtime
->shared
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
3470 var_tmp_dir
= runtime
->shared
->var_tmp_dir
;
3471 else if (runtime
->shared
->var_tmp_dir
)
3472 var_tmp_dir
= strjoina(runtime
->shared
->var_tmp_dir
, "/tmp");
3476 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3477 setup_os_release_symlink
= needs_sandboxing
&& exec_context_get_effective_mount_apivfs(context
) && (root_dir
|| root_image
);
3478 r
= compile_symlinks(context
, params
, setup_os_release_symlink
, &symlinks
);
3482 if (context
->mount_propagation_flag
== MS_SHARED
)
3483 log_debug("shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3485 r
= exec_context_get_credential_directory(context
, params
, params
->unit_id
, &creds_path
);
3489 if (params
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
3490 propagate_dir
= path_join("/run/systemd/propagate/", params
->unit_id
);
3494 incoming_dir
= strdup("/run/systemd/incoming");
3498 private_namespace_dir
= strdup("/run/systemd");
3499 if (!private_namespace_dir
)
3502 /* If running under a different root filesystem, propagate the host's os-release. We make a
3503 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3504 if (setup_os_release_symlink
) {
3505 host_os_release_stage
= strdup("/run/systemd/propagate/.os-release-stage");
3506 if (!host_os_release_stage
)
3510 assert(params
->runtime_scope
== RUNTIME_SCOPE_USER
);
3512 if (asprintf(&private_namespace_dir
, "/run/user/" UID_FMT
"/systemd", geteuid()) < 0)
3515 if (setup_os_release_symlink
) {
3516 if (asprintf(&host_os_release_stage
,
3517 "/run/user/" UID_FMT
"/systemd/propagate/.os-release-stage",
3524 r
= verity_settings_prepare(
3527 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
3528 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
3529 context
->root_verity
);
3534 NamespaceParameters parameters
= {
3535 .runtime_scope
= params
->runtime_scope
,
3537 .root_directory
= root_dir
,
3538 .root_image
= root_image
,
3539 .root_image_options
= context
->root_image_options
,
3540 .root_image_policy
= context
->root_image_policy
?: &image_policy_service
,
3542 .read_write_paths
= read_write_paths
,
3543 .read_only_paths
= needs_sandboxing
? context
->read_only_paths
: NULL
,
3544 .inaccessible_paths
= needs_sandboxing
? context
->inaccessible_paths
: NULL
,
3546 .exec_paths
= needs_sandboxing
? context
->exec_paths
: NULL
,
3547 .no_exec_paths
= needs_sandboxing
? context
->no_exec_paths
: NULL
,
3549 .empty_directories
= empty_directories
,
3550 .symlinks
= symlinks
,
3552 .bind_mounts
= bind_mounts
,
3553 .n_bind_mounts
= n_bind_mounts
,
3555 .temporary_filesystems
= context
->temporary_filesystems
,
3556 .n_temporary_filesystems
= context
->n_temporary_filesystems
,
3558 .mount_images
= context
->mount_images
,
3559 .n_mount_images
= context
->n_mount_images
,
3560 .mount_image_policy
= context
->mount_image_policy
?: &image_policy_service
,
3563 .var_tmp_dir
= var_tmp_dir
,
3565 .creds_path
= creds_path
,
3566 .log_namespace
= context
->log_namespace
,
3567 .mount_propagation_flag
= context
->mount_propagation_flag
,
3571 .extension_images
= context
->extension_images
,
3572 .n_extension_images
= context
->n_extension_images
,
3573 .extension_image_policy
= context
->extension_image_policy
?: &image_policy_sysext
,
3574 .extension_directories
= context
->extension_directories
,
3576 .propagate_dir
= propagate_dir
,
3577 .incoming_dir
= incoming_dir
,
3578 .private_namespace_dir
= private_namespace_dir
,
3579 .host_notify_socket
= params
->notify_socket
,
3580 .notify_socket_path
= exec_get_private_notify_socket_path(context
, params
, needs_sandboxing
),
3581 .host_os_release_stage
= host_os_release_stage
,
3583 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3584 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3585 * sandbox inside the mount namespace. */
3586 .ignore_protect_paths
= !needs_sandboxing
&& !context
->dynamic_user
&& root_dir
,
3588 .protect_control_groups
= needs_sandboxing
? exec_get_protect_control_groups(context
) : PROTECT_CONTROL_GROUPS_NO
,
3589 .protect_kernel_tunables
= needs_sandboxing
&& context
->protect_kernel_tunables
,
3590 .protect_kernel_modules
= needs_sandboxing
&& context
->protect_kernel_modules
,
3591 .protect_kernel_logs
= needs_sandboxing
&& context
->protect_kernel_logs
,
3593 .private_dev
= needs_sandboxing
&& context
->private_devices
,
3594 .private_network
= needs_sandboxing
&& exec_needs_network_namespace(context
),
3595 .private_ipc
= needs_sandboxing
&& exec_needs_ipc_namespace(context
),
3596 .private_pids
= needs_sandboxing
&& exec_needs_pid_namespace(context
, params
) ? context
->private_pids
: PRIVATE_PIDS_NO
,
3597 .private_tmp
= needs_sandboxing
? context
->private_tmp
: PRIVATE_TMP_NO
,
3598 .private_var_tmp
= needs_sandboxing
? context
->private_var_tmp
: PRIVATE_TMP_NO
,
3600 .mount_apivfs
= needs_sandboxing
&& exec_context_get_effective_mount_apivfs(context
),
3601 .bind_log_sockets
= needs_sandboxing
&& exec_context_get_effective_bind_log_sockets(context
),
3603 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3604 .mount_nosuid
= needs_sandboxing
&& context
->no_new_privileges
&& !mac_selinux_use(),
3606 .protect_home
= needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
3607 .protect_hostname
= needs_sandboxing
? context
->protect_hostname
: PROTECT_HOSTNAME_NO
,
3608 .protect_system
= needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
3609 .protect_proc
= needs_sandboxing
? context
->protect_proc
: PROTECT_PROC_DEFAULT
,
3610 .proc_subset
= needs_sandboxing
? context
->proc_subset
: PROC_SUBSET_ALL
,
3613 r
= setup_namespace(¶meters
, reterr_path
);
3614 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3615 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3616 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3617 * completely different execution environment. */
3619 if (insist_on_sandboxing(
3621 root_dir
, root_image
,
3624 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
3625 "Failed to set up namespace, and refusing to continue since "
3626 "the selected namespacing options alter mount environment non-trivially.\n"
3627 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3629 context
->n_temporary_filesystems
,
3632 yes_no(context
->dynamic_user
));
3634 log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
3641 static int apply_working_directory(
3642 const ExecContext
*context
,
3643 const ExecParameters
*params
,
3644 ExecRuntime
*runtime
,
3645 const char *pwent_home
,
3646 char * const *env
) {
3655 if (context
->working_directory_home
) {
3656 /* Preferably use the data from $HOME, in case it was updated by a PAM module */
3657 wd
= strv_env_get(env
, "HOME");
3659 /* If that's not available, use the data from the struct passwd entry: */
3666 wd
= empty_to_root(context
->working_directory
);
3668 if (params
->flags
& EXEC_APPLY_CHROOT
)
3669 r
= RET_NERRNO(chdir(wd
));
3671 _cleanup_close_
int dfd
= -EBADF
;
3674 runtime
->ephemeral_copy
?: context
->root_directory
,
3675 CHASE_PREFIX_ROOT
|CHASE_AT_RESOLVE_IN_ROOT
,
3676 /* ret_path= */ NULL
,
3679 r
= RET_NERRNO(fchdir(dfd
));
3681 return context
->working_directory_missing_ok
? 0 : r
;
3684 static int apply_root_directory(
3685 const ExecContext
*context
,
3686 const ExecParameters
*params
,
3687 ExecRuntime
*runtime
,
3688 const bool needs_mount_ns
,
3694 assert(exit_status
);
3696 if (params
->flags
& EXEC_APPLY_CHROOT
)
3697 if (!needs_mount_ns
&& context
->root_directory
)
3698 if (chroot(runtime
->ephemeral_copy
?: context
->root_directory
) < 0) {
3699 *exit_status
= EXIT_CHROOT
;
3706 static int setup_keyring(
3707 const ExecContext
*context
,
3708 const ExecParameters
*p
,
3712 key_serial_t keyring
;
3720 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3721 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3722 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3723 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3724 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3725 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3727 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
3730 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3731 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3732 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3733 * & group is just as nasty as acquiring a reference to the user keyring. */
3735 saved_uid
= getuid();
3736 saved_gid
= getgid();
3738 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3739 if (setregid(gid
, -1) < 0)
3740 return log_error_errno(errno
, "Failed to change GID for user keyring: %m");
3743 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3744 if (setreuid(uid
, -1) < 0) {
3745 r
= log_error_errno(errno
, "Failed to change UID for user keyring: %m");
3750 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
3751 if (keyring
== -1) {
3752 if (errno
== ENOSYS
)
3753 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
3754 else if (ERRNO_IS_PRIVILEGE(errno
))
3755 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
3756 else if (errno
== EDQUOT
)
3757 log_debug_errno(errno
, "Out of kernel keyrings to allocate, ignoring.");
3759 r
= log_error_errno(errno
, "Setting up kernel keyring failed: %m");
3764 /* When requested link the user keyring into the session keyring. */
3765 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
3767 if (keyctl(KEYCTL_LINK
,
3768 KEY_SPEC_USER_KEYRING
,
3769 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
3770 r
= log_error_errno(errno
, "Failed to link user keyring into session keyring: %m");
3775 /* Restore uid/gid back */
3776 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3777 if (setreuid(saved_uid
, -1) < 0) {
3778 r
= log_error_errno(errno
, "Failed to change UID back for user keyring: %m");
3783 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3784 if (setregid(saved_gid
, -1) < 0)
3785 return log_error_errno(errno
, "Failed to change GID back for user keyring: %m");
3788 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3789 if (!sd_id128_is_null(p
->invocation_id
)) {
3792 key
= add_key("user",
3795 sizeof(p
->invocation_id
),
3796 KEY_SPEC_SESSION_KEYRING
);
3798 log_debug_errno(errno
, "Failed to add invocation ID to keyring, ignoring: %m");
3800 if (keyctl(KEYCTL_SETPERM
, key
,
3801 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
3802 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
3803 r
= log_error_errno(errno
, "Failed to restrict invocation ID permission: %m");
3808 /* Revert back uid & gid for the last time, and exit */
3809 /* no extra logging, as only the first already reported error matters */
3810 if (getuid() != saved_uid
)
3811 (void) setreuid(saved_uid
, -1);
3813 if (getgid() != saved_gid
)
3814 (void) setregid(saved_gid
, -1);
3819 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
3825 array
[(*n
)++] = pair
[0];
3827 array
[(*n
)++] = pair
[1];
3830 static int close_remaining_fds(
3831 const ExecParameters
*params
,
3832 const ExecRuntime
*runtime
,
3837 size_t n_dont_close
= 0;
3838 int dont_close
[n_fds
+ 17];
3843 if (params
->stdin_fd
>= 0)
3844 dont_close
[n_dont_close
++] = params
->stdin_fd
;
3845 if (params
->stdout_fd
>= 0)
3846 dont_close
[n_dont_close
++] = params
->stdout_fd
;
3847 if (params
->stderr_fd
>= 0)
3848 dont_close
[n_dont_close
++] = params
->stderr_fd
;
3851 dont_close
[n_dont_close
++] = socket_fd
;
3853 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
3854 n_dont_close
+= n_fds
;
3857 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ephemeral_storage_socket
);
3859 if (runtime
->shared
) {
3860 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->netns_storage_socket
);
3861 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->ipcns_storage_socket
);
3864 if (runtime
->dynamic_creds
) {
3865 if (runtime
->dynamic_creds
->user
)
3866 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->user
->storage_socket
);
3867 if (runtime
->dynamic_creds
->group
)
3868 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->group
->storage_socket
);
3871 if (params
->user_lookup_fd
>= 0)
3872 dont_close
[n_dont_close
++] = params
->user_lookup_fd
;
3874 if (params
->handoff_timestamp_fd
>= 0)
3875 dont_close
[n_dont_close
++] = params
->handoff_timestamp_fd
;
3877 if (params
->pidref_transport_fd
>= 0)
3878 dont_close
[n_dont_close
++] = params
->pidref_transport_fd
;
3880 assert(n_dont_close
<= ELEMENTSOF(dont_close
));
3882 return close_all_fds(dont_close
, n_dont_close
);
3885 static int send_user_lookup(
3886 const char *unit_id
,
3893 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3894 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3897 if (user_lookup_fd
< 0)
3900 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
3903 if (writev(user_lookup_fd
,
3905 IOVEC_MAKE(&uid
, sizeof(uid
)),
3906 IOVEC_MAKE(&gid
, sizeof(gid
)),
3907 IOVEC_MAKE_STRING(unit_id
) }, 3) < 0)
3913 static int acquire_home(const ExecContext
*c
, const char **home
, char **ret_buf
) {
3920 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3922 if (*home
) /* Already acquired from get_fixed_user()? */
3925 if (!c
->working_directory_home
)
3928 if (c
->dynamic_user
|| (c
->user
&& is_this_me(c
->user
) <= 0))
3929 return -EADDRNOTAVAIL
;
3931 r
= get_home_dir(ret_buf
);
3939 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
3940 _cleanup_strv_free_
char ** list
= NULL
;
3947 assert(c
->dynamic_user
);
3949 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3950 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3953 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3955 if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(t
))
3961 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
3964 if (exec_directory_is_private(c
, t
))
3965 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
3967 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
3971 r
= strv_consume(&list
, e
);
3977 *ret
= TAKE_PTR(list
);
3982 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
3988 if (!c
->numa_policy
.nodes
.set
) {
3989 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3994 _cleanup_(cpu_set_done
) CPUSet s
= {};
3995 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
3999 *ret
= TAKE_STRUCT(s
);
4003 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int *fd
) {
4008 assert(*n_fds
< fds_size
);
4014 if (*fd
< 3 + (int) *n_fds
) {
4015 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4016 * the fds we pass to the process (or which are closed only during execve). */
4018 r
= fcntl(*fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
4022 close_and_replace(*fd
, r
);
4025 fds
[(*n_fds
)++] = *fd
;
4029 static int connect_unix_harder(const OpenFile
*of
, int ofd
) {
4030 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
4032 union sockaddr_union addr
= {
4033 .un
.sun_family
= AF_UNIX
,
4041 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
4043 return log_debug_errno(r
, "Failed to set sockaddr for '%s': %m", of
->path
);
4046 FOREACH_ELEMENT(i
, socket_types
) {
4047 _cleanup_close_
int fd
= -EBADF
;
4049 fd
= socket(AF_UNIX
, *i
|SOCK_CLOEXEC
, 0);
4051 return log_debug_errno(errno
, "Failed to create socket for '%s': %m", of
->path
);
4053 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
4056 if (r
!= -EPROTOTYPE
)
4057 return log_debug_errno(r
, "Failed to connect to socket for '%s': %m", of
->path
);
4060 return log_debug_errno(SYNTHETIC_ERRNO(EPROTOTYPE
), "No suitable socket type to connect to socket '%s'.", of
->path
);
4063 static int get_open_file_fd(const OpenFile
*of
) {
4064 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
4069 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
4071 return log_debug_errno(errno
, "Failed to open '%s' as O_PATH: %m", of
->path
);
4073 if (fstat(ofd
, &st
) < 0)
4074 return log_debug_errno( errno
, "Failed to stat '%s': %m", of
->path
);
4076 if (S_ISSOCK(st
.st_mode
)) {
4077 fd
= connect_unix_harder(of
, ofd
);
4081 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
4082 return log_debug_errno(errno
, "Failed to shutdown send for socket '%s': %m", of
->path
);
4084 log_debug("Opened socket '%s' as fd %d.", of
->path
, fd
);
4086 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
4087 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
4089 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
4092 fd
= fd_reopen(ofd
, flags
|O_NOCTTY
|O_CLOEXEC
);
4094 return log_debug_errno(fd
, "Failed to reopen file '%s': %m", of
->path
);
4096 log_debug("Opened file '%s' as fd %d.", of
->path
, fd
);
4102 static int collect_open_file_fds(ExecParameters
*p
, size_t *n_fds
) {
4106 LIST_FOREACH(open_files
, of
, p
->open_files
) {
4107 _cleanup_close_
int fd
= -EBADF
;
4109 fd
= get_open_file_fd(of
);
4111 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
4112 log_full_errno(fd
== -ENOENT
|| ERRNO_IS_NEG_PRIVILEGE(fd
) ? LOG_DEBUG
: LOG_WARNING
,
4114 "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
4119 return log_error_errno(fd
, "Failed to get OpenFile= file descriptor for '%s': %m", of
->path
);
4122 if (!GREEDY_REALLOC(p
->fds
, *n_fds
+ 1))
4125 if (strv_extend(&p
->fd_names
, of
->fdname
) < 0)
4128 p
->fds
[(*n_fds
)++] = TAKE_FD(fd
);
4134 static void log_command_line(
4135 const ExecContext
*context
,
4136 const ExecParameters
*params
,
4138 const char *executable
,
4149 _cleanup_free_
char *cmdline
= quote_command_line(argv
, SHELL_ESCAPE_EMPTY
);
4151 log_struct(LOG_DEBUG
,
4152 LOG_ITEM("EXECUTABLE=%s", executable
),
4153 LOG_EXEC_MESSAGE(params
, "%s: %s", msg
, strnull(cmdline
)),
4154 LOG_EXEC_INVOCATION_ID(params
));
4157 static bool exec_needs_cap_sys_admin(const ExecContext
*context
, const ExecParameters
*params
) {
4160 return context
->private_users
!= PRIVATE_USERS_NO
||
4161 context
->private_tmp
!= PRIVATE_TMP_NO
||
4162 context
->private_devices
||
4163 context
->private_network
||
4164 context
->network_namespace_path
||
4165 context
->private_ipc
||
4166 context
->ipc_namespace_path
||
4167 context
->private_mounts
> 0 ||
4168 context
->mount_apivfs
> 0 ||
4169 context
->bind_log_sockets
> 0 ||
4170 context
->n_bind_mounts
> 0 ||
4171 context
->n_temporary_filesystems
> 0 ||
4172 context
->root_directory
||
4173 !strv_isempty(context
->extension_directories
) ||
4174 context
->protect_system
!= PROTECT_SYSTEM_NO
||
4175 context
->protect_home
!= PROTECT_HOME_NO
||
4176 exec_needs_pid_namespace(context
, params
) ||
4177 context
->protect_kernel_tunables
||
4178 context
->protect_kernel_modules
||
4179 context
->protect_kernel_logs
||
4180 exec_needs_cgroup_mount(context
) ||
4181 context
->protect_clock
||
4182 context
->protect_hostname
!= PROTECT_HOSTNAME_NO
||
4183 !strv_isempty(context
->read_write_paths
) ||
4184 !strv_isempty(context
->read_only_paths
) ||
4185 !strv_isempty(context
->inaccessible_paths
) ||
4186 !strv_isempty(context
->exec_paths
) ||
4187 !strv_isempty(context
->no_exec_paths
) ||
4188 context
->delegate_namespaces
!= NAMESPACE_FLAGS_INITIAL
;
4191 static PrivateUsers
exec_context_get_effective_private_users(
4192 const ExecContext
*context
,
4193 const ExecParameters
*params
) {
4198 if (context
->private_users
!= PRIVATE_USERS_NO
)
4199 return context
->private_users
;
4201 /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
4202 if (context
->delegate_namespaces
!= NAMESPACE_FLAGS_INITIAL
)
4203 return PRIVATE_USERS_SELF
;
4205 return PRIVATE_USERS_NO
;
4208 static bool exec_namespace_is_delegated(
4209 const ExecContext
*context
,
4210 const ExecParameters
*params
,
4211 bool have_cap_sys_admin
,
4212 unsigned long namespace) {
4216 assert(namespace != CLONE_NEWUSER
);
4218 /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
4219 * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
4220 * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
4221 if (!have_cap_sys_admin
&& exec_needs_cap_sys_admin(context
, params
))
4224 if (context
->delegate_namespaces
== NAMESPACE_FLAGS_INITIAL
)
4225 return params
->runtime_scope
== RUNTIME_SCOPE_USER
;
4227 if (FLAGS_SET(context
->delegate_namespaces
, namespace))
4230 /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
4231 * those are delegated mountns must be deferred too.
4233 * The list should stay in sync with exec_needs_mount_namespace(). */
4234 if (namespace == CLONE_NEWNS
)
4235 return context
->delegate_namespaces
& (CLONE_NEWPID
|CLONE_NEWCGROUP
|CLONE_NEWNET
);
4240 static int setup_delegated_namespaces(
4241 const ExecContext
*context
,
4242 ExecParameters
*params
,
4243 ExecRuntime
*runtime
,
4245 const char *memory_pressure_path
,
4248 const ExecCommand
*command
,
4249 bool needs_sandboxing
,
4250 bool have_cap_sys_admin
,
4251 int *reterr_exit_status
) {
4255 /* This function is called twice, once before unsharing the user namespace, and once after unsharing
4256 * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
4257 * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
4258 * that all namespaces that should not be delegated are unshared when this function is called the
4259 * first time and all namespaces that should be delegated are unshared when this function is called
4260 * the second time. */
4265 assert(reterr_exit_status
);
4267 if (exec_needs_network_namespace(context
) &&
4268 exec_namespace_is_delegated(context
, params
, have_cap_sys_admin
, CLONE_NEWNET
) == delegate
&&
4269 runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4271 /* Try to enable network namespacing if network namespacing is available and we have
4272 * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
4273 * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
4274 * the new network namespace. And if we don't have that, then we could only create a network
4275 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4276 if (namespace_type_supported(NAMESPACE_NET
) && have_effective_cap(CAP_NET_ADMIN
) > 0) {
4277 r
= setup_shareable_ns(runtime
->shared
->netns_storage_socket
, CLONE_NEWNET
);
4278 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4279 log_notice_errno(r
, "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4281 *reterr_exit_status
= EXIT_NETWORK
;
4282 return log_error_errno(r
, "Failed to set up network namespacing: %m");
4284 log_debug("Set up %snetwork namespace", delegate
? "delegated " : "");
4285 } else if (context
->network_namespace_path
) {
4286 *reterr_exit_status
= EXIT_NETWORK
;
4287 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "NetworkNamespacePath= is not supported, refusing.");
4289 log_notice("PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4292 if (exec_needs_ipc_namespace(context
) &&
4293 exec_namespace_is_delegated(context
, params
, have_cap_sys_admin
, CLONE_NEWIPC
) == delegate
&&
4294 runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4296 if (namespace_type_supported(NAMESPACE_IPC
)) {
4297 r
= setup_shareable_ns(runtime
->shared
->ipcns_storage_socket
, CLONE_NEWIPC
);
4298 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4299 log_warning_errno(r
, "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4301 *reterr_exit_status
= EXIT_NAMESPACE
;
4302 return log_error_errno(r
, "Failed to set up IPC namespacing: %m");
4304 log_debug("Set up %sIPC namespace", delegate
? "delegated " : "");
4305 } else if (context
->ipc_namespace_path
) {
4306 *reterr_exit_status
= EXIT_NAMESPACE
;
4307 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "IPCNamespacePath= is not supported, refusing.");
4309 log_warning("PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4312 if (needs_sandboxing
&& exec_needs_cgroup_namespace(context
) &&
4313 exec_namespace_is_delegated(context
, params
, have_cap_sys_admin
, CLONE_NEWCGROUP
) == delegate
) {
4314 if (unshare(CLONE_NEWCGROUP
) < 0) {
4315 *reterr_exit_status
= EXIT_NAMESPACE
;
4316 return log_error_errno(errno
, "Failed to set up cgroup namespacing: %m");
4319 log_debug("Set up %scgroup namespace", delegate
? "delegated " : "");
4322 /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
4323 * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
4324 if (needs_sandboxing
&& exec_needs_pid_namespace(context
, params
) &&
4325 exec_namespace_is_delegated(context
, params
, have_cap_sys_admin
, CLONE_NEWPID
) == delegate
) {
4326 if (params
->pidref_transport_fd
< 0) {
4327 *reterr_exit_status
= EXIT_NAMESPACE
;
4328 return log_error_errno(SYNTHETIC_ERRNO(ENOTCONN
), "PidRef socket is not set up: %m");
4331 /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
4332 * to check if we can mount /proc/.
4334 * We need to check prior to entering the user namespace because if we're running unprivileged or in a
4335 * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
4336 * once we unshare a mount namespace. */
4337 if (!have_cap_sys_admin
|| delegate
) {
4338 r
= can_mount_proc();
4340 *reterr_exit_status
= EXIT_NAMESPACE
;
4341 return log_error_errno(r
, "Failed to detect if /proc/ can be remounted: %m");
4344 *reterr_exit_status
= EXIT_NAMESPACE
;
4345 return log_error_errno(SYNTHETIC_ERRNO(EPERM
),
4346 "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
4350 r
= setup_private_pids(context
, params
);
4352 *reterr_exit_status
= EXIT_NAMESPACE
;
4353 return log_error_errno(r
, "Failed to set up pid namespace: %m");
4356 log_debug("Set up %spid namespace", delegate
? "delegated " : "");
4359 /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
4361 if (exec_needs_mount_namespace(context
, params
, runtime
) &&
4362 exec_namespace_is_delegated(context
, params
, have_cap_sys_admin
, CLONE_NEWNS
) == delegate
) {
4363 _cleanup_free_
char *error_path
= NULL
;
4365 r
= apply_mount_namespace(command
->flags
,
4369 memory_pressure_path
,
4375 *reterr_exit_status
= EXIT_NAMESPACE
;
4376 return log_error_errno(r
, "Failed to set up mount namespacing%s%s: %m",
4377 error_path
? ": " : "", strempty(error_path
));
4380 log_debug("Set up %smount namespace", delegate
? "delegated " : "");
4383 if (needs_sandboxing
&&
4384 exec_namespace_is_delegated(context
, params
, have_cap_sys_admin
, CLONE_NEWUTS
) == delegate
) {
4385 r
= apply_protect_hostname(context
, params
, reterr_exit_status
);
4389 log_debug("Set up %sUTS namespace", delegate
? "delegated " : "");
4395 static bool exec_context_shall_confirm_spawn(const ExecContext
*context
) {
4398 if (confirm_spawn_disabled())
4401 /* For some reasons units remaining in the same process group
4402 * as PID 1 fail to acquire the console even if it's not used
4403 * by any process. So skip the confirmation question for them. */
4404 return !context
->same_pgrp
;
4407 static int exec_context_named_iofds(
4408 const ExecContext
*c
,
4409 const ExecParameters
*p
,
4410 int named_iofds
[static 3]) {
4413 const char* stdio_fdname
[3];
4418 assert(named_iofds
);
4420 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
4421 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
4422 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
4424 for (size_t i
= 0; i
< 3; i
++)
4425 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
4427 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
+ p
->n_extra_fds
;
4429 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
4430 if (named_iofds
[STDIN_FILENO
] < 0 &&
4431 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
4432 stdio_fdname
[STDIN_FILENO
] &&
4433 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
4435 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
4438 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
4439 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
4440 stdio_fdname
[STDOUT_FILENO
] &&
4441 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
4443 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
4446 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
4447 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
4448 stdio_fdname
[STDERR_FILENO
] &&
4449 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
4451 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
4455 return targets
== 0 ? 0 : -ENOENT
;
4458 static void exec_shared_runtime_close(ExecSharedRuntime
*shared
) {
4462 safe_close_pair(shared
->netns_storage_socket
);
4463 safe_close_pair(shared
->ipcns_storage_socket
);
4466 static void exec_runtime_close(ExecRuntime
*rt
) {
4470 safe_close_pair(rt
->ephemeral_storage_socket
);
4472 exec_shared_runtime_close(rt
->shared
);
4473 dynamic_creds_close(rt
->dynamic_creds
);
4476 static void exec_params_close(ExecParameters
*p
) {
4480 p
->stdin_fd
= safe_close(p
->stdin_fd
);
4481 p
->stdout_fd
= safe_close(p
->stdout_fd
);
4482 p
->stderr_fd
= safe_close(p
->stderr_fd
);
4485 static int exec_fd_mark_hot(
4486 const ExecContext
*c
,
4489 int *reterr_exit_status
) {
4499 if (write(p
->exec_fd
, &x
, sizeof(x
)) < 0) {
4500 if (reterr_exit_status
)
4501 *reterr_exit_status
= EXIT_EXEC
;
4502 return log_error_errno(errno
, "Failed to mark exec_fd as %s: %m", hot
? "hot" : "cold");
4508 static int send_handoff_timestamp(
4509 const ExecContext
*c
,
4511 int *reterr_exit_status
) {
4516 if (p
->handoff_timestamp_fd
< 0)
4520 dual_timestamp_now(&dt
);
4522 if (write(p
->handoff_timestamp_fd
, (const usec_t
[2]) { dt
.realtime
, dt
.monotonic
}, sizeof(usec_t
) * 2) < 0) {
4523 if (reterr_exit_status
)
4524 *reterr_exit_status
= EXIT_EXEC
;
4525 return log_error_errno(errno
, "Failed to send handoff timestamp: %m");
4531 static void prepare_terminal(
4532 const ExecContext
*context
,
4533 ExecParameters
*p
) {
4535 _cleanup_close_
int lock_fd
= -EBADF
;
4537 /* This is the "constructive" reset, i.e. is about preparing things for our invocation rather than
4538 * cleaning up things from older invocations. */
4543 /* We only try to reset things if we there's the chance our stdout points to a TTY */
4544 if (!(is_terminal_output(context
->std_output
) ||
4545 (context
->std_output
== EXEC_OUTPUT_INHERIT
&& is_terminal_input(context
->std_input
)) ||
4546 context
->std_output
== EXEC_OUTPUT_NAMED_FD
||
4550 /* Let's explicitly determine whether to reset via ANSI sequences or not, taking our ExecContext
4551 * information into account */
4552 bool use_ansi
= exec_context_shall_ansi_seq_reset(context
);
4554 if (context
->tty_reset
) {
4555 /* When we are resetting the TTY, then let's create a lock first, to synchronize access. This
4556 * in particular matters as concurrent resets and the TTY size ANSI DSR logic done by the
4557 * exec_context_apply_tty_size() below might interfere */
4558 lock_fd
= lock_dev_console();
4560 log_debug_errno(lock_fd
, "Failed to lock /dev/console, ignoring: %m");
4562 /* We explicitly control whether to send ansi sequences or not here, since we want to consult
4563 * the env vars explicitly configured in the ExecContext, rather than our own environment
4565 (void) terminal_reset_defensive(STDOUT_FILENO
, use_ansi
? TERMINAL_RESET_FORCE_ANSI_SEQ
: TERMINAL_RESET_AVOID_ANSI_SEQ
);
4568 (void) exec_context_apply_tty_size(context
, STDIN_FILENO
, STDOUT_FILENO
, /* tty_path= */ NULL
);
4571 (void) osc_context_open_service(p
->unit_id
, p
->invocation_id
, /* ret_seq= */ NULL
);
4574 static int setup_term_environment(const ExecContext
*context
, char ***env
) {
4580 /* Already specified by user? */
4581 if (strv_env_get(*env
, "TERM"))
4584 /* Do we need $TERM at all? */
4585 if (!is_terminal_input(context
->std_input
) &&
4586 !is_terminal_output(context
->std_output
) &&
4587 !is_terminal_output(context
->std_error
) &&
4591 const char *tty_path
= exec_context_tty_path(context
);
4593 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
4594 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
4595 * container manager passes to PID 1 ends up all the way in the console login shown.
4597 * Note that if this doesn't work out we won't bother with querying systemd.tty.term.console
4598 * kernel cmdline option or DCS anymore either, because pid1 also imports $TERM based on those
4599 * and it should have showed up as our $TERM if there were anything. */
4600 if (tty_is_console(tty_path
) && getppid() == 1) {
4601 const char *term
= strv_find_prefix(environ
, "TERM=");
4603 r
= strv_env_replace_strdup(env
, term
);
4607 FOREACH_STRING(i
, "COLORTERM=", "NO_COLOR=") {
4608 const char *s
= strv_find_prefix(environ
, i
);
4612 r
= strv_env_replace_strdup(env
, s
);
4621 if (in_charset(skip_dev_prefix(tty_path
), ALPHANUMERICAL
)) {
4622 _cleanup_free_
char *key
= NULL
, *cmdline
= NULL
;
4624 key
= strjoin("systemd.tty.term.", skip_dev_prefix(tty_path
));
4628 r
= proc_cmdline_get_key(key
, /* flags = */ 0, &cmdline
);
4630 return strv_env_assign(env
, "TERM", cmdline
);
4632 log_debug_errno(r
, "Failed to read '%s' from kernel cmdline, ignoring: %m", key
);
4635 /* This handles real virtual terminals (returning "linux") and
4636 * any terminals which support the DCS +q query sequence. */
4637 _cleanup_free_
char *dcs_term
= NULL
;
4638 r
= query_term_for_tty(tty_path
, &dcs_term
);
4640 return strv_env_assign(env
, "TERM", dcs_term
);
4644 /* If $TERM is not known and we pick a fallback default, then let's also set
4645 * $COLORTERM=truecolor. That's because our fallback default is vt220, which is
4646 * generally a safe bet (as it supports PageUp/PageDown unlike vt100, and is quite
4647 * universally available in terminfo/termcap), except for the fact that real DEC
4648 * vt220 gear never actually supported color. Most tools these days generate color on
4649 * vt220 anyway, ignoring the physical capabilities of the real hardware, but some
4650 * tools actually believe in the historical truth. Which is unfortunate since *we*
4651 * *don't* care about the historical truth, we just want sane defaults if nothing
4652 * better is explicitly configured. It's 2025 after all, at the time of writing,
4653 * pretty much all terminal emulators actually *do* support color, hence if we don't
4654 * know any better let's explicitly claim color support via $COLORTERM. Or in other
4655 * words: we now explicitly claim to be connected to a franken-vt220 with true color
4657 r
= strv_env_replace_strdup(env
, "COLORTERM=truecolor");
4661 return strv_env_replace_strdup(env
, "TERM=" FALLBACK_TERM
);
4665 const ExecCommand
*command
,
4666 const ExecContext
*context
,
4667 ExecParameters
*params
,
4668 ExecRuntime
*runtime
,
4669 const CGroupContext
*cgroup_context
,
4672 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
;
4674 const char *username
= NULL
, *groupname
= NULL
;
4675 _cleanup_free_
char *home_buffer
= NULL
, *memory_pressure_path
= NULL
, *own_user
= NULL
;
4676 const char *pwent_home
= NULL
, *shell
= NULL
;
4677 dev_t journal_stream_dev
= 0;
4678 ino_t journal_stream_ino
= 0;
4679 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4680 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
4681 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
4683 userns_set_up
= false,
4684 keep_seccomp_privileges
= false;
4686 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
4687 bool use_selinux
= false;
4690 bool use_smack
= false;
4693 bool use_apparmor
= false;
4696 uint64_t saved_bset
= 0;
4698 uid_t saved_uid
= getuid();
4699 gid_t saved_gid
= getgid();
4700 uid_t uid
= UID_INVALID
;
4701 gid_t gid
= GID_INVALID
;
4702 size_t n_fds
, /* fds to pass to the child */
4703 n_keep_fds
; /* total number of fds not to close */
4705 _cleanup_free_ gid_t
*gids
= NULL
, *gids_after_pam
= NULL
;
4706 int ngids
= 0, ngids_after_pam
= 0;
4707 int socket_fd
= -EBADF
, named_iofds
[3] = EBADF_TRIPLET
;
4708 size_t n_storage_fds
, n_socket_fds
, n_extra_fds
;
4714 assert(cgroup_context
);
4715 assert(exit_status
);
4717 LOG_CONTEXT_PUSH_EXEC(context
, params
);
4719 /* Explicitly test for CVE-2021-4034 inspired invocations */
4720 if (!command
->path
|| strv_isempty(command
->argv
)) {
4721 *exit_status
= EXIT_EXEC
;
4722 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid command line arguments.");
4725 if (context
->std_input
== EXEC_INPUT_SOCKET
||
4726 context
->std_output
== EXEC_OUTPUT_SOCKET
||
4727 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
4729 if (params
->n_socket_fds
> 1)
4730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
4732 if (params
->n_socket_fds
== 0)
4733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
4735 socket_fd
= params
->fds
[0];
4736 n_storage_fds
= n_socket_fds
= n_extra_fds
= 0;
4738 n_socket_fds
= params
->n_socket_fds
;
4739 n_storage_fds
= params
->n_storage_fds
;
4740 n_extra_fds
= params
->n_extra_fds
;
4742 n_fds
= n_socket_fds
+ n_storage_fds
+ n_extra_fds
;
4744 r
= exec_context_named_iofds(context
, params
, named_iofds
);
4746 return log_error_errno(r
, "Failed to load a named file descriptor: %m");
4748 rename_process_from_path(command
->path
);
4750 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4751 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4752 * both of which will be demoted to SIG_DFL. */
4753 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4756 if (context
->ignore_sigpipe
)
4757 (void) ignore_signals(SIGPIPE
);
4759 r
= reset_signal_mask();
4761 *exit_status
= EXIT_SIGNAL_MASK
;
4762 return log_error_errno(r
, "Failed to set process signal mask: %m");
4765 if (params
->idle_pipe
)
4766 do_idle_pipe_dance(params
->idle_pipe
);
4768 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4769 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4770 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4771 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4774 log_set_open_when_needed(true);
4775 log_settle_target();
4777 /* In case anything used libc syslog(), close this here, too */
4780 r
= collect_open_file_fds(params
, &n_fds
);
4782 *exit_status
= EXIT_FDS
;
4783 return log_error_errno(r
, "Failed to get OpenFile= file descriptors: %m");
4786 int keep_fds
[n_fds
+ 4];
4787 memcpy_safe(keep_fds
, params
->fds
, n_fds
* sizeof(int));
4790 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, ¶ms
->exec_fd
);
4792 *exit_status
= EXIT_FDS
;
4793 return log_error_errno(r
, "Failed to collect shifted fd: %m");
4796 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, ¶ms
->handoff_timestamp_fd
);
4798 *exit_status
= EXIT_FDS
;
4799 return log_error_errno(r
, "Failed to collect shifted fd: %m");
4803 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, ¶ms
->bpf_restrict_fs_map_fd
);
4805 *exit_status
= EXIT_FDS
;
4806 return log_error_errno(r
, "Failed to collect shifted fd: %m");
4810 r
= close_remaining_fds(params
, runtime
, socket_fd
, keep_fds
, n_keep_fds
);
4812 *exit_status
= EXIT_FDS
;
4813 return log_error_errno(r
, "Failed to close unwanted file descriptors: %m");
4816 if (!context
->same_pgrp
&&
4818 *exit_status
= EXIT_SETSID
;
4819 return log_error_errno(errno
, "Failed to create new process session: %m");
4822 /* Now, reset the TTY associated to this service "destructively" (i.e. possibly even hang up or
4823 * disallocate the VT), to get rid of any prior uses of the device. Note that we do not keep any fd
4824 * open here, hence some of the settings made here might vanish again, depending on the TTY driver
4825 * used. A 2nd ("constructive") initialization after we opened the input/output fds we actually want
4826 * will fix this. Note that we pass a NULL invocation ID here – as exec_context_tty_reset() expects
4827 * the invocation ID associated with the OSC 3008 context ID to close. But we don't want to close any
4828 * OSC 3008 context here, and opening a fresh OSC 3008 context happens a bit further down. */
4829 exec_context_tty_reset(context
, params
, /* invocation_id= */ SD_ID128_NULL
);
4831 if (params
->shall_confirm_spawn
&& exec_context_shall_confirm_spawn(context
)) {
4832 _cleanup_free_
char *cmdline
= NULL
;
4834 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4836 *exit_status
= EXIT_MEMORY
;
4840 r
= ask_for_confirmation(context
, params
, cmdline
);
4841 if (r
!= CONFIRM_EXECUTE
) {
4842 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4843 *exit_status
= EXIT_SUCCESS
;
4847 *exit_status
= EXIT_CONFIRM
;
4848 return log_error_errno(SYNTHETIC_ERRNO(ECANCELED
), "Execution cancelled by the user.");
4852 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4853 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4854 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4855 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4856 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4857 if (setenv("SYSTEMD_ACTIVATION_UNIT", params
->unit_id
, true) != 0 ||
4858 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params
->runtime_scope
), true) != 0) {
4859 *exit_status
= EXIT_MEMORY
;
4860 return log_error_errno(errno
, "Failed to update environment: %m");
4863 if (context
->dynamic_user
&& runtime
->dynamic_creds
) {
4864 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4866 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4867 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4868 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4869 *exit_status
= EXIT_USER
;
4870 return log_error_errno(errno
, "Failed to update environment: %m");
4873 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4875 *exit_status
= EXIT_MEMORY
;
4879 r
= dynamic_creds_realize(runtime
->dynamic_creds
, suggested_paths
, &uid
, &gid
);
4881 *exit_status
= EXIT_USER
;
4883 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
4884 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4885 return log_error_errno(r
, "Failed to update dynamic user credentials: %m");
4888 if (!uid_is_valid(uid
)) {
4889 *exit_status
= EXIT_USER
;
4890 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\".", uid
);
4893 if (!gid_is_valid(gid
)) {
4894 *exit_status
= EXIT_USER
;
4895 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\".", gid
);
4898 if (runtime
->dynamic_creds
->user
)
4899 username
= runtime
->dynamic_creds
->user
->name
;
4906 else if (context
->pam_name
|| FLAGS_SET(command
->flags
, EXEC_COMMAND_VIA_SHELL
)) {
4907 /* If PAM is enabled but no user name is explicitly selected, then use our own one. */
4908 own_user
= getusername_malloc();
4910 *exit_status
= EXIT_USER
;
4911 return log_error_errno(r
, "Failed to determine my own user ID: %m");
4918 /* We can't use nss unconditionally for root without risking deadlocks if some IPC services
4919 * will be started by pid1 and are ordered after us. But if SetLoginEnvironment= is
4920 * enabled *explicitly* (i.e. no exec_context_get_set_login_environment() here),
4921 * or PAM shall be invoked, let's consult NSS even for root, so that the user
4922 * gets accurate $SHELL in session(-like) contexts. */
4923 r
= get_fixed_user(u
,
4924 /* prefer_nss = */ context
->set_login_environment
> 0 || context
->pam_name
,
4925 &username
, &uid
, &gid
, &pwent_home
, &shell
);
4927 *exit_status
= EXIT_USER
;
4928 return log_error_errno(r
, "Failed to determine user credentials: %m");
4932 if (context
->group
) {
4933 r
= get_fixed_group(context
->group
, &groupname
, &gid
);
4935 *exit_status
= EXIT_GROUP
;
4936 return log_error_errno(r
, "Failed to determine group credentials: %m");
4941 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4942 ngids
= get_supplementary_groups(context
, username
, gid
, &gids
);
4944 *exit_status
= EXIT_GROUP
;
4945 return log_error_errno(ngids
, "Failed to determine supplementary groups: %m");
4948 r
= send_user_lookup(params
->unit_id
, params
->user_lookup_fd
, uid
, gid
);
4950 *exit_status
= EXIT_USER
;
4951 return log_error_errno(r
, "Failed to send user credentials to PID1: %m");
4954 params
->user_lookup_fd
= safe_close(params
->user_lookup_fd
);
4956 r
= acquire_home(context
, &pwent_home
, &home_buffer
);
4958 *exit_status
= EXIT_CHDIR
;
4959 return log_error_errno(r
, "Failed to determine $HOME for the invoking user: %m");
4962 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4964 (void) fd_nonblock(socket_fd
, false);
4966 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4968 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4970 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4971 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4972 if (params
->cgroup_path
) {
4973 _cleanup_free_
char *subcgroup
= NULL
;
4975 r
= exec_params_get_cgroup_path(params
, cgroup_context
, params
->cgroup_path
, &subcgroup
);
4977 *exit_status
= EXIT_CGROUP
;
4978 return log_error_errno(r
, "Failed to acquire cgroup path: %m");
4981 /* If there is a subcgroup required, let's make sure to create it now. */
4982 r
= cg_create(subcgroup
);
4984 return log_error_errno(r
, "Failed to create subcgroup '%s': %m", subcgroup
);
4987 /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
4988 * as unsharing the cgroup namespace later on makes the current cgroup the root of the
4989 * namespace and we want the root of the namespace to be the main service cgroup and not the
4990 * subgroup. One edge case is if we're a control process that needs to be spawned in a
4991 * subgroup, in this case, we have no choice as moving into the main service cgroup might
4992 * violate the no inner processes rule of cgroupv2. */
4993 const char *cgtarget
= needs_sandboxing
&& exec_needs_cgroup_namespace(context
) &&
4994 !exec_params_needs_control_subcgroup(params
)
4995 ? params
->cgroup_path
: subcgroup
;
4997 r
= cg_attach(cgtarget
, 0);
4998 if (r
== -EUCLEAN
) {
4999 *exit_status
= EXIT_CGROUP
;
5000 return log_error_errno(r
,
5001 "Failed to attach process to cgroup '%s', "
5002 "because the cgroup or one of its parents or "
5003 "siblings is in the threaded mode.", cgtarget
);
5006 *exit_status
= EXIT_CGROUP
;
5007 return log_error_errno(r
, "Failed to attach to cgroup %s: %m", cgtarget
);
5011 if (context
->network_namespace_path
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
5012 r
= open_shareable_ns_path(runtime
->shared
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
5014 *exit_status
= EXIT_NETWORK
;
5015 return log_error_errno(r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
5019 if (context
->ipc_namespace_path
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
5020 r
= open_shareable_ns_path(runtime
->shared
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
5022 *exit_status
= EXIT_NAMESPACE
;
5023 return log_error_errno(r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
5027 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
5029 *exit_status
= EXIT_STDIN
;
5030 return log_error_errno(r
, "Failed to set up standard input: %m");
5033 _cleanup_free_
char *fname
= NULL
;
5034 r
= path_extract_filename(command
->path
, &fname
);
5036 *exit_status
= EXIT_STDOUT
;
5037 return log_error_errno(r
, "Failed to extract filename from path %s: %m", command
->path
);
5040 r
= setup_output(context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, fname
, uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
5042 *exit_status
= EXIT_STDOUT
;
5043 return log_error_errno(r
, "Failed to set up standard output: %m");
5046 r
= setup_output(context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, fname
, uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
5048 *exit_status
= EXIT_STDERR
;
5049 return log_error_errno(r
, "Failed to set up standard error output: %m");
5052 /* Now that stdin/stdout are definiely opened, properly initialize it with our desired
5053 * settings. Note: this is a "constructive" reset, it prepares things for us to use. This is
5054 * different from the "destructive" TTY reset further up. Also note: we apply this on stdin/stdout in
5055 * case this is a tty, regardless if we opened it ourselves or got it passed in pre-opened. */
5056 prepare_terminal(context
, params
);
5058 if (context
->oom_score_adjust_set
) {
5059 /* When we can't make this change due to EPERM, then let's silently skip over it. User
5060 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
5061 r
= set_oom_score_adjust(context
->oom_score_adjust
);
5062 if (ERRNO_IS_NEG_PRIVILEGE(r
))
5063 log_debug_errno(r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5065 *exit_status
= EXIT_OOM_ADJUST
;
5066 return log_error_errno(r
, "Failed to adjust OOM setting: %m");
5070 if (context
->coredump_filter_set
) {
5071 r
= set_coredump_filter(context
->coredump_filter
);
5072 if (ERRNO_IS_NEG_PRIVILEGE(r
))
5073 log_debug_errno(r
, "Failed to adjust coredump_filter, ignoring: %m");
5075 *exit_status
= EXIT_LIMITS
;
5076 return log_error_errno(r
, "Failed to adjust coredump_filter: %m");
5080 if (context
->cpu_sched_set
) {
5081 struct sched_attr attr
= {
5082 .size
= sizeof(attr
),
5083 .sched_policy
= context
->cpu_sched_policy
,
5084 .sched_priority
= context
->cpu_sched_priority
,
5085 .sched_flags
= context
->cpu_sched_reset_on_fork
? SCHED_FLAG_RESET_ON_FORK
: 0,
5088 r
= sched_setattr(/* pid= */ 0, &attr
, /* flags= */ 0);
5090 *exit_status
= EXIT_SETSCHEDULER
;
5091 return log_error_errno(errno
, "Failed to set up CPU scheduling: %m");
5096 * Set nice value _after_ the call to sched_setattr() because struct sched_attr includes sched_nice
5097 * which we do not set, thus it will clobber any previously set nice value. Scheduling policy might
5098 * be reasonably set together with nice value e.g. in case of SCHED_BATCH (see sched(7)).
5099 * It would be ideal to set both with the same call, but we cannot easily do so because of all the
5100 * extra logic in setpriority_closest().
5102 if (context
->nice_set
) {
5103 r
= setpriority_closest(context
->nice
);
5105 *exit_status
= EXIT_NICE
;
5106 return log_error_errno(r
, "Failed to set up process scheduling priority (nice level): %m");
5110 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
5111 _cleanup_(cpu_set_done
) CPUSet converted_cpu_set
= {};
5112 const CPUSet
*cpu_set
;
5114 if (context
->cpu_affinity_from_numa
) {
5115 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
5117 *exit_status
= EXIT_CPUAFFINITY
;
5118 return log_error_errno(r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
5121 cpu_set
= &converted_cpu_set
;
5123 cpu_set
= &context
->cpu_set
;
5125 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
5126 *exit_status
= EXIT_CPUAFFINITY
;
5127 return log_error_errno(errno
, "Failed to set up CPU affinity: %m");
5131 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
5132 r
= apply_numa_policy(&context
->numa_policy
);
5133 if (ERRNO_IS_NEG_NOT_SUPPORTED(r
))
5134 log_debug_errno(r
, "NUMA support not available, ignoring.");
5136 *exit_status
= EXIT_NUMA_POLICY
;
5137 return log_error_errno(r
, "Failed to set NUMA memory policy: %m");
5141 if (context
->ioprio_set
)
5142 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
5143 *exit_status
= EXIT_IOPRIO
;
5144 return log_error_errno(errno
, "Failed to set up IO scheduling priority: %m");
5147 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
5148 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
5149 *exit_status
= EXIT_TIMERSLACK
;
5150 return log_error_errno(errno
, "Failed to set up timer slack: %m");
5153 if (context
->personality
!= PERSONALITY_INVALID
) {
5154 r
= safe_personality(context
->personality
);
5156 *exit_status
= EXIT_PERSONALITY
;
5157 return log_error_errno(r
, "Failed to set up execution domain (personality): %m");
5161 if (context
->memory_ksm
>= 0)
5162 if (prctl(PR_SET_MEMORY_MERGE
, context
->memory_ksm
, 0, 0, 0) < 0) {
5163 if (ERRNO_IS_NOT_SUPPORTED(errno
))
5164 log_debug_errno(errno
, "KSM support not available, ignoring.");
5166 *exit_status
= EXIT_KSM
;
5167 return log_error_errno(errno
, "Failed to set KSM: %m");
5172 if (context
->utmp_id
) {
5173 _cleanup_free_
char *username_alloc
= NULL
;
5175 if (!username
&& context
->utmp_mode
== EXEC_UTMP_USER
) {
5176 username_alloc
= uid_to_name(uid_is_valid(uid
) ? uid
: saved_uid
);
5177 if (!username_alloc
) {
5178 *exit_status
= EXIT_USER
;
5183 const char *line
= context
->tty_path
?
5184 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
5186 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
5188 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
5189 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
5191 username
?: username_alloc
);
5195 if (uid_is_valid(uid
)) {
5196 r
= chown_terminal(STDIN_FILENO
, uid
);
5198 *exit_status
= EXIT_STDIN
;
5199 return log_error_errno(r
, "Failed to change ownership of terminal: %m");
5203 if (params
->cgroup_path
) {
5204 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5205 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5206 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5207 * touch a single hierarchy too. */
5209 if (params
->flags
& EXEC_CGROUP_DELEGATE
) {
5210 _cleanup_free_
char *p
= NULL
;
5212 r
= cg_set_access(params
->cgroup_path
, uid
, gid
);
5214 *exit_status
= EXIT_CGROUP
;
5215 return log_error_errno(r
, "Failed to adjust control group access: %m");
5218 r
= exec_params_get_cgroup_path(params
, cgroup_context
, params
->cgroup_path
, &p
);
5220 *exit_status
= EXIT_CGROUP
;
5221 return log_error_errno(r
, "Failed to acquire cgroup path: %m");
5224 r
= cg_set_access_recursive(p
, uid
, gid
);
5226 *exit_status
= EXIT_CGROUP
;
5227 return log_error_errno(r
, "Failed to adjust control subgroup access: %m");
5232 if (is_pressure_supported() > 0) {
5233 if (cgroup_context_want_memory_pressure(cgroup_context
)) {
5234 r
= cg_get_path("memory", params
->cgroup_path
, "memory.pressure", &memory_pressure_path
);
5236 *exit_status
= EXIT_MEMORY
;
5240 r
= chmod_and_chown(memory_pressure_path
, 0644, uid
, gid
);
5242 log_full_errno(r
== -ENOENT
|| ERRNO_IS_PRIVILEGE(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
5243 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path
);
5244 memory_pressure_path
= mfree(memory_pressure_path
);
5246 /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
5247 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
5248 * pressure path environment variable or read-write mount to the unit. This is why we check if
5249 * memory_pressure_path != NULL in the conditional below. */
5250 if (memory_pressure_path
&& needs_sandboxing
&& exec_needs_cgroup_namespace(context
)) {
5251 memory_pressure_path
= mfree(memory_pressure_path
);
5252 r
= cg_get_path("memory", "", "memory.pressure", &memory_pressure_path
);
5254 *exit_status
= EXIT_MEMORY
;
5258 } else if (cgroup_context
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_NO
) {
5259 memory_pressure_path
= strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5260 if (!memory_pressure_path
) {
5261 *exit_status
= EXIT_MEMORY
;
5268 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
5270 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5271 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
5273 return log_error_errno(r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
5276 r
= exec_setup_credentials(context
, cgroup_context
, params
, params
->unit_id
, uid
, gid
);
5278 *exit_status
= EXIT_CREDENTIALS
;
5279 return log_error_errno(r
, "Failed to set up credentials: %m");
5282 r
= build_environment(
5292 memory_pressure_path
,
5296 *exit_status
= EXIT_MEMORY
;
5300 r
= build_pass_environment(context
, &pass_env
);
5302 *exit_status
= EXIT_MEMORY
;
5306 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5307 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5308 * not specify PATH but the unit has ExecSearchPath. */
5309 if (!strv_isempty(context
->exec_search_path
)) {
5310 _cleanup_free_
char *joined
= NULL
;
5312 joined
= strv_join(context
->exec_search_path
, ":");
5314 *exit_status
= EXIT_MEMORY
;
5318 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
5320 *exit_status
= EXIT_MEMORY
;
5325 accum_env
= strv_env_merge(params
->environment
,
5327 joined_exec_search_path
,
5329 context
->environment
,
5332 *exit_status
= EXIT_MEMORY
;
5335 strv_env_clean(accum_env
);
5337 (void) umask(context
->umask
);
5339 r
= setup_term_environment(context
, &accum_env
);
5341 *exit_status
= EXIT_MEMORY
;
5342 return log_error_errno(r
, "Failed to construct $TERM: %m");
5345 r
= setup_keyring(context
, params
, uid
, gid
);
5347 *exit_status
= EXIT_KEYRING
;
5348 return log_error_errno(r
, "Failed to set up kernel keyring: %m");
5351 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5352 * excepted from either whole sandboxing or just setresuid() itself. */
5353 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
5355 uint64_t capability_ambient_set
= context
->capability_ambient_set
;
5357 /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
5358 have_cap_sys_admin
= have_effective_cap(CAP_SYS_ADMIN
) > 0;
5360 if (needs_sandboxing
) {
5361 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5362 * /sys being present. The actual MAC context application will happen later, as late as
5363 * possible, to avoid impacting our own code paths. */
5366 use_selinux
= mac_selinux_use();
5369 use_smack
= mac_smack_use();
5372 if (mac_apparmor_use()) {
5373 r
= dlopen_libapparmor();
5374 if (r
< 0 && !ERRNO_IS_NEG_NOT_SUPPORTED(r
))
5375 log_warning_errno(r
, "Failed to load libapparmor, ignoring: %m");
5376 use_apparmor
= r
>= 0;
5381 if (needs_sandboxing
) {
5384 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5385 * is set here. (See below.) */
5387 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
5389 *exit_status
= EXIT_LIMITS
;
5390 return log_error_errno(r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
5394 if (needs_setuid
&& context
->pam_name
&& username
) {
5395 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
5396 * wins here. (See above.) */
5398 /* All fds passed in the fds array will be closed in the pam child process. */
5399 r
= setup_pam(context
, cgroup_context
, params
, username
, uid
, gid
, &accum_env
,
5400 params
->fds
, n_fds
, needs_sandboxing
, params
->exec_fd
);
5402 *exit_status
= EXIT_PAM
;
5403 return log_error_errno(r
, "Failed to set up PAM session: %m");
5406 /* PAM modules might have set some ambient caps. Query them here and merge them into
5407 * the caps we want to set in the end, so that we don't end up unsetting them. */
5408 uint64_t ambient_after_pam
;
5409 r
= capability_get_ambient(&ambient_after_pam
);
5411 *exit_status
= EXIT_CAPABILITIES
;
5412 return log_error_errno(r
, "Failed to query ambient caps: %m");
5415 capability_ambient_set
|= ambient_after_pam
;
5417 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
5418 if (ngids_after_pam
< 0) {
5419 *exit_status
= EXIT_GROUP
;
5420 return log_error_errno(ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
5424 if (needs_sandboxing
&& !have_cap_sys_admin
&& exec_needs_cap_sys_admin(context
, params
)) {
5425 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5426 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5427 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5428 PrivateUsers pu
= exec_context_get_effective_private_users(context
, params
);
5429 if (pu
== PRIVATE_USERS_NO
)
5430 pu
= PRIVATE_USERS_SELF
;
5432 /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
5433 * unprivileged user namespaces. */
5434 r
= setup_private_users(pu
, saved_uid
, saved_gid
, uid
, gid
, /* allow_setgroups= */ false);
5435 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5436 * the actual requested operations fail (or silently continue). */
5437 if (r
< 0 && context
->private_users
!= PRIVATE_USERS_NO
) {
5438 *exit_status
= EXIT_USER
;
5439 return log_error_errno(r
, "Failed to set up user namespacing for unprivileged user: %m");
5442 log_info_errno(r
, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5445 userns_set_up
= true;
5446 log_debug("Set up unprivileged user namespace");
5450 /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
5451 r
= setup_delegated_namespaces(
5455 /* delegate= */ false,
5456 memory_pressure_path
,
5466 /* Drop groups as early as possible.
5467 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
5468 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5470 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
5471 int ngids_to_enforce
;
5473 ngids_to_enforce
= merge_gid_lists(gids
,
5478 if (ngids_to_enforce
< 0) {
5479 *exit_status
= EXIT_GROUP
;
5480 return log_error_errno(ngids_to_enforce
, "Failed to merge group lists. Group membership might be incorrect: %m");
5483 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
5485 *exit_status
= EXIT_GROUP
;
5486 return log_error_errno(r
, "Changing group credentials failed: %m");
5490 /* If the user namespace was not set up above, try to do it now.
5491 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5492 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5493 * case of mount namespaces being less privileged when the mount point list is copied from a
5494 * different user namespace). */
5496 if (needs_sandboxing
&& !userns_set_up
) {
5497 PrivateUsers pu
= exec_context_get_effective_private_users(context
, params
);
5499 r
= setup_private_users(pu
, saved_uid
, saved_gid
, uid
, gid
,
5500 /* allow_setgroups= */ pu
== PRIVATE_USERS_FULL
);
5502 *exit_status
= EXIT_USER
;
5503 return log_error_errno(r
, "Failed to set up user namespacing: %m");
5506 log_debug("Set up privileged user namespace");
5509 /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
5510 r
= setup_delegated_namespaces(
5514 /* delegate= */ true,
5515 memory_pressure_path
,
5525 if (needs_sandboxing
&& exec_needs_cgroup_namespace(context
) && params
->cgroup_path
) {
5526 /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
5527 * ensures the root of the cgroup namespace is the top level service cgroup and not the
5528 * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
5529 r
= attach_to_subcgroup(context
, cgroup_context
, params
, /* prefix= */ NULL
);
5531 *exit_status
= EXIT_CGROUP
;
5536 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5539 const char *path
= command
->path
;
5541 if (FLAGS_SET(command
->flags
, EXEC_COMMAND_VIA_SHELL
)) {
5542 if (shell_is_placeholder(shell
)) {
5543 log_debug("Shell prefixing requested for user without default shell, using /bin/sh: %s",
5545 assert(streq(path
, _PATH_BSHELL
));
5550 _cleanup_free_
char *executable
= NULL
;
5551 _cleanup_close_
int executable_fd
= -EBADF
;
5552 r
= find_executable_full(path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
5554 *exit_status
= EXIT_EXEC
;
5555 log_struct_errno(LOG_NOTICE
, r
,
5556 LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED_STR
),
5557 LOG_EXEC_MESSAGE(params
, "Unable to locate executable '%s': %m", path
),
5558 LOG_ITEM("EXECUTABLE=%s", path
));
5559 /* If the error will be ignored by manager, tune down the log level here. Missing executable
5560 * is very much expected in this case. */
5561 return r
!= -ENOMEM
&& FLAGS_SET(command
->flags
, EXEC_COMMAND_IGNORE_FAILURE
) ? 1 : r
;
5564 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, &executable_fd
);
5566 *exit_status
= EXIT_FDS
;
5567 return log_error_errno(r
, "Failed to collect shifted fd: %m");
5571 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
5576 else if (params
->n_socket_fds
== 1)
5577 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5578 * use context from that fd to compute the label. */
5579 fd
= params
->fds
[0];
5582 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
5584 if (!context
->selinux_context_ignore
) {
5585 *exit_status
= EXIT_SELINUX_CONTEXT
;
5586 return log_error_errno(r
, "Failed to determine SELinux context: %m");
5588 log_debug_errno(r
, "Failed to determine SELinux context, ignoring: %m");
5594 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5595 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5596 * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
5597 * them open until the final execve(). But first, close the remaining sockets in the context
5600 exec_runtime_close(runtime
);
5601 exec_params_close(params
);
5603 r
= close_all_fds(keep_fds
, n_keep_fds
);
5605 r
= pack_fds(params
->fds
, n_fds
);
5607 r
= flag_fds(params
->fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
5609 *exit_status
= EXIT_FDS
;
5610 return log_error_errno(r
, "Failed to adjust passed file descriptors: %m");
5613 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5614 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5615 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5618 secure_bits
= context
->secure_bits
;
5620 if (needs_sandboxing
) {
5623 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5624 * (Note this is placed after the general resource limit initialization, see above, in order
5625 * to take precedence.) */
5626 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
5627 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
5628 *exit_status
= EXIT_LIMITS
;
5629 return log_error_errno(errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5634 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5635 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5637 r
= setup_smack(context
, params
, executable_fd
);
5638 if (r
< 0 && !context
->smack_process_label_ignore
) {
5639 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
5640 return log_error_errno(r
, "Failed to set SMACK process label: %m");
5645 bset
= context
->capability_bounding_set
;
5648 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
5649 * keep the needed privileges to apply it even if we're not root. */
5651 uid_is_valid(uid
) &&
5652 context_has_seccomp(context
) &&
5653 seccomp_allows_drop_privileges(context
)) {
5654 keep_seccomp_privileges
= true;
5656 if (prctl(PR_SET_KEEPCAPS
, 1) < 0) {
5657 *exit_status
= EXIT_USER
;
5658 return log_error_errno(errno
, "Failed to enable keep capabilities flag: %m");
5661 /* Save the current bounding set so we can restore it after applying the seccomp
5664 bset
|= (UINT64_C(1) << CAP_SYS_ADMIN
) |
5665 (UINT64_C(1) << CAP_SETPCAP
);
5669 if (!cap_test_all(bset
)) {
5670 r
= capability_bounding_set_drop(bset
, /* right_now= */ false);
5672 *exit_status
= EXIT_CAPABILITIES
;
5673 return log_error_errno(r
, "Failed to drop capabilities: %m");
5677 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5680 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5681 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5682 * the ambient capabilities can be raised as they are present in the permitted and
5683 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5684 * without changing the user, so we also set the ambient capabilities here.
5686 * The requested ambient capabilities are raised in the inheritable set if the second
5687 * argument is true. */
5688 if (capability_ambient_set
!= 0) {
5689 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ true);
5691 *exit_status
= EXIT_CAPABILITIES
;
5692 return log_error_errno(r
, "Failed to apply ambient capabilities (before UID change): %m");
5697 /* chroot to root directory first, before we lose the ability to chroot */
5698 r
= apply_root_directory(context
, params
, runtime
, needs_mount_namespace
, exit_status
);
5700 return log_error_errno(r
, "Chrooting to the requested root directory failed: %m");
5703 if (uid_is_valid(uid
)) {
5704 r
= enforce_user(context
, uid
, capability_ambient_set
);
5706 *exit_status
= EXIT_USER
;
5707 return log_error_errno(r
, "Failed to change UID to " UID_FMT
": %m", uid
);
5710 if (keep_seccomp_privileges
) {
5711 if (!BIT_SET(capability_ambient_set
, CAP_SETUID
)) {
5712 r
= drop_capability(CAP_SETUID
);
5714 *exit_status
= EXIT_USER
;
5715 return log_error_errno(r
, "Failed to drop CAP_SETUID: %m");
5719 r
= keep_capability(CAP_SYS_ADMIN
);
5721 *exit_status
= EXIT_USER
;
5722 return log_error_errno(r
, "Failed to keep CAP_SYS_ADMIN: %m");
5725 r
= keep_capability(CAP_SETPCAP
);
5727 *exit_status
= EXIT_USER
;
5728 return log_error_errno(r
, "Failed to keep CAP_SETPCAP: %m");
5732 if (capability_ambient_set
!= 0) {
5734 /* Raise the ambient capabilities after user change. */
5735 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ false);
5737 *exit_status
= EXIT_CAPABILITIES
;
5738 return log_error_errno(r
, "Failed to apply ambient capabilities (after UID change): %m");
5744 /* Apply working directory here, because the working directory might be on NFS and only the user
5745 * running this service might have the correct privilege to change to the working directory. Also, it
5746 * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5747 * the cwd cannot be used to pin directories outside of the sandbox. */
5748 r
= apply_working_directory(context
, params
, runtime
, pwent_home
, accum_env
);
5750 *exit_status
= EXIT_CHDIR
;
5751 return log_error_errno(r
, "Changing to the requested working directory failed: %m");
5754 if (needs_sandboxing
) {
5755 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5756 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5757 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5758 * are restricted. */
5762 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
5765 r
= setexeccon(exec_context
);
5767 if (!context
->selinux_context_ignore
) {
5768 *exit_status
= EXIT_SELINUX_CONTEXT
;
5769 return log_error_errno(r
, "Failed to change SELinux context to %s: %m", exec_context
);
5771 log_debug_errno(r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
5778 if (use_apparmor
&& context
->apparmor_profile
) {
5779 r
= ASSERT_PTR(sym_aa_change_onexec
)(context
->apparmor_profile
);
5780 if (r
< 0 && !context
->apparmor_profile_ignore
) {
5781 *exit_status
= EXIT_APPARMOR_PROFILE
;
5782 return log_error_errno(errno
, "Failed to prepare AppArmor profile change to %s: %m",
5783 context
->apparmor_profile
);
5788 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5789 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5790 * requires CAP_SETPCAP. */
5791 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
5792 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5793 * effective set here.
5795 * The effective set is overwritten during execve() with the following values:
5797 * - ambient set (for non-root processes)
5799 * - (inheritable | bounding) set for root processes)
5801 * Hence there is no security impact to raise it in the effective set before execve
5803 r
= capability_gain_cap_setpcap(/* ret_before_caps = */ NULL
);
5805 *exit_status
= EXIT_CAPABILITIES
;
5806 return log_error_errno(r
, "Failed to gain CAP_SETPCAP for setting secure bits");
5808 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
5809 *exit_status
= EXIT_SECUREBITS
;
5810 return log_error_errno(errno
, "Failed to set process secure bits: %m");
5814 if (context_has_no_new_privileges(context
))
5815 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
5816 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
5817 return log_error_errno(errno
, "Failed to disable new privileges: %m");
5821 r
= apply_address_families(context
, params
);
5823 *exit_status
= EXIT_ADDRESS_FAMILIES
;
5824 return log_error_errno(r
, "Failed to restrict address families: %m");
5827 r
= apply_memory_deny_write_execute(context
, params
);
5829 *exit_status
= EXIT_SECCOMP
;
5830 return log_error_errno(r
, "Failed to disable writing to executable memory: %m");
5833 r
= apply_restrict_realtime(context
, params
);
5835 *exit_status
= EXIT_SECCOMP
;
5836 return log_error_errno(r
, "Failed to apply realtime restrictions: %m");
5839 r
= apply_restrict_suid_sgid(context
, params
);
5841 *exit_status
= EXIT_SECCOMP
;
5842 return log_error_errno(r
, "Failed to apply SUID/SGID restrictions: %m");
5845 r
= apply_restrict_namespaces(context
, params
);
5847 *exit_status
= EXIT_SECCOMP
;
5848 return log_error_errno(r
, "Failed to apply namespace restrictions: %m");
5851 r
= apply_protect_sysctl(context
, params
);
5853 *exit_status
= EXIT_SECCOMP
;
5854 return log_error_errno(r
, "Failed to apply sysctl restrictions: %m");
5857 r
= apply_protect_kernel_modules(context
, params
);
5859 *exit_status
= EXIT_SECCOMP
;
5860 return log_error_errno(r
, "Failed to apply module loading restrictions: %m");
5863 r
= apply_protect_kernel_logs(context
, params
);
5865 *exit_status
= EXIT_SECCOMP
;
5866 return log_error_errno(r
, "Failed to apply kernel log restrictions: %m");
5869 r
= apply_protect_clock(context
, params
);
5871 *exit_status
= EXIT_SECCOMP
;
5872 return log_error_errno(r
, "Failed to apply clock restrictions: %m");
5875 r
= apply_private_devices(context
, params
);
5877 *exit_status
= EXIT_SECCOMP
;
5878 return log_error_errno(r
, "Failed to set up private devices: %m");
5881 r
= apply_syscall_archs(context
, params
);
5883 *exit_status
= EXIT_SECCOMP
;
5884 return log_error_errno(r
, "Failed to apply syscall architecture restrictions: %m");
5887 r
= apply_lock_personality(context
, params
);
5889 *exit_status
= EXIT_SECCOMP
;
5890 return log_error_errno(r
, "Failed to lock personalities: %m");
5893 r
= apply_syscall_log(context
, params
);
5895 *exit_status
= EXIT_SECCOMP
;
5896 return log_error_errno(r
, "Failed to apply system call log filters: %m");
5901 r
= apply_restrict_filesystems(context
, params
);
5903 *exit_status
= EXIT_BPF
;
5904 return log_error_errno(r
, "Failed to restrict filesystems: %m");
5909 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5910 * by the filter as little as possible. */
5911 r
= apply_syscall_filter(context
, params
);
5913 *exit_status
= EXIT_SECCOMP
;
5914 return log_error_errno(r
, "Failed to apply system call filters: %m");
5917 if (keep_seccomp_privileges
) {
5918 /* Restore the capability bounding set with what's expected from the service + the
5919 * ambient capabilities hack */
5920 if (!cap_test_all(saved_bset
)) {
5921 r
= capability_bounding_set_drop(saved_bset
, /* right_now= */ false);
5923 *exit_status
= EXIT_CAPABILITIES
;
5924 return log_error_errno(r
, "Failed to drop bset capabilities: %m");
5928 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5929 * applications that use it. */
5930 if (!BIT_SET(saved_bset
, CAP_SYS_ADMIN
)) {
5931 r
= drop_capability(CAP_SYS_ADMIN
);
5933 *exit_status
= EXIT_USER
;
5934 return log_error_errno(r
, "Failed to drop CAP_SYS_ADMIN: %m");
5938 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5939 * applications that use it. */
5940 if (!BIT_SET(saved_bset
, CAP_SETPCAP
)) {
5941 r
= drop_capability(CAP_SETPCAP
);
5943 *exit_status
= EXIT_USER
;
5944 return log_error_errno(r
, "Failed to drop CAP_SETPCAP: %m");
5948 if (prctl(PR_SET_KEEPCAPS
, 0) < 0) {
5949 *exit_status
= EXIT_USER
;
5950 return log_error_errno(errno
, "Failed to drop keep capabilities flag: %m");
5957 if (!strv_isempty(context
->unset_environment
)) {
5960 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5962 *exit_status
= EXIT_MEMORY
;
5966 strv_free_and_replace(accum_env
, ee
);
5969 _cleanup_strv_free_
char **replaced_argv
= NULL
, **argv_via_shell
= NULL
;
5970 char **final_argv
= FLAGS_SET(command
->flags
, EXEC_COMMAND_VIA_SHELL
) ? strv_skip(command
->argv
, 1) : command
->argv
;
5972 if (final_argv
&& !FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5973 _cleanup_strv_free_
char **unset_variables
= NULL
, **bad_variables
= NULL
;
5975 r
= replace_env_argv(final_argv
, accum_env
, &replaced_argv
, &unset_variables
, &bad_variables
);
5977 *exit_status
= EXIT_MEMORY
;
5978 return log_error_errno(r
, "Failed to replace environment variables: %m");
5980 final_argv
= replaced_argv
;
5982 if (!strv_isempty(unset_variables
)) {
5983 _cleanup_free_
char *ju
= strv_join(unset_variables
, ", ");
5984 log_warning("Referenced but unset environment variable evaluates to an empty string: %s", strna(ju
));
5987 if (!strv_isempty(bad_variables
)) {
5988 _cleanup_free_
char *jb
= strv_join(bad_variables
, ", ");
5989 log_warning("Invalid environment variable name evaluates to an empty string: %s", strna(jb
));
5993 if (FLAGS_SET(command
->flags
, EXEC_COMMAND_VIA_SHELL
)) {
5994 r
= strv_extendf(&argv_via_shell
, "%s%s", command
->argv
[0][0] == '-' ? "-" : "", path
);
5996 *exit_status
= EXIT_MEMORY
;
6000 if (!strv_isempty(final_argv
)) {
6001 _cleanup_free_
char *cmdline_joined
= NULL
;
6003 cmdline_joined
= strv_join(final_argv
, " ");
6004 if (!cmdline_joined
) {
6005 *exit_status
= EXIT_MEMORY
;
6009 r
= strv_extend_many(&argv_via_shell
, "-c", cmdline_joined
);
6011 *exit_status
= EXIT_MEMORY
;
6016 final_argv
= argv_via_shell
;
6019 log_command_line(context
, params
, "Executing", executable
, final_argv
);
6021 /* We have finished with all our initializations. Let's now let the manager know that. From this
6022 * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
6024 r
= exec_fd_mark_hot(context
, params
, /* hot= */ true, exit_status
);
6028 /* As last thing before the execve(), let's send the handoff timestamp */
6029 r
= send_handoff_timestamp(context
, params
, exit_status
);
6031 /* If this handoff timestamp failed, let's undo the marking as hot */
6032 (void) exec_fd_mark_hot(context
, params
, /* hot= */ false, /* reterr_exit_status= */ NULL
);
6036 /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
6037 * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
6038 * exec_fd this is pretty much the whole raison d'etre. */
6040 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
6042 /* The execve() failed, let's undo the marking as hot */
6043 (void) exec_fd_mark_hot(context
, params
, /* hot= */ false, /* reterr_exit_status= */ NULL
);
6045 *exit_status
= EXIT_EXEC
;
6046 return log_error_errno(r
, "Failed to execute %s: %m", executable
);