1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
3 #include <sys/eventfd.h>
9 #include <security/pam_appl.h>
10 #include <security/pam_misc.h>
14 #include <sys/apparmor.h>
17 #include "sd-messages.h"
20 #include "apparmor-util.h"
22 #include "argv-util.h"
24 #include "bpf-dlopen.h"
25 #include "bpf-restrict-fs.h"
26 #include "btrfs-util.h"
27 #include "capability-util.h"
28 #include "cgroup-setup.h"
30 #include "chattr-util.h"
31 #include "chown-recursive.h"
33 #include "data-fd-util.h"
36 #include "exec-credential.h"
37 #include "exec-invoke.h"
39 #include "exit-status.h"
41 #include "hexdecoct.h"
43 #include "iovec-util.h"
44 #include "journal-send.h"
45 #include "missing_ioprio.h"
46 #include "missing_prctl.h"
47 #include "missing_securebits.h"
48 #include "missing_syscall.h"
49 #include "mkdir-label.h"
50 #include "proc-cmdline.h"
51 #include "process-util.h"
53 #include "rlimit-util.h"
54 #include "seccomp-util.h"
55 #include "selinux-util.h"
56 #include "signal-util.h"
57 #include "smack-util.h"
58 #include "socket-util.h"
59 #include "string-table.h"
61 #include "terminal-util.h"
62 #include "utmp-wtmp.h"
65 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
66 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
68 #define SNDBUF_SIZE (8*1024*1024)
78 assert(fds
|| n_fds
== 0);
80 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
81 * O_NONBLOCK only applies to socket activation though. */
83 for (size_t i
= 0; i
< n_fds
; i
++) {
85 if (i
< n_socket_fds
) {
86 r
= fd_nonblock(fds
[i
], nonblock
);
91 /* We unconditionally drop FD_CLOEXEC from the fds,
92 * since after all we want to pass these fds to our
95 r
= fd_cloexec(fds
[i
], false);
103 static bool is_terminal_input(ExecInput i
) {
106 EXEC_INPUT_TTY_FORCE
,
107 EXEC_INPUT_TTY_FAIL
);
110 static bool is_terminal_output(ExecOutput o
) {
113 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
114 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
117 static bool is_kmsg_output(ExecOutput o
) {
120 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
123 static bool exec_context_needs_term(const ExecContext
*c
) {
126 /* Return true if the execution context suggests we should set $TERM to something useful. */
128 if (is_terminal_input(c
->std_input
))
131 if (is_terminal_output(c
->std_output
))
134 if (is_terminal_output(c
->std_error
))
137 return !!c
->tty_path
;
140 static int open_null_as(int flags
, int nfd
) {
145 fd
= open("/dev/null", flags
|O_NOCTTY
);
149 return move_fd(fd
, nfd
, false);
152 static int connect_journal_socket(
154 const char *log_namespace
,
158 uid_t olduid
= UID_INVALID
;
159 gid_t oldgid
= GID_INVALID
;
165 j
= journal_stream_path(log_namespace
);
169 if (gid_is_valid(gid
)) {
172 if (setegid(gid
) < 0)
176 if (uid_is_valid(uid
)) {
179 if (seteuid(uid
) < 0) {
185 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
187 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
188 an LSM interferes. */
190 if (uid_is_valid(uid
))
191 (void) seteuid(olduid
);
194 if (gid_is_valid(gid
))
195 (void) setegid(oldgid
);
200 static int connect_logger_as(
201 const ExecContext
*context
,
202 const ExecParameters
*params
,
209 _cleanup_close_
int fd
= -EBADF
;
214 assert(output
< _EXEC_OUTPUT_MAX
);
218 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
222 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
226 if (shutdown(fd
, SHUT_RD
) < 0)
229 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
239 context
->syslog_identifier
?: ident
,
240 params
->flags
& EXEC_PASS_LOG_UNIT
? params
->unit_id
: "",
241 context
->syslog_priority
,
242 !!context
->syslog_level_prefix
,
244 is_kmsg_output(output
),
245 is_terminal_output(output
)) < 0)
248 return move_fd(TAKE_FD(fd
), nfd
, false);
251 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
257 fd
= open_terminal(path
, flags
| O_NOCTTY
);
261 return move_fd(fd
, nfd
, false);
264 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
265 _cleanup_close_
int fd
= -EBADF
;
270 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
273 fd
= open(path
, flags
|O_NOCTTY
, mode
);
277 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
280 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
282 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
286 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
287 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
288 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
289 * wasn't an AF_UNIX socket after all */
294 if ((flags
& O_ACCMODE
) == O_RDONLY
)
295 r
= shutdown(fd
, SHUT_WR
);
296 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
297 r
= shutdown(fd
, SHUT_RD
);
306 static int fixup_input(
307 const ExecContext
*context
,
309 bool apply_tty_stdin
) {
315 std_input
= context
->std_input
;
317 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
318 return EXEC_INPUT_NULL
;
320 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
321 return EXEC_INPUT_NULL
;
323 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
324 return EXEC_INPUT_NULL
;
329 static int fixup_output(ExecOutput output
, int socket_fd
) {
331 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
332 return EXEC_OUTPUT_INHERIT
;
337 static int setup_input(
338 const ExecContext
*context
,
339 const ExecParameters
*params
,
341 const int named_iofds
[static 3]) {
350 if (params
->stdin_fd
>= 0) {
351 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
354 /* Try to make this the controlling tty, if it is a tty, and reset it */
355 if (isatty(STDIN_FILENO
)) {
356 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
358 if (context
->tty_reset
)
359 (void) reset_terminal_fd(STDIN_FILENO
, /* switch_to_text= */ true);
361 (void) exec_context_apply_tty_size(context
, STDIN_FILENO
, /* tty_path= */ NULL
);
367 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
371 case EXEC_INPUT_NULL
:
372 return open_null_as(O_RDONLY
, STDIN_FILENO
);
375 case EXEC_INPUT_TTY_FORCE
:
376 case EXEC_INPUT_TTY_FAIL
: {
377 _cleanup_close_
int tty_fd
= -EBADF
;
378 const char *tty_path
;
380 tty_path
= ASSERT_PTR(exec_context_tty_path(context
));
382 tty_fd
= acquire_terminal(tty_path
,
383 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
384 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
385 ACQUIRE_TERMINAL_WAIT
,
390 r
= exec_context_apply_tty_size(context
, tty_fd
, tty_path
);
394 r
= move_fd(tty_fd
, STDIN_FILENO
, /* cloexec= */ false);
402 case EXEC_INPUT_SOCKET
:
403 assert(socket_fd
>= 0);
405 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
407 case EXEC_INPUT_NAMED_FD
:
408 assert(named_iofds
[STDIN_FILENO
] >= 0);
410 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
411 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
413 case EXEC_INPUT_DATA
: {
416 fd
= acquire_data_fd_full(context
->stdin_data
, context
->stdin_data_size
, /* flags = */ 0);
420 return move_fd(fd
, STDIN_FILENO
, false);
423 case EXEC_INPUT_FILE
: {
427 assert(context
->stdio_file
[STDIN_FILENO
]);
429 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
430 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
432 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
436 return move_fd(fd
, STDIN_FILENO
, false);
440 assert_not_reached();
444 static bool can_inherit_stderr_from_stdout(
445 const ExecContext
*context
,
451 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
454 if (e
== EXEC_OUTPUT_INHERIT
)
459 if (e
== EXEC_OUTPUT_NAMED_FD
)
460 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
462 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
463 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
468 static int setup_output(
469 const ExecContext
*context
,
470 const ExecParameters
*params
,
473 const int named_iofds
[static 3],
477 dev_t
*journal_stream_dev
,
478 ino_t
*journal_stream_ino
) {
487 assert(journal_stream_dev
);
488 assert(journal_stream_ino
);
490 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
492 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
495 return STDOUT_FILENO
;
498 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
499 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
502 return STDERR_FILENO
;
505 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
506 o
= fixup_output(context
->std_output
, socket_fd
);
508 if (fileno
== STDERR_FILENO
) {
510 e
= fixup_output(context
->std_error
, socket_fd
);
512 /* This expects the input and output are already set up */
514 /* Don't change the stderr file descriptor if we inherit all
515 * the way and are not on a tty */
516 if (e
== EXEC_OUTPUT_INHERIT
&&
517 o
== EXEC_OUTPUT_INHERIT
&&
518 i
== EXEC_INPUT_NULL
&&
519 !is_terminal_input(context
->std_input
) &&
523 /* Duplicate from stdout if possible */
524 if (can_inherit_stderr_from_stdout(context
, o
, e
))
525 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
529 } else if (o
== EXEC_OUTPUT_INHERIT
) {
530 /* If input got downgraded, inherit the original value */
531 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
532 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
534 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
535 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
536 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
538 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
542 /* We need to open /dev/null here anew, to get the right access mode. */
543 return open_null_as(O_WRONLY
, fileno
);
548 case EXEC_OUTPUT_NULL
:
549 return open_null_as(O_WRONLY
, fileno
);
551 case EXEC_OUTPUT_TTY
:
552 if (is_terminal_input(i
))
553 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
555 /* We don't reset the terminal if this is just about output */
556 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
558 case EXEC_OUTPUT_KMSG
:
559 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
560 case EXEC_OUTPUT_JOURNAL
:
561 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
562 r
= connect_logger_as(context
, params
, o
, ident
, fileno
, uid
, gid
);
564 log_exec_warning_errno(context
,
567 "Failed to connect %s to the journal socket, ignoring: %m",
568 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
569 r
= open_null_as(O_WRONLY
, fileno
);
573 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
574 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
575 * services to detect whether they are connected to the journal or not.
577 * If both stdout and stderr are connected to a stream then let's make sure to store the data
578 * about STDERR as that's usually the best way to do logging. */
580 if (fstat(fileno
, &st
) >= 0 &&
581 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
582 *journal_stream_dev
= st
.st_dev
;
583 *journal_stream_ino
= st
.st_ino
;
588 case EXEC_OUTPUT_SOCKET
:
589 assert(socket_fd
>= 0);
591 return RET_NERRNO(dup2(socket_fd
, fileno
));
593 case EXEC_OUTPUT_NAMED_FD
:
594 assert(named_iofds
[fileno
] >= 0);
596 (void) fd_nonblock(named_iofds
[fileno
], false);
597 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
599 case EXEC_OUTPUT_FILE
:
600 case EXEC_OUTPUT_FILE_APPEND
:
601 case EXEC_OUTPUT_FILE_TRUNCATE
: {
605 assert(context
->stdio_file
[fileno
]);
607 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
608 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
611 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
614 if (o
== EXEC_OUTPUT_FILE_APPEND
)
616 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
619 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
623 return move_fd(fd
, fileno
, 0);
627 assert_not_reached();
631 static int chown_terminal(int fd
, uid_t uid
) {
636 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
637 if (!isatty_safe(fd
))
640 /* This might fail. What matters are the results. */
641 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
648 static int setup_confirm_stdio(
649 const ExecContext
*context
,
651 int *ret_saved_stdin
,
652 int *ret_saved_stdout
) {
654 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
657 assert(ret_saved_stdin
);
658 assert(ret_saved_stdout
);
660 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
664 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
665 if (saved_stdout
< 0)
668 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
672 r
= chown_terminal(fd
, getuid());
676 r
= reset_terminal_fd(fd
, /* switch_to_text= */ true);
680 r
= exec_context_apply_tty_size(context
, fd
, vc
);
684 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
689 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
690 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
694 static void write_confirm_error_fd(int err
, int fd
, const char *unit_id
) {
698 if (err
== -ETIMEDOUT
)
699 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", unit_id
);
702 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id
);
706 static void write_confirm_error(int err
, const char *vc
, const char *unit_id
) {
707 _cleanup_close_
int fd
= -EBADF
;
711 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
715 write_confirm_error_fd(err
, fd
, unit_id
);
718 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
722 assert(saved_stdout
);
726 if (*saved_stdin
>= 0)
727 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
730 if (*saved_stdout
>= 0)
731 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
734 *saved_stdin
= safe_close(*saved_stdin
);
735 *saved_stdout
= safe_close(*saved_stdout
);
741 CONFIRM_PRETEND_FAILURE
= -1,
742 CONFIRM_PRETEND_SUCCESS
= 0,
746 static bool confirm_spawn_disabled(void) {
747 return access("/run/systemd/confirm_spawn_disabled", F_OK
) >= 0;
750 static int ask_for_confirmation(const ExecContext
*context
, const ExecParameters
*params
, const char *cmdline
) {
751 int saved_stdout
= -1, saved_stdin
= -1, r
;
752 _cleanup_free_
char *e
= NULL
;
758 /* For any internal errors, assume a positive response. */
759 r
= setup_confirm_stdio(context
, params
->confirm_spawn
, &saved_stdin
, &saved_stdout
);
761 write_confirm_error(r
, params
->confirm_spawn
, params
->unit_id
);
762 return CONFIRM_EXECUTE
;
765 /* confirm_spawn might have been disabled while we were sleeping. */
766 if (!params
->confirm_spawn
|| confirm_spawn_disabled()) {
771 e
= ellipsize(cmdline
, 60, 100);
779 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
781 write_confirm_error_fd(r
, STDOUT_FILENO
, params
->unit_id
);
788 printf("Resuming normal execution.\n");
789 manager_disable_confirm_spawn();
793 printf(" Unit: %s\n",
795 exec_context_dump(context
, stdout
, " ");
796 exec_params_dump(params
, stdout
, " ");
797 continue; /* ask again */
799 printf("Failing execution.\n");
800 r
= CONFIRM_PRETEND_FAILURE
;
803 printf(" c - continue, proceed without asking anymore\n"
804 " D - dump, show the state of the unit\n"
805 " f - fail, don't execute the command and pretend it failed\n"
807 " i - info, show a short summary of the unit\n"
808 " j - jobs, show jobs that are in progress\n"
809 " s - skip, don't execute the command and pretend it succeeded\n"
810 " y - yes, execute the command\n");
811 continue; /* ask again */
815 params
->unit_id
, cmdline
);
816 continue; /* ask again */
818 if (sigqueue(getppid(),
820 (const union sigval
) { .sival_int
= MANAGER_SIGNAL_COMMAND_DUMP_JOBS
}) < 0)
823 continue; /* ask again */
825 /* 'n' was removed in favor of 'f'. */
826 printf("Didn't understand 'n', did you mean 'f'?\n");
827 continue; /* ask again */
829 printf("Skipping execution.\n");
830 r
= CONFIRM_PRETEND_SUCCESS
;
836 assert_not_reached();
842 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
846 static int get_fixed_user(
847 const char *user_or_uid
,
848 const char **ret_username
,
851 const char **ret_home
,
852 const char **ret_shell
) {
857 assert(ret_username
);
859 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
860 * (i.e. are "/" or "/bin/nologin"). */
862 r
= get_user_creds(&user_or_uid
, ret_uid
, ret_gid
, ret_home
, ret_shell
, USER_CREDS_CLEAN
);
866 /* user_or_uid is normalized by get_user_creds to username */
867 *ret_username
= user_or_uid
;
872 static int get_fixed_group(
873 const char *group_or_gid
,
874 const char **ret_groupname
,
879 assert(group_or_gid
);
880 assert(ret_groupname
);
882 r
= get_group_creds(&group_or_gid
, ret_gid
, /* flags = */ 0);
886 /* group_or_gid is normalized by get_group_creds to groupname */
887 *ret_groupname
= group_or_gid
;
892 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
893 const char *group
, gid_t gid
,
894 gid_t
**supplementary_gids
, int *ngids
) {
897 bool keep_groups
= false;
898 gid_t
*groups
= NULL
;
899 _cleanup_free_ gid_t
*l_gids
= NULL
;
904 * If user is given, then lookup GID and supplementary groups list.
905 * We avoid NSS lookups for gid=0. Also we have to initialize groups
906 * here and as early as possible so we keep the list of supplementary
907 * groups of the caller.
909 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
910 /* First step, initialize groups from /etc/groups */
911 if (initgroups(user
, gid
) < 0)
917 if (strv_isempty(c
->supplementary_groups
))
921 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
922 * be positive, otherwise fail.
925 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
926 if (ngroups_max
<= 0)
927 return errno_or_else(EOPNOTSUPP
);
929 l_gids
= new(gid_t
, ngroups_max
);
935 * Lookup the list of groups that the user belongs to, we
936 * avoid NSS lookups here too for gid=0.
939 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
944 STRV_FOREACH(i
, c
->supplementary_groups
) {
947 if (k
>= ngroups_max
)
951 r
= get_group_creds(&g
, l_gids
+k
, 0);
959 * Sets ngids to zero to drop all supplementary groups, happens
960 * when we are under root and SupplementaryGroups= is empty.
967 /* Otherwise get the final list of supplementary groups */
968 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
972 *supplementary_gids
= groups
;
980 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
983 /* Handle SupplementaryGroups= if it is not empty */
985 r
= maybe_setgroups(ngids
, supplementary_gids
);
990 if (gid_is_valid(gid
)) {
991 /* Then set our gids */
992 if (setresgid(gid
, gid
, gid
) < 0)
999 static int set_securebits(unsigned bits
, unsigned mask
) {
1003 current
= prctl(PR_GET_SECUREBITS
);
1007 /* Clear all securebits defined in mask and set bits */
1008 applied
= ((unsigned) current
& ~mask
) | bits
;
1009 if ((unsigned) current
== applied
)
1012 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1018 static int enforce_user(
1019 const ExecContext
*context
,
1021 uint64_t capability_ambient_set
) {
1025 if (!uid_is_valid(uid
))
1028 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1029 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1032 if ((capability_ambient_set
!= 0 || context
->secure_bits
!= 0) && uid
!= 0) {
1034 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1035 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1036 r
= set_securebits(1U << SECURE_KEEP_CAPS
, 0);
1041 /* Second step: actually set the uids */
1042 if (setresuid(uid
, uid
, uid
) < 0)
1045 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1046 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1047 * outside of this call. */
1053 static int null_conv(
1055 const struct pam_message
**msg
,
1056 struct pam_response
**resp
,
1057 void *appdata_ptr
) {
1059 /* We don't support conversations */
1061 return PAM_CONV_ERR
;
1064 static int pam_close_session_and_delete_credentials(pam_handle_t
*handle
, int flags
) {
1069 r
= pam_close_session(handle
, flags
);
1070 if (r
!= PAM_SUCCESS
)
1071 log_debug("pam_close_session() failed: %s", pam_strerror(handle
, r
));
1073 s
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1074 if (s
!= PAM_SUCCESS
)
1075 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle
, s
));
1077 return r
!= PAM_SUCCESS
? r
: s
;
1082 static int setup_pam(
1088 char ***env
, /* updated on success */
1089 const int fds
[], size_t n_fds
,
1094 static const struct pam_conv conv
= {
1099 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1100 _cleanup_strv_free_
char **e
= NULL
;
1101 pam_handle_t
*handle
= NULL
;
1103 int pam_code
= PAM_SUCCESS
, r
;
1104 bool close_session
= false;
1112 /* We set up PAM in the parent process, then fork. The child
1113 * will then stay around until killed via PR_GET_PDEATHSIG or
1114 * systemd via the cgroup logic. It will then remove the PAM
1115 * session again. The parent process will exec() the actual
1116 * daemon. We do things this way to ensure that the main PID
1117 * of the daemon is the one we initially fork()ed. */
1119 r
= barrier_create(&barrier
);
1123 if (log_get_max_level() < LOG_DEBUG
)
1124 flags
|= PAM_SILENT
;
1126 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1127 if (pam_code
!= PAM_SUCCESS
) {
1133 _cleanup_free_
char *q
= NULL
;
1135 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1136 * out if that's the case, and read the TTY off it. */
1138 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1139 tty
= strjoina("/dev/", q
);
1143 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1144 if (pam_code
!= PAM_SUCCESS
)
1148 STRV_FOREACH(nv
, *env
) {
1149 pam_code
= pam_putenv(handle
, *nv
);
1150 if (pam_code
!= PAM_SUCCESS
)
1154 pam_code
= pam_acct_mgmt(handle
, flags
);
1155 if (pam_code
!= PAM_SUCCESS
)
1158 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1159 if (pam_code
!= PAM_SUCCESS
)
1160 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1162 pam_code
= pam_open_session(handle
, flags
);
1163 if (pam_code
!= PAM_SUCCESS
)
1166 close_session
= true;
1168 e
= pam_getenvlist(handle
);
1170 pam_code
= PAM_BUF_ERR
;
1174 /* Block SIGTERM, so that we know that it won't get lost in the child */
1176 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
) >= 0);
1178 parent_pid
= getpid_cached();
1180 r
= safe_fork("(sd-pam)", 0, NULL
);
1186 /* The child's job is to reset the PAM session on termination */
1187 barrier_set_role(&barrier
, BARRIER_CHILD
);
1189 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1190 * those fds are open here that have been opened by PAM. */
1191 (void) close_many(fds
, n_fds
);
1193 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1194 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1195 * we'd never signal completion. */
1196 exec_fd
= safe_close(exec_fd
);
1198 /* Drop privileges - we don't need any to pam_close_session and this will make
1199 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1200 * threads to fail to exit normally */
1202 r
= fully_set_uid_gid(uid
, gid
, /* supplementary_gids= */ NULL
, /* n_supplementary_gids= */ 0);
1204 log_warning_errno(r
, "Failed to drop privileges in sd-pam: %m");
1206 (void) ignore_signals(SIGPIPE
);
1208 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1209 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1210 * this way. We rely on the control groups kill logic to do the rest for us. */
1211 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1214 /* Tell the parent that our setup is done. This is especially important regarding dropping
1215 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1217 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1218 (void) barrier_place(&barrier
);
1220 /* Check if our parent process might already have died? */
1221 if (getppid() == parent_pid
) {
1225 assert_se(sigemptyset(&ss
) >= 0);
1226 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1228 assert_se(sigwait(&ss
, &sig
) == 0);
1229 assert(sig
== SIGTERM
);
1232 /* If our parent died we'll end the session */
1233 if (getppid() != parent_pid
) {
1234 pam_code
= pam_close_session_and_delete_credentials(handle
, flags
);
1235 if (pam_code
!= PAM_SUCCESS
)
1242 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1243 * know about this. See pam_end(3) */
1244 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1248 barrier_set_role(&barrier
, BARRIER_PARENT
);
1250 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1254 /* Unblock SIGTERM again in the parent */
1255 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1257 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1258 * this fd around. */
1261 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1262 * recover. However, warn loudly if it happens. */
1263 if (!barrier_place_and_sync(&barrier
))
1264 log_error("PAM initialization failed");
1266 return strv_free_and_replace(*env
, e
);
1269 if (pam_code
!= PAM_SUCCESS
) {
1270 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1271 r
= -EPERM
; /* PAM errors do not map to errno */
1273 log_error_errno(r
, "PAM failed: %m");
1277 pam_code
= pam_close_session_and_delete_credentials(handle
, flags
);
1279 (void) pam_end(handle
, pam_code
| flags
);
1289 static void rename_process_from_path(const char *path
) {
1290 _cleanup_free_
char *buf
= NULL
;
1295 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1298 if (path_extract_filename(path
, &buf
) < 0) {
1299 rename_process("(...)");
1303 size_t l
= strlen(buf
);
1305 /* The end of the process name is usually more interesting, since the first bit might just be
1312 char process_name
[11];
1313 process_name
[0] = '(';
1314 memcpy(process_name
+1, p
, l
);
1315 process_name
[1+l
] = ')';
1316 process_name
[1+l
+1] = 0;
1318 (void) rename_process(process_name
);
1321 static bool context_has_address_families(const ExecContext
*c
) {
1324 return c
->address_families_allow_list
||
1325 !set_isempty(c
->address_families
);
1328 static bool context_has_syscall_filters(const ExecContext
*c
) {
1331 return c
->syscall_allow_list
||
1332 !hashmap_isempty(c
->syscall_filter
);
1335 static bool context_has_syscall_logs(const ExecContext
*c
) {
1338 return c
->syscall_log_allow_list
||
1339 !hashmap_isempty(c
->syscall_log
);
1342 static bool context_has_seccomp(const ExecContext
*c
) {
1343 /* We need NNP if we have any form of seccomp and are unprivileged */
1344 return c
->lock_personality
||
1345 c
->memory_deny_write_execute
||
1346 c
->private_devices
||
1348 c
->protect_hostname
||
1349 c
->protect_kernel_tunables
||
1350 c
->protect_kernel_modules
||
1351 c
->protect_kernel_logs
||
1352 context_has_address_families(c
) ||
1353 exec_context_restrict_namespaces_set(c
) ||
1354 c
->restrict_realtime
||
1355 c
->restrict_suid_sgid
||
1356 !set_isempty(c
->syscall_archs
) ||
1357 context_has_syscall_filters(c
) ||
1358 context_has_syscall_logs(c
);
1361 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1364 if (c
->no_new_privileges
)
1367 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1370 return context_has_seccomp(c
);
1375 static bool seccomp_allows_drop_privileges(const ExecContext
*c
) {
1377 bool has_capget
= false, has_capset
= false, has_prctl
= false;
1381 /* No syscall filter, we are allowed to drop privileges */
1382 if (hashmap_isempty(c
->syscall_filter
))
1385 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
1386 _cleanup_free_
char *name
= NULL
;
1388 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
1390 if (streq(name
, "capget"))
1392 else if (streq(name
, "capset"))
1394 else if (streq(name
, "prctl"))
1398 if (c
->syscall_allow_list
)
1399 return has_capget
&& has_capset
&& has_prctl
;
1401 return !(has_capget
|| has_capset
|| has_prctl
);
1404 static bool skip_seccomp_unavailable(const ExecContext
*c
, const ExecParameters
*p
, const char* msg
) {
1406 if (is_seccomp_available())
1409 log_exec_debug(c
, p
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1413 static int apply_syscall_filter(const ExecContext
*c
, const ExecParameters
*p
, bool needs_ambient_hack
) {
1414 uint32_t negative_action
, default_action
, action
;
1420 if (!context_has_syscall_filters(c
))
1423 if (skip_seccomp_unavailable(c
, p
, "SystemCallFilter="))
1426 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1428 if (c
->syscall_allow_list
) {
1429 default_action
= negative_action
;
1430 action
= SCMP_ACT_ALLOW
;
1432 default_action
= SCMP_ACT_ALLOW
;
1433 action
= negative_action
;
1436 if (needs_ambient_hack
) {
1437 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1442 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1445 static int apply_syscall_log(const ExecContext
*c
, const ExecParameters
*p
) {
1447 uint32_t default_action
, action
;
1453 if (!context_has_syscall_logs(c
))
1457 if (skip_seccomp_unavailable(c
, p
, "SystemCallLog="))
1460 if (c
->syscall_log_allow_list
) {
1461 /* Log nothing but the ones listed */
1462 default_action
= SCMP_ACT_ALLOW
;
1463 action
= SCMP_ACT_LOG
;
1465 /* Log everything but the ones listed */
1466 default_action
= SCMP_ACT_LOG
;
1467 action
= SCMP_ACT_ALLOW
;
1470 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1472 /* old libseccomp */
1473 log_exec_debug(c
, p
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1478 static int apply_syscall_archs(const ExecContext
*c
, const ExecParameters
*p
) {
1482 if (set_isempty(c
->syscall_archs
))
1485 if (skip_seccomp_unavailable(c
, p
, "SystemCallArchitectures="))
1488 return seccomp_restrict_archs(c
->syscall_archs
);
1491 static int apply_address_families(const ExecContext
*c
, const ExecParameters
*p
) {
1495 if (!context_has_address_families(c
))
1498 if (skip_seccomp_unavailable(c
, p
, "RestrictAddressFamilies="))
1501 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1504 static int apply_memory_deny_write_execute(const ExecContext
*c
, const ExecParameters
*p
) {
1510 if (!c
->memory_deny_write_execute
)
1513 /* use prctl() if kernel supports it (6.3) */
1514 r
= prctl(PR_SET_MDWE
, PR_MDWE_REFUSE_EXEC_GAIN
, 0, 0, 0);
1516 log_exec_debug(c
, p
, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1519 if (r
< 0 && errno
!= EINVAL
)
1520 return log_exec_debug_errno(c
,
1523 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1524 /* else use seccomp */
1525 log_exec_debug(c
, p
, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1527 if (skip_seccomp_unavailable(c
, p
, "MemoryDenyWriteExecute="))
1530 return seccomp_memory_deny_write_execute();
1533 static int apply_restrict_realtime(const ExecContext
*c
, const ExecParameters
*p
) {
1537 if (!c
->restrict_realtime
)
1540 if (skip_seccomp_unavailable(c
, p
, "RestrictRealtime="))
1543 return seccomp_restrict_realtime();
1546 static int apply_restrict_suid_sgid(const ExecContext
*c
, const ExecParameters
*p
) {
1550 if (!c
->restrict_suid_sgid
)
1553 if (skip_seccomp_unavailable(c
, p
, "RestrictSUIDSGID="))
1556 return seccomp_restrict_suid_sgid();
1559 static int apply_protect_sysctl(const ExecContext
*c
, const ExecParameters
*p
) {
1563 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1564 * let's protect even those systems where this is left on in the kernel. */
1566 if (!c
->protect_kernel_tunables
)
1569 if (skip_seccomp_unavailable(c
, p
, "ProtectKernelTunables="))
1572 return seccomp_protect_sysctl();
1575 static int apply_protect_kernel_modules(const ExecContext
*c
, const ExecParameters
*p
) {
1579 /* Turn off module syscalls on ProtectKernelModules=yes */
1581 if (!c
->protect_kernel_modules
)
1584 if (skip_seccomp_unavailable(c
, p
, "ProtectKernelModules="))
1587 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1590 static int apply_protect_kernel_logs(const ExecContext
*c
, const ExecParameters
*p
) {
1594 if (!c
->protect_kernel_logs
)
1597 if (skip_seccomp_unavailable(c
, p
, "ProtectKernelLogs="))
1600 return seccomp_protect_syslog();
1603 static int apply_protect_clock(const ExecContext
*c
, const ExecParameters
*p
) {
1607 if (!c
->protect_clock
)
1610 if (skip_seccomp_unavailable(c
, p
, "ProtectClock="))
1613 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1616 static int apply_private_devices(const ExecContext
*c
, const ExecParameters
*p
) {
1620 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1622 if (!c
->private_devices
)
1625 if (skip_seccomp_unavailable(c
, p
, "PrivateDevices="))
1628 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1631 static int apply_restrict_namespaces(const ExecContext
*c
, const ExecParameters
*p
) {
1635 if (!exec_context_restrict_namespaces_set(c
))
1638 if (skip_seccomp_unavailable(c
, p
, "RestrictNamespaces="))
1641 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1644 static int apply_lock_personality(const ExecContext
*c
, const ExecParameters
*p
) {
1645 unsigned long personality
;
1651 if (!c
->lock_personality
)
1654 if (skip_seccomp_unavailable(c
, p
, "LockPersonality="))
1657 personality
= c
->personality
;
1659 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1660 if (personality
== PERSONALITY_INVALID
) {
1662 r
= opinionated_personality(&personality
);
1667 return seccomp_lock_personality(personality
);
1673 static int apply_restrict_filesystems(const ExecContext
*c
, const ExecParameters
*p
) {
1679 if (!exec_context_restrict_filesystems_set(c
))
1682 if (p
->bpf_restrict_fs_map_fd
< 0) {
1683 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1684 log_exec_debug(c
, p
, "LSM BPF not supported, skipping RestrictFileSystems=");
1688 /* We are in a new binary, so dl-open again */
1693 return bpf_restrict_fs_update(c
->restrict_filesystems
, p
->cgroup_id
, p
->bpf_restrict_fs_map_fd
, c
->restrict_filesystems_allow_list
);
1697 static int apply_protect_hostname(const ExecContext
*c
, const ExecParameters
*p
, int *ret_exit_status
) {
1701 if (!c
->protect_hostname
)
1704 if (ns_type_supported(NAMESPACE_UTS
)) {
1705 if (unshare(CLONE_NEWUTS
) < 0) {
1706 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1707 *ret_exit_status
= EXIT_NAMESPACE
;
1708 return log_exec_error_errno(c
,
1711 "Failed to set up UTS namespacing: %m");
1716 "ProtectHostname=yes is configured, but UTS namespace setup is "
1717 "prohibited (container manager?), ignoring namespace setup.");
1722 "ProtectHostname=yes is configured, but the kernel does not "
1723 "support UTS namespaces, ignoring namespace setup.");
1728 if (skip_seccomp_unavailable(c
, p
, "ProtectHostname="))
1731 r
= seccomp_protect_hostname();
1733 *ret_exit_status
= EXIT_SECCOMP
;
1734 return log_exec_error_errno(c
, p
, r
, "Failed to apply hostname restrictions: %m");
1741 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1744 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1745 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1747 if (idle_pipe
[0] >= 0) {
1750 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1752 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1755 /* Signal systemd that we are bored and want to continue. */
1756 n
= write(idle_pipe
[3], "x", 1);
1758 /* Wait for systemd to react to the signal above. */
1759 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1762 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1766 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1769 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1771 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1772 * the service payload in. */
1773 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1774 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
1775 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
1776 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
1777 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
1778 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
1781 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
1783 static int build_environment(
1784 const ExecContext
*c
,
1785 const ExecParameters
*p
,
1786 const CGroupContext
*cgroup_context
,
1789 const char *username
,
1791 dev_t journal_stream_dev
,
1792 ino_t journal_stream_ino
,
1793 const char *memory_pressure_path
,
1796 _cleanup_strv_free_
char **our_env
= NULL
;
1805 #define N_ENV_VARS 19
1806 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1811 _cleanup_free_
char *joined
= NULL
;
1813 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1815 our_env
[n_env
++] = x
;
1817 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1819 our_env
[n_env
++] = x
;
1821 joined
= strv_join(p
->fd_names
, ":");
1825 x
= strjoin("LISTEN_FDNAMES=", joined
);
1828 our_env
[n_env
++] = x
;
1831 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1832 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1834 our_env
[n_env
++] = x
;
1836 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1838 our_env
[n_env
++] = x
;
1841 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1842 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1843 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1844 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1845 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1848 our_env
[n_env
++] = x
;
1851 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1852 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1853 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1854 * SetLoginEnvironment= switch. */
1855 if (!c
->user
&& !c
->dynamic_user
&& p
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
1856 r
= get_fixed_user("root", &username
, NULL
, NULL
, &home
, &shell
);
1858 return log_exec_debug_errno(c
,
1861 "Failed to determine user credentials for root: %m");
1864 bool set_user_login_env
= exec_context_get_set_login_environment(c
);
1867 x
= strjoin("USER=", username
);
1870 our_env
[n_env
++] = x
;
1872 if (set_user_login_env
) {
1873 x
= strjoin("LOGNAME=", username
);
1876 our_env
[n_env
++] = x
;
1880 if (home
&& set_user_login_env
) {
1881 x
= strjoin("HOME=", home
);
1885 path_simplify(x
+ 5);
1886 our_env
[n_env
++] = x
;
1889 if (shell
&& set_user_login_env
) {
1890 x
= strjoin("SHELL=", shell
);
1894 path_simplify(x
+ 6);
1895 our_env
[n_env
++] = x
;
1898 if (!sd_id128_is_null(p
->invocation_id
)) {
1899 assert(p
->invocation_id_string
);
1901 x
= strjoin("INVOCATION_ID=", p
->invocation_id_string
);
1905 our_env
[n_env
++] = x
;
1908 if (exec_context_needs_term(c
)) {
1909 _cleanup_free_
char *cmdline
= NULL
;
1910 const char *tty_path
, *term
= NULL
;
1912 tty_path
= exec_context_tty_path(c
);
1914 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916 * container manager passes to PID 1 ends up all the way in the console login shown. */
1918 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1919 term
= getenv("TERM");
1920 else if (tty_path
&& in_charset(skip_dev_prefix(tty_path
), ALPHANUMERICAL
)) {
1921 _cleanup_free_
char *key
= NULL
;
1923 key
= strjoin("systemd.tty.term.", skip_dev_prefix(tty_path
));
1927 r
= proc_cmdline_get_key(key
, 0, &cmdline
);
1929 log_exec_debug_errno(c
,
1932 "Failed to read %s from kernel cmdline, ignoring: %m",
1939 term
= default_term_for_tty(tty_path
);
1941 x
= strjoin("TERM=", term
);
1944 our_env
[n_env
++] = x
;
1947 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1948 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1951 our_env
[n_env
++] = x
;
1954 if (c
->log_namespace
) {
1955 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
1959 our_env
[n_env
++] = x
;
1962 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1963 _cleanup_free_
char *joined
= NULL
;
1969 if (c
->directories
[t
].n_items
== 0)
1972 n
= exec_directory_env_name_to_string(t
);
1976 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
1977 _cleanup_free_
char *prefixed
= NULL
;
1979 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
1983 if (!strextend_with_separator(&joined
, ":", prefixed
))
1987 x
= strjoin(n
, "=", joined
);
1991 our_env
[n_env
++] = x
;
1994 _cleanup_free_
char *creds_dir
= NULL
;
1995 r
= exec_context_get_credential_directory(c
, p
, p
->unit_id
, &creds_dir
);
1999 x
= strjoin("CREDENTIALS_DIRECTORY=", creds_dir
);
2003 our_env
[n_env
++] = x
;
2006 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
2009 our_env
[n_env
++] = x
;
2011 if (memory_pressure_path
) {
2012 x
= strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path
);
2016 our_env
[n_env
++] = x
;
2018 if (cgroup_context
&& !path_equal(memory_pressure_path
, "/dev/null")) {
2019 _cleanup_free_
char *b
= NULL
, *e
= NULL
;
2021 if (asprintf(&b
, "%s " USEC_FMT
" " USEC_FMT
,
2022 MEMORY_PRESSURE_DEFAULT_TYPE
,
2023 cgroup_context
->memory_pressure_threshold_usec
== USEC_INFINITY
? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
:
2024 CLAMP(cgroup_context
->memory_pressure_threshold_usec
, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
),
2025 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2028 if (base64mem(b
, strlen(b
) + 1, &e
) < 0)
2031 x
= strjoin("MEMORY_PRESSURE_WRITE=", e
);
2035 our_env
[n_env
++] = x
;
2039 assert(n_env
< N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
2042 *ret
= TAKE_PTR(our_env
);
2047 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2048 _cleanup_strv_free_
char **pass_env
= NULL
;
2051 STRV_FOREACH(i
, c
->pass_environment
) {
2052 _cleanup_free_
char *x
= NULL
;
2058 x
= strjoin(*i
, "=", v
);
2062 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2065 pass_env
[n_env
++] = TAKE_PTR(x
);
2066 pass_env
[n_env
] = NULL
;
2069 *ret
= TAKE_PTR(pass_env
);
2074 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2075 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2076 _cleanup_close_pair_
int errno_pipe
[2] = EBADF_PAIR
;
2077 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2078 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2083 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2084 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2085 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2086 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2087 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2088 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2089 * continues execution normally.
2090 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2091 * does not need CAP_SETUID to write the single line mapping to itself. */
2093 /* Can only set up multiple mappings with CAP_SETUID. */
2094 if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
))
2095 r
= asprintf(&uid_map
,
2096 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2097 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2098 ouid
, ouid
, uid
, uid
);
2100 r
= asprintf(&uid_map
,
2101 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2107 /* Can only set up multiple mappings with CAP_SETGID. */
2108 if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
))
2109 r
= asprintf(&gid_map
,
2110 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2111 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2112 ogid
, ogid
, gid
, gid
);
2114 r
= asprintf(&gid_map
,
2115 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2121 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2123 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2124 if (unshare_ready_fd
< 0)
2127 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2129 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2132 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGKILL
, &pid
);
2136 _cleanup_close_
int fd
= -EBADF
;
2140 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2141 * here, after the parent opened its own user namespace. */
2144 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2146 /* Wait until the parent unshared the user namespace */
2147 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2152 /* Disable the setgroups() system call in the child user namespace, for good. */
2153 a
= procfs_file_alloca(ppid
, "setgroups");
2154 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2156 if (errno
!= ENOENT
) {
2161 /* If the file is missing the kernel is too old, let's continue anyway. */
2163 if (write(fd
, "deny\n", 5) < 0) {
2168 fd
= safe_close(fd
);
2171 /* First write the GID map */
2172 a
= procfs_file_alloca(ppid
, "gid_map");
2173 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2178 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2182 fd
= safe_close(fd
);
2184 /* The write the UID map */
2185 a
= procfs_file_alloca(ppid
, "uid_map");
2186 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2191 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2196 _exit(EXIT_SUCCESS
);
2199 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2200 _exit(EXIT_FAILURE
);
2203 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2205 if (unshare(CLONE_NEWUSER
) < 0)
2208 /* Let the child know that the namespace is ready now */
2209 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2212 /* Try to read an error code from the child */
2213 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2216 if (n
== sizeof(r
)) { /* an error code was sent to us */
2221 if (n
!= 0) /* on success we should have read 0 bytes */
2224 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2227 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2233 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2234 _cleanup_free_
char *src_abs
= NULL
;
2239 src_abs
= path_join(root
, source
);
2243 STRV_FOREACH(dst
, symlinks
) {
2244 _cleanup_free_
char *dst_abs
= NULL
;
2246 dst_abs
= path_join(root
, *dst
);
2250 r
= mkdir_parents_label(dst_abs
, 0755);
2254 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2262 static int setup_exec_directory(
2263 const ExecContext
*context
,
2264 const ExecParameters
*params
,
2267 ExecDirectoryType type
,
2268 bool needs_mount_namespace
,
2271 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2272 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2273 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2274 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2275 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2276 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2282 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2283 assert(exit_status
);
2285 if (!params
->prefix
[type
])
2288 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2289 if (!uid_is_valid(uid
))
2291 if (!gid_is_valid(gid
))
2295 FOREACH_ARRAY(i
, context
->directories
[type
].items
, context
->directories
[type
].n_items
) {
2296 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2298 p
= path_join(params
->prefix
[type
], i
->path
);
2304 r
= mkdir_parents_label(p
, 0755);
2308 if (IN_SET(type
, EXEC_DIRECTORY_STATE
, EXEC_DIRECTORY_LOGS
) && params
->runtime_scope
== RUNTIME_SCOPE_USER
) {
2310 /* If we are in user mode, and a configuration directory exists but a state directory
2311 * doesn't exist, then we likely are upgrading from an older systemd version that
2312 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2313 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2314 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2315 * separated. If a service has both dirs configured but only the configuration dir
2316 * exists and the state dir does not, we assume we are looking at an update
2317 * situation. Hence, create a compatibility symlink, so that all expectations are
2320 * (We also do something similar with the log directory, which still doesn't exist in
2321 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2323 /* this assumes the state dir is always created before the configuration dir */
2324 assert_cc(EXEC_DIRECTORY_STATE
< EXEC_DIRECTORY_LOGS
);
2325 assert_cc(EXEC_DIRECTORY_LOGS
< EXEC_DIRECTORY_CONFIGURATION
);
2327 r
= laccess(p
, F_OK
);
2329 _cleanup_free_
char *q
= NULL
;
2331 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2332 * under the configuration hierarchy. */
2334 if (type
== EXEC_DIRECTORY_STATE
)
2335 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], i
->path
);
2336 else if (type
== EXEC_DIRECTORY_LOGS
)
2337 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], "log", i
->path
);
2339 assert_not_reached();
2345 r
= laccess(q
, F_OK
);
2347 /* It does exist! This hence looks like an update. Symlink the
2348 * configuration directory into the state directory. */
2350 r
= symlink_idempotent(q
, p
, /* make_relative= */ true);
2354 log_exec_notice(context
, params
, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p
, q
);
2356 } else if (r
!= -ENOENT
)
2357 log_exec_warning_errno(context
, params
, r
, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q
);
2360 log_exec_warning_errno(context
, params
, r
, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p
);
2363 if (exec_directory_is_private(context
, type
)) {
2364 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2365 * case we want to avoid leaving a directory around fully accessible that is owned by
2366 * a dynamic user whose UID is later on reused. To lock this down we use the same
2367 * trick used by container managers to prohibit host users to get access to files of
2368 * the same UID in containers: we place everything inside a directory that has an
2369 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2370 * for unprivileged host code. We then use fs namespacing to make this directory
2371 * permeable for the service itself.
2373 * Specifically: for a service which wants a special directory "foo/" we first create
2374 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2375 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2376 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2377 * unprivileged host users can't look into it. Inside of the namespace of the unit
2378 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2379 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2380 * for the service and making sure it only gets access to the dirs it needs but no
2381 * others. Tricky? Yes, absolutely, but it works!
2383 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2384 * to be owned by the service itself.
2386 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2387 * for sharing files or sockets with other services. */
2389 pp
= path_join(params
->prefix
[type
], "private");
2395 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2396 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2400 if (!path_extend(&pp
, i
->path
)) {
2405 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2406 r
= mkdir_parents_label(pp
, 0755);
2410 if (is_dir(p
, false) > 0 &&
2411 (laccess(pp
, F_OK
) == -ENOENT
)) {
2413 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2414 * it over. Most likely the service has been upgraded from one that didn't use
2415 * DynamicUser=1, to one that does. */
2417 log_exec_info(context
,
2419 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2420 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2421 exec_directory_type_to_string(type
), p
, pp
);
2423 r
= RET_NERRNO(rename(p
, pp
));
2427 /* Otherwise, create the actual directory for the service */
2429 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2430 if (r
< 0 && r
!= -EEXIST
)
2434 if (!i
->only_create
) {
2435 /* And link it up from the original place.
2437 * 1) If a mount namespace is going to be used, then this symlink remains on
2438 * the host, and a new one for the child namespace will be created later.
2439 * 2) It is not necessary to create this symlink when one of its parent
2440 * directories is specified and already created. E.g.
2441 * StateDirectory=foo foo/bar
2442 * In that case, the inode points to pp and p for "foo/bar" are the same:
2443 * pp = "/var/lib/private/foo/bar"
2444 * p = "/var/lib/foo/bar"
2445 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2446 * we do not need to create the symlink, but we cannot create the symlink.
2447 * See issue #24783. */
2448 r
= symlink_idempotent(pp
, p
, true);
2454 _cleanup_free_
char *target
= NULL
;
2456 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2457 readlink_and_make_absolute(p
, &target
) >= 0) {
2458 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2460 /* This already exists and is a symlink? Interesting. Maybe it's one created
2461 * by DynamicUser=1 (see above)?
2463 * We do this for all directory types except for ConfigurationDirectory=,
2464 * since they all support the private/ symlink logic at least in some
2465 * configurations, see above. */
2467 r
= chase(target
, NULL
, 0, &target_resolved
, NULL
);
2471 q
= path_join(params
->prefix
[type
], "private", i
->path
);
2477 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2478 r
= chase(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2482 if (path_equal(q_resolved
, target_resolved
)) {
2484 /* Hmm, apparently DynamicUser= was once turned on for this service,
2485 * but is no longer. Let's move the directory back up. */
2487 log_exec_info(context
,
2489 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2490 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2491 exec_directory_type_to_string(type
), q
, p
);
2493 r
= RET_NERRNO(unlink(p
));
2497 r
= RET_NERRNO(rename(q
, p
));
2503 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2508 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2511 /* Don't change the owner/access mode of the configuration directory,
2512 * as in the common case it is not written to by a service, and shall
2513 * not be writable. */
2515 r
= RET_NERRNO(stat(p
, &st
));
2519 /* Still complain if the access mode doesn't match */
2520 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2521 log_exec_warning(context
,
2523 "%s \'%s\' already exists but the mode is different. "
2524 "(File system: %o %sMode: %o)",
2525 exec_directory_type_to_string(type
), i
->path
,
2526 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2533 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2534 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2535 * current UID/GID ownership.) */
2536 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2540 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2541 * available to user code anyway */
2542 if (params
->runtime_scope
!= RUNTIME_SCOPE_SYSTEM
)
2545 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2546 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2547 * assignments to exist. */
2548 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777, AT_SYMLINK_FOLLOW
);
2553 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2554 * they are set up later, to allow configuring empty var/run/etc. */
2555 if (!needs_mount_namespace
)
2556 FOREACH_ARRAY(i
, context
->directories
[type
].items
, context
->directories
[type
].n_items
) {
2557 r
= create_many_symlinks(params
->prefix
[type
], i
->path
, i
->symlinks
);
2565 *exit_status
= exit_status_table
[type
];
2570 static int setup_smack(
2571 const ExecParameters
*params
,
2572 const ExecContext
*context
,
2573 int executable_fd
) {
2577 assert(executable_fd
>= 0);
2579 if (context
->smack_process_label
) {
2580 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2583 } else if (params
->fallback_smack_process_label
) {
2584 _cleanup_free_
char *exec_label
= NULL
;
2586 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
2587 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
2590 r
= mac_smack_apply_pid(0, exec_label
?: params
->fallback_smack_process_label
);
2599 static int compile_bind_mounts(
2600 const ExecContext
*context
,
2601 const ExecParameters
*params
,
2602 BindMount
**ret_bind_mounts
,
2603 size_t *ret_n_bind_mounts
,
2604 char ***ret_empty_directories
) {
2606 _cleanup_strv_free_
char **empty_directories
= NULL
;
2607 BindMount
*bind_mounts
= NULL
;
2613 assert(ret_bind_mounts
);
2614 assert(ret_n_bind_mounts
);
2615 assert(ret_empty_directories
);
2617 CLEANUP_ARRAY(bind_mounts
, h
, bind_mount_free_many
);
2619 n
= context
->n_bind_mounts
;
2620 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2621 if (!params
->prefix
[t
])
2624 FOREACH_ARRAY(i
, context
->directories
[t
].items
, context
->directories
[t
].n_items
)
2625 n
+= !i
->only_create
;
2629 *ret_bind_mounts
= NULL
;
2630 *ret_n_bind_mounts
= 0;
2631 *ret_empty_directories
= NULL
;
2635 bind_mounts
= new(BindMount
, n
);
2639 FOREACH_ARRAY(item
, context
->bind_mounts
, context
->n_bind_mounts
) {
2640 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
2642 s
= strdup(item
->source
);
2646 d
= strdup(item
->destination
);
2650 bind_mounts
[h
++] = (BindMount
) {
2651 .source
= TAKE_PTR(s
),
2652 .destination
= TAKE_PTR(d
),
2653 .read_only
= item
->read_only
,
2654 .recursive
= item
->recursive
,
2655 .ignore_enoent
= item
->ignore_enoent
,
2659 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2660 if (!params
->prefix
[t
])
2663 if (context
->directories
[t
].n_items
== 0)
2666 if (exec_directory_is_private(context
, t
) &&
2667 !exec_context_with_rootfs(context
)) {
2670 /* So this is for a dynamic user, and we need to make sure the process can access its own
2671 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2672 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2674 private_root
= path_join(params
->prefix
[t
], "private");
2678 r
= strv_consume(&empty_directories
, private_root
);
2683 FOREACH_ARRAY(i
, context
->directories
[t
].items
, context
->directories
[t
].n_items
) {
2684 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
2686 /* When one of the parent directories is in the list, we cannot create the symlink
2687 * for the child directory. See also the comments in setup_exec_directory(). */
2691 if (exec_directory_is_private(context
, t
))
2692 s
= path_join(params
->prefix
[t
], "private", i
->path
);
2694 s
= path_join(params
->prefix
[t
], i
->path
);
2698 if (exec_directory_is_private(context
, t
) &&
2699 exec_context_with_rootfs(context
))
2700 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2701 * directory is not created on the root directory. So, let's bind-mount the directory
2702 * on the 'non-private' place. */
2703 d
= path_join(params
->prefix
[t
], i
->path
);
2709 bind_mounts
[h
++] = (BindMount
) {
2710 .source
= TAKE_PTR(s
),
2711 .destination
= TAKE_PTR(d
),
2712 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
2720 *ret_bind_mounts
= TAKE_PTR(bind_mounts
);
2721 *ret_n_bind_mounts
= n
;
2722 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2727 /* ret_symlinks will contain a list of pairs src:dest that describes
2728 * the symlinks to create later on. For example, the symlinks needed
2729 * to safely give private directories to DynamicUser=1 users. */
2730 static int compile_symlinks(
2731 const ExecContext
*context
,
2732 const ExecParameters
*params
,
2733 bool setup_os_release_symlink
,
2734 char ***ret_symlinks
) {
2736 _cleanup_strv_free_
char **symlinks
= NULL
;
2741 assert(ret_symlinks
);
2743 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++)
2744 FOREACH_ARRAY(i
, context
->directories
[dt
].items
, context
->directories
[dt
].n_items
) {
2745 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
2747 STRV_FOREACH(symlink
, i
->symlinks
) {
2748 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
2750 src_abs
= path_join(params
->prefix
[dt
], i
->path
);
2751 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
2752 if (!src_abs
|| !dst_abs
)
2755 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
2760 if (!exec_directory_is_private(context
, dt
) ||
2761 exec_context_with_rootfs(context
) ||
2765 private_path
= path_join(params
->prefix
[dt
], "private", i
->path
);
2769 path
= path_join(params
->prefix
[dt
], i
->path
);
2773 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
2778 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2779 * and readers will never get a half-written version. Note that, while the paths specified here are
2780 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2781 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2782 if (setup_os_release_symlink
) {
2783 r
= strv_extend_many(
2785 "/run/host/.os-release-stage/os-release",
2786 "/run/host/os-release");
2791 *ret_symlinks
= TAKE_PTR(symlinks
);
2796 static bool insist_on_sandboxing(
2797 const ExecContext
*context
,
2798 const char *root_dir
,
2799 const char *root_image
,
2800 const BindMount
*bind_mounts
,
2801 size_t n_bind_mounts
) {
2804 assert(n_bind_mounts
== 0 || bind_mounts
);
2806 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2807 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2808 * rearrange stuff in a way we cannot ignore gracefully. */
2810 if (context
->n_temporary_filesystems
> 0)
2813 if (root_dir
|| root_image
)
2816 if (context
->n_mount_images
> 0)
2819 if (context
->dynamic_user
)
2822 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
2825 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2827 FOREACH_ARRAY(i
, bind_mounts
, n_bind_mounts
)
2828 if (!path_equal(i
->source
, i
->destination
))
2831 if (context
->log_namespace
)
2837 static int setup_ephemeral(
2838 const ExecContext
*context
,
2839 ExecRuntime
*runtime
,
2840 char **root_image
, /* both input and output! modified if ephemeral logic enabled */
2841 char **root_directory
) { /* ditto */
2843 _cleanup_close_
int fd
= -EBADF
;
2844 _cleanup_free_
char *new_root
= NULL
;
2849 assert(root_directory
);
2851 if (!*root_image
&& !*root_directory
)
2854 if (!runtime
|| !runtime
->ephemeral_copy
)
2857 assert(runtime
->ephemeral_storage_socket
[0] >= 0);
2858 assert(runtime
->ephemeral_storage_socket
[1] >= 0);
2860 new_root
= strdup(runtime
->ephemeral_copy
);
2862 return log_oom_debug();
2864 r
= posix_lock(runtime
->ephemeral_storage_socket
[0], LOCK_EX
);
2866 return log_debug_errno(r
, "Failed to lock ephemeral storage socket: %m");
2868 CLEANUP_POSIX_UNLOCK(runtime
->ephemeral_storage_socket
[0]);
2870 fd
= receive_one_fd(runtime
->ephemeral_storage_socket
[0], MSG_PEEK
|MSG_DONTWAIT
);
2872 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2875 return log_debug_errno(fd
, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2878 log_debug("Making ephemeral copy of %s to %s", *root_image
, new_root
);
2880 fd
= copy_file(*root_image
,
2888 return log_debug_errno(fd
, "Failed to copy image %s to %s: %m",
2889 *root_image
, new_root
);
2891 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2892 * which tends to not perform well in combination with lots of random writes.
2894 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2895 * copy, but we at least want to make the intention clear.
2897 r
= chattr_fd(fd
, FS_NOCOW_FL
, FS_NOCOW_FL
, NULL
);
2899 log_debug_errno(fd
, "Failed to disable copy-on-write for %s, ignoring: %m", new_root
);
2901 assert(*root_directory
);
2903 log_debug("Making ephemeral snapshot of %s to %s", *root_directory
, new_root
);
2905 fd
= btrfs_subvol_snapshot_at(
2906 AT_FDCWD
, *root_directory
,
2908 BTRFS_SNAPSHOT_FALLBACK_COPY
|
2909 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
2910 BTRFS_SNAPSHOT_RECURSIVE
|
2911 BTRFS_SNAPSHOT_LOCK_BSD
);
2913 return log_debug_errno(fd
, "Failed to snapshot directory %s to %s: %m",
2914 *root_directory
, new_root
);
2917 r
= send_one_fd(runtime
->ephemeral_storage_socket
[1], fd
, MSG_DONTWAIT
);
2919 return log_debug_errno(r
, "Failed to queue file descriptor on ephemeral storage socket: %m");
2922 free_and_replace(*root_image
, new_root
);
2924 assert(*root_directory
);
2925 free_and_replace(*root_directory
, new_root
);
2931 static int verity_settings_prepare(
2932 VeritySettings
*verity
,
2933 const char *root_image
,
2934 const void *root_hash
,
2935 size_t root_hash_size
,
2936 const char *root_hash_path
,
2937 const void *root_hash_sig
,
2938 size_t root_hash_sig_size
,
2939 const char *root_hash_sig_path
,
2940 const char *verity_data_path
) {
2949 d
= memdup(root_hash
, root_hash_size
);
2953 free_and_replace(verity
->root_hash
, d
);
2954 verity
->root_hash_size
= root_hash_size
;
2955 verity
->designator
= PARTITION_ROOT
;
2958 if (root_hash_sig
) {
2961 d
= memdup(root_hash_sig
, root_hash_sig_size
);
2965 free_and_replace(verity
->root_hash_sig
, d
);
2966 verity
->root_hash_sig_size
= root_hash_sig_size
;
2967 verity
->designator
= PARTITION_ROOT
;
2970 if (verity_data_path
) {
2971 r
= free_and_strdup(&verity
->data_path
, verity_data_path
);
2976 r
= verity_settings_load(
2980 root_hash_sig_path
);
2982 return log_debug_errno(r
, "Failed to load root hash: %m");
2987 static int pick_versions(
2988 const ExecContext
*context
,
2989 const ExecParameters
*params
,
2990 char **ret_root_image
,
2991 char **ret_root_directory
) {
2997 assert(ret_root_image
);
2998 assert(ret_root_directory
);
3000 if (context
->root_image
) {
3001 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
3003 r
= path_pick(/* toplevel_path= */ NULL
,
3004 /* toplevel_fd= */ AT_FDCWD
,
3005 context
->root_image
,
3006 &pick_filter_image_raw
,
3007 PICK_ARCHITECTURE
|PICK_TRIES
|PICK_RESOLVE
,
3013 return log_exec_debug_errno(context
, params
, SYNTHETIC_ERRNO(ENOENT
), "No matching entry in .v/ directory %s found.", context
->root_image
);
3015 *ret_root_image
= TAKE_PTR(result
.path
);
3016 *ret_root_directory
= NULL
;
3020 if (context
->root_directory
) {
3021 _cleanup_(pick_result_done
) PickResult result
= PICK_RESULT_NULL
;
3023 r
= path_pick(/* toplevel_path= */ NULL
,
3024 /* toplevel_fd= */ AT_FDCWD
,
3025 context
->root_directory
,
3026 &pick_filter_image_dir
,
3027 PICK_ARCHITECTURE
|PICK_TRIES
|PICK_RESOLVE
,
3033 return log_exec_debug_errno(context
, params
, SYNTHETIC_ERRNO(ENOENT
), "No matching entry in .v/ directory %s found.", context
->root_directory
);
3035 *ret_root_image
= NULL
;
3036 *ret_root_directory
= TAKE_PTR(result
.path
);
3040 *ret_root_image
= *ret_root_directory
= NULL
;
3044 static int apply_mount_namespace(
3045 ExecCommandFlags command_flags
,
3046 const ExecContext
*context
,
3047 const ExecParameters
*params
,
3048 ExecRuntime
*runtime
,
3049 const char *memory_pressure_path
,
3050 bool needs_sandboxing
,
3051 char **error_path
) {
3053 _cleanup_(verity_settings_done
) VeritySettings verity
= VERITY_SETTINGS_DEFAULT
;
3054 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
,
3055 **read_write_paths_cleanup
= NULL
;
3056 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
3057 *extension_dir
= NULL
, *host_os_release_stage
= NULL
, *root_image
= NULL
, *root_dir
= NULL
;
3058 const char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
3059 char **read_write_paths
;
3060 bool setup_os_release_symlink
;
3061 BindMount
*bind_mounts
= NULL
;
3062 size_t n_bind_mounts
= 0;
3067 CLEANUP_ARRAY(bind_mounts
, n_bind_mounts
, bind_mount_free_many
);
3069 if (params
->flags
& EXEC_APPLY_CHROOT
) {
3078 r
= setup_ephemeral(
3087 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
3091 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3092 * service will need to write to it in order to start the notifications. */
3093 if (context
->protect_control_groups
&& memory_pressure_path
&& !streq(memory_pressure_path
, "/dev/null")) {
3094 read_write_paths_cleanup
= strv_copy(context
->read_write_paths
);
3095 if (!read_write_paths_cleanup
)
3098 r
= strv_extend(&read_write_paths_cleanup
, memory_pressure_path
);
3102 read_write_paths
= read_write_paths_cleanup
;
3104 read_write_paths
= context
->read_write_paths
;
3106 if (needs_sandboxing
) {
3107 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3108 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3109 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3111 if (context
->private_tmp
&& runtime
&& runtime
->shared
) {
3112 if (streq_ptr(runtime
->shared
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
3113 tmp_dir
= runtime
->shared
->tmp_dir
;
3114 else if (runtime
->shared
->tmp_dir
)
3115 tmp_dir
= strjoina(runtime
->shared
->tmp_dir
, "/tmp");
3117 if (streq_ptr(runtime
->shared
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
3118 var_tmp_dir
= runtime
->shared
->var_tmp_dir
;
3119 else if (runtime
->shared
->var_tmp_dir
)
3120 var_tmp_dir
= strjoina(runtime
->shared
->var_tmp_dir
, "/tmp");
3124 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3125 setup_os_release_symlink
= needs_sandboxing
&& exec_context_get_effective_mount_apivfs(context
) && (root_dir
|| root_image
);
3126 r
= compile_symlinks(context
, params
, setup_os_release_symlink
, &symlinks
);
3130 if (context
->mount_propagation_flag
== MS_SHARED
)
3131 log_exec_debug(context
,
3133 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3135 r
= exec_context_get_credential_directory(context
, params
, params
->unit_id
, &creds_path
);
3139 if (params
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
3140 propagate_dir
= path_join("/run/systemd/propagate/", params
->unit_id
);
3144 incoming_dir
= strdup("/run/systemd/incoming");
3148 extension_dir
= strdup("/run/systemd/unit-extensions");
3152 /* If running under a different root filesystem, propagate the host's os-release. We make a
3153 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3154 if (setup_os_release_symlink
) {
3155 host_os_release_stage
= strdup("/run/systemd/propagate/.os-release-stage");
3156 if (!host_os_release_stage
)
3160 assert(params
->runtime_scope
== RUNTIME_SCOPE_USER
);
3162 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0)
3165 if (setup_os_release_symlink
) {
3166 if (asprintf(&host_os_release_stage
,
3167 "/run/user/" UID_FMT
"/systemd/propagate/.os-release-stage",
3174 r
= verity_settings_prepare(
3177 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
3178 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
3179 context
->root_verity
);
3184 NamespaceParameters parameters
= {
3185 .runtime_scope
= params
->runtime_scope
,
3187 .root_directory
= root_dir
,
3188 .root_image
= root_image
,
3189 .root_image_options
= context
->root_image_options
,
3190 .root_image_policy
= context
->root_image_policy
?: &image_policy_service
,
3192 .read_write_paths
= read_write_paths
,
3193 .read_only_paths
= needs_sandboxing
? context
->read_only_paths
: NULL
,
3194 .inaccessible_paths
= needs_sandboxing
? context
->inaccessible_paths
: NULL
,
3196 .exec_paths
= needs_sandboxing
? context
->exec_paths
: NULL
,
3197 .no_exec_paths
= needs_sandboxing
? context
->no_exec_paths
: NULL
,
3199 .empty_directories
= empty_directories
,
3200 .symlinks
= symlinks
,
3202 .bind_mounts
= bind_mounts
,
3203 .n_bind_mounts
= n_bind_mounts
,
3205 .temporary_filesystems
= context
->temporary_filesystems
,
3206 .n_temporary_filesystems
= context
->n_temporary_filesystems
,
3208 .mount_images
= context
->mount_images
,
3209 .n_mount_images
= context
->n_mount_images
,
3210 .mount_image_policy
= context
->mount_image_policy
?: &image_policy_service
,
3213 .var_tmp_dir
= var_tmp_dir
,
3215 .creds_path
= creds_path
,
3216 .log_namespace
= context
->log_namespace
,
3217 .mount_propagation_flag
= context
->mount_propagation_flag
,
3221 .extension_images
= context
->extension_images
,
3222 .n_extension_images
= context
->n_extension_images
,
3223 .extension_image_policy
= context
->extension_image_policy
?: &image_policy_sysext
,
3224 .extension_directories
= context
->extension_directories
,
3226 .propagate_dir
= propagate_dir
,
3227 .incoming_dir
= incoming_dir
,
3228 .extension_dir
= extension_dir
,
3229 .notify_socket
= root_dir
|| root_image
? params
->notify_socket
: NULL
,
3230 .host_os_release_stage
= host_os_release_stage
,
3232 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3233 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3234 * sandbox inside the mount namespace. */
3235 .ignore_protect_paths
= !needs_sandboxing
&& !context
->dynamic_user
&& root_dir
,
3237 .protect_control_groups
= needs_sandboxing
&& context
->protect_control_groups
,
3238 .protect_kernel_tunables
= needs_sandboxing
&& context
->protect_kernel_tunables
,
3239 .protect_kernel_modules
= needs_sandboxing
&& context
->protect_kernel_modules
,
3240 .protect_kernel_logs
= needs_sandboxing
&& context
->protect_kernel_logs
,
3241 .protect_hostname
= needs_sandboxing
&& context
->protect_hostname
,
3243 .private_dev
= needs_sandboxing
&& context
->private_devices
,
3244 .private_network
= needs_sandboxing
&& exec_needs_network_namespace(context
),
3245 .private_ipc
= needs_sandboxing
&& exec_needs_ipc_namespace(context
),
3247 .mount_apivfs
= needs_sandboxing
&& exec_context_get_effective_mount_apivfs(context
),
3249 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3250 .mount_nosuid
= needs_sandboxing
&& context
->no_new_privileges
&& !mac_selinux_use(),
3252 .protect_home
= needs_sandboxing
? context
->protect_home
: false,
3253 .protect_system
= needs_sandboxing
? context
->protect_system
: false,
3254 .protect_proc
= needs_sandboxing
? context
->protect_proc
: false,
3255 .proc_subset
= needs_sandboxing
? context
->proc_subset
: false,
3258 r
= setup_namespace(¶meters
, error_path
);
3259 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3260 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3261 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3262 * completely different execution environment. */
3264 if (insist_on_sandboxing(
3266 root_dir
, root_image
,
3269 return log_exec_debug_errno(context
,
3271 SYNTHETIC_ERRNO(EOPNOTSUPP
),
3272 "Failed to set up namespace, and refusing to continue since "
3273 "the selected namespacing options alter mount environment non-trivially.\n"
3274 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3276 context
->n_temporary_filesystems
,
3279 yes_no(context
->dynamic_user
));
3281 log_exec_debug(context
, params
, "Failed to set up namespace, assuming containerized execution and ignoring.");
3288 static int apply_working_directory(
3289 const ExecContext
*context
,
3290 const ExecParameters
*params
,
3291 ExecRuntime
*runtime
,
3299 assert(exit_status
);
3301 if (context
->working_directory_home
) {
3303 *exit_status
= EXIT_CHDIR
;
3309 wd
= empty_to_root(context
->working_directory
);
3311 if (params
->flags
& EXEC_APPLY_CHROOT
)
3312 r
= RET_NERRNO(chdir(wd
));
3314 _cleanup_close_
int dfd
= -EBADF
;
3317 (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
,
3318 CHASE_PREFIX_ROOT
|CHASE_AT_RESOLVE_IN_ROOT
,
3319 /* ret_path= */ NULL
,
3322 r
= RET_NERRNO(fchdir(dfd
));
3325 if (r
< 0 && !context
->working_directory_missing_ok
) {
3326 *exit_status
= EXIT_CHDIR
;
3333 static int apply_root_directory(
3334 const ExecContext
*context
,
3335 const ExecParameters
*params
,
3336 ExecRuntime
*runtime
,
3337 const bool needs_mount_ns
,
3341 assert(exit_status
);
3343 if (params
->flags
& EXEC_APPLY_CHROOT
)
3344 if (!needs_mount_ns
&& context
->root_directory
)
3345 if (chroot((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
) < 0) {
3346 *exit_status
= EXIT_CHROOT
;
3353 static int setup_keyring(
3354 const ExecContext
*context
,
3355 const ExecParameters
*p
,
3356 uid_t uid
, gid_t gid
) {
3358 key_serial_t keyring
;
3366 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3367 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3368 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3369 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3370 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3371 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3373 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
3376 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3377 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3378 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3379 * & group is just as nasty as acquiring a reference to the user keyring. */
3381 saved_uid
= getuid();
3382 saved_gid
= getgid();
3384 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3385 if (setregid(gid
, -1) < 0)
3386 return log_exec_error_errno(context
,
3389 "Failed to change GID for user keyring: %m");
3392 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3393 if (setreuid(uid
, -1) < 0) {
3394 r
= log_exec_error_errno(context
,
3397 "Failed to change UID for user keyring: %m");
3402 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
3403 if (keyring
== -1) {
3404 if (errno
== ENOSYS
)
3405 log_exec_debug_errno(context
,
3408 "Kernel keyring not supported, ignoring.");
3409 else if (ERRNO_IS_PRIVILEGE(errno
))
3410 log_exec_debug_errno(context
,
3413 "Kernel keyring access prohibited, ignoring.");
3414 else if (errno
== EDQUOT
)
3415 log_exec_debug_errno(context
,
3418 "Out of kernel keyrings to allocate, ignoring.");
3420 r
= log_exec_error_errno(context
,
3423 "Setting up kernel keyring failed: %m");
3428 /* When requested link the user keyring into the session keyring. */
3429 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
3431 if (keyctl(KEYCTL_LINK
,
3432 KEY_SPEC_USER_KEYRING
,
3433 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
3434 r
= log_exec_error_errno(context
,
3437 "Failed to link user keyring into session keyring: %m");
3442 /* Restore uid/gid back */
3443 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3444 if (setreuid(saved_uid
, -1) < 0) {
3445 r
= log_exec_error_errno(context
,
3448 "Failed to change UID back for user keyring: %m");
3453 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3454 if (setregid(saved_gid
, -1) < 0)
3455 return log_exec_error_errno(context
,
3458 "Failed to change GID back for user keyring: %m");
3461 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3462 if (!sd_id128_is_null(p
->invocation_id
)) {
3465 key
= add_key("user",
3468 sizeof(p
->invocation_id
),
3469 KEY_SPEC_SESSION_KEYRING
);
3471 log_exec_debug_errno(context
,
3474 "Failed to add invocation ID to keyring, ignoring: %m");
3476 if (keyctl(KEYCTL_SETPERM
, key
,
3477 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
3478 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
3479 r
= log_exec_error_errno(context
,
3482 "Failed to restrict invocation ID permission: %m");
3487 /* Revert back uid & gid for the last time, and exit */
3488 /* no extra logging, as only the first already reported error matters */
3489 if (getuid() != saved_uid
)
3490 (void) setreuid(saved_uid
, -1);
3492 if (getgid() != saved_gid
)
3493 (void) setregid(saved_gid
, -1);
3498 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
3504 array
[(*n
)++] = pair
[0];
3506 array
[(*n
)++] = pair
[1];
3509 static int close_remaining_fds(
3510 const ExecParameters
*params
,
3511 const ExecRuntime
*runtime
,
3513 const int *fds
, size_t n_fds
) {
3515 size_t n_dont_close
= 0;
3516 int dont_close
[n_fds
+ 16];
3520 if (params
->stdin_fd
>= 0)
3521 dont_close
[n_dont_close
++] = params
->stdin_fd
;
3522 if (params
->stdout_fd
>= 0)
3523 dont_close
[n_dont_close
++] = params
->stdout_fd
;
3524 if (params
->stderr_fd
>= 0)
3525 dont_close
[n_dont_close
++] = params
->stderr_fd
;
3528 dont_close
[n_dont_close
++] = socket_fd
;
3530 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
3531 n_dont_close
+= n_fds
;
3535 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ephemeral_storage_socket
);
3537 if (runtime
&& runtime
->shared
) {
3538 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->netns_storage_socket
);
3539 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->ipcns_storage_socket
);
3542 if (runtime
&& runtime
->dynamic_creds
) {
3543 if (runtime
->dynamic_creds
->user
)
3544 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->user
->storage_socket
);
3545 if (runtime
->dynamic_creds
->group
)
3546 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->group
->storage_socket
);
3549 if (params
->user_lookup_fd
>= 0)
3550 dont_close
[n_dont_close
++] = params
->user_lookup_fd
;
3552 if (params
->handoff_timestamp_fd
>= 0)
3553 dont_close
[n_dont_close
++] = params
->handoff_timestamp_fd
;
3555 assert(n_dont_close
<= ELEMENTSOF(dont_close
));
3557 return close_all_fds(dont_close
, n_dont_close
);
3560 static int send_user_lookup(
3561 const char *unit_id
,
3568 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3569 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3572 if (user_lookup_fd
< 0)
3575 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
3578 if (writev(user_lookup_fd
,
3580 IOVEC_MAKE(&uid
, sizeof(uid
)),
3581 IOVEC_MAKE(&gid
, sizeof(gid
)),
3582 IOVEC_MAKE_STRING(unit_id
) }, 3) < 0)
3588 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
3595 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3600 if (!c
->working_directory_home
)
3603 r
= get_home_dir(buf
);
3611 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
3612 _cleanup_strv_free_
char ** list
= NULL
;
3619 assert(c
->dynamic_user
);
3621 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3622 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3625 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3626 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
3632 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
3635 if (exec_directory_is_private(c
, t
))
3636 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
3638 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
3642 r
= strv_consume(&list
, e
);
3648 *ret
= TAKE_PTR(list
);
3653 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
3654 _cleanup_(cpu_set_reset
) CPUSet s
= {};
3660 if (!c
->numa_policy
.nodes
.set
) {
3661 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3665 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
3671 return cpu_set_add_all(ret
, &s
);
3674 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int *fd
) {
3679 assert(*n_fds
< fds_size
);
3685 if (*fd
< 3 + (int) *n_fds
) {
3686 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3687 * the fds we pass to the process (or which are closed only during execve). */
3689 r
= fcntl(*fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
3693 close_and_replace(*fd
, r
);
3696 fds
[(*n_fds
)++] = *fd
;
3700 static int connect_unix_harder(const ExecContext
*c
, const ExecParameters
*p
, const OpenFile
*of
, int ofd
) {
3701 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
3703 union sockaddr_union addr
= {
3704 .un
.sun_family
= AF_UNIX
,
3714 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
3716 return log_exec_error_errno(c
, p
, r
, "Failed to set sockaddr for '%s': %m", of
->path
);
3719 FOREACH_ELEMENT(i
, socket_types
) {
3720 _cleanup_close_
int fd
= -EBADF
;
3722 fd
= socket(AF_UNIX
, *i
|SOCK_CLOEXEC
, 0);
3724 return log_exec_error_errno(c
, p
,
3725 errno
, "Failed to create socket for '%s': %m",
3728 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
3731 if (r
!= -EPROTOTYPE
)
3732 return log_exec_error_errno(c
, p
,
3733 r
, "Failed to connect to socket for '%s': %m",
3737 return log_exec_error_errno(c
, p
,
3738 SYNTHETIC_ERRNO(EPROTOTYPE
), "No suitable socket type to connect to socket '%s'.",
3742 static int get_open_file_fd(const ExecContext
*c
, const ExecParameters
*p
, const OpenFile
*of
) {
3743 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
3750 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
3752 return log_exec_error_errno(c
, p
, errno
, "Failed to open '%s' as O_PATH: %m", of
->path
);
3754 if (fstat(ofd
, &st
) < 0)
3755 return log_exec_error_errno(c
, p
, errno
, "Failed to stat '%s': %m", of
->path
);
3757 if (S_ISSOCK(st
.st_mode
)) {
3758 fd
= connect_unix_harder(c
, p
, of
, ofd
);
3762 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
3763 return log_exec_error_errno(c
, p
,
3764 errno
, "Failed to shutdown send for socket '%s': %m",
3767 log_exec_debug(c
, p
, "Opened socket '%s' as fd %d.", of
->path
, fd
);
3769 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
3770 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
3772 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
3775 fd
= fd_reopen(ofd
, flags
| O_CLOEXEC
);
3777 return log_exec_error_errno(c
, p
, fd
, "Failed to reopen file '%s': %m", of
->path
);
3779 log_exec_debug(c
, p
, "Opened file '%s' as fd %d.", of
->path
, fd
);
3785 static int collect_open_file_fds(const ExecContext
*c
, ExecParameters
*p
, size_t *n_fds
) {
3792 LIST_FOREACH(open_files
, of
, p
->open_files
) {
3793 _cleanup_close_
int fd
= -EBADF
;
3795 fd
= get_open_file_fd(c
, p
, of
);
3797 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
3798 log_exec_warning_errno(c
, p
, fd
,
3799 "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
3807 if (!GREEDY_REALLOC(p
->fds
, *n_fds
+ 1))
3810 r
= strv_extend(&p
->fd_names
, of
->fdname
);
3814 p
->fds
[(*n_fds
)++] = TAKE_FD(fd
);
3820 static void log_command_line(
3821 const ExecContext
*context
,
3822 const ExecParameters
*params
,
3824 const char *executable
,
3835 _cleanup_free_
char *cmdline
= quote_command_line(argv
, SHELL_ESCAPE_EMPTY
);
3837 log_exec_struct(context
, params
, LOG_DEBUG
,
3838 "EXECUTABLE=%s", executable
,
3839 LOG_EXEC_MESSAGE(params
, "%s: %s", msg
, strnull(cmdline
)),
3840 LOG_EXEC_INVOCATION_ID(params
));
3843 static bool exec_context_need_unprivileged_private_users(
3844 const ExecContext
*context
,
3845 const ExecParameters
*params
) {
3850 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3851 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3852 * (system manager) then we have privileges and don't need this. */
3853 if (params
->runtime_scope
!= RUNTIME_SCOPE_USER
)
3856 return context
->private_users
||
3857 context
->private_tmp
||
3858 context
->private_devices
||
3859 context
->private_network
||
3860 context
->network_namespace_path
||
3861 context
->private_ipc
||
3862 context
->ipc_namespace_path
||
3863 context
->private_mounts
> 0 ||
3864 context
->mount_apivfs
> 0 ||
3865 context
->n_bind_mounts
> 0 ||
3866 context
->n_temporary_filesystems
> 0 ||
3867 context
->root_directory
||
3868 !strv_isempty(context
->extension_directories
) ||
3869 context
->protect_system
!= PROTECT_SYSTEM_NO
||
3870 context
->protect_home
!= PROTECT_HOME_NO
||
3871 context
->protect_kernel_tunables
||
3872 context
->protect_kernel_modules
||
3873 context
->protect_kernel_logs
||
3874 context
->protect_control_groups
||
3875 context
->protect_clock
||
3876 context
->protect_hostname
||
3877 !strv_isempty(context
->read_write_paths
) ||
3878 !strv_isempty(context
->read_only_paths
) ||
3879 !strv_isempty(context
->inaccessible_paths
) ||
3880 !strv_isempty(context
->exec_paths
) ||
3881 !strv_isempty(context
->no_exec_paths
);
3884 static bool exec_context_shall_confirm_spawn(const ExecContext
*context
) {
3887 if (confirm_spawn_disabled())
3890 /* For some reasons units remaining in the same process group
3891 * as PID 1 fail to acquire the console even if it's not used
3892 * by any process. So skip the confirmation question for them. */
3893 return !context
->same_pgrp
;
3896 static int exec_context_named_iofds(
3897 const ExecContext
*c
,
3898 const ExecParameters
*p
,
3899 int named_iofds
[static 3]) {
3902 const char* stdio_fdname
[3];
3907 assert(named_iofds
);
3909 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3910 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3911 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3913 for (size_t i
= 0; i
< 3; i
++)
3914 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3916 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3918 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
3919 if (named_iofds
[STDIN_FILENO
] < 0 &&
3920 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3921 stdio_fdname
[STDIN_FILENO
] &&
3922 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3924 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3927 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3928 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3929 stdio_fdname
[STDOUT_FILENO
] &&
3930 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3932 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3935 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3936 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3937 stdio_fdname
[STDERR_FILENO
] &&
3938 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3940 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3944 return targets
== 0 ? 0 : -ENOENT
;
3947 static void exec_shared_runtime_close(ExecSharedRuntime
*shared
) {
3951 safe_close_pair(shared
->netns_storage_socket
);
3952 safe_close_pair(shared
->ipcns_storage_socket
);
3955 static void exec_runtime_close(ExecRuntime
*rt
) {
3959 safe_close_pair(rt
->ephemeral_storage_socket
);
3961 exec_shared_runtime_close(rt
->shared
);
3962 dynamic_creds_close(rt
->dynamic_creds
);
3965 static void exec_params_close(ExecParameters
*p
) {
3969 p
->stdin_fd
= safe_close(p
->stdin_fd
);
3970 p
->stdout_fd
= safe_close(p
->stdout_fd
);
3971 p
->stderr_fd
= safe_close(p
->stderr_fd
);
3974 static int exec_fd_mark_hot(
3975 const ExecContext
*c
,
3978 int *reterr_exit_status
) {
3988 if (write(p
->exec_fd
, &x
, sizeof(x
)) < 0) {
3989 if (reterr_exit_status
)
3990 *reterr_exit_status
= EXIT_EXEC
;
3991 return log_exec_error_errno(c
, p
, errno
, "Failed to mark exec_fd as %s: %m", hot
? "hot" : "cold");
3997 static int send_handoff_timestamp(
3998 const ExecContext
*c
,
4000 int *reterr_exit_status
) {
4005 if (p
->handoff_timestamp_fd
< 0)
4009 dual_timestamp_now(&dt
);
4011 if (send(p
->handoff_timestamp_fd
, (const usec_t
[2]) { dt
.realtime
, dt
.monotonic
}, sizeof(usec_t
) * 2, 0) < 0) {
4012 if (reterr_exit_status
)
4013 *reterr_exit_status
= EXIT_EXEC
;
4014 return log_exec_error_errno(c
, p
, errno
, "Failed to send handoff timestamp: %m");
4021 const ExecCommand
*command
,
4022 const ExecContext
*context
,
4023 ExecParameters
*params
,
4024 ExecRuntime
*runtime
,
4025 const CGroupContext
*cgroup_context
,
4028 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
4030 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
4031 const char *username
= NULL
, *groupname
= NULL
;
4032 _cleanup_free_
char *home_buffer
= NULL
, *memory_pressure_path
= NULL
;
4033 const char *home
= NULL
, *shell
= NULL
;
4034 char **final_argv
= NULL
;
4035 dev_t journal_stream_dev
= 0;
4036 ino_t journal_stream_ino
= 0;
4037 bool userns_set_up
= false;
4038 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4039 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
4040 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
4041 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
4042 bool keep_seccomp_privileges
= false;
4044 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
4045 bool use_selinux
= false;
4048 bool use_smack
= false;
4051 bool use_apparmor
= false;
4054 uint64_t saved_bset
= 0;
4056 uid_t saved_uid
= getuid();
4057 gid_t saved_gid
= getgid();
4058 uid_t uid
= UID_INVALID
;
4059 gid_t gid
= GID_INVALID
;
4060 size_t n_fds
, /* fds to pass to the child */
4061 n_keep_fds
; /* total number of fds not to close */
4063 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
4064 int ngids_after_pam
= 0;
4066 int socket_fd
= -EBADF
, named_iofds
[3] = EBADF_TRIPLET
;
4067 size_t n_storage_fds
, n_socket_fds
;
4072 assert(exit_status
);
4074 /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4075 * and is already applied earlier. Just for safety. */
4076 if (context
->log_level_max
>= 0)
4077 log_set_max_level(context
->log_level_max
);
4079 /* Explicitly test for CVE-2021-4034 inspired invocations */
4080 if (!command
->path
|| strv_isempty(command
->argv
)) {
4081 *exit_status
= EXIT_EXEC
;
4082 return log_exec_error_errno(
4085 SYNTHETIC_ERRNO(EINVAL
),
4086 "Invalid command line arguments.");
4089 LOG_CONTEXT_PUSH_EXEC(context
, params
);
4091 if (context
->std_input
== EXEC_INPUT_SOCKET
||
4092 context
->std_output
== EXEC_OUTPUT_SOCKET
||
4093 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
4095 if (params
->n_socket_fds
> 1)
4096 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
4098 if (params
->n_socket_fds
== 0)
4099 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
4101 socket_fd
= params
->fds
[0];
4102 n_storage_fds
= n_socket_fds
= 0;
4104 n_socket_fds
= params
->n_socket_fds
;
4105 n_storage_fds
= params
->n_storage_fds
;
4107 n_fds
= n_socket_fds
+ n_storage_fds
;
4109 r
= exec_context_named_iofds(context
, params
, named_iofds
);
4111 return log_exec_error_errno(context
, params
, r
, "Failed to load a named file descriptor: %m");
4113 rename_process_from_path(command
->path
);
4115 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4116 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4117 * both of which will be demoted to SIG_DFL. */
4118 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4121 if (context
->ignore_sigpipe
)
4122 (void) ignore_signals(SIGPIPE
);
4124 r
= reset_signal_mask();
4126 *exit_status
= EXIT_SIGNAL_MASK
;
4127 return log_exec_error_errno(context
, params
, r
, "Failed to set process signal mask: %m");
4130 if (params
->idle_pipe
)
4131 do_idle_pipe_dance(params
->idle_pipe
);
4133 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4134 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4135 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4136 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4139 log_set_open_when_needed(true);
4140 log_settle_target();
4142 /* In case anything used libc syslog(), close this here, too */
4145 r
= collect_open_file_fds(context
, params
, &n_fds
);
4147 *exit_status
= EXIT_FDS
;
4148 return log_exec_error_errno(context
, params
, r
, "Failed to get OpenFile= file descriptors: %m");
4151 int keep_fds
[n_fds
+ 4];
4152 memcpy_safe(keep_fds
, params
->fds
, n_fds
* sizeof(int));
4155 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, ¶ms
->exec_fd
);
4157 *exit_status
= EXIT_FDS
;
4158 return log_exec_error_errno(context
, params
, r
, "Failed to collect shifted fd: %m");
4161 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, ¶ms
->handoff_timestamp_fd
);
4163 *exit_status
= EXIT_FDS
;
4164 return log_exec_error_errno(context
, params
, r
, "Failed to collect shifted fd: %m");
4168 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, ¶ms
->bpf_restrict_fs_map_fd
);
4170 *exit_status
= EXIT_FDS
;
4171 return log_exec_error_errno(context
, params
, r
, "Failed to collect shifted fd: %m");
4175 r
= close_remaining_fds(params
, runtime
, socket_fd
, keep_fds
, n_keep_fds
);
4177 *exit_status
= EXIT_FDS
;
4178 return log_exec_error_errno(context
, params
, r
, "Failed to close unwanted file descriptors: %m");
4181 if (!context
->same_pgrp
&&
4183 *exit_status
= EXIT_SETSID
;
4184 return log_exec_error_errno(context
, params
, errno
, "Failed to create new process session: %m");
4187 exec_context_tty_reset(context
, params
);
4189 if (params
->shall_confirm_spawn
&& exec_context_shall_confirm_spawn(context
)) {
4190 _cleanup_free_
char *cmdline
= NULL
;
4192 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4194 *exit_status
= EXIT_MEMORY
;
4198 r
= ask_for_confirmation(context
, params
, cmdline
);
4199 if (r
!= CONFIRM_EXECUTE
) {
4200 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4201 *exit_status
= EXIT_SUCCESS
;
4205 *exit_status
= EXIT_CONFIRM
;
4206 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(ECANCELED
),
4207 "Execution cancelled by the user.");
4211 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4212 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4213 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4214 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4215 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4216 if (setenv("SYSTEMD_ACTIVATION_UNIT", params
->unit_id
, true) != 0 ||
4217 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params
->runtime_scope
), true) != 0) {
4218 *exit_status
= EXIT_MEMORY
;
4219 return log_exec_error_errno(context
, params
, errno
, "Failed to update environment: %m");
4222 if (context
->dynamic_user
&& runtime
&& runtime
->dynamic_creds
) {
4223 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4225 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4226 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4227 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4228 *exit_status
= EXIT_USER
;
4229 return log_exec_error_errno(context
, params
, errno
, "Failed to update environment: %m");
4232 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4234 *exit_status
= EXIT_MEMORY
;
4238 r
= dynamic_creds_realize(runtime
->dynamic_creds
, suggested_paths
, &uid
, &gid
);
4240 *exit_status
= EXIT_USER
;
4242 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4243 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4244 return log_exec_error_errno(context
, params
, r
, "Failed to update dynamic user credentials: %m");
4247 if (!uid_is_valid(uid
)) {
4248 *exit_status
= EXIT_USER
;
4249 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\".", uid
);
4252 if (!gid_is_valid(gid
)) {
4253 *exit_status
= EXIT_USER
;
4254 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\".", gid
);
4257 if (runtime
->dynamic_creds
->user
)
4258 username
= runtime
->dynamic_creds
->user
->name
;
4261 if (context
->user
) {
4262 r
= get_fixed_user(context
->user
, &username
, &uid
, &gid
, &home
, &shell
);
4264 *exit_status
= EXIT_USER
;
4265 return log_exec_error_errno(context
, params
, r
, "Failed to determine user credentials: %m");
4269 if (context
->group
) {
4270 r
= get_fixed_group(context
->group
, &groupname
, &gid
);
4272 *exit_status
= EXIT_GROUP
;
4273 return log_exec_error_errno(context
, params
, r
, "Failed to determine group credentials: %m");
4278 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4279 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
4280 &supplementary_gids
, &ngids
);
4282 *exit_status
= EXIT_GROUP
;
4283 return log_exec_error_errno(context
, params
, r
, "Failed to determine supplementary groups: %m");
4286 r
= send_user_lookup(params
->unit_id
, params
->user_lookup_fd
, uid
, gid
);
4288 *exit_status
= EXIT_USER
;
4289 return log_exec_error_errno(context
, params
, r
, "Failed to send user credentials to PID1: %m");
4292 params
->user_lookup_fd
= safe_close(params
->user_lookup_fd
);
4294 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
4296 *exit_status
= EXIT_CHDIR
;
4297 return log_exec_error_errno(context
, params
, r
, "Failed to determine $HOME for user: %m");
4300 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4302 (void) fd_nonblock(socket_fd
, false);
4304 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4305 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4306 if (params
->cgroup_path
) {
4307 _cleanup_free_
char *p
= NULL
;
4309 r
= exec_params_get_cgroup_path(params
, cgroup_context
, &p
);
4311 *exit_status
= EXIT_CGROUP
;
4312 return log_exec_error_errno(context
, params
, r
, "Failed to acquire cgroup path: %m");
4315 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
4316 if (r
== -EUCLEAN
) {
4317 *exit_status
= EXIT_CGROUP
;
4318 return log_exec_error_errno(context
, params
, r
,
4319 "Failed to attach process to cgroup '%s', "
4320 "because the cgroup or one of its parents or "
4321 "siblings is in the threaded mode.", p
);
4324 *exit_status
= EXIT_CGROUP
;
4325 return log_exec_error_errno(context
, params
, r
, "Failed to attach to cgroup %s: %m", p
);
4329 if (context
->network_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4330 r
= open_shareable_ns_path(runtime
->shared
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
4332 *exit_status
= EXIT_NETWORK
;
4333 return log_exec_error_errno(context
, params
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
4337 if (context
->ipc_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4338 r
= open_shareable_ns_path(runtime
->shared
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
4340 *exit_status
= EXIT_NAMESPACE
;
4341 return log_exec_error_errno(context
, params
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
4345 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
4347 *exit_status
= EXIT_STDIN
;
4348 return log_exec_error_errno(context
, params
, r
, "Failed to set up standard input: %m");
4351 _cleanup_free_
char *fname
= NULL
;
4352 r
= path_extract_filename(command
->path
, &fname
);
4354 *exit_status
= EXIT_STDOUT
;
4355 return log_exec_error_errno(context
, params
, r
, "Failed to extract filename from path %s: %m", command
->path
);
4358 r
= setup_output(context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, fname
, uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4360 *exit_status
= EXIT_STDOUT
;
4361 return log_exec_error_errno(context
, params
, r
, "Failed to set up standard output: %m");
4364 r
= setup_output(context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, fname
, uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4366 *exit_status
= EXIT_STDERR
;
4367 return log_exec_error_errno(context
, params
, r
, "Failed to set up standard error output: %m");
4370 if (context
->oom_score_adjust_set
) {
4371 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4372 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4373 r
= set_oom_score_adjust(context
->oom_score_adjust
);
4374 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4375 log_exec_debug_errno(context
, params
, r
,
4376 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4378 *exit_status
= EXIT_OOM_ADJUST
;
4379 return log_exec_error_errno(context
, params
, r
, "Failed to adjust OOM setting: %m");
4383 if (context
->coredump_filter_set
) {
4384 r
= set_coredump_filter(context
->coredump_filter
);
4385 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4386 log_exec_debug_errno(context
, params
, r
, "Failed to adjust coredump_filter, ignoring: %m");
4388 *exit_status
= EXIT_LIMITS
;
4389 return log_exec_error_errno(context
, params
, r
, "Failed to adjust coredump_filter: %m");
4393 if (context
->nice_set
) {
4394 r
= setpriority_closest(context
->nice
);
4396 *exit_status
= EXIT_NICE
;
4397 return log_exec_error_errno(context
, params
, r
, "Failed to set up process scheduling priority (nice level): %m");
4401 if (context
->cpu_sched_set
) {
4402 struct sched_param param
= {
4403 .sched_priority
= context
->cpu_sched_priority
,
4406 r
= sched_setscheduler(0,
4407 context
->cpu_sched_policy
|
4408 (context
->cpu_sched_reset_on_fork
?
4409 SCHED_RESET_ON_FORK
: 0),
4412 *exit_status
= EXIT_SETSCHEDULER
;
4413 return log_exec_error_errno(context
, params
, errno
, "Failed to set up CPU scheduling: %m");
4417 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
4418 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
4419 const CPUSet
*cpu_set
;
4421 if (context
->cpu_affinity_from_numa
) {
4422 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
4424 *exit_status
= EXIT_CPUAFFINITY
;
4425 return log_exec_error_errno(context
, params
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
4428 cpu_set
= &converted_cpu_set
;
4430 cpu_set
= &context
->cpu_set
;
4432 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
4433 *exit_status
= EXIT_CPUAFFINITY
;
4434 return log_exec_error_errno(context
, params
, errno
, "Failed to set up CPU affinity: %m");
4438 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
4439 r
= apply_numa_policy(&context
->numa_policy
);
4440 if (ERRNO_IS_NEG_NOT_SUPPORTED(r
))
4441 log_exec_debug_errno(context
, params
, r
, "NUMA support not available, ignoring.");
4443 *exit_status
= EXIT_NUMA_POLICY
;
4444 return log_exec_error_errno(context
, params
, r
, "Failed to set NUMA memory policy: %m");
4448 if (context
->ioprio_set
)
4449 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
4450 *exit_status
= EXIT_IOPRIO
;
4451 return log_exec_error_errno(context
, params
, errno
, "Failed to set up IO scheduling priority: %m");
4454 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
4455 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
4456 *exit_status
= EXIT_TIMERSLACK
;
4457 return log_exec_error_errno(context
, params
, errno
, "Failed to set up timer slack: %m");
4460 if (context
->personality
!= PERSONALITY_INVALID
) {
4461 r
= safe_personality(context
->personality
);
4463 *exit_status
= EXIT_PERSONALITY
;
4464 return log_exec_error_errno(context
, params
, r
, "Failed to set up execution domain (personality): %m");
4469 if (context
->utmp_id
) {
4470 _cleanup_free_
char *username_alloc
= NULL
;
4472 if (!username
&& context
->utmp_mode
== EXEC_UTMP_USER
) {
4473 username_alloc
= uid_to_name(uid_is_valid(uid
) ? uid
: saved_uid
);
4474 if (!username_alloc
) {
4475 *exit_status
= EXIT_USER
;
4480 const char *line
= context
->tty_path
?
4481 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
4483 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
4485 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
4486 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
4488 username
?: username_alloc
);
4492 if (uid_is_valid(uid
)) {
4493 r
= chown_terminal(STDIN_FILENO
, uid
);
4495 *exit_status
= EXIT_STDIN
;
4496 return log_exec_error_errno(context
, params
, r
, "Failed to change ownership of terminal: %m");
4500 if (params
->cgroup_path
) {
4501 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4502 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4503 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4504 * touch a single hierarchy too. */
4506 if (params
->flags
& EXEC_CGROUP_DELEGATE
) {
4507 _cleanup_free_
char *p
= NULL
;
4509 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
4511 *exit_status
= EXIT_CGROUP
;
4512 return log_exec_error_errno(context
, params
, r
, "Failed to adjust control group access: %m");
4515 r
= exec_params_get_cgroup_path(params
, cgroup_context
, &p
);
4517 *exit_status
= EXIT_CGROUP
;
4518 return log_exec_error_errno(context
, params
, r
, "Failed to acquire cgroup path: %m");
4521 r
= cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER
, p
, uid
, gid
);
4523 *exit_status
= EXIT_CGROUP
;
4524 return log_exec_error_errno(context
, params
, r
, "Failed to adjust control subgroup access: %m");
4529 if (cgroup_context
&& cg_unified() > 0 && is_pressure_supported() > 0) {
4530 if (cgroup_context_want_memory_pressure(cgroup_context
)) {
4531 r
= cg_get_path("memory", params
->cgroup_path
, "memory.pressure", &memory_pressure_path
);
4533 *exit_status
= EXIT_MEMORY
;
4537 r
= chmod_and_chown(memory_pressure_path
, 0644, uid
, gid
);
4539 log_exec_full_errno(context
, params
, r
== -ENOENT
|| ERRNO_IS_PRIVILEGE(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
4540 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path
);
4541 memory_pressure_path
= mfree(memory_pressure_path
);
4543 } else if (cgroup_context
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_OFF
) {
4544 memory_pressure_path
= strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4545 if (!memory_pressure_path
) {
4546 *exit_status
= EXIT_MEMORY
;
4553 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
4555 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4556 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
4558 return log_exec_error_errno(context
, params
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
4561 r
= exec_setup_credentials(context
, params
, params
->unit_id
, uid
, gid
);
4563 *exit_status
= EXIT_CREDENTIALS
;
4564 return log_exec_error_errno(context
, params
, r
, "Failed to set up credentials: %m");
4567 r
= build_environment(
4577 memory_pressure_path
,
4580 *exit_status
= EXIT_MEMORY
;
4584 r
= build_pass_environment(context
, &pass_env
);
4586 *exit_status
= EXIT_MEMORY
;
4590 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4591 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4592 * not specify PATH but the unit has ExecSearchPath. */
4593 if (!strv_isempty(context
->exec_search_path
)) {
4594 _cleanup_free_
char *joined
= NULL
;
4596 joined
= strv_join(context
->exec_search_path
, ":");
4598 *exit_status
= EXIT_MEMORY
;
4602 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
4604 *exit_status
= EXIT_MEMORY
;
4609 accum_env
= strv_env_merge(params
->environment
,
4611 joined_exec_search_path
,
4613 context
->environment
,
4616 *exit_status
= EXIT_MEMORY
;
4619 accum_env
= strv_env_clean(accum_env
);
4621 (void) umask(context
->umask
);
4623 r
= setup_keyring(context
, params
, uid
, gid
);
4625 *exit_status
= EXIT_KEYRING
;
4626 return log_exec_error_errno(context
, params
, r
, "Failed to set up kernel keyring: %m");
4629 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4631 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4633 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4634 * for it, and the kernel doesn't actually support ambient caps. */
4635 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
4637 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4638 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4640 if (needs_ambient_hack
)
4641 needs_setuid
= false;
4643 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
4645 uint64_t capability_ambient_set
= context
->capability_ambient_set
;
4647 if (needs_sandboxing
) {
4648 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4649 * /sys being present. The actual MAC context application will happen later, as late as
4650 * possible, to avoid impacting our own code paths. */
4653 use_selinux
= mac_selinux_use();
4656 use_smack
= mac_smack_use();
4659 use_apparmor
= mac_apparmor_use();
4663 if (needs_sandboxing
) {
4666 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4667 * is set here. (See below.) */
4669 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
4671 *exit_status
= EXIT_LIMITS
;
4672 return log_exec_error_errno(context
, params
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
4676 if (needs_setuid
&& context
->pam_name
&& username
) {
4677 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4678 * wins here. (See above.) */
4680 /* All fds passed in the fds array will be closed in the pam child process. */
4681 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, params
->fds
, n_fds
, params
->exec_fd
);
4683 *exit_status
= EXIT_PAM
;
4684 return log_exec_error_errno(context
, params
, r
, "Failed to set up PAM session: %m");
4687 if (ambient_capabilities_supported()) {
4688 uint64_t ambient_after_pam
;
4690 /* PAM modules might have set some ambient caps. Query them here and merge them into
4691 * the caps we want to set in the end, so that we don't end up unsetting them. */
4692 r
= capability_get_ambient(&ambient_after_pam
);
4694 *exit_status
= EXIT_CAPABILITIES
;
4695 return log_exec_error_errno(context
, params
, r
, "Failed to query ambient caps: %m");
4698 capability_ambient_set
|= ambient_after_pam
;
4701 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
4702 if (ngids_after_pam
< 0) {
4703 *exit_status
= EXIT_GROUP
;
4704 return log_exec_error_errno(context
, params
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
4708 if (needs_sandboxing
&& exec_context_need_unprivileged_private_users(context
, params
)) {
4709 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4710 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4711 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4713 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4714 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4715 * the actual requested operations fail (or silently continue). */
4716 if (r
< 0 && context
->private_users
) {
4717 *exit_status
= EXIT_USER
;
4718 return log_exec_error_errno(context
, params
, r
, "Failed to set up user namespacing for unprivileged user: %m");
4721 log_exec_info_errno(context
, params
, r
, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4723 userns_set_up
= true;
4726 if (exec_needs_network_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
4728 /* Try to enable network namespacing if network namespacing is available and we have
4729 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4730 * new network namespace. And if we don't have that, then we could only create a network
4731 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4732 if (ns_type_supported(NAMESPACE_NET
) && have_effective_cap(CAP_NET_ADMIN
) > 0) {
4733 r
= setup_shareable_ns(runtime
->shared
->netns_storage_socket
, CLONE_NEWNET
);
4734 if (ERRNO_IS_NEG_PRIVILEGE(r
))
4735 log_exec_notice_errno(context
, params
, r
,
4736 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4738 *exit_status
= EXIT_NETWORK
;
4739 return log_exec_error_errno(context
, params
, r
, "Failed to set up network namespacing: %m");
4741 } else if (context
->network_namespace_path
) {
4742 *exit_status
= EXIT_NETWORK
;
4743 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4744 "NetworkNamespacePath= is not supported, refusing.");
4746 log_exec_notice(context
, params
, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4749 if (exec_needs_ipc_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
4751 if (ns_type_supported(NAMESPACE_IPC
)) {
4752 r
= setup_shareable_ns(runtime
->shared
->ipcns_storage_socket
, CLONE_NEWIPC
);
4754 log_exec_warning_errno(context
, params
, r
,
4755 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4757 *exit_status
= EXIT_NAMESPACE
;
4758 return log_exec_error_errno(context
, params
, r
, "Failed to set up IPC namespacing: %m");
4760 } else if (context
->ipc_namespace_path
) {
4761 *exit_status
= EXIT_NAMESPACE
;
4762 return log_exec_error_errno(context
, params
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4763 "IPCNamespacePath= is not supported, refusing.");
4765 log_exec_warning(context
, params
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4768 if (needs_mount_namespace
) {
4769 _cleanup_free_
char *error_path
= NULL
;
4771 r
= apply_mount_namespace(command
->flags
,
4775 memory_pressure_path
,
4779 *exit_status
= EXIT_NAMESPACE
;
4780 return log_exec_error_errno(context
, params
, r
, "Failed to set up mount namespacing%s%s: %m",
4781 error_path
? ": " : "", strempty(error_path
));
4785 if (needs_sandboxing
) {
4786 r
= apply_protect_hostname(context
, params
, exit_status
);
4791 if (context
->memory_ksm
>= 0)
4792 if (prctl(PR_SET_MEMORY_MERGE
, context
->memory_ksm
, 0, 0, 0) < 0) {
4793 if (ERRNO_IS_NOT_SUPPORTED(errno
))
4794 log_exec_debug_errno(context
,
4797 "KSM support not available, ignoring.");
4799 *exit_status
= EXIT_KSM
;
4800 return log_exec_error_errno(context
, params
, errno
, "Failed to set KSM: %m");
4804 /* Drop groups as early as possible.
4805 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4806 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4808 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
4809 int ngids_to_enforce
= 0;
4811 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
4816 if (ngids_to_enforce
< 0) {
4817 *exit_status
= EXIT_GROUP
;
4818 return log_exec_error_errno(context
, params
,
4820 "Failed to merge group lists. Group membership might be incorrect: %m");
4823 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
4825 *exit_status
= EXIT_GROUP
;
4826 return log_exec_error_errno(context
, params
, r
, "Changing group credentials failed: %m");
4830 /* If the user namespace was not set up above, try to do it now.
4831 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4832 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4833 * case of mount namespaces being less privileged when the mount point list is copied from a
4834 * different user namespace). */
4836 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
4837 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4839 *exit_status
= EXIT_USER
;
4840 return log_exec_error_errno(context
, params
, r
, "Failed to set up user namespacing: %m");
4844 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4847 _cleanup_free_
char *executable
= NULL
;
4848 _cleanup_close_
int executable_fd
= -EBADF
;
4849 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
4851 *exit_status
= EXIT_EXEC
;
4852 log_exec_struct_errno(context
, params
, LOG_NOTICE
, r
,
4853 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4854 LOG_EXEC_MESSAGE(params
,
4855 "Unable to locate executable '%s': %m",
4857 "EXECUTABLE=%s", command
->path
);
4858 /* If the error will be ignored by manager, tune down the log level here. Missing executable
4859 * is very much expected in this case. */
4860 return r
!= -ENOMEM
&& FLAGS_SET(command
->flags
, EXEC_COMMAND_IGNORE_FAILURE
) ? 1 : r
;
4863 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, &executable_fd
);
4865 *exit_status
= EXIT_FDS
;
4866 return log_exec_error_errno(context
, params
, r
, "Failed to collect shifted fd: %m");
4870 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
4875 else if (params
->n_socket_fds
== 1)
4876 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4877 * use context from that fd to compute the label. */
4878 fd
= params
->fds
[0];
4881 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
4883 if (!context
->selinux_context_ignore
) {
4884 *exit_status
= EXIT_SELINUX_CONTEXT
;
4885 return log_exec_error_errno(context
,
4888 "Failed to determine SELinux context: %m");
4890 log_exec_debug_errno(context
,
4893 "Failed to determine SELinux context, ignoring: %m");
4899 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4900 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4901 * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
4902 * them open until the final execve(). But first, close the remaining sockets in the context
4905 exec_runtime_close(runtime
);
4906 exec_params_close(params
);
4908 r
= close_all_fds(keep_fds
, n_keep_fds
);
4910 r
= pack_fds(params
->fds
, n_fds
);
4912 r
= flag_fds(params
->fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
4914 *exit_status
= EXIT_FDS
;
4915 return log_exec_error_errno(context
, params
, r
, "Failed to adjust passed file descriptors: %m");
4918 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4919 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4920 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4923 secure_bits
= context
->secure_bits
;
4925 if (needs_sandboxing
) {
4928 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4929 * (Note this is placed after the general resource limit initialization, see above, in order
4930 * to take precedence.) */
4931 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
4932 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
4933 *exit_status
= EXIT_LIMITS
;
4934 return log_exec_error_errno(context
, params
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4939 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4940 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4941 if (use_smack
&& context
->smack_process_label
) {
4942 r
= setup_smack(params
, context
, executable_fd
);
4943 if (r
< 0 && !context
->smack_process_label_ignore
) {
4944 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
4945 return log_exec_error_errno(context
, params
, r
, "Failed to set SMACK process label: %m");
4950 bset
= context
->capability_bounding_set
;
4951 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4952 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4953 * instead of us doing that */
4954 if (needs_ambient_hack
)
4955 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
4956 (UINT64_C(1) << CAP_SETUID
) |
4957 (UINT64_C(1) << CAP_SETGID
);
4960 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4961 * keep the needed privileges to apply it even if we're not root. */
4963 uid_is_valid(uid
) &&
4964 context_has_seccomp(context
) &&
4965 seccomp_allows_drop_privileges(context
)) {
4966 keep_seccomp_privileges
= true;
4968 if (prctl(PR_SET_KEEPCAPS
, 1) < 0) {
4969 *exit_status
= EXIT_USER
;
4970 return log_exec_error_errno(context
, params
, errno
, "Failed to enable keep capabilities flag: %m");
4973 /* Save the current bounding set so we can restore it after applying the seccomp
4976 bset
|= (UINT64_C(1) << CAP_SYS_ADMIN
) |
4977 (UINT64_C(1) << CAP_SETPCAP
);
4981 if (!cap_test_all(bset
)) {
4982 r
= capability_bounding_set_drop(bset
, /* right_now= */ false);
4984 *exit_status
= EXIT_CAPABILITIES
;
4985 return log_exec_error_errno(context
, params
, r
, "Failed to drop capabilities: %m");
4989 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4992 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4993 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4994 * the ambient capabilities can be raised as they are present in the permitted and
4995 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4996 * without changing the user, so we also set the ambient capabilities here.
4998 * The requested ambient capabilities are raised in the inheritable set if the second
4999 * argument is true. */
5000 if (!needs_ambient_hack
) {
5001 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ true);
5003 *exit_status
= EXIT_CAPABILITIES
;
5004 return log_exec_error_errno(context
, params
, r
, "Failed to apply ambient capabilities (before UID change): %m");
5009 /* chroot to root directory first, before we lose the ability to chroot */
5010 r
= apply_root_directory(context
, params
, runtime
, needs_mount_namespace
, exit_status
);
5012 return log_exec_error_errno(context
, params
, r
, "Chrooting to the requested root directory failed: %m");
5015 if (uid_is_valid(uid
)) {
5016 r
= enforce_user(context
, uid
, capability_ambient_set
);
5018 *exit_status
= EXIT_USER
;
5019 return log_exec_error_errno(context
, params
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
5022 if (keep_seccomp_privileges
) {
5023 if (!FLAGS_SET(capability_ambient_set
, (UINT64_C(1) << CAP_SETUID
))) {
5024 r
= drop_capability(CAP_SETUID
);
5026 *exit_status
= EXIT_USER
;
5027 return log_exec_error_errno(context
, params
, r
, "Failed to drop CAP_SETUID: %m");
5031 r
= keep_capability(CAP_SYS_ADMIN
);
5033 *exit_status
= EXIT_USER
;
5034 return log_exec_error_errno(context
, params
, r
, "Failed to keep CAP_SYS_ADMIN: %m");
5037 r
= keep_capability(CAP_SETPCAP
);
5039 *exit_status
= EXIT_USER
;
5040 return log_exec_error_errno(context
, params
, r
, "Failed to keep CAP_SETPCAP: %m");
5044 if (!needs_ambient_hack
&& capability_ambient_set
!= 0) {
5046 /* Raise the ambient capabilities after user change. */
5047 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ false);
5049 *exit_status
= EXIT_CAPABILITIES
;
5050 return log_exec_error_errno(context
, params
, r
, "Failed to apply ambient capabilities (after UID change): %m");
5056 /* Apply working directory here, because the working directory might be on NFS and only the user
5057 * running this service might have the correct privilege to change to the working directory. Also, it
5058 * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5059 * the cwd cannot be used to pin directories outside of the sandbox. */
5060 r
= apply_working_directory(context
, params
, runtime
, home
, exit_status
);
5062 return log_exec_error_errno(context
, params
, r
, "Changing to the requested working directory failed: %m");
5064 if (needs_sandboxing
) {
5065 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5066 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5067 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5068 * are restricted. */
5072 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
5075 r
= setexeccon(exec_context
);
5077 if (!context
->selinux_context_ignore
) {
5078 *exit_status
= EXIT_SELINUX_CONTEXT
;
5079 return log_exec_error_errno(context
, params
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
5081 log_exec_debug_errno(context
,
5084 "Failed to change SELinux context to %s, ignoring: %m",
5092 if (use_apparmor
&& context
->apparmor_profile
) {
5093 r
= aa_change_onexec(context
->apparmor_profile
);
5094 if (r
< 0 && !context
->apparmor_profile_ignore
) {
5095 *exit_status
= EXIT_APPARMOR_PROFILE
;
5096 return log_exec_error_errno(context
,
5099 "Failed to prepare AppArmor profile change to %s: %m",
5100 context
->apparmor_profile
);
5105 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5106 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5107 * requires CAP_SETPCAP. */
5108 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
5109 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5110 * effective set here.
5112 * The effective set is overwritten during execve() with the following values:
5114 * - ambient set (for non-root processes)
5116 * - (inheritable | bounding) set for root processes)
5118 * Hence there is no security impact to raise it in the effective set before execve
5120 r
= capability_gain_cap_setpcap(/* return_caps= */ NULL
);
5122 *exit_status
= EXIT_CAPABILITIES
;
5123 return log_exec_error_errno(context
, params
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
5125 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
5126 *exit_status
= EXIT_SECUREBITS
;
5127 return log_exec_error_errno(context
, params
, errno
, "Failed to set process secure bits: %m");
5131 if (context_has_no_new_privileges(context
))
5132 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
5133 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
5134 return log_exec_error_errno(context
, params
, errno
, "Failed to disable new privileges: %m");
5138 r
= apply_address_families(context
, params
);
5140 *exit_status
= EXIT_ADDRESS_FAMILIES
;
5141 return log_exec_error_errno(context
, params
, r
, "Failed to restrict address families: %m");
5144 r
= apply_memory_deny_write_execute(context
, params
);
5146 *exit_status
= EXIT_SECCOMP
;
5147 return log_exec_error_errno(context
, params
, r
, "Failed to disable writing to executable memory: %m");
5150 r
= apply_restrict_realtime(context
, params
);
5152 *exit_status
= EXIT_SECCOMP
;
5153 return log_exec_error_errno(context
, params
, r
, "Failed to apply realtime restrictions: %m");
5156 r
= apply_restrict_suid_sgid(context
, params
);
5158 *exit_status
= EXIT_SECCOMP
;
5159 return log_exec_error_errno(context
, params
, r
, "Failed to apply SUID/SGID restrictions: %m");
5162 r
= apply_restrict_namespaces(context
, params
);
5164 *exit_status
= EXIT_SECCOMP
;
5165 return log_exec_error_errno(context
, params
, r
, "Failed to apply namespace restrictions: %m");
5168 r
= apply_protect_sysctl(context
, params
);
5170 *exit_status
= EXIT_SECCOMP
;
5171 return log_exec_error_errno(context
, params
, r
, "Failed to apply sysctl restrictions: %m");
5174 r
= apply_protect_kernel_modules(context
, params
);
5176 *exit_status
= EXIT_SECCOMP
;
5177 return log_exec_error_errno(context
, params
, r
, "Failed to apply module loading restrictions: %m");
5180 r
= apply_protect_kernel_logs(context
, params
);
5182 *exit_status
= EXIT_SECCOMP
;
5183 return log_exec_error_errno(context
, params
, r
, "Failed to apply kernel log restrictions: %m");
5186 r
= apply_protect_clock(context
, params
);
5188 *exit_status
= EXIT_SECCOMP
;
5189 return log_exec_error_errno(context
, params
, r
, "Failed to apply clock restrictions: %m");
5192 r
= apply_private_devices(context
, params
);
5194 *exit_status
= EXIT_SECCOMP
;
5195 return log_exec_error_errno(context
, params
, r
, "Failed to set up private devices: %m");
5198 r
= apply_syscall_archs(context
, params
);
5200 *exit_status
= EXIT_SECCOMP
;
5201 return log_exec_error_errno(context
, params
, r
, "Failed to apply syscall architecture restrictions: %m");
5204 r
= apply_lock_personality(context
, params
);
5206 *exit_status
= EXIT_SECCOMP
;
5207 return log_exec_error_errno(context
, params
, r
, "Failed to lock personalities: %m");
5210 r
= apply_syscall_log(context
, params
);
5212 *exit_status
= EXIT_SECCOMP
;
5213 return log_exec_error_errno(context
, params
, r
, "Failed to apply system call log filters: %m");
5218 r
= apply_restrict_filesystems(context
, params
);
5220 *exit_status
= EXIT_BPF
;
5221 return log_exec_error_errno(context
, params
, r
, "Failed to restrict filesystems: %m");
5226 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5227 * by the filter as little as possible. */
5228 r
= apply_syscall_filter(context
, params
, needs_ambient_hack
);
5230 *exit_status
= EXIT_SECCOMP
;
5231 return log_exec_error_errno(context
, params
, r
, "Failed to apply system call filters: %m");
5234 if (keep_seccomp_privileges
) {
5235 /* Restore the capability bounding set with what's expected from the service + the
5236 * ambient capabilities hack */
5237 if (!cap_test_all(saved_bset
)) {
5238 r
= capability_bounding_set_drop(saved_bset
, /* right_now= */ false);
5240 *exit_status
= EXIT_CAPABILITIES
;
5241 return log_exec_error_errno(context
, params
, r
, "Failed to drop bset capabilities: %m");
5245 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5246 * applications that use it. */
5247 if (!FLAGS_SET(saved_bset
, (UINT64_C(1) << CAP_SYS_ADMIN
))) {
5248 r
= drop_capability(CAP_SYS_ADMIN
);
5250 *exit_status
= EXIT_USER
;
5251 return log_exec_error_errno(context
, params
, r
, "Failed to drop CAP_SYS_ADMIN: %m");
5255 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5256 * applications that use it. */
5257 if (!FLAGS_SET(saved_bset
, (UINT64_C(1) << CAP_SETPCAP
))) {
5258 r
= drop_capability(CAP_SETPCAP
);
5260 *exit_status
= EXIT_USER
;
5261 return log_exec_error_errno(context
, params
, r
, "Failed to drop CAP_SETPCAP: %m");
5265 if (prctl(PR_SET_KEEPCAPS
, 0) < 0) {
5266 *exit_status
= EXIT_USER
;
5267 return log_exec_error_errno(context
, params
, errno
, "Failed to drop keep capabilities flag: %m");
5274 if (!strv_isempty(context
->unset_environment
)) {
5277 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5279 *exit_status
= EXIT_MEMORY
;
5283 strv_free_and_replace(accum_env
, ee
);
5286 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5287 _cleanup_strv_free_
char **unset_variables
= NULL
, **bad_variables
= NULL
;
5289 r
= replace_env_argv(command
->argv
, accum_env
, &replaced_argv
, &unset_variables
, &bad_variables
);
5291 *exit_status
= EXIT_MEMORY
;
5292 return log_exec_error_errno(context
,
5295 "Failed to replace environment variables: %m");
5297 final_argv
= replaced_argv
;
5299 if (!strv_isempty(unset_variables
)) {
5300 _cleanup_free_
char *ju
= strv_join(unset_variables
, ", ");
5301 log_exec_warning(context
,
5303 "Referenced but unset environment variable evaluates to an empty string: %s",
5307 if (!strv_isempty(bad_variables
)) {
5308 _cleanup_free_
char *jb
= strv_join(bad_variables
, ", ");
5309 log_exec_warning(context
,
5311 "Invalid environment variable name evaluates to an empty string: %s",
5315 final_argv
= command
->argv
;
5317 log_command_line(context
, params
, "Executing", executable
, final_argv
);
5319 /* We have finished with all our initializations. Let's now let the manager know that. From this
5320 * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5322 r
= exec_fd_mark_hot(context
, params
, /* hot= */ true, exit_status
);
5326 /* As last thing before the execve(), let's send the handoff timestamp */
5327 r
= send_handoff_timestamp(context
, params
, exit_status
);
5329 /* If this handoff timestamp failed, let's undo the marking as hot */
5330 (void) exec_fd_mark_hot(context
, params
, /* hot= */ false, /* reterr_exit_status= */ NULL
);
5334 /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
5335 * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
5336 * exec_fd this is pretty much the whole raison d'etre. */
5338 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5340 /* The execve() failed, let's undo the marking as hot */
5341 (void) exec_fd_mark_hot(context
, params
, /* hot= */ false, /* reterr_exit_status= */ NULL
);
5343 *exit_status
= EXIT_EXEC
;
5344 return log_exec_error_errno(context
, params
, r
, "Failed to execute %s: %m", executable
);