1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <sys/eventfd.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
13 #include <sys/types.h>
19 #include <security/pam_appl.h>
23 #include <selinux/selinux.h>
31 #include <sys/apparmor.h>
34 #include "sd-messages.h"
38 #include "alloc-util.h"
40 #include "apparmor-util.h"
46 #include "capability-util.h"
47 #include "cgroup-setup.h"
48 #include "chase-symlinks.h"
49 #include "chown-recursive.h"
50 #include "cpu-set-util.h"
51 #include "creds-util.h"
52 #include "data-fd-util.h"
56 #include "errno-list.h"
59 #include "exit-status.h"
62 #include "format-util.h"
63 #include "glob-util.h"
64 #include "hexdecoct.h"
66 #include "ioprio-util.h"
71 #include "manager-dump.h"
72 #include "memory-util.h"
73 #include "missing_fs.h"
74 #include "missing_ioprio.h"
75 #include "mkdir-label.h"
76 #include "mount-util.h"
77 #include "mountpoint-util.h"
78 #include "namespace.h"
79 #include "parse-util.h"
80 #include "path-util.h"
81 #include "process-util.h"
82 #include "random-util.h"
83 #include "recurse-dir.h"
84 #include "rlimit-util.h"
87 #include "seccomp-util.h"
89 #include "securebits-util.h"
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "smack-util.h"
93 #include "socket-util.h"
95 #include "stat-util.h"
96 #include "string-table.h"
97 #include "string-util.h"
99 #include "syslog-util.h"
100 #include "terminal-util.h"
101 #include "tmpfile-util.h"
102 #include "umask-util.h"
103 #include "unit-serialize.h"
104 #include "user-util.h"
105 #include "utmp-wtmp.h"
107 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
108 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
110 #define SNDBUF_SIZE (8*1024*1024)
112 static int shift_fds(int fds
[], size_t n_fds
) {
116 /* Modifies the fds array! (sorts it) */
120 for (int start
= 0;;) {
121 int restart_from
= -1;
123 for (int i
= start
; i
< (int) n_fds
; i
++) {
126 /* Already at right index? */
130 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
137 /* Hmm, the fd we wanted isn't free? Then
138 * let's remember that and try again from here */
139 if (nfd
!= i
+3 && restart_from
< 0)
143 if (restart_from
< 0)
146 start
= restart_from
;
152 static int flags_fds(const int fds
[], size_t n_socket_fds
, size_t n_storage_fds
, bool nonblock
) {
156 n_fds
= n_socket_fds
+ n_storage_fds
;
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
165 for (size_t i
= 0; i
< n_fds
; i
++) {
167 if (i
< n_socket_fds
) {
168 r
= fd_nonblock(fds
[i
], nonblock
);
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
177 r
= fd_cloexec(fds
[i
], false);
185 static const char *exec_context_tty_path(const ExecContext
*context
) {
188 if (context
->stdio_as_fds
)
191 if (context
->tty_path
)
192 return context
->tty_path
;
194 return "/dev/console";
197 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
202 path
= exec_context_tty_path(context
);
204 if (context
->tty_vhangup
) {
205 if (p
&& p
->stdin_fd
>= 0)
206 (void) terminal_vhangup_fd(p
->stdin_fd
);
208 (void) terminal_vhangup(path
);
211 if (context
->tty_reset
) {
212 if (p
&& p
->stdin_fd
>= 0)
213 (void) reset_terminal_fd(p
->stdin_fd
, true);
215 (void) reset_terminal(path
);
218 if (p
&& p
->stdin_fd
>= 0)
219 (void) terminal_set_size_fd(p
->stdin_fd
, path
, context
->tty_rows
, context
->tty_cols
);
221 if (context
->tty_vt_disallocate
&& path
)
222 (void) vt_disallocate(path
);
225 static bool is_terminal_input(ExecInput i
) {
228 EXEC_INPUT_TTY_FORCE
,
229 EXEC_INPUT_TTY_FAIL
);
232 static bool is_terminal_output(ExecOutput o
) {
235 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
236 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
239 static bool is_kmsg_output(ExecOutput o
) {
242 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
245 static bool exec_context_needs_term(const ExecContext
*c
) {
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
250 if (is_terminal_input(c
->std_input
))
253 if (is_terminal_output(c
->std_output
))
256 if (is_terminal_output(c
->std_error
))
259 return !!c
->tty_path
;
262 static int open_null_as(int flags
, int nfd
) {
267 fd
= open("/dev/null", flags
|O_NOCTTY
);
271 return move_fd(fd
, nfd
, false);
274 static int connect_journal_socket(
276 const char *log_namespace
,
280 uid_t olduid
= UID_INVALID
;
281 gid_t oldgid
= GID_INVALID
;
286 strjoina("/run/systemd/journal.", log_namespace
, "/stdout") :
287 "/run/systemd/journal/stdout";
289 if (gid_is_valid(gid
)) {
292 if (setegid(gid
) < 0)
296 if (uid_is_valid(uid
)) {
299 if (seteuid(uid
) < 0) {
305 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
307 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
308 an LSM interferes. */
310 if (uid_is_valid(uid
))
311 (void) seteuid(olduid
);
314 if (gid_is_valid(gid
))
315 (void) setegid(oldgid
);
320 static int connect_logger_as(
322 const ExecContext
*context
,
323 const ExecParameters
*params
,
330 _cleanup_close_
int fd
= -1;
335 assert(output
< _EXEC_OUTPUT_MAX
);
339 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
343 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
347 if (shutdown(fd
, SHUT_RD
) < 0)
350 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
360 context
->syslog_identifier
?: ident
,
361 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
362 context
->syslog_priority
,
363 !!context
->syslog_level_prefix
,
365 is_kmsg_output(output
),
366 is_terminal_output(output
)) < 0)
369 return move_fd(TAKE_FD(fd
), nfd
, false);
372 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
378 fd
= open_terminal(path
, flags
| O_NOCTTY
);
382 return move_fd(fd
, nfd
, false);
385 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
386 _cleanup_close_
int fd
= -1;
391 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
394 fd
= open(path
, flags
|O_NOCTTY
, mode
);
398 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
403 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
407 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
408 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
409 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
410 * wasn't an AF_UNIX socket after all */
415 if ((flags
& O_ACCMODE
) == O_RDONLY
)
416 r
= shutdown(fd
, SHUT_WR
);
417 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
418 r
= shutdown(fd
, SHUT_RD
);
427 static int fixup_input(
428 const ExecContext
*context
,
430 bool apply_tty_stdin
) {
436 std_input
= context
->std_input
;
438 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
439 return EXEC_INPUT_NULL
;
441 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
442 return EXEC_INPUT_NULL
;
444 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
445 return EXEC_INPUT_NULL
;
450 static int fixup_output(ExecOutput output
, int socket_fd
) {
452 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
453 return EXEC_OUTPUT_INHERIT
;
458 static int setup_input(
459 const ExecContext
*context
,
460 const ExecParameters
*params
,
462 const int named_iofds
[static 3]) {
471 if (params
->stdin_fd
>= 0) {
472 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
475 /* Try to make this the controlling tty, if it is a tty, and reset it */
476 if (isatty(STDIN_FILENO
)) {
477 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
478 (void) reset_terminal_fd(STDIN_FILENO
, true);
479 (void) terminal_set_size_fd(STDIN_FILENO
, NULL
, context
->tty_rows
, context
->tty_cols
);
485 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
489 case EXEC_INPUT_NULL
:
490 return open_null_as(O_RDONLY
, STDIN_FILENO
);
493 case EXEC_INPUT_TTY_FORCE
:
494 case EXEC_INPUT_TTY_FAIL
: {
497 fd
= acquire_terminal(exec_context_tty_path(context
),
498 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
499 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
500 ACQUIRE_TERMINAL_WAIT
,
505 r
= terminal_set_size_fd(fd
, exec_context_tty_path(context
), context
->tty_rows
, context
->tty_cols
);
509 return move_fd(fd
, STDIN_FILENO
, false);
512 case EXEC_INPUT_SOCKET
:
513 assert(socket_fd
>= 0);
515 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
517 case EXEC_INPUT_NAMED_FD
:
518 assert(named_iofds
[STDIN_FILENO
] >= 0);
520 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
521 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
523 case EXEC_INPUT_DATA
: {
526 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
530 return move_fd(fd
, STDIN_FILENO
, false);
533 case EXEC_INPUT_FILE
: {
537 assert(context
->stdio_file
[STDIN_FILENO
]);
539 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
540 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
542 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
546 return move_fd(fd
, STDIN_FILENO
, false);
550 assert_not_reached();
554 static bool can_inherit_stderr_from_stdout(
555 const ExecContext
*context
,
561 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
564 if (e
== EXEC_OUTPUT_INHERIT
)
569 if (e
== EXEC_OUTPUT_NAMED_FD
)
570 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
572 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
573 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
578 static int setup_output(
580 const ExecContext
*context
,
581 const ExecParameters
*params
,
584 const int named_iofds
[static 3],
588 dev_t
*journal_stream_dev
,
589 ino_t
*journal_stream_ino
) {
599 assert(journal_stream_dev
);
600 assert(journal_stream_ino
);
602 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
604 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
607 return STDOUT_FILENO
;
610 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
611 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
614 return STDERR_FILENO
;
617 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
618 o
= fixup_output(context
->std_output
, socket_fd
);
620 if (fileno
== STDERR_FILENO
) {
622 e
= fixup_output(context
->std_error
, socket_fd
);
624 /* This expects the input and output are already set up */
626 /* Don't change the stderr file descriptor if we inherit all
627 * the way and are not on a tty */
628 if (e
== EXEC_OUTPUT_INHERIT
&&
629 o
== EXEC_OUTPUT_INHERIT
&&
630 i
== EXEC_INPUT_NULL
&&
631 !is_terminal_input(context
->std_input
) &&
635 /* Duplicate from stdout if possible */
636 if (can_inherit_stderr_from_stdout(context
, o
, e
))
637 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
641 } else if (o
== EXEC_OUTPUT_INHERIT
) {
642 /* If input got downgraded, inherit the original value */
643 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
644 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
646 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
647 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
648 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
650 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
654 /* We need to open /dev/null here anew, to get the right access mode. */
655 return open_null_as(O_WRONLY
, fileno
);
660 case EXEC_OUTPUT_NULL
:
661 return open_null_as(O_WRONLY
, fileno
);
663 case EXEC_OUTPUT_TTY
:
664 if (is_terminal_input(i
))
665 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
667 /* We don't reset the terminal if this is just about output */
668 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
670 case EXEC_OUTPUT_KMSG
:
671 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
672 case EXEC_OUTPUT_JOURNAL
:
673 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
674 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
676 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m",
677 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
678 r
= open_null_as(O_WRONLY
, fileno
);
682 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
683 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
684 * services to detect whether they are connected to the journal or not.
686 * If both stdout and stderr are connected to a stream then let's make sure to store the data
687 * about STDERR as that's usually the best way to do logging. */
689 if (fstat(fileno
, &st
) >= 0 &&
690 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
691 *journal_stream_dev
= st
.st_dev
;
692 *journal_stream_ino
= st
.st_ino
;
697 case EXEC_OUTPUT_SOCKET
:
698 assert(socket_fd
>= 0);
700 return RET_NERRNO(dup2(socket_fd
, fileno
));
702 case EXEC_OUTPUT_NAMED_FD
:
703 assert(named_iofds
[fileno
] >= 0);
705 (void) fd_nonblock(named_iofds
[fileno
], false);
706 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
708 case EXEC_OUTPUT_FILE
:
709 case EXEC_OUTPUT_FILE_APPEND
:
710 case EXEC_OUTPUT_FILE_TRUNCATE
: {
714 assert(context
->stdio_file
[fileno
]);
716 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
717 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
720 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
723 if (o
== EXEC_OUTPUT_FILE_APPEND
)
725 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
728 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
732 return move_fd(fd
, fileno
, 0);
736 assert_not_reached();
740 static int chown_terminal(int fd
, uid_t uid
) {
745 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
746 if (isatty(fd
) < 1) {
747 if (IN_SET(errno
, EINVAL
, ENOTTY
))
748 return 0; /* not a tty */
753 /* This might fail. What matters are the results. */
754 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
761 static int setup_confirm_stdio(
762 const ExecContext
*context
,
764 int *ret_saved_stdin
,
765 int *ret_saved_stdout
) {
767 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
770 assert(ret_saved_stdin
);
771 assert(ret_saved_stdout
);
773 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
777 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
778 if (saved_stdout
< 0)
781 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
785 r
= chown_terminal(fd
, getuid());
789 r
= reset_terminal_fd(fd
, true);
793 r
= terminal_set_size_fd(fd
, vc
, context
->tty_rows
, context
->tty_cols
);
797 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
802 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
803 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
807 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
810 if (err
== -ETIMEDOUT
)
811 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
814 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
818 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
819 _cleanup_close_
int fd
= -1;
823 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
827 write_confirm_error_fd(err
, fd
, u
);
830 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
834 assert(saved_stdout
);
838 if (*saved_stdin
>= 0)
839 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
842 if (*saved_stdout
>= 0)
843 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
846 *saved_stdin
= safe_close(*saved_stdin
);
847 *saved_stdout
= safe_close(*saved_stdout
);
853 CONFIRM_PRETEND_FAILURE
= -1,
854 CONFIRM_PRETEND_SUCCESS
= 0,
858 static int ask_for_confirmation(const ExecContext
*context
, const char *vc
, Unit
*u
, const char *cmdline
) {
859 int saved_stdout
= -1, saved_stdin
= -1, r
;
860 _cleanup_free_
char *e
= NULL
;
863 /* For any internal errors, assume a positive response. */
864 r
= setup_confirm_stdio(context
, vc
, &saved_stdin
, &saved_stdout
);
866 write_confirm_error(r
, vc
, u
);
867 return CONFIRM_EXECUTE
;
870 /* confirm_spawn might have been disabled while we were sleeping. */
871 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
876 e
= ellipsize(cmdline
, 60, 100);
884 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
886 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
893 printf("Resuming normal execution.\n");
894 manager_disable_confirm_spawn();
898 unit_dump(u
, stdout
, " ");
899 continue; /* ask again */
901 printf("Failing execution.\n");
902 r
= CONFIRM_PRETEND_FAILURE
;
905 printf(" c - continue, proceed without asking anymore\n"
906 " D - dump, show the state of the unit\n"
907 " f - fail, don't execute the command and pretend it failed\n"
909 " i - info, show a short summary of the unit\n"
910 " j - jobs, show jobs that are in progress\n"
911 " s - skip, don't execute the command and pretend it succeeded\n"
912 " y - yes, execute the command\n");
913 continue; /* ask again */
915 printf(" Description: %s\n"
918 u
->id
, u
->description
, cmdline
);
919 continue; /* ask again */
921 manager_dump_jobs(u
->manager
, stdout
, " ");
922 continue; /* ask again */
924 /* 'n' was removed in favor of 'f'. */
925 printf("Didn't understand 'n', did you mean 'f'?\n");
926 continue; /* ask again */
928 printf("Skipping execution.\n");
929 r
= CONFIRM_PRETEND_SUCCESS
;
935 assert_not_reached();
941 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
945 static int get_fixed_user(const ExecContext
*c
, const char **user
,
946 uid_t
*uid
, gid_t
*gid
,
947 const char **home
, const char **shell
) {
956 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
957 * (i.e. are "/" or "/bin/nologin"). */
960 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
968 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
978 r
= get_group_creds(&name
, gid
, 0);
986 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
987 const char *group
, gid_t gid
,
988 gid_t
**supplementary_gids
, int *ngids
) {
991 bool keep_groups
= false;
992 gid_t
*groups
= NULL
;
993 _cleanup_free_ gid_t
*l_gids
= NULL
;
998 * If user is given, then lookup GID and supplementary groups list.
999 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1000 * here and as early as possible so we keep the list of supplementary
1001 * groups of the caller.
1003 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
1004 /* First step, initialize groups from /etc/groups */
1005 if (initgroups(user
, gid
) < 0)
1011 if (strv_isempty(c
->supplementary_groups
))
1015 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1016 * be positive, otherwise fail.
1019 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1020 if (ngroups_max
<= 0)
1021 return errno_or_else(EOPNOTSUPP
);
1023 l_gids
= new(gid_t
, ngroups_max
);
1029 * Lookup the list of groups that the user belongs to, we
1030 * avoid NSS lookups here too for gid=0.
1033 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1038 STRV_FOREACH(i
, c
->supplementary_groups
) {
1041 if (k
>= ngroups_max
)
1045 r
= get_group_creds(&g
, l_gids
+k
, 0);
1053 * Sets ngids to zero to drop all supplementary groups, happens
1054 * when we are under root and SupplementaryGroups= is empty.
1061 /* Otherwise get the final list of supplementary groups */
1062 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1066 *supplementary_gids
= groups
;
1074 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1077 /* Handle SupplementaryGroups= if it is not empty */
1079 r
= maybe_setgroups(ngids
, supplementary_gids
);
1084 if (gid_is_valid(gid
)) {
1085 /* Then set our gids */
1086 if (setresgid(gid
, gid
, gid
) < 0)
1093 static int set_securebits(int bits
, int mask
) {
1094 int current
, applied
;
1095 current
= prctl(PR_GET_SECUREBITS
);
1098 /* Clear all securebits defined in mask and set bits */
1099 applied
= (current
& ~mask
) | bits
;
1100 if (current
== applied
)
1102 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1107 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1111 if (!uid_is_valid(uid
))
1114 /* Sets (but doesn't look up) the uid and make sure we keep the
1115 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1116 * required, so we also need keep-caps in this case.
1119 if (context
->capability_ambient_set
!= 0 || context
->secure_bits
!= 0) {
1121 /* First step: If we need to keep capabilities but
1122 * drop privileges we need to make sure we keep our
1123 * caps, while we drop privileges. */
1125 /* Add KEEP_CAPS to the securebits */
1126 r
= set_securebits(1<<SECURE_KEEP_CAPS
, 0);
1132 /* Second step: actually set the uids */
1133 if (setresuid(uid
, uid
, uid
) < 0)
1136 /* At this point we should have all necessary capabilities but
1137 are otherwise a normal user. However, the caps might got
1138 corrupted due to the setresuid() so we need clean them up
1139 later. This is done outside of this call. */
1146 static int null_conv(
1148 const struct pam_message
**msg
,
1149 struct pam_response
**resp
,
1150 void *appdata_ptr
) {
1152 /* We don't support conversations */
1154 return PAM_CONV_ERR
;
1159 static int setup_pam(
1165 char ***env
, /* updated on success */
1166 const int fds
[], size_t n_fds
) {
1170 static const struct pam_conv conv
= {
1175 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1176 _cleanup_strv_free_
char **e
= NULL
;
1177 pam_handle_t
*handle
= NULL
;
1179 int pam_code
= PAM_SUCCESS
, r
;
1180 bool close_session
= false;
1181 pid_t pam_pid
= 0, parent_pid
;
1188 /* We set up PAM in the parent process, then fork. The child
1189 * will then stay around until killed via PR_GET_PDEATHSIG or
1190 * systemd via the cgroup logic. It will then remove the PAM
1191 * session again. The parent process will exec() the actual
1192 * daemon. We do things this way to ensure that the main PID
1193 * of the daemon is the one we initially fork()ed. */
1195 r
= barrier_create(&barrier
);
1199 if (log_get_max_level() < LOG_DEBUG
)
1200 flags
|= PAM_SILENT
;
1202 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1203 if (pam_code
!= PAM_SUCCESS
) {
1209 _cleanup_free_
char *q
= NULL
;
1211 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1212 * out if that's the case, and read the TTY off it. */
1214 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1215 tty
= strjoina("/dev/", q
);
1219 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1220 if (pam_code
!= PAM_SUCCESS
)
1224 STRV_FOREACH(nv
, *env
) {
1225 pam_code
= pam_putenv(handle
, *nv
);
1226 if (pam_code
!= PAM_SUCCESS
)
1230 pam_code
= pam_acct_mgmt(handle
, flags
);
1231 if (pam_code
!= PAM_SUCCESS
)
1234 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1235 if (pam_code
!= PAM_SUCCESS
)
1236 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1238 pam_code
= pam_open_session(handle
, flags
);
1239 if (pam_code
!= PAM_SUCCESS
)
1242 close_session
= true;
1244 e
= pam_getenvlist(handle
);
1246 pam_code
= PAM_BUF_ERR
;
1250 /* Block SIGTERM, so that we know that it won't get lost in the child */
1252 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1254 parent_pid
= getpid_cached();
1256 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1260 int sig
, ret
= EXIT_PAM
;
1262 /* The child's job is to reset the PAM session on termination */
1263 barrier_set_role(&barrier
, BARRIER_CHILD
);
1265 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1266 * those fds are open here that have been opened by PAM. */
1267 (void) close_many(fds
, n_fds
);
1269 /* Drop privileges - we don't need any to pam_close_session and this will make
1270 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1271 * threads to fail to exit normally */
1273 r
= maybe_setgroups(0, NULL
);
1275 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1276 if (setresgid(gid
, gid
, gid
) < 0)
1277 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1278 if (setresuid(uid
, uid
, uid
) < 0)
1279 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1281 (void) ignore_signals(SIGPIPE
);
1283 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1284 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1285 * this way. We rely on the control groups kill logic to do the rest for us. */
1286 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1289 /* Tell the parent that our setup is done. This is especially important regarding dropping
1290 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1292 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1293 (void) barrier_place(&barrier
);
1295 /* Check if our parent process might already have died? */
1296 if (getppid() == parent_pid
) {
1299 assert_se(sigemptyset(&ss
) >= 0);
1300 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1303 if (sigwait(&ss
, &sig
) < 0) {
1310 assert(sig
== SIGTERM
);
1315 pam_code
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1316 if (pam_code
!= PAM_SUCCESS
)
1319 /* If our parent died we'll end the session */
1320 if (getppid() != parent_pid
) {
1321 pam_code
= pam_close_session(handle
, flags
);
1322 if (pam_code
!= PAM_SUCCESS
)
1329 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1330 * know about this. See pam_end(3) */
1331 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1335 barrier_set_role(&barrier
, BARRIER_PARENT
);
1337 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1341 /* Unblock SIGTERM again in the parent */
1342 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1344 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1345 * this fd around. */
1348 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1349 * recover. However, warn loudly if it happens. */
1350 if (!barrier_place_and_sync(&barrier
))
1351 log_error("PAM initialization failed");
1353 return strv_free_and_replace(*env
, e
);
1356 if (pam_code
!= PAM_SUCCESS
) {
1357 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1358 r
= -EPERM
; /* PAM errors do not map to errno */
1360 log_error_errno(r
, "PAM failed: %m");
1364 pam_code
= pam_close_session(handle
, flags
);
1366 (void) pam_end(handle
, pam_code
| flags
);
1376 static void rename_process_from_path(const char *path
) {
1377 char process_name
[11];
1381 /* This resulting string must fit in 10 chars (i.e. the length
1382 * of "/sbin/init") to look pretty in /bin/ps */
1386 rename_process("(...)");
1392 /* The end of the process name is usually more
1393 * interesting, since the first bit might just be
1399 process_name
[0] = '(';
1400 memcpy(process_name
+1, p
, l
);
1401 process_name
[1+l
] = ')';
1402 process_name
[1+l
+1] = 0;
1404 rename_process(process_name
);
1407 static bool context_has_address_families(const ExecContext
*c
) {
1410 return c
->address_families_allow_list
||
1411 !set_isempty(c
->address_families
);
1414 static bool context_has_syscall_filters(const ExecContext
*c
) {
1417 return c
->syscall_allow_list
||
1418 !hashmap_isempty(c
->syscall_filter
);
1421 static bool context_has_syscall_logs(const ExecContext
*c
) {
1424 return c
->syscall_log_allow_list
||
1425 !hashmap_isempty(c
->syscall_log
);
1428 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1431 if (c
->no_new_privileges
)
1434 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1437 /* We need NNP if we have any form of seccomp and are unprivileged */
1438 return c
->lock_personality
||
1439 c
->memory_deny_write_execute
||
1440 c
->private_devices
||
1442 c
->protect_hostname
||
1443 c
->protect_kernel_tunables
||
1444 c
->protect_kernel_modules
||
1445 c
->protect_kernel_logs
||
1446 context_has_address_families(c
) ||
1447 exec_context_restrict_namespaces_set(c
) ||
1448 c
->restrict_realtime
||
1449 c
->restrict_suid_sgid
||
1450 !set_isempty(c
->syscall_archs
) ||
1451 context_has_syscall_filters(c
) ||
1452 context_has_syscall_logs(c
);
1455 static bool exec_context_has_credentials(const ExecContext
*context
) {
1459 return !hashmap_isempty(context
->set_credentials
) ||
1460 !hashmap_isempty(context
->load_credentials
);
1465 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1467 if (is_seccomp_available())
1470 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1474 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1475 uint32_t negative_action
, default_action
, action
;
1481 if (!context_has_syscall_filters(c
))
1484 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1487 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1489 if (c
->syscall_allow_list
) {
1490 default_action
= negative_action
;
1491 action
= SCMP_ACT_ALLOW
;
1493 default_action
= SCMP_ACT_ALLOW
;
1494 action
= negative_action
;
1497 if (needs_ambient_hack
) {
1498 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1503 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1506 static int apply_syscall_log(const Unit
* u
, const ExecContext
*c
) {
1508 uint32_t default_action
, action
;
1514 if (!context_has_syscall_logs(c
))
1518 if (skip_seccomp_unavailable(u
, "SystemCallLog="))
1521 if (c
->syscall_log_allow_list
) {
1522 /* Log nothing but the ones listed */
1523 default_action
= SCMP_ACT_ALLOW
;
1524 action
= SCMP_ACT_LOG
;
1526 /* Log everything but the ones listed */
1527 default_action
= SCMP_ACT_LOG
;
1528 action
= SCMP_ACT_ALLOW
;
1531 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1533 /* old libseccomp */
1534 log_unit_debug(u
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1539 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1543 if (set_isempty(c
->syscall_archs
))
1546 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1549 return seccomp_restrict_archs(c
->syscall_archs
);
1552 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1556 if (!context_has_address_families(c
))
1559 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1562 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1565 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1569 if (!c
->memory_deny_write_execute
)
1572 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1575 return seccomp_memory_deny_write_execute();
1578 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1582 if (!c
->restrict_realtime
)
1585 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1588 return seccomp_restrict_realtime();
1591 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1595 if (!c
->restrict_suid_sgid
)
1598 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1601 return seccomp_restrict_suid_sgid();
1604 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1608 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609 * let's protect even those systems where this is left on in the kernel. */
1611 if (!c
->protect_kernel_tunables
)
1614 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1617 return seccomp_protect_sysctl();
1620 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1624 /* Turn off module syscalls on ProtectKernelModules=yes */
1626 if (!c
->protect_kernel_modules
)
1629 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1632 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1635 static int apply_protect_kernel_logs(const Unit
*u
, const ExecContext
*c
) {
1639 if (!c
->protect_kernel_logs
)
1642 if (skip_seccomp_unavailable(u
, "ProtectKernelLogs="))
1645 return seccomp_protect_syslog();
1648 static int apply_protect_clock(const Unit
*u
, const ExecContext
*c
) {
1652 if (!c
->protect_clock
)
1655 if (skip_seccomp_unavailable(u
, "ProtectClock="))
1658 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1661 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1665 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1667 if (!c
->private_devices
)
1670 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1676 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1680 if (!exec_context_restrict_namespaces_set(c
))
1683 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1686 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1689 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1690 unsigned long personality
;
1696 if (!c
->lock_personality
)
1699 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1702 personality
= c
->personality
;
1704 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1705 if (personality
== PERSONALITY_INVALID
) {
1707 r
= opinionated_personality(&personality
);
1712 return seccomp_lock_personality(personality
);
1718 static int apply_restrict_filesystems(Unit
*u
, const ExecContext
*c
) {
1722 if (!exec_context_restrict_filesystems_set(c
))
1725 if (!u
->manager
->restrict_fs
) {
1726 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1727 log_unit_debug(u
, "LSM BPF not supported, skipping RestrictFileSystems=");
1731 return lsm_bpf_unit_restrict_filesystems(u
, c
->restrict_filesystems
, c
->restrict_filesystems_allow_list
);
1735 static int apply_protect_hostname(const Unit
*u
, const ExecContext
*c
, int *ret_exit_status
) {
1739 if (!c
->protect_hostname
)
1742 if (ns_type_supported(NAMESPACE_UTS
)) {
1743 if (unshare(CLONE_NEWUTS
) < 0) {
1744 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1745 *ret_exit_status
= EXIT_NAMESPACE
;
1746 return log_unit_error_errno(u
, errno
, "Failed to set up UTS namespacing: %m");
1749 log_unit_warning(u
, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1752 log_unit_warning(u
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1757 if (skip_seccomp_unavailable(u
, "ProtectHostname="))
1760 r
= seccomp_protect_hostname();
1762 *ret_exit_status
= EXIT_SECCOMP
;
1763 return log_unit_error_errno(u
, r
, "Failed to apply hostname restrictions: %m");
1770 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1773 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1774 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1776 if (idle_pipe
[0] >= 0) {
1779 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1781 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1784 /* Signal systemd that we are bored and want to continue. */
1785 n
= write(idle_pipe
[3], "x", 1);
1787 /* Wait for systemd to react to the signal above. */
1788 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1791 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1795 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1798 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1800 static int build_environment(
1802 const ExecContext
*c
,
1803 const ExecParameters
*p
,
1806 const char *username
,
1808 dev_t journal_stream_dev
,
1809 ino_t journal_stream_ino
,
1812 _cleanup_strv_free_
char **our_env
= NULL
;
1821 #define N_ENV_VARS 17
1822 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1827 _cleanup_free_
char *joined
= NULL
;
1829 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1831 our_env
[n_env
++] = x
;
1833 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1835 our_env
[n_env
++] = x
;
1837 joined
= strv_join(p
->fd_names
, ":");
1841 x
= strjoin("LISTEN_FDNAMES=", joined
);
1844 our_env
[n_env
++] = x
;
1847 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1848 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1850 our_env
[n_env
++] = x
;
1852 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1854 our_env
[n_env
++] = x
;
1857 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1858 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1859 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1860 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1861 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1864 our_env
[n_env
++] = x
;
1868 x
= strjoin("HOME=", home
);
1872 path_simplify(x
+ 5);
1873 our_env
[n_env
++] = x
;
1877 x
= strjoin("LOGNAME=", username
);
1880 our_env
[n_env
++] = x
;
1882 x
= strjoin("USER=", username
);
1885 our_env
[n_env
++] = x
;
1889 x
= strjoin("SHELL=", shell
);
1893 path_simplify(x
+ 6);
1894 our_env
[n_env
++] = x
;
1897 if (!sd_id128_is_null(u
->invocation_id
)) {
1898 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1901 our_env
[n_env
++] = x
;
1904 if (exec_context_needs_term(c
)) {
1905 const char *tty_path
, *term
= NULL
;
1907 tty_path
= exec_context_tty_path(c
);
1909 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1910 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1911 * container manager passes to PID 1 ends up all the way in the console login shown. */
1913 if (path_equal_ptr(tty_path
, "/dev/console") && getppid() == 1)
1914 term
= getenv("TERM");
1917 term
= default_term_for_tty(tty_path
);
1919 x
= strjoin("TERM=", term
);
1922 our_env
[n_env
++] = x
;
1925 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1926 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1929 our_env
[n_env
++] = x
;
1932 if (c
->log_namespace
) {
1933 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
1937 our_env
[n_env
++] = x
;
1940 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1941 _cleanup_free_
char *joined
= NULL
;
1947 if (c
->directories
[t
].n_items
== 0)
1950 n
= exec_directory_env_name_to_string(t
);
1954 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
1955 _cleanup_free_
char *prefixed
= NULL
;
1957 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
1961 if (!strextend_with_separator(&joined
, ":", prefixed
))
1965 x
= strjoin(n
, "=", joined
);
1969 our_env
[n_env
++] = x
;
1972 if (exec_context_has_credentials(c
) && p
->prefix
[EXEC_DIRECTORY_RUNTIME
]) {
1973 x
= strjoin("CREDENTIALS_DIRECTORY=", p
->prefix
[EXEC_DIRECTORY_RUNTIME
], "/credentials/", u
->id
);
1977 our_env
[n_env
++] = x
;
1980 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
1983 our_env
[n_env
++] = x
;
1985 our_env
[n_env
++] = NULL
;
1986 assert(n_env
<= N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1989 *ret
= TAKE_PTR(our_env
);
1994 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1995 _cleanup_strv_free_
char **pass_env
= NULL
;
1998 STRV_FOREACH(i
, c
->pass_environment
) {
1999 _cleanup_free_
char *x
= NULL
;
2005 x
= strjoin(*i
, "=", v
);
2009 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2012 pass_env
[n_env
++] = TAKE_PTR(x
);
2013 pass_env
[n_env
] = NULL
;
2016 *ret
= TAKE_PTR(pass_env
);
2021 bool exec_needs_mount_namespace(
2022 const ExecContext
*context
,
2023 const ExecParameters
*params
,
2024 const ExecRuntime
*runtime
) {
2028 if (context
->root_image
)
2031 if (!strv_isempty(context
->read_write_paths
) ||
2032 !strv_isempty(context
->read_only_paths
) ||
2033 !strv_isempty(context
->inaccessible_paths
) ||
2034 !strv_isempty(context
->exec_paths
) ||
2035 !strv_isempty(context
->no_exec_paths
))
2038 if (context
->n_bind_mounts
> 0)
2041 if (context
->n_temporary_filesystems
> 0)
2044 if (context
->n_mount_images
> 0)
2047 if (context
->n_extension_images
> 0)
2050 if (!strv_isempty(context
->extension_directories
))
2053 if (!IN_SET(context
->mount_flags
, 0, MS_SHARED
))
2056 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
2059 if (context
->private_devices
||
2060 context
->private_mounts
||
2061 context
->protect_system
!= PROTECT_SYSTEM_NO
||
2062 context
->protect_home
!= PROTECT_HOME_NO
||
2063 context
->protect_kernel_tunables
||
2064 context
->protect_kernel_modules
||
2065 context
->protect_kernel_logs
||
2066 context
->protect_control_groups
||
2067 context
->protect_proc
!= PROTECT_PROC_DEFAULT
||
2068 context
->proc_subset
!= PROC_SUBSET_ALL
||
2069 context
->private_ipc
||
2070 context
->ipc_namespace_path
)
2073 if (context
->root_directory
) {
2074 if (exec_context_get_effective_mount_apivfs(context
))
2077 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2078 if (params
&& !params
->prefix
[t
])
2081 if (context
->directories
[t
].n_items
> 0)
2086 if (context
->dynamic_user
&&
2087 (context
->directories
[EXEC_DIRECTORY_STATE
].n_items
> 0 ||
2088 context
->directories
[EXEC_DIRECTORY_CACHE
].n_items
> 0 ||
2089 context
->directories
[EXEC_DIRECTORY_LOGS
].n_items
> 0))
2092 if (context
->log_namespace
)
2098 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2099 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2100 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
2101 _cleanup_close_
int unshare_ready_fd
= -1;
2102 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2107 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2108 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2109 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2110 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2111 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2112 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2113 * continues execution normally.
2114 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2115 * does not need CAP_SETUID to write the single line mapping to itself. */
2117 /* Can only set up multiple mappings with CAP_SETUID. */
2118 if (have_effective_cap(CAP_SETUID
) && uid
!= ouid
&& uid_is_valid(uid
))
2119 r
= asprintf(&uid_map
,
2120 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2121 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2122 ouid
, ouid
, uid
, uid
);
2124 r
= asprintf(&uid_map
,
2125 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2131 /* Can only set up multiple mappings with CAP_SETGID. */
2132 if (have_effective_cap(CAP_SETGID
) && gid
!= ogid
&& gid_is_valid(gid
))
2133 r
= asprintf(&gid_map
,
2134 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2135 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2136 ogid
, ogid
, gid
, gid
);
2138 r
= asprintf(&gid_map
,
2139 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2145 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2147 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2148 if (unshare_ready_fd
< 0)
2151 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2153 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2156 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
2160 _cleanup_close_
int fd
= -1;
2164 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2165 * here, after the parent opened its own user namespace. */
2168 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2170 /* Wait until the parent unshared the user namespace */
2171 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2176 /* Disable the setgroups() system call in the child user namespace, for good. */
2177 a
= procfs_file_alloca(ppid
, "setgroups");
2178 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2180 if (errno
!= ENOENT
) {
2185 /* If the file is missing the kernel is too old, let's continue anyway. */
2187 if (write(fd
, "deny\n", 5) < 0) {
2192 fd
= safe_close(fd
);
2195 /* First write the GID map */
2196 a
= procfs_file_alloca(ppid
, "gid_map");
2197 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2202 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2206 fd
= safe_close(fd
);
2208 /* The write the UID map */
2209 a
= procfs_file_alloca(ppid
, "uid_map");
2210 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2215 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2220 _exit(EXIT_SUCCESS
);
2223 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2224 _exit(EXIT_FAILURE
);
2227 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2229 if (unshare(CLONE_NEWUSER
) < 0)
2232 /* Let the child know that the namespace is ready now */
2233 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2236 /* Try to read an error code from the child */
2237 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2240 if (n
== sizeof(r
)) { /* an error code was sent to us */
2245 if (n
!= 0) /* on success we should have read 0 bytes */
2248 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2251 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2257 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2258 if (!context
->dynamic_user
)
2261 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2264 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2270 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2271 _cleanup_free_
char *src_abs
= NULL
;
2276 src_abs
= path_join(root
, source
);
2280 STRV_FOREACH(dst
, symlinks
) {
2281 _cleanup_free_
char *dst_abs
= NULL
;
2283 dst_abs
= path_join(root
, *dst
);
2287 r
= mkdir_parents_label(dst_abs
, 0755);
2291 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2299 static int setup_exec_directory(
2300 const ExecContext
*context
,
2301 const ExecParameters
*params
,
2304 ExecDirectoryType type
,
2305 bool needs_mount_namespace
,
2308 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2309 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2310 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2311 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2312 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2313 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2319 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2320 assert(exit_status
);
2322 if (!params
->prefix
[type
])
2325 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2326 if (!uid_is_valid(uid
))
2328 if (!gid_is_valid(gid
))
2332 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2333 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2335 p
= path_join(params
->prefix
[type
], context
->directories
[type
].items
[i
].path
);
2341 r
= mkdir_parents_label(p
, 0755);
2345 if (exec_directory_is_private(context
, type
)) {
2346 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2347 * case we want to avoid leaving a directory around fully accessible that is owned by
2348 * a dynamic user whose UID is later on reused. To lock this down we use the same
2349 * trick used by container managers to prohibit host users to get access to files of
2350 * the same UID in containers: we place everything inside a directory that has an
2351 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2352 * for unprivileged host code. We then use fs namespacing to make this directory
2353 * permeable for the service itself.
2355 * Specifically: for a service which wants a special directory "foo/" we first create
2356 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2357 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2358 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2359 * unprivileged host users can't look into it. Inside of the namespace of the unit
2360 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2361 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2362 * for the service and making sure it only gets access to the dirs it needs but no
2363 * others. Tricky? Yes, absolutely, but it works!
2365 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2366 * to be owned by the service itself.
2368 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2369 * for sharing files or sockets with other services. */
2371 pp
= path_join(params
->prefix
[type
], "private");
2377 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2378 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2382 if (!path_extend(&pp
, context
->directories
[type
].items
[i
].path
)) {
2387 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2388 r
= mkdir_parents_label(pp
, 0755);
2392 if (is_dir(p
, false) > 0 &&
2393 (laccess(pp
, F_OK
) < 0 && errno
== ENOENT
)) {
2395 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2396 * it over. Most likely the service has been upgraded from one that didn't use
2397 * DynamicUser=1, to one that does. */
2399 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2400 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2401 exec_directory_type_to_string(type
), p
, pp
);
2403 if (rename(p
, pp
) < 0) {
2408 /* Otherwise, create the actual directory for the service */
2410 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2411 if (r
< 0 && r
!= -EEXIST
)
2415 /* And link it up from the original place. Note that if a mount namespace is going to be
2416 * used, then this symlink remains on the host, and a new one for the child namespace will
2417 * be created later. */
2418 r
= symlink_idempotent(pp
, p
, true);
2423 _cleanup_free_
char *target
= NULL
;
2425 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2426 readlink_and_make_absolute(p
, &target
) >= 0) {
2427 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2429 /* This already exists and is a symlink? Interesting. Maybe it's one created
2430 * by DynamicUser=1 (see above)?
2432 * We do this for all directory types except for ConfigurationDirectory=,
2433 * since they all support the private/ symlink logic at least in some
2434 * configurations, see above. */
2436 r
= chase_symlinks(target
, NULL
, 0, &target_resolved
, NULL
);
2440 q
= path_join(params
->prefix
[type
], "private", context
->directories
[type
].items
[i
].path
);
2446 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2447 r
= chase_symlinks(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2451 if (path_equal(q_resolved
, target_resolved
)) {
2453 /* Hmm, apparently DynamicUser= was once turned on for this service,
2454 * but is no longer. Let's move the directory back up. */
2456 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2457 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2458 exec_directory_type_to_string(type
), q
, p
);
2460 if (unlink(p
) < 0) {
2465 if (rename(q
, p
) < 0) {
2472 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2477 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2480 /* Don't change the owner/access mode of the configuration directory,
2481 * as in the common case it is not written to by a service, and shall
2482 * not be writable. */
2484 if (stat(p
, &st
) < 0) {
2489 /* Still complain if the access mode doesn't match */
2490 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2491 log_warning("%s \'%s\' already exists but the mode is different. "
2492 "(File system: %o %sMode: %o)",
2493 exec_directory_type_to_string(type
), context
->directories
[type
].items
[i
].path
,
2494 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2501 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2502 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2503 * current UID/GID ownership.) */
2504 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2508 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2509 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2510 * assignments to exist. */
2511 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777);
2516 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2517 * they are set up later, to allow configuring empty var/run/etc. */
2518 if (!needs_mount_namespace
)
2519 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2520 r
= create_many_symlinks(params
->prefix
[type
],
2521 context
->directories
[type
].items
[i
].path
,
2522 context
->directories
[type
].items
[i
].symlinks
);
2530 *exit_status
= exit_status_table
[type
];
2534 static int write_credential(
2540 bool ownership_ok
) {
2542 _cleanup_(unlink_and_freep
) char *tmp
= NULL
;
2543 _cleanup_close_
int fd
= -1;
2546 r
= tempfn_random_child("", "cred", &tmp
);
2550 fd
= openat(dfd
, tmp
, O_CREAT
|O_RDWR
|O_CLOEXEC
|O_EXCL
|O_NOFOLLOW
|O_NOCTTY
, 0600);
2556 r
= loop_write(fd
, data
, size
, /* do_poll = */ false);
2560 if (fchmod(fd
, 0400) < 0) /* Take away "w" bit */
2563 if (uid_is_valid(uid
) && uid
!= getuid()) {
2564 r
= fd_add_uid_acl_permission(fd
, uid
, ACL_READ
);
2566 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
2569 if (!ownership_ok
) /* Ideally we use ACLs, since we can neatly express what we want
2570 * to express: that the user gets read access and nothing
2571 * else. But if the backing fs can't support that (e.g. ramfs)
2572 * then we can use file ownership instead. But that's only safe if
2573 * we can then re-mount the whole thing read-only, so that the
2574 * user can no longer chmod() the file to gain write access. */
2577 if (fchown(fd
, uid
, GID_INVALID
) < 0)
2582 if (renameat(dfd
, tmp
, dfd
, id
) < 0)
2589 static char **credential_search_path(
2590 const ExecParameters
*params
,
2593 _cleanup_strv_free_
char **l
= NULL
;
2597 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2598 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2599 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2602 if (strv_extend(&l
, params
->received_encrypted_credentials_directory
) < 0)
2605 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2609 if (params
->received_credentials_directory
)
2610 if (strv_extend(&l
, params
->received_credentials_directory
) < 0)
2613 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2616 if (DEBUG_LOGGING
) {
2617 _cleanup_free_
char *t
= strv_join(l
, ":");
2619 log_debug("Credential search path is: %s", t
);
2625 static int load_credential(
2626 const ExecContext
*context
,
2627 const ExecParameters
*params
,
2638 ReadFullFileFlags flags
= READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
;
2639 _cleanup_strv_free_
char **search_path
= NULL
;
2640 _cleanup_(erase_and_freep
) char *data
= NULL
;
2641 _cleanup_free_
char *bindname
= NULL
;
2642 const char *source
= NULL
;
2643 bool missing_ok
= true;
2644 size_t size
, add
, maxsz
;
2652 assert(write_dfd
>= 0);
2655 if (read_dfd
>= 0) {
2656 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2657 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2658 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2661 if (!filename_is_valid(path
)) /* safety check */
2667 } else if (path_is_absolute(path
)) {
2668 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2671 if (!path_is_valid(path
)) /* safety check */
2674 flags
|= READ_FULL_FILE_CONNECT_SOCKET
;
2676 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2677 * via the source socket address in case we read off an AF_UNIX socket. */
2678 if (asprintf(&bindname
, "@%" PRIx64
"/unit/%s/%s", random_u64(), unit
, id
) < 0)
2684 } else if (credential_name_valid(path
)) {
2685 /* If this is a relative path, take it as credential name relative to the credentials
2686 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2687 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2689 search_path
= credential_search_path(params
, encrypted
);
2698 flags
|= READ_FULL_FILE_UNBASE64
;
2700 maxsz
= encrypted
? CREDENTIAL_ENCRYPTED_SIZE_MAX
: CREDENTIAL_SIZE_MAX
;
2703 STRV_FOREACH(d
, search_path
) {
2704 _cleanup_free_
char *j
= NULL
;
2706 j
= path_join(*d
, path
);
2710 r
= read_full_file_full(
2711 AT_FDCWD
, j
, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2721 r
= read_full_file_full(
2731 if (r
== -ENOENT
&& (missing_ok
|| hashmap_contains(context
->set_credentials
, id
))) {
2732 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2733 * will get clear errors if we don't pass such a missing credential on as they
2734 * themselves will get ENOENT when trying to read them, which should not be much
2735 * worse than when we handle the error here and make it fatal.
2737 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2738 * we are fine, too. */
2739 log_debug_errno(r
, "Couldn't read inherited credential '%s', skipping: %m", path
);
2743 return log_debug_errno(r
, "Failed to read credential '%s': %m", path
);
2746 _cleanup_free_
void *plaintext
= NULL
;
2747 size_t plaintext_size
= 0;
2749 r
= decrypt_credential_and_warn(id
, now(CLOCK_REALTIME
), NULL
, NULL
, data
, size
, &plaintext
, &plaintext_size
);
2753 free_and_replace(data
, plaintext
);
2754 size
= plaintext_size
;
2757 add
= strlen(id
) + size
;
2761 r
= write_credential(write_dfd
, id
, data
, size
, uid
, ownership_ok
);
2763 return log_debug_errno(r
, "Failed to write credential '%s': %m", id
);
2769 struct load_cred_args
{
2770 const ExecContext
*context
;
2771 const ExecParameters
*params
;
2780 static int load_cred_recurse_dir_cb(
2781 RecurseDirEvent event
,
2785 const struct dirent
*de
,
2786 const struct statx
*sx
,
2789 struct load_cred_args
*args
= ASSERT_PTR(userdata
);
2790 _cleanup_free_
char *sub_id
= NULL
;
2793 if (event
!= RECURSE_DIR_ENTRY
)
2794 return RECURSE_DIR_CONTINUE
;
2796 if (!IN_SET(de
->d_type
, DT_REG
, DT_SOCK
))
2797 return RECURSE_DIR_CONTINUE
;
2799 sub_id
= strreplace(path
, "/", "_");
2803 if (!credential_name_valid(sub_id
))
2804 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential would get ID %s, which is not valid, refusing", sub_id
);
2806 if (faccessat(args
->dfd
, sub_id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0) {
2807 log_debug("Skipping credential with duplicated ID %s at %s", sub_id
, path
);
2808 return RECURSE_DIR_CONTINUE
;
2810 if (errno
!= ENOENT
)
2811 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sub_id
);
2813 r
= load_credential(
2828 return RECURSE_DIR_CONTINUE
;
2831 static int acquire_credentials(
2832 const ExecContext
*context
,
2833 const ExecParameters
*params
,
2837 bool ownership_ok
) {
2839 uint64_t left
= CREDENTIALS_TOTAL_SIZE_MAX
;
2840 _cleanup_close_
int dfd
= -1;
2841 ExecLoadCredential
*lc
;
2842 ExecSetCredential
*sc
;
2848 dfd
= open(p
, O_DIRECTORY
|O_CLOEXEC
);
2852 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2853 HASHMAP_FOREACH(lc
, context
->load_credentials
) {
2854 _cleanup_close_
int sub_fd
= -1;
2856 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2857 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2858 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2859 * propagate a credential passed to us from further up. */
2861 if (path_is_absolute(lc
->path
)) {
2862 sub_fd
= open(lc
->path
, O_DIRECTORY
|O_CLOEXEC
|O_RDONLY
);
2863 if (sub_fd
< 0 && !IN_SET(errno
,
2864 ENOTDIR
, /* Not a directory */
2865 ENOENT
)) /* Doesn't exist? */
2866 return log_debug_errno(errno
, "Failed to open '%s': %m", lc
->path
);
2870 /* Regular file (incl. a credential passed in from higher up) */
2871 r
= load_credential(
2887 /* path= */ lc
->id
, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2888 /* statx_mask= */ 0,
2889 /* n_depth_max= */ UINT_MAX
,
2890 RECURSE_DIR_SORT
|RECURSE_DIR_IGNORE_DOT
|RECURSE_DIR_ENSURE_TYPE
,
2891 load_cred_recurse_dir_cb
,
2892 &(struct load_cred_args
) {
2895 .encrypted
= lc
->encrypted
,
2899 .ownership_ok
= ownership_ok
,
2906 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2907 * them, so that they can act as a "default" if the same credential is specified multiple times. */
2908 HASHMAP_FOREACH(sc
, context
->set_credentials
) {
2909 _cleanup_(erase_and_freep
) void *plaintext
= NULL
;
2913 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2914 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2915 * slow and involved, hence it's nice to be able to skip that if the credential already
2917 if (faccessat(dfd
, sc
->id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0)
2919 if (errno
!= ENOENT
)
2920 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sc
->id
);
2922 if (sc
->encrypted
) {
2923 r
= decrypt_credential_and_warn(sc
->id
, now(CLOCK_REALTIME
), NULL
, NULL
, sc
->data
, sc
->size
, &plaintext
, &size
);
2933 add
= strlen(sc
->id
) + size
;
2937 r
= write_credential(dfd
, sc
->id
, data
, size
, uid
, ownership_ok
);
2944 if (fchmod(dfd
, 0500) < 0) /* Now take away the "w" bit */
2947 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2950 if (uid_is_valid(uid
) && uid
!= getuid()) {
2951 r
= fd_add_uid_acl_permission(dfd
, uid
, ACL_READ
| ACL_EXECUTE
);
2953 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
2959 if (fchown(dfd
, uid
, GID_INVALID
) < 0)
2967 static int setup_credentials_internal(
2968 const ExecContext
*context
,
2969 const ExecParameters
*params
,
2971 const char *final
, /* This is where the credential store shall eventually end up at */
2972 const char *workspace
, /* This is where we can prepare it before moving it to the final place */
2973 bool reuse_workspace
, /* Whether to reuse any existing workspace mount if it already is a mount */
2974 bool must_mount
, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2977 int r
, workspace_mounted
; /* negative if we don't know yet whether we have/can mount something; true
2978 * if we mounted something; false if we definitely can't mount anything */
2986 if (reuse_workspace
) {
2987 r
= path_is_mount_point(workspace
, NULL
, 0);
2991 workspace_mounted
= true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2993 workspace_mounted
= -1; /* We need to figure out if we can mount something to the workspace */
2995 workspace_mounted
= -1; /* ditto */
2997 r
= path_is_mount_point(final
, NULL
, 0);
3001 /* If the final place already has something mounted, we use that. If the workspace also has
3002 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3004 final_mounted
= true;
3006 if (workspace_mounted
< 0) {
3007 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3008 * the final version to the workspace, and make it writable, so that we can make
3011 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3015 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
);
3019 workspace_mounted
= true;
3022 final_mounted
= false;
3024 if (workspace_mounted
< 0) {
3025 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3026 for (int try = 0;; try++) {
3029 /* Try "ramfs" first, since it's not swap backed */
3030 r
= mount_nofollow_verbose(LOG_DEBUG
, "ramfs", workspace
, "ramfs", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, "mode=0700");
3032 workspace_mounted
= true;
3036 } else if (try == 1) {
3037 _cleanup_free_
char *opts
= NULL
;
3039 if (asprintf(&opts
, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX
) < 0)
3042 /* Fall back to "tmpfs" otherwise */
3043 r
= mount_nofollow_verbose(LOG_DEBUG
, "tmpfs", workspace
, "tmpfs", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, opts
);
3045 workspace_mounted
= true;
3050 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3051 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3053 if (!ERRNO_IS_PRIVILEGE(r
)) /* Propagate anything that isn't a permission problem */
3056 if (must_mount
) /* If we it's not OK to use the plain directory
3057 * fallback, propagate all errors too */
3060 /* If we lack privileges to bind mount stuff, then let's gracefully
3061 * proceed for compat with container envs, and just use the final dir
3064 workspace_mounted
= false;
3068 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3069 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
);
3073 workspace_mounted
= true;
3079 assert(!must_mount
|| workspace_mounted
> 0);
3080 where
= workspace_mounted
? workspace
: final
;
3082 (void) label_fix_full(AT_FDCWD
, where
, final
, 0);
3084 r
= acquire_credentials(context
, params
, unit
, where
, uid
, workspace_mounted
);
3088 if (workspace_mounted
) {
3089 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3090 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
);
3094 /* And mount it to the final place, read-only */
3096 r
= umount_verbose(LOG_DEBUG
, workspace
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
3098 r
= mount_nofollow_verbose(LOG_DEBUG
, workspace
, final
, NULL
, MS_MOVE
, NULL
);
3102 _cleanup_free_
char *parent
= NULL
;
3104 /* If we do not have our own mount put used the plain directory fallback, then we need to
3105 * open access to the top-level credential directory and the per-service directory now */
3107 r
= path_extract_directory(final
, &parent
);
3110 if (chmod(parent
, 0755) < 0)
3117 static int setup_credentials(
3118 const ExecContext
*context
,
3119 const ExecParameters
*params
,
3123 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
3129 if (!exec_context_has_credentials(context
))
3132 if (!params
->prefix
[EXEC_DIRECTORY_RUNTIME
])
3135 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3136 * and the subdir we mount over with a read-only file system readable by the service's user */
3137 q
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials");
3141 r
= mkdir_label(q
, 0755); /* top-level dir: world readable/searchable */
3142 if (r
< 0 && r
!= -EEXIST
)
3145 p
= path_join(q
, unit
);
3149 r
= mkdir_label(p
, 0700); /* per-unit dir: private to user */
3150 if (r
< 0 && r
!= -EEXIST
)
3153 r
= safe_fork("(sd-mkdcreds)", FORK_DEATHSIG
|FORK_WAIT
|FORK_NEW_MOUNTNS
, NULL
);
3155 _cleanup_free_
char *t
= NULL
, *u
= NULL
;
3157 /* If this is not a privilege or support issue then propagate the error */
3158 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
3161 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3162 * it into place, so that users can't access half-initialized credential stores. */
3163 t
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "systemd/temporary-credentials");
3167 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3168 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3169 * after it is fully set up */
3170 u
= path_join(t
, unit
);
3174 FOREACH_STRING(i
, t
, u
) {
3175 r
= mkdir_label(i
, 0700);
3176 if (r
< 0 && r
!= -EEXIST
)
3180 r
= setup_credentials_internal(
3184 p
, /* final mount point */
3185 u
, /* temporary workspace to overmount */
3186 true, /* reuse the workspace if it is already a mount */
3187 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3190 (void) rmdir(u
); /* remove the workspace again if we can. */
3195 } else if (r
== 0) {
3197 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3198 * we can use the same directory for all cases, after turning off propagation. Question
3199 * though is: where do we turn off propagation exactly, and where do we place the workspace
3200 * directory? We need some place that is guaranteed to be a mount point in the host, and
3201 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3202 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3203 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3204 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3205 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3206 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3207 * propagation on the former, and then overmount the latter.
3209 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3210 * for this purpose, but there are few other candidates that work equally well for us, and
3211 * given that the we do this in a privately namespaced short-lived single-threaded process
3212 * that no one else sees this should be OK to do. */
3214 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, "/dev", NULL
, MS_SLAVE
|MS_REC
, NULL
); /* Turn off propagation from our namespace to host */
3218 r
= setup_credentials_internal(
3222 p
, /* final mount point */
3223 "/dev/shm", /* temporary workspace to overmount */
3224 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3225 true, /* insist that something is mounted, do not allow fallback to plain directory */
3230 _exit(EXIT_SUCCESS
);
3233 _exit(EXIT_FAILURE
);
3240 static int setup_smack(
3241 const Manager
*manager
,
3242 const ExecContext
*context
,
3243 int executable_fd
) {
3247 assert(executable_fd
>= 0);
3249 if (context
->smack_process_label
) {
3250 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
3253 } else if (manager
->default_smack_process_label
) {
3254 _cleanup_free_
char *exec_label
= NULL
;
3256 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
3257 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
3260 r
= mac_smack_apply_pid(0, exec_label
? : manager
->default_smack_process_label
);
3269 static int compile_bind_mounts(
3270 const ExecContext
*context
,
3271 const ExecParameters
*params
,
3272 BindMount
**ret_bind_mounts
,
3273 size_t *ret_n_bind_mounts
,
3274 char ***ret_empty_directories
) {
3276 _cleanup_strv_free_
char **empty_directories
= NULL
;
3277 BindMount
*bind_mounts
;
3283 assert(ret_bind_mounts
);
3284 assert(ret_n_bind_mounts
);
3285 assert(ret_empty_directories
);
3287 n
= context
->n_bind_mounts
;
3288 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3289 if (!params
->prefix
[t
])
3292 n
+= context
->directories
[t
].n_items
;
3296 *ret_bind_mounts
= NULL
;
3297 *ret_n_bind_mounts
= 0;
3298 *ret_empty_directories
= NULL
;
3302 bind_mounts
= new(BindMount
, n
);
3306 for (size_t i
= 0; i
< context
->n_bind_mounts
; i
++) {
3307 BindMount
*item
= context
->bind_mounts
+ i
;
3310 s
= strdup(item
->source
);
3316 d
= strdup(item
->destination
);
3323 bind_mounts
[h
++] = (BindMount
) {
3326 .read_only
= item
->read_only
,
3327 .recursive
= item
->recursive
,
3328 .ignore_enoent
= item
->ignore_enoent
,
3332 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3333 if (!params
->prefix
[t
])
3336 if (context
->directories
[t
].n_items
== 0)
3339 if (exec_directory_is_private(context
, t
) &&
3340 !exec_context_with_rootfs(context
)) {
3343 /* So this is for a dynamic user, and we need to make sure the process can access its own
3344 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3345 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3347 private_root
= path_join(params
->prefix
[t
], "private");
3348 if (!private_root
) {
3353 r
= strv_consume(&empty_directories
, private_root
);
3358 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++) {
3361 if (exec_directory_is_private(context
, t
))
3362 s
= path_join(params
->prefix
[t
], "private", context
->directories
[t
].items
[i
].path
);
3364 s
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3370 if (exec_directory_is_private(context
, t
) &&
3371 exec_context_with_rootfs(context
))
3372 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3373 * directory is not created on the root directory. So, let's bind-mount the directory
3374 * on the 'non-private' place. */
3375 d
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3384 bind_mounts
[h
++] = (BindMount
) {
3388 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
3390 .ignore_enoent
= false,
3397 *ret_bind_mounts
= bind_mounts
;
3398 *ret_n_bind_mounts
= n
;
3399 *ret_empty_directories
= TAKE_PTR(empty_directories
);
3404 bind_mount_free_many(bind_mounts
, h
);
3408 /* ret_symlinks will contain a list of pairs src:dest that describes
3409 * the symlinks to create later on. For example, the symlinks needed
3410 * to safely give private directories to DynamicUser=1 users. */
3411 static int compile_symlinks(
3412 const ExecContext
*context
,
3413 const ExecParameters
*params
,
3414 char ***ret_symlinks
) {
3416 _cleanup_strv_free_
char **symlinks
= NULL
;
3421 assert(ret_symlinks
);
3423 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3424 for (size_t i
= 0; i
< context
->directories
[dt
].n_items
; i
++) {
3425 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
3427 STRV_FOREACH(symlink
, context
->directories
[dt
].items
[i
].symlinks
) {
3428 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
3430 src_abs
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3431 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
3432 if (!src_abs
|| !dst_abs
)
3435 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
3440 if (!exec_directory_is_private(context
, dt
) || exec_context_with_rootfs(context
))
3443 private_path
= path_join(params
->prefix
[dt
], "private", context
->directories
[dt
].items
[i
].path
);
3447 path
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3451 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
3457 *ret_symlinks
= TAKE_PTR(symlinks
);
3462 static bool insist_on_sandboxing(
3463 const ExecContext
*context
,
3464 const char *root_dir
,
3465 const char *root_image
,
3466 const BindMount
*bind_mounts
,
3467 size_t n_bind_mounts
) {
3470 assert(n_bind_mounts
== 0 || bind_mounts
);
3472 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3473 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3474 * rearrange stuff in a way we cannot ignore gracefully. */
3476 if (context
->n_temporary_filesystems
> 0)
3479 if (root_dir
|| root_image
)
3482 if (context
->n_mount_images
> 0)
3485 if (context
->dynamic_user
)
3488 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
3491 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3493 for (size_t i
= 0; i
< n_bind_mounts
; i
++)
3494 if (!path_equal(bind_mounts
[i
].source
, bind_mounts
[i
].destination
))
3497 if (context
->log_namespace
)
3503 static int apply_mount_namespace(
3505 ExecCommandFlags command_flags
,
3506 const ExecContext
*context
,
3507 const ExecParameters
*params
,
3508 const ExecRuntime
*runtime
,
3509 char **error_path
) {
3511 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
;
3512 const char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
3513 const char *root_dir
= NULL
, *root_image
= NULL
;
3514 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
3515 *extension_dir
= NULL
;
3516 NamespaceInfo ns_info
;
3517 bool needs_sandboxing
;
3518 BindMount
*bind_mounts
= NULL
;
3519 size_t n_bind_mounts
= 0;
3524 if (params
->flags
& EXEC_APPLY_CHROOT
) {
3525 root_image
= context
->root_image
;
3528 root_dir
= context
->root_directory
;
3531 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
3535 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3536 r
= compile_symlinks(context
, params
, &symlinks
);
3540 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command_flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3541 if (needs_sandboxing
) {
3542 /* The runtime struct only contains the parent of the private /tmp,
3543 * which is non-accessible to world users. Inside of it there's a /tmp
3544 * that is sticky, and that's the one we want to use here.
3545 * This does not apply when we are using /run/systemd/empty as fallback. */
3547 if (context
->private_tmp
&& runtime
) {
3548 if (streq_ptr(runtime
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
3549 tmp_dir
= runtime
->tmp_dir
;
3550 else if (runtime
->tmp_dir
)
3551 tmp_dir
= strjoina(runtime
->tmp_dir
, "/tmp");
3553 if (streq_ptr(runtime
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
3554 var_tmp_dir
= runtime
->var_tmp_dir
;
3555 else if (runtime
->var_tmp_dir
)
3556 var_tmp_dir
= strjoina(runtime
->var_tmp_dir
, "/tmp");
3559 ns_info
= (NamespaceInfo
) {
3560 .ignore_protect_paths
= false,
3561 .private_dev
= context
->private_devices
,
3562 .protect_control_groups
= context
->protect_control_groups
,
3563 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
3564 .protect_kernel_modules
= context
->protect_kernel_modules
,
3565 .protect_kernel_logs
= context
->protect_kernel_logs
,
3566 .protect_hostname
= context
->protect_hostname
,
3567 .mount_apivfs
= exec_context_get_effective_mount_apivfs(context
),
3568 .private_mounts
= context
->private_mounts
,
3569 .protect_home
= context
->protect_home
,
3570 .protect_system
= context
->protect_system
,
3571 .protect_proc
= context
->protect_proc
,
3572 .proc_subset
= context
->proc_subset
,
3573 .private_ipc
= context
->private_ipc
|| context
->ipc_namespace_path
,
3574 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3575 .mount_nosuid
= context
->no_new_privileges
&& !mac_selinux_use(),
3577 } else if (!context
->dynamic_user
&& root_dir
)
3579 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3580 * sandbox info, otherwise enforce it, don't ignore protected paths and
3581 * fail if we are enable to apply the sandbox inside the mount namespace.
3583 ns_info
= (NamespaceInfo
) {
3584 .ignore_protect_paths
= true,
3587 ns_info
= (NamespaceInfo
) {};
3589 if (context
->mount_flags
== MS_SHARED
)
3590 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3592 if (exec_context_has_credentials(context
) &&
3593 params
->prefix
[EXEC_DIRECTORY_RUNTIME
] &&
3594 FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
3595 creds_path
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials", u
->id
);
3602 if (MANAGER_IS_SYSTEM(u
->manager
)) {
3603 propagate_dir
= path_join("/run/systemd/propagate/", u
->id
);
3604 if (!propagate_dir
) {
3609 incoming_dir
= strdup("/run/systemd/incoming");
3610 if (!incoming_dir
) {
3615 extension_dir
= strdup("/run/systemd/unit-extensions");
3616 if (!extension_dir
) {
3621 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0) {
3626 r
= setup_namespace(root_dir
, root_image
, context
->root_image_options
,
3627 &ns_info
, context
->read_write_paths
,
3628 needs_sandboxing
? context
->read_only_paths
: NULL
,
3629 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
3630 needs_sandboxing
? context
->exec_paths
: NULL
,
3631 needs_sandboxing
? context
->no_exec_paths
: NULL
,
3636 context
->temporary_filesystems
,
3637 context
->n_temporary_filesystems
,
3638 context
->mount_images
,
3639 context
->n_mount_images
,
3643 context
->log_namespace
,
3644 context
->mount_flags
,
3645 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
3646 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
3647 context
->root_verity
,
3648 context
->extension_images
,
3649 context
->n_extension_images
,
3650 context
->extension_directories
,
3654 root_dir
|| root_image
? params
->notify_socket
: NULL
,
3657 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3658 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3659 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3660 * completely different execution environment. */
3662 if (insist_on_sandboxing(
3664 root_dir
, root_image
,
3667 log_unit_debug(u
, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3668 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3669 n_bind_mounts
, context
->n_temporary_filesystems
, yes_no(root_dir
), yes_no(root_image
), yes_no(context
->dynamic_user
));
3673 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
3679 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
3683 static int apply_working_directory(
3684 const ExecContext
*context
,
3685 const ExecParameters
*params
,
3692 assert(exit_status
);
3694 if (context
->working_directory_home
) {
3697 *exit_status
= EXIT_CHDIR
;
3704 wd
= empty_to_root(context
->working_directory
);
3706 if (params
->flags
& EXEC_APPLY_CHROOT
)
3709 d
= prefix_roota(context
->root_directory
, wd
);
3711 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
3712 *exit_status
= EXIT_CHDIR
;
3719 static int apply_root_directory(
3720 const ExecContext
*context
,
3721 const ExecParameters
*params
,
3722 const bool needs_mount_ns
,
3726 assert(exit_status
);
3728 if (params
->flags
& EXEC_APPLY_CHROOT
)
3729 if (!needs_mount_ns
&& context
->root_directory
)
3730 if (chroot(context
->root_directory
) < 0) {
3731 *exit_status
= EXIT_CHROOT
;
3738 static int setup_keyring(
3740 const ExecContext
*context
,
3741 const ExecParameters
*p
,
3742 uid_t uid
, gid_t gid
) {
3744 key_serial_t keyring
;
3753 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3754 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3755 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3756 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3757 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3758 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3760 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
3763 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3764 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3765 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3766 * & group is just as nasty as acquiring a reference to the user keyring. */
3768 saved_uid
= getuid();
3769 saved_gid
= getgid();
3771 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3772 if (setregid(gid
, -1) < 0)
3773 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
3776 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3777 if (setreuid(uid
, -1) < 0) {
3778 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
3783 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
3784 if (keyring
== -1) {
3785 if (errno
== ENOSYS
)
3786 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
3787 else if (ERRNO_IS_PRIVILEGE(errno
))
3788 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
3789 else if (errno
== EDQUOT
)
3790 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
3792 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
3797 /* When requested link the user keyring into the session keyring. */
3798 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
3800 if (keyctl(KEYCTL_LINK
,
3801 KEY_SPEC_USER_KEYRING
,
3802 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
3803 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
3808 /* Restore uid/gid back */
3809 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3810 if (setreuid(saved_uid
, -1) < 0) {
3811 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
3816 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3817 if (setregid(saved_gid
, -1) < 0)
3818 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
3821 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3822 if (!sd_id128_is_null(u
->invocation_id
)) {
3825 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
3827 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
3829 if (keyctl(KEYCTL_SETPERM
, key
,
3830 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
3831 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
3832 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
3837 /* Revert back uid & gid for the last time, and exit */
3838 /* no extra logging, as only the first already reported error matters */
3839 if (getuid() != saved_uid
)
3840 (void) setreuid(saved_uid
, -1);
3842 if (getgid() != saved_gid
)
3843 (void) setregid(saved_gid
, -1);
3848 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
3854 array
[(*n
)++] = pair
[0];
3856 array
[(*n
)++] = pair
[1];
3859 static int close_remaining_fds(
3860 const ExecParameters
*params
,
3861 const ExecRuntime
*runtime
,
3862 const DynamicCreds
*dcreds
,
3865 const int *fds
, size_t n_fds
) {
3867 size_t n_dont_close
= 0;
3868 int dont_close
[n_fds
+ 12];
3872 if (params
->stdin_fd
>= 0)
3873 dont_close
[n_dont_close
++] = params
->stdin_fd
;
3874 if (params
->stdout_fd
>= 0)
3875 dont_close
[n_dont_close
++] = params
->stdout_fd
;
3876 if (params
->stderr_fd
>= 0)
3877 dont_close
[n_dont_close
++] = params
->stderr_fd
;
3880 dont_close
[n_dont_close
++] = socket_fd
;
3882 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
3883 n_dont_close
+= n_fds
;
3887 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
3888 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ipcns_storage_socket
);
3893 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
3895 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
3898 if (user_lookup_fd
>= 0)
3899 dont_close
[n_dont_close
++] = user_lookup_fd
;
3901 return close_all_fds(dont_close
, n_dont_close
);
3904 static int send_user_lookup(
3912 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3913 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3916 if (user_lookup_fd
< 0)
3919 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
3922 if (writev(user_lookup_fd
,
3924 IOVEC_INIT(&uid
, sizeof(uid
)),
3925 IOVEC_INIT(&gid
, sizeof(gid
)),
3926 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
3932 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
3939 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3944 if (!c
->working_directory_home
)
3947 r
= get_home_dir(buf
);
3955 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
3956 _cleanup_strv_free_
char ** list
= NULL
;
3963 assert(c
->dynamic_user
);
3965 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3966 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3969 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3970 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
3976 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
3979 if (exec_directory_is_private(c
, t
))
3980 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
3982 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
3986 r
= strv_consume(&list
, e
);
3992 *ret
= TAKE_PTR(list
);
3997 static int exec_parameters_get_cgroup_path(const ExecParameters
*params
, char **ret
) {
3998 bool using_subcgroup
;
4004 if (!params
->cgroup_path
)
4007 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4008 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4009 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4010 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4011 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4012 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4013 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4014 * flag, which is only passed for the former statements, not for the latter. */
4016 using_subcgroup
= FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
|EXEC_CGROUP_DELEGATE
|EXEC_IS_CONTROL
);
4017 if (using_subcgroup
)
4018 p
= path_join(params
->cgroup_path
, ".control");
4020 p
= strdup(params
->cgroup_path
);
4025 return using_subcgroup
;
4028 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
4029 _cleanup_(cpu_set_reset
) CPUSet s
= {};
4035 if (!c
->numa_policy
.nodes
.set
) {
4036 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4040 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
4046 return cpu_set_add_all(ret
, &s
);
4049 bool exec_context_get_cpu_affinity_from_numa(const ExecContext
*c
) {
4052 return c
->cpu_affinity_from_numa
;
4055 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int fd
, int *ret_fd
) {
4060 assert(*n_fds
< fds_size
);
4068 if (fd
< 3 + (int) *n_fds
) {
4069 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4070 * the fds we pass to the process (or which are closed only during execve). */
4072 r
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
4076 close_and_replace(fd
, r
);
4079 *ret_fd
= fds
[*n_fds
] = fd
;
4084 static int exec_child(
4086 const ExecCommand
*command
,
4087 const ExecContext
*context
,
4088 const ExecParameters
*params
,
4089 ExecRuntime
*runtime
,
4090 DynamicCreds
*dcreds
,
4092 const int named_iofds
[static 3],
4094 size_t n_socket_fds
,
4095 size_t n_storage_fds
,
4100 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
4101 int r
, ngids
= 0, exec_fd
;
4102 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
4103 const char *username
= NULL
, *groupname
= NULL
;
4104 _cleanup_free_
char *home_buffer
= NULL
;
4105 const char *home
= NULL
, *shell
= NULL
;
4106 char **final_argv
= NULL
;
4107 dev_t journal_stream_dev
= 0;
4108 ino_t journal_stream_ino
= 0;
4109 bool userns_set_up
= false;
4110 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4111 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
4112 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
4113 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
4115 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
4116 bool use_selinux
= false;
4119 bool use_smack
= false;
4122 bool use_apparmor
= false;
4124 uid_t saved_uid
= getuid();
4125 gid_t saved_gid
= getgid();
4126 uid_t uid
= UID_INVALID
;
4127 gid_t gid
= GID_INVALID
;
4128 size_t n_fds
= n_socket_fds
+ n_storage_fds
, /* fds to pass to the child */
4129 n_keep_fds
; /* total number of fds not to close */
4131 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
4132 int ngids_after_pam
= 0;
4138 assert(exit_status
);
4140 /* Explicitly test for CVE-2021-4034 inspired invocations */
4141 assert(command
->path
);
4142 assert(!strv_isempty(command
->argv
));
4144 rename_process_from_path(command
->path
);
4146 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4147 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4148 * both of which will be demoted to SIG_DFL. */
4149 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4152 if (context
->ignore_sigpipe
)
4153 (void) ignore_signals(SIGPIPE
);
4155 r
= reset_signal_mask();
4157 *exit_status
= EXIT_SIGNAL_MASK
;
4158 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
4161 if (params
->idle_pipe
)
4162 do_idle_pipe_dance(params
->idle_pipe
);
4164 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4165 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4166 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4167 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4170 log_set_open_when_needed(true);
4172 /* In case anything used libc syslog(), close this here, too */
4175 int keep_fds
[n_fds
+ 3];
4176 memcpy_safe(keep_fds
, fds
, n_fds
* sizeof(int));
4179 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, params
->exec_fd
, &exec_fd
);
4181 *exit_status
= EXIT_FDS
;
4182 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4186 if (unit
->manager
->restrict_fs
) {
4187 int bpf_map_fd
= lsm_bpf_map_restrict_fs_fd(unit
);
4188 if (bpf_map_fd
< 0) {
4189 *exit_status
= EXIT_FDS
;
4190 return log_unit_error_errno(unit
, bpf_map_fd
, "Failed to get restrict filesystems BPF map fd: %m");
4193 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, bpf_map_fd
, &bpf_map_fd
);
4195 *exit_status
= EXIT_FDS
;
4196 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4201 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, keep_fds
, n_keep_fds
);
4203 *exit_status
= EXIT_FDS
;
4204 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
4207 if (!context
->same_pgrp
&&
4209 *exit_status
= EXIT_SETSID
;
4210 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
4213 exec_context_tty_reset(context
, params
);
4215 if (unit_shall_confirm_spawn(unit
)) {
4216 _cleanup_free_
char *cmdline
= NULL
;
4218 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4220 *exit_status
= EXIT_MEMORY
;
4224 r
= ask_for_confirmation(context
, params
->confirm_spawn
, unit
, cmdline
);
4225 if (r
!= CONFIRM_EXECUTE
) {
4226 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4227 *exit_status
= EXIT_SUCCESS
;
4230 *exit_status
= EXIT_CONFIRM
;
4231 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ECANCELED
),
4232 "Execution cancelled by the user");
4236 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4237 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4238 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4239 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4240 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4241 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
4242 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit
->manager
) ? "system" : "user", true) != 0) {
4243 *exit_status
= EXIT_MEMORY
;
4244 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4247 if (context
->dynamic_user
&& dcreds
) {
4248 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4250 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4251 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4252 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4253 *exit_status
= EXIT_USER
;
4254 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4257 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4259 *exit_status
= EXIT_MEMORY
;
4263 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
4265 *exit_status
= EXIT_USER
;
4267 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4268 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4269 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
4272 if (!uid_is_valid(uid
)) {
4273 *exit_status
= EXIT_USER
;
4274 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\"", uid
);
4277 if (!gid_is_valid(gid
)) {
4278 *exit_status
= EXIT_USER
;
4279 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\"", gid
);
4283 username
= dcreds
->user
->name
;
4286 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
4288 *exit_status
= EXIT_USER
;
4289 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
4292 r
= get_fixed_group(context
, &groupname
, &gid
);
4294 *exit_status
= EXIT_GROUP
;
4295 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
4299 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4300 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
4301 &supplementary_gids
, &ngids
);
4303 *exit_status
= EXIT_GROUP
;
4304 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
4307 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
4309 *exit_status
= EXIT_USER
;
4310 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
4313 user_lookup_fd
= safe_close(user_lookup_fd
);
4315 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
4317 *exit_status
= EXIT_CHDIR
;
4318 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
4321 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4322 * must sure to drop O_NONBLOCK */
4324 (void) fd_nonblock(socket_fd
, false);
4326 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4327 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4328 if (params
->cgroup_path
) {
4329 _cleanup_free_
char *p
= NULL
;
4331 r
= exec_parameters_get_cgroup_path(params
, &p
);
4333 *exit_status
= EXIT_CGROUP
;
4334 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
4337 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
4338 if (r
== -EUCLEAN
) {
4339 *exit_status
= EXIT_CGROUP
;
4340 return log_unit_error_errno(unit
, r
, "Failed to attach process to cgroup %s "
4341 "because the cgroup or one of its parents or "
4342 "siblings is in the threaded mode: %m", p
);
4345 *exit_status
= EXIT_CGROUP
;
4346 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
4350 if (context
->network_namespace_path
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
4351 r
= open_shareable_ns_path(runtime
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
4353 *exit_status
= EXIT_NETWORK
;
4354 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
4358 if (context
->ipc_namespace_path
&& runtime
&& runtime
->ipcns_storage_socket
[0] >= 0) {
4359 r
= open_shareable_ns_path(runtime
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
4361 *exit_status
= EXIT_NAMESPACE
;
4362 return log_unit_error_errno(unit
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
4366 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
4368 *exit_status
= EXIT_STDIN
;
4369 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
4372 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4374 *exit_status
= EXIT_STDOUT
;
4375 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
4378 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4380 *exit_status
= EXIT_STDERR
;
4381 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
4384 if (context
->oom_score_adjust_set
) {
4385 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4386 * prohibit write access to this file, and we shouldn't trip up over that. */
4387 r
= set_oom_score_adjust(context
->oom_score_adjust
);
4388 if (ERRNO_IS_PRIVILEGE(r
))
4389 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4391 *exit_status
= EXIT_OOM_ADJUST
;
4392 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
4396 if (context
->coredump_filter_set
) {
4397 r
= set_coredump_filter(context
->coredump_filter
);
4398 if (ERRNO_IS_PRIVILEGE(r
))
4399 log_unit_debug_errno(unit
, r
, "Failed to adjust coredump_filter, ignoring: %m");
4401 return log_unit_error_errno(unit
, r
, "Failed to adjust coredump_filter: %m");
4404 if (context
->nice_set
) {
4405 r
= setpriority_closest(context
->nice
);
4407 return log_unit_error_errno(unit
, r
, "Failed to set up process scheduling priority (nice level): %m");
4410 if (context
->cpu_sched_set
) {
4411 struct sched_param param
= {
4412 .sched_priority
= context
->cpu_sched_priority
,
4415 r
= sched_setscheduler(0,
4416 context
->cpu_sched_policy
|
4417 (context
->cpu_sched_reset_on_fork
?
4418 SCHED_RESET_ON_FORK
: 0),
4421 *exit_status
= EXIT_SETSCHEDULER
;
4422 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
4426 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
4427 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
4428 const CPUSet
*cpu_set
;
4430 if (context
->cpu_affinity_from_numa
) {
4431 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
4433 *exit_status
= EXIT_CPUAFFINITY
;
4434 return log_unit_error_errno(unit
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
4437 cpu_set
= &converted_cpu_set
;
4439 cpu_set
= &context
->cpu_set
;
4441 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
4442 *exit_status
= EXIT_CPUAFFINITY
;
4443 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
4447 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
4448 r
= apply_numa_policy(&context
->numa_policy
);
4449 if (r
== -EOPNOTSUPP
)
4450 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
4452 *exit_status
= EXIT_NUMA_POLICY
;
4453 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
4457 if (context
->ioprio_set
)
4458 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
4459 *exit_status
= EXIT_IOPRIO
;
4460 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
4463 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
4464 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
4465 *exit_status
= EXIT_TIMERSLACK
;
4466 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
4469 if (context
->personality
!= PERSONALITY_INVALID
) {
4470 r
= safe_personality(context
->personality
);
4472 *exit_status
= EXIT_PERSONALITY
;
4473 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
4477 if (context
->utmp_id
) {
4478 const char *line
= context
->tty_path
?
4479 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
4481 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
4483 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
4484 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
4489 if (uid_is_valid(uid
)) {
4490 r
= chown_terminal(STDIN_FILENO
, uid
);
4492 *exit_status
= EXIT_STDIN
;
4493 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
4497 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4498 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4499 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4500 * touch a single hierarchy too. */
4501 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
4502 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
4504 *exit_status
= EXIT_CGROUP
;
4505 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
4509 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
4511 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4512 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
4514 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
4517 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
4518 r
= setup_credentials(context
, params
, unit
->id
, uid
);
4520 *exit_status
= EXIT_CREDENTIALS
;
4521 return log_unit_error_errno(unit
, r
, "Failed to set up credentials: %m");
4525 r
= build_environment(
4537 *exit_status
= EXIT_MEMORY
;
4541 r
= build_pass_environment(context
, &pass_env
);
4543 *exit_status
= EXIT_MEMORY
;
4547 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4548 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4549 * not specify PATH but the unit has ExecSearchPath. */
4550 if (!strv_isempty(context
->exec_search_path
)) {
4551 _cleanup_free_
char *joined
= NULL
;
4553 joined
= strv_join(context
->exec_search_path
, ":");
4555 *exit_status
= EXIT_MEMORY
;
4559 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
4561 *exit_status
= EXIT_MEMORY
;
4566 accum_env
= strv_env_merge(params
->environment
,
4568 joined_exec_search_path
,
4570 context
->environment
,
4573 *exit_status
= EXIT_MEMORY
;
4576 accum_env
= strv_env_clean(accum_env
);
4578 (void) umask(context
->umask
);
4580 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
4582 *exit_status
= EXIT_KEYRING
;
4583 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
4586 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4588 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4590 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4591 * for it, and the kernel doesn't actually support ambient caps. */
4592 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
4594 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4595 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4597 if (needs_ambient_hack
)
4598 needs_setuid
= false;
4600 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
4602 if (needs_sandboxing
) {
4603 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4604 * /sys being present. The actual MAC context application will happen later, as late as
4605 * possible, to avoid impacting our own code paths. */
4608 use_selinux
= mac_selinux_use();
4611 use_smack
= mac_smack_use();
4614 use_apparmor
= mac_apparmor_use();
4618 if (needs_sandboxing
) {
4621 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4622 * is set here. (See below.) */
4624 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
4626 *exit_status
= EXIT_LIMITS
;
4627 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
4631 if (needs_setuid
&& context
->pam_name
&& username
) {
4632 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4633 * wins here. (See above.) */
4635 /* All fds passed in the fds array will be closed in the pam child process. */
4636 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
4638 *exit_status
= EXIT_PAM
;
4639 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
4642 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
4643 if (ngids_after_pam
< 0) {
4644 *exit_status
= EXIT_MEMORY
;
4645 return log_unit_error_errno(unit
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
4649 if (needs_sandboxing
&& context
->private_users
&& !have_effective_cap(CAP_SYS_ADMIN
)) {
4650 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4651 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4652 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4654 userns_set_up
= true;
4655 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4657 *exit_status
= EXIT_USER
;
4658 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user: %m");
4662 if ((context
->private_network
|| context
->network_namespace_path
) && runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
4664 if (ns_type_supported(NAMESPACE_NET
)) {
4665 r
= setup_shareable_ns(runtime
->netns_storage_socket
, CLONE_NEWNET
);
4667 log_unit_warning_errno(unit
, r
,
4668 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4670 *exit_status
= EXIT_NETWORK
;
4671 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
4673 } else if (context
->network_namespace_path
) {
4674 *exit_status
= EXIT_NETWORK
;
4675 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4676 "NetworkNamespacePath= is not supported, refusing.");
4678 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4681 if ((context
->private_ipc
|| context
->ipc_namespace_path
) && runtime
&& runtime
->ipcns_storage_socket
[0] >= 0) {
4683 if (ns_type_supported(NAMESPACE_IPC
)) {
4684 r
= setup_shareable_ns(runtime
->ipcns_storage_socket
, CLONE_NEWIPC
);
4686 log_unit_warning_errno(unit
, r
,
4687 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4689 *exit_status
= EXIT_NAMESPACE
;
4690 return log_unit_error_errno(unit
, r
, "Failed to set up IPC namespacing: %m");
4692 } else if (context
->ipc_namespace_path
) {
4693 *exit_status
= EXIT_NAMESPACE
;
4694 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4695 "IPCNamespacePath= is not supported, refusing.");
4697 log_unit_warning(unit
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4700 if (needs_mount_namespace
) {
4701 _cleanup_free_
char *error_path
= NULL
;
4703 r
= apply_mount_namespace(unit
, command
->flags
, context
, params
, runtime
, &error_path
);
4705 *exit_status
= EXIT_NAMESPACE
;
4706 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
4707 error_path
? ": " : "", strempty(error_path
));
4711 if (needs_sandboxing
) {
4712 r
= apply_protect_hostname(unit
, context
, exit_status
);
4717 /* Drop groups as early as possible.
4718 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4719 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4721 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
4722 int ngids_to_enforce
= 0;
4724 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
4729 if (ngids_to_enforce
< 0) {
4730 *exit_status
= EXIT_MEMORY
;
4731 return log_unit_error_errno(unit
,
4733 "Failed to merge group lists. Group membership might be incorrect: %m");
4736 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
4738 *exit_status
= EXIT_GROUP
;
4739 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
4743 /* If the user namespace was not set up above, try to do it now.
4744 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4745 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4746 * case of mount namespaces being less privileged when the mount point list is copied from a
4747 * different user namespace). */
4749 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
4750 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4752 *exit_status
= EXIT_USER
;
4753 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
4757 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4760 _cleanup_free_
char *executable
= NULL
;
4761 _cleanup_close_
int executable_fd
= -1;
4762 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
4764 if (r
!= -ENOMEM
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
4765 log_unit_struct_errno(unit
, LOG_INFO
, r
,
4766 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4767 LOG_UNIT_INVOCATION_ID(unit
),
4768 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
4770 "EXECUTABLE=%s", command
->path
);
4774 *exit_status
= EXIT_EXEC
;
4776 return log_unit_struct_errno(unit
, LOG_INFO
, r
,
4777 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4778 LOG_UNIT_INVOCATION_ID(unit
),
4779 LOG_UNIT_MESSAGE(unit
, "Failed to locate executable %s: %m",
4781 "EXECUTABLE=%s", command
->path
);
4784 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, executable_fd
, &executable_fd
);
4786 *exit_status
= EXIT_FDS
;
4787 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4791 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
4796 else if (params
->n_socket_fds
== 1)
4797 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4798 * use context from that fd to compute the label. */
4799 fd
= params
->fds
[0];
4802 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
4804 if (!context
->selinux_context_ignore
) {
4805 *exit_status
= EXIT_SELINUX_CONTEXT
;
4806 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
4808 log_unit_debug_errno(unit
, r
, "Failed to determine SELinux context, ignoring: %m");
4814 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4815 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4816 * however if we have it as we want to keep it open until the final execve(). */
4818 r
= close_all_fds(keep_fds
, n_keep_fds
);
4820 r
= shift_fds(fds
, n_fds
);
4822 r
= flags_fds(fds
, n_socket_fds
, n_storage_fds
, context
->non_blocking
);
4824 *exit_status
= EXIT_FDS
;
4825 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
4828 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4829 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4830 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4833 secure_bits
= context
->secure_bits
;
4835 if (needs_sandboxing
) {
4838 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4839 * requested. (Note this is placed after the general resource limit initialization, see
4840 * above, in order to take precedence.) */
4841 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
4842 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
4843 *exit_status
= EXIT_LIMITS
;
4844 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4849 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4850 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4852 r
= setup_smack(unit
->manager
, context
, executable_fd
);
4853 if (r
< 0 && !context
->smack_process_label_ignore
) {
4854 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
4855 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
4860 bset
= context
->capability_bounding_set
;
4861 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4862 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4863 * instead of us doing that */
4864 if (needs_ambient_hack
)
4865 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
4866 (UINT64_C(1) << CAP_SETUID
) |
4867 (UINT64_C(1) << CAP_SETGID
);
4869 if (!cap_test_all(bset
)) {
4870 r
= capability_bounding_set_drop(bset
, false);
4872 *exit_status
= EXIT_CAPABILITIES
;
4873 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
4877 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4879 * To be able to raise the ambient capabilities after setresuid() they have to be
4880 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4881 * After setresuid() the ambient capabilities can be raised as they are present in
4882 * the permitted and inhertiable set. However it is possible that someone wants to
4883 * set ambient capabilities without changing the user, so we also set the ambient
4884 * capabilities here.
4885 * The requested ambient capabilities are raised in the inheritable set if the
4886 * second argument is true. */
4887 if (!needs_ambient_hack
) {
4888 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
4890 *exit_status
= EXIT_CAPABILITIES
;
4891 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
4896 /* chroot to root directory first, before we lose the ability to chroot */
4897 r
= apply_root_directory(context
, params
, needs_mount_namespace
, exit_status
);
4899 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
4902 if (uid_is_valid(uid
)) {
4903 r
= enforce_user(context
, uid
);
4905 *exit_status
= EXIT_USER
;
4906 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
4909 if (!needs_ambient_hack
&&
4910 context
->capability_ambient_set
!= 0) {
4912 /* Raise the ambient capabilities after user change. */
4913 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
4915 *exit_status
= EXIT_CAPABILITIES
;
4916 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
4922 /* Apply working directory here, because the working directory might be on NFS and only the user running
4923 * this service might have the correct privilege to change to the working directory */
4924 r
= apply_working_directory(context
, params
, home
, exit_status
);
4926 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
4928 if (needs_sandboxing
) {
4929 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4930 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4931 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4932 * are restricted. */
4936 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
4939 r
= setexeccon(exec_context
);
4941 if (!context
->selinux_context_ignore
) {
4942 *exit_status
= EXIT_SELINUX_CONTEXT
;
4943 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
4945 log_unit_debug_errno(unit
, r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
4952 if (use_apparmor
&& context
->apparmor_profile
) {
4953 r
= aa_change_onexec(context
->apparmor_profile
);
4954 if (r
< 0 && !context
->apparmor_profile_ignore
) {
4955 *exit_status
= EXIT_APPARMOR_PROFILE
;
4956 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
4961 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4962 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4964 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
4965 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4966 * effective set here.
4967 * The effective set is overwritten during execve with the following values:
4968 * - ambient set (for non-root processes)
4969 * - (inheritable | bounding) set for root processes)
4971 * Hence there is no security impact to raise it in the effective set before execve
4973 r
= capability_gain_cap_setpcap(NULL
);
4975 *exit_status
= EXIT_CAPABILITIES
;
4976 return log_unit_error_errno(unit
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
4978 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
4979 *exit_status
= EXIT_SECUREBITS
;
4980 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
4984 if (context_has_no_new_privileges(context
))
4985 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
4986 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
4987 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
4991 r
= apply_address_families(unit
, context
);
4993 *exit_status
= EXIT_ADDRESS_FAMILIES
;
4994 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
4997 r
= apply_memory_deny_write_execute(unit
, context
);
4999 *exit_status
= EXIT_SECCOMP
;
5000 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
5003 r
= apply_restrict_realtime(unit
, context
);
5005 *exit_status
= EXIT_SECCOMP
;
5006 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
5009 r
= apply_restrict_suid_sgid(unit
, context
);
5011 *exit_status
= EXIT_SECCOMP
;
5012 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
5015 r
= apply_restrict_namespaces(unit
, context
);
5017 *exit_status
= EXIT_SECCOMP
;
5018 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
5021 r
= apply_protect_sysctl(unit
, context
);
5023 *exit_status
= EXIT_SECCOMP
;
5024 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
5027 r
= apply_protect_kernel_modules(unit
, context
);
5029 *exit_status
= EXIT_SECCOMP
;
5030 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
5033 r
= apply_protect_kernel_logs(unit
, context
);
5035 *exit_status
= EXIT_SECCOMP
;
5036 return log_unit_error_errno(unit
, r
, "Failed to apply kernel log restrictions: %m");
5039 r
= apply_protect_clock(unit
, context
);
5041 *exit_status
= EXIT_SECCOMP
;
5042 return log_unit_error_errno(unit
, r
, "Failed to apply clock restrictions: %m");
5045 r
= apply_private_devices(unit
, context
);
5047 *exit_status
= EXIT_SECCOMP
;
5048 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
5051 r
= apply_syscall_archs(unit
, context
);
5053 *exit_status
= EXIT_SECCOMP
;
5054 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
5057 r
= apply_lock_personality(unit
, context
);
5059 *exit_status
= EXIT_SECCOMP
;
5060 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
5063 r
= apply_syscall_log(unit
, context
);
5065 *exit_status
= EXIT_SECCOMP
;
5066 return log_unit_error_errno(unit
, r
, "Failed to apply system call log filters: %m");
5069 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5070 * by the filter as little as possible. */
5071 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
5073 *exit_status
= EXIT_SECCOMP
;
5074 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
5079 r
= apply_restrict_filesystems(unit
, context
);
5081 *exit_status
= EXIT_BPF
;
5082 return log_unit_error_errno(unit
, r
, "Failed to restrict filesystems: %m");
5088 if (!strv_isempty(context
->unset_environment
)) {
5091 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5093 *exit_status
= EXIT_MEMORY
;
5097 strv_free_and_replace(accum_env
, ee
);
5100 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5101 replaced_argv
= replace_env_argv(command
->argv
, accum_env
);
5102 if (!replaced_argv
) {
5103 *exit_status
= EXIT_MEMORY
;
5106 final_argv
= replaced_argv
;
5108 final_argv
= command
->argv
;
5110 if (DEBUG_LOGGING
) {
5111 _cleanup_free_
char *line
= NULL
;
5113 line
= quote_command_line(final_argv
, SHELL_ESCAPE_EMPTY
);
5115 *exit_status
= EXIT_MEMORY
;
5119 log_unit_struct(unit
, LOG_DEBUG
,
5120 "EXECUTABLE=%s", executable
,
5121 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
));
5127 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5128 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5130 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5131 *exit_status
= EXIT_EXEC
;
5132 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
5136 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5141 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5142 * that POLLHUP on it no longer means execve() succeeded. */
5144 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5145 *exit_status
= EXIT_EXEC
;
5146 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
5150 *exit_status
= EXIT_EXEC
;
5151 return log_unit_error_errno(unit
, r
, "Failed to execute %s: %m", executable
);
5154 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
5155 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
5157 int exec_spawn(Unit
*unit
,
5158 ExecCommand
*command
,
5159 const ExecContext
*context
,
5160 const ExecParameters
*params
,
5161 ExecRuntime
*runtime
,
5162 DynamicCreds
*dcreds
,
5165 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
5166 _cleanup_free_
char *subcgroup_path
= NULL
;
5167 _cleanup_strv_free_
char **files_env
= NULL
;
5168 size_t n_storage_fds
= 0, n_socket_fds
= 0;
5169 _cleanup_free_
char *line
= NULL
;
5177 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
5179 if (context
->std_input
== EXEC_INPUT_SOCKET
||
5180 context
->std_output
== EXEC_OUTPUT_SOCKET
||
5181 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
5183 if (params
->n_socket_fds
> 1)
5184 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
5186 if (params
->n_socket_fds
== 0)
5187 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
5189 socket_fd
= params
->fds
[0];
5193 n_socket_fds
= params
->n_socket_fds
;
5194 n_storage_fds
= params
->n_storage_fds
;
5197 r
= exec_context_named_iofds(context
, params
, named_iofds
);
5199 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
5201 r
= exec_context_load_environment(unit
, context
, &files_env
);
5203 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
5205 line
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
5209 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5210 and, until the next SELinux policy changes, we save further reloads in future children. */
5211 mac_selinux_maybe_reload();
5213 log_unit_struct(unit
, LOG_DEBUG
,
5214 LOG_UNIT_MESSAGE(unit
, "About to execute %s", line
),
5215 "EXECUTABLE=%s", command
->path
, /* We won't know the real executable path until we create
5216 the mount namespace in the child, but we want to log
5217 from the parent, so we need to use the (possibly
5218 inaccurate) path here. */
5219 LOG_UNIT_INVOCATION_ID(unit
));
5221 if (params
->cgroup_path
) {
5222 r
= exec_parameters_get_cgroup_path(params
, &subcgroup_path
);
5224 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
5225 if (r
> 0) { /* We are using a child cgroup */
5226 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
5228 return log_unit_error_errno(unit
, r
, "Failed to create control group '%s': %m", subcgroup_path
);
5230 /* Normally we would not propagate the oomd xattrs to children but since we created this
5231 * sub-cgroup internally we should do it. */
5232 cgroup_oomd_xattr_apply(unit
, subcgroup_path
);
5238 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
5241 int exit_status
= EXIT_SUCCESS
;
5243 r
= exec_child(unit
,
5255 unit
->manager
->user_lookup_fds
[1],
5259 const char *status
=
5260 exit_status_to_string(exit_status
,
5261 EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
);
5263 log_unit_struct_errno(unit
, LOG_ERR
, r
,
5264 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5265 LOG_UNIT_INVOCATION_ID(unit
),
5266 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
5267 status
, command
->path
),
5268 "EXECUTABLE=%s", command
->path
);
5274 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
5276 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5277 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5278 * process will be killed too). */
5280 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
5282 exec_status_start(&command
->exec_status
, pid
);
5288 void exec_context_init(ExecContext
*c
) {
5292 c
->ioprio
= IOPRIO_DEFAULT_CLASS_AND_PRIO
;
5293 c
->cpu_sched_policy
= SCHED_OTHER
;
5294 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
5295 c
->syslog_level_prefix
= true;
5296 c
->ignore_sigpipe
= true;
5297 c
->timer_slack_nsec
= NSEC_INFINITY
;
5298 c
->personality
= PERSONALITY_INVALID
;
5299 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5300 c
->directories
[t
].mode
= 0755;
5301 c
->timeout_clean_usec
= USEC_INFINITY
;
5302 c
->capability_bounding_set
= CAP_ALL
;
5303 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
5304 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
5305 c
->log_level_max
= -1;
5307 c
->syscall_errno
= SECCOMP_ERROR_NUMBER_KILL
;
5309 c
->tty_rows
= UINT_MAX
;
5310 c
->tty_cols
= UINT_MAX
;
5311 numa_policy_reset(&c
->numa_policy
);
5314 void exec_context_done(ExecContext
*c
) {
5317 c
->environment
= strv_free(c
->environment
);
5318 c
->environment_files
= strv_free(c
->environment_files
);
5319 c
->pass_environment
= strv_free(c
->pass_environment
);
5320 c
->unset_environment
= strv_free(c
->unset_environment
);
5322 rlimit_free_all(c
->rlimit
);
5324 for (size_t l
= 0; l
< 3; l
++) {
5325 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
5326 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
5329 c
->working_directory
= mfree(c
->working_directory
);
5330 c
->root_directory
= mfree(c
->root_directory
);
5331 c
->root_image
= mfree(c
->root_image
);
5332 c
->root_image_options
= mount_options_free_all(c
->root_image_options
);
5333 c
->root_hash
= mfree(c
->root_hash
);
5334 c
->root_hash_size
= 0;
5335 c
->root_hash_path
= mfree(c
->root_hash_path
);
5336 c
->root_hash_sig
= mfree(c
->root_hash_sig
);
5337 c
->root_hash_sig_size
= 0;
5338 c
->root_hash_sig_path
= mfree(c
->root_hash_sig_path
);
5339 c
->root_verity
= mfree(c
->root_verity
);
5340 c
->extension_images
= mount_image_free_many(c
->extension_images
, &c
->n_extension_images
);
5341 c
->extension_directories
= strv_free(c
->extension_directories
);
5342 c
->tty_path
= mfree(c
->tty_path
);
5343 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
5344 c
->user
= mfree(c
->user
);
5345 c
->group
= mfree(c
->group
);
5347 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
5349 c
->pam_name
= mfree(c
->pam_name
);
5351 c
->read_only_paths
= strv_free(c
->read_only_paths
);
5352 c
->read_write_paths
= strv_free(c
->read_write_paths
);
5353 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
5354 c
->exec_paths
= strv_free(c
->exec_paths
);
5355 c
->no_exec_paths
= strv_free(c
->no_exec_paths
);
5356 c
->exec_search_path
= strv_free(c
->exec_search_path
);
5358 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
5359 c
->bind_mounts
= NULL
;
5360 c
->n_bind_mounts
= 0;
5361 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
5362 c
->temporary_filesystems
= NULL
;
5363 c
->n_temporary_filesystems
= 0;
5364 c
->mount_images
= mount_image_free_many(c
->mount_images
, &c
->n_mount_images
);
5366 cpu_set_reset(&c
->cpu_set
);
5367 numa_policy_reset(&c
->numa_policy
);
5369 c
->utmp_id
= mfree(c
->utmp_id
);
5370 c
->selinux_context
= mfree(c
->selinux_context
);
5371 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
5372 c
->smack_process_label
= mfree(c
->smack_process_label
);
5374 c
->restrict_filesystems
= set_free(c
->restrict_filesystems
);
5376 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
5377 c
->syscall_archs
= set_free(c
->syscall_archs
);
5378 c
->address_families
= set_free(c
->address_families
);
5380 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5381 exec_directory_done(&c
->directories
[t
]);
5383 c
->log_level_max
= -1;
5385 exec_context_free_log_extra_fields(c
);
5387 c
->log_ratelimit_interval_usec
= 0;
5388 c
->log_ratelimit_burst
= 0;
5390 c
->stdin_data
= mfree(c
->stdin_data
);
5391 c
->stdin_data_size
= 0;
5393 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
5394 c
->ipc_namespace_path
= mfree(c
->ipc_namespace_path
);
5396 c
->log_namespace
= mfree(c
->log_namespace
);
5398 c
->load_credentials
= hashmap_free(c
->load_credentials
);
5399 c
->set_credentials
= hashmap_free(c
->set_credentials
);
5402 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
5405 if (!runtime_prefix
)
5408 for (size_t i
= 0; i
< c
->directories
[EXEC_DIRECTORY_RUNTIME
].n_items
; i
++) {
5409 _cleanup_free_
char *p
= NULL
;
5411 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5412 p
= path_join(runtime_prefix
, "private", c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5414 p
= path_join(runtime_prefix
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5418 /* We execute this synchronously, since we need to be sure this is gone when we start the
5420 (void) rm_rf(p
, REMOVE_ROOT
);
5422 STRV_FOREACH(symlink
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].symlinks
) {
5423 _cleanup_free_
char *symlink_abs
= NULL
;
5425 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5426 symlink_abs
= path_join(runtime_prefix
, "private", *symlink
);
5428 symlink_abs
= path_join(runtime_prefix
, *symlink
);
5432 (void) unlink(symlink_abs
);
5440 int exec_context_destroy_credentials(const ExecContext
*c
, const char *runtime_prefix
, const char *unit
) {
5441 _cleanup_free_
char *p
= NULL
;
5445 if (!runtime_prefix
|| !unit
)
5448 p
= path_join(runtime_prefix
, "credentials", unit
);
5452 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5453 * unmount it, and afterwards remove the mount point */
5454 (void) umount2(p
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
5455 (void) rm_rf(p
, REMOVE_ROOT
|REMOVE_CHMOD
);
5460 static void exec_command_done(ExecCommand
*c
) {
5463 c
->path
= mfree(c
->path
);
5464 c
->argv
= strv_free(c
->argv
);
5467 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
5468 for (size_t i
= 0; i
< n
; i
++)
5469 exec_command_done(c
+i
);
5472 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
5476 LIST_REMOVE(command
, c
, i
);
5477 exec_command_done(i
);
5484 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
5485 for (size_t i
= 0; i
< n
; i
++)
5486 c
[i
] = exec_command_free_list(c
[i
]);
5489 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
5490 for (size_t i
= 0; i
< n
; i
++)
5491 exec_status_reset(&c
[i
].exec_status
);
5494 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
5495 for (size_t i
= 0; i
< n
; i
++)
5496 LIST_FOREACH(command
, z
, c
[i
])
5497 exec_status_reset(&z
->exec_status
);
5500 typedef struct InvalidEnvInfo
{
5505 static void invalid_env(const char *p
, void *userdata
) {
5506 InvalidEnvInfo
*info
= userdata
;
5508 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
5511 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
5517 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
5520 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
5523 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
5526 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
5529 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
5532 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
5539 static int exec_context_named_iofds(
5540 const ExecContext
*c
,
5541 const ExecParameters
*p
,
5542 int named_iofds
[static 3]) {
5545 const char* stdio_fdname
[3];
5550 assert(named_iofds
);
5552 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
5553 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
5554 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
5556 for (size_t i
= 0; i
< 3; i
++)
5557 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
5559 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
5561 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
5562 if (named_iofds
[STDIN_FILENO
] < 0 &&
5563 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
5564 stdio_fdname
[STDIN_FILENO
] &&
5565 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
5567 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
5570 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
5571 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
5572 stdio_fdname
[STDOUT_FILENO
] &&
5573 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
5575 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
5578 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
5579 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
5580 stdio_fdname
[STDERR_FILENO
] &&
5581 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
5583 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
5587 return targets
== 0 ? 0 : -ENOENT
;
5590 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***ret
) {
5591 _cleanup_strv_free_
char **v
= NULL
;
5597 STRV_FOREACH(i
, c
->environment_files
) {
5598 _cleanup_globfree_ glob_t pglob
= {};
5599 bool ignore
= false;
5607 if (!path_is_absolute(fn
)) {
5613 /* Filename supports globbing, take all matching files */
5614 r
= safe_glob(fn
, 0, &pglob
);
5621 /* When we don't match anything, -ENOENT should be returned */
5622 assert(pglob
.gl_pathc
> 0);
5624 for (unsigned n
= 0; n
< pglob
.gl_pathc
; n
++) {
5625 _cleanup_strv_free_
char **p
= NULL
;
5627 r
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
5634 /* Log invalid environment variables with filename */
5636 InvalidEnvInfo info
= {
5638 .path
= pglob
.gl_pathv
[n
]
5641 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
5647 char **m
= strv_env_merge(v
, p
);
5651 strv_free_and_replace(v
, m
);
5661 static bool tty_may_match_dev_console(const char *tty
) {
5662 _cleanup_free_
char *resolved
= NULL
;
5667 tty
= skip_dev_prefix(tty
);
5669 /* trivial identity? */
5670 if (streq(tty
, "console"))
5673 if (resolve_dev_console(&resolved
) < 0)
5674 return true; /* if we could not resolve, assume it may */
5676 /* "tty0" means the active VC, so it may be the same sometimes */
5677 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
5680 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
5683 return ec
->tty_reset
||
5685 ec
->tty_vt_disallocate
||
5686 is_terminal_input(ec
->std_input
) ||
5687 is_terminal_output(ec
->std_output
) ||
5688 is_terminal_output(ec
->std_error
);
5691 bool exec_context_may_touch_console(const ExecContext
*ec
) {
5693 return exec_context_may_touch_tty(ec
) &&
5694 tty_may_match_dev_console(exec_context_tty_path(ec
));
5697 static void strv_fprintf(FILE *f
, char **l
) {
5701 fprintf(f
, " %s", *g
);
5704 static void strv_dump(FILE* f
, const char *prefix
, const char *name
, char **strv
) {
5709 if (!strv_isempty(strv
)) {
5710 fprintf(f
, "%s%s:", prefix
, name
);
5711 strv_fprintf(f
, strv
);
5716 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
5722 prefix
= strempty(prefix
);
5726 "%sWorkingDirectory: %s\n"
5727 "%sRootDirectory: %s\n"
5728 "%sNonBlocking: %s\n"
5729 "%sPrivateTmp: %s\n"
5730 "%sPrivateDevices: %s\n"
5731 "%sProtectKernelTunables: %s\n"
5732 "%sProtectKernelModules: %s\n"
5733 "%sProtectKernelLogs: %s\n"
5734 "%sProtectClock: %s\n"
5735 "%sProtectControlGroups: %s\n"
5736 "%sPrivateNetwork: %s\n"
5737 "%sPrivateUsers: %s\n"
5738 "%sProtectHome: %s\n"
5739 "%sProtectSystem: %s\n"
5740 "%sMountAPIVFS: %s\n"
5741 "%sIgnoreSIGPIPE: %s\n"
5742 "%sMemoryDenyWriteExecute: %s\n"
5743 "%sRestrictRealtime: %s\n"
5744 "%sRestrictSUIDSGID: %s\n"
5745 "%sKeyringMode: %s\n"
5746 "%sProtectHostname: %s\n"
5747 "%sProtectProc: %s\n"
5748 "%sProcSubset: %s\n",
5750 prefix
, empty_to_root(c
->working_directory
),
5751 prefix
, empty_to_root(c
->root_directory
),
5752 prefix
, yes_no(c
->non_blocking
),
5753 prefix
, yes_no(c
->private_tmp
),
5754 prefix
, yes_no(c
->private_devices
),
5755 prefix
, yes_no(c
->protect_kernel_tunables
),
5756 prefix
, yes_no(c
->protect_kernel_modules
),
5757 prefix
, yes_no(c
->protect_kernel_logs
),
5758 prefix
, yes_no(c
->protect_clock
),
5759 prefix
, yes_no(c
->protect_control_groups
),
5760 prefix
, yes_no(c
->private_network
),
5761 prefix
, yes_no(c
->private_users
),
5762 prefix
, protect_home_to_string(c
->protect_home
),
5763 prefix
, protect_system_to_string(c
->protect_system
),
5764 prefix
, yes_no(exec_context_get_effective_mount_apivfs(c
)),
5765 prefix
, yes_no(c
->ignore_sigpipe
),
5766 prefix
, yes_no(c
->memory_deny_write_execute
),
5767 prefix
, yes_no(c
->restrict_realtime
),
5768 prefix
, yes_no(c
->restrict_suid_sgid
),
5769 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
5770 prefix
, yes_no(c
->protect_hostname
),
5771 prefix
, protect_proc_to_string(c
->protect_proc
),
5772 prefix
, proc_subset_to_string(c
->proc_subset
));
5775 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
5777 if (c
->root_image_options
) {
5778 fprintf(f
, "%sRootImageOptions:", prefix
);
5779 LIST_FOREACH(mount_options
, o
, c
->root_image_options
)
5780 if (!isempty(o
->options
))
5781 fprintf(f
, " %s:%s",
5782 partition_designator_to_string(o
->partition_designator
),
5788 _cleanup_free_
char *encoded
= NULL
;
5789 encoded
= hexmem(c
->root_hash
, c
->root_hash_size
);
5791 fprintf(f
, "%sRootHash: %s\n", prefix
, encoded
);
5794 if (c
->root_hash_path
)
5795 fprintf(f
, "%sRootHash: %s\n", prefix
, c
->root_hash_path
);
5797 if (c
->root_hash_sig
) {
5798 _cleanup_free_
char *encoded
= NULL
;
5800 len
= base64mem(c
->root_hash_sig
, c
->root_hash_sig_size
, &encoded
);
5802 fprintf(f
, "%sRootHashSignature: base64:%s\n", prefix
, encoded
);
5805 if (c
->root_hash_sig_path
)
5806 fprintf(f
, "%sRootHashSignature: %s\n", prefix
, c
->root_hash_sig_path
);
5809 fprintf(f
, "%sRootVerity: %s\n", prefix
, c
->root_verity
);
5811 STRV_FOREACH(e
, c
->environment
)
5812 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
5814 STRV_FOREACH(e
, c
->environment_files
)
5815 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
5817 STRV_FOREACH(e
, c
->pass_environment
)
5818 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
5820 STRV_FOREACH(e
, c
->unset_environment
)
5821 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
5823 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
5825 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5826 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
5828 for (size_t i
= 0; i
< c
->directories
[dt
].n_items
; i
++) {
5829 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].items
[i
].path
);
5831 STRV_FOREACH(d
, c
->directories
[dt
].items
[i
].symlinks
)
5832 fprintf(f
, "%s%s: %s:%s\n", prefix
, exec_directory_type_symlink_to_string(dt
), c
->directories
[dt
].items
[i
].path
, *d
);
5836 fprintf(f
, "%sTimeoutCleanSec: %s\n", prefix
, FORMAT_TIMESPAN(c
->timeout_clean_usec
, USEC_PER_SEC
));
5839 fprintf(f
, "%sNice: %i\n", prefix
, c
->nice
);
5841 if (c
->oom_score_adjust_set
)
5842 fprintf(f
, "%sOOMScoreAdjust: %i\n", prefix
, c
->oom_score_adjust
);
5844 if (c
->coredump_filter_set
)
5845 fprintf(f
, "%sCoredumpFilter: 0x%"PRIx64
"\n", prefix
, c
->coredump_filter
);
5847 for (unsigned i
= 0; i
< RLIM_NLIMITS
; i
++)
5849 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
5850 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
5851 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
5852 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
5855 if (c
->ioprio_set
) {
5856 _cleanup_free_
char *class_str
= NULL
;
5858 r
= ioprio_class_to_string_alloc(ioprio_prio_class(c
->ioprio
), &class_str
);
5860 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
5862 fprintf(f
, "%sIOPriority: %d\n", prefix
, ioprio_prio_data(c
->ioprio
));
5865 if (c
->cpu_sched_set
) {
5866 _cleanup_free_
char *policy_str
= NULL
;
5868 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
5870 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
5873 "%sCPUSchedulingPriority: %i\n"
5874 "%sCPUSchedulingResetOnFork: %s\n",
5875 prefix
, c
->cpu_sched_priority
,
5876 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
5879 if (c
->cpu_set
.set
) {
5880 _cleanup_free_
char *affinity
= NULL
;
5882 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
5883 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
5886 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
5887 _cleanup_free_
char *nodes
= NULL
;
5889 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
5890 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
5891 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
5894 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
5895 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
5898 "%sStandardInput: %s\n"
5899 "%sStandardOutput: %s\n"
5900 "%sStandardError: %s\n",
5901 prefix
, exec_input_to_string(c
->std_input
),
5902 prefix
, exec_output_to_string(c
->std_output
),
5903 prefix
, exec_output_to_string(c
->std_error
));
5905 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
5906 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
5907 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
5908 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
5909 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
5910 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
5912 if (c
->std_input
== EXEC_INPUT_FILE
)
5913 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
5914 if (c
->std_output
== EXEC_OUTPUT_FILE
)
5915 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5916 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
5917 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5918 if (c
->std_output
== EXEC_OUTPUT_FILE_TRUNCATE
)
5919 fprintf(f
, "%sStandardOutputFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
5920 if (c
->std_error
== EXEC_OUTPUT_FILE
)
5921 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5922 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
5923 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5924 if (c
->std_error
== EXEC_OUTPUT_FILE_TRUNCATE
)
5925 fprintf(f
, "%sStandardErrorFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
5931 "%sTTYVHangup: %s\n"
5932 "%sTTYVTDisallocate: %s\n"
5934 "%sTTYColumns: %u\n",
5935 prefix
, c
->tty_path
,
5936 prefix
, yes_no(c
->tty_reset
),
5937 prefix
, yes_no(c
->tty_vhangup
),
5938 prefix
, yes_no(c
->tty_vt_disallocate
),
5939 prefix
, c
->tty_rows
,
5940 prefix
, c
->tty_cols
);
5942 if (IN_SET(c
->std_output
,
5944 EXEC_OUTPUT_JOURNAL
,
5945 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
5946 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
5947 IN_SET(c
->std_error
,
5949 EXEC_OUTPUT_JOURNAL
,
5950 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
5951 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
5953 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
5955 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
5957 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
5959 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
5961 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
5964 if (c
->log_level_max
>= 0) {
5965 _cleanup_free_
char *t
= NULL
;
5967 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
5969 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
5972 if (c
->log_ratelimit_interval_usec
> 0)
5974 "%sLogRateLimitIntervalSec: %s\n",
5975 prefix
, FORMAT_TIMESPAN(c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
5977 if (c
->log_ratelimit_burst
> 0)
5978 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
5980 for (size_t j
= 0; j
< c
->n_log_extra_fields
; j
++) {
5981 fprintf(f
, "%sLogExtraFields: ", prefix
);
5982 fwrite(c
->log_extra_fields
[j
].iov_base
,
5983 1, c
->log_extra_fields
[j
].iov_len
,
5988 if (c
->log_namespace
)
5989 fprintf(f
, "%sLogNamespace: %s\n", prefix
, c
->log_namespace
);
5991 if (c
->secure_bits
) {
5992 _cleanup_free_
char *str
= NULL
;
5994 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
5996 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
5999 if (c
->capability_bounding_set
!= CAP_ALL
) {
6000 _cleanup_free_
char *str
= NULL
;
6002 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
6004 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
6007 if (c
->capability_ambient_set
!= 0) {
6008 _cleanup_free_
char *str
= NULL
;
6010 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
6012 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
6016 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
6018 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
6020 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
6022 strv_dump(f
, prefix
, "SupplementaryGroups", c
->supplementary_groups
);
6025 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
6027 strv_dump(f
, prefix
, "ReadWritePaths", c
->read_write_paths
);
6028 strv_dump(f
, prefix
, "ReadOnlyPaths", c
->read_only_paths
);
6029 strv_dump(f
, prefix
, "InaccessiblePaths", c
->inaccessible_paths
);
6030 strv_dump(f
, prefix
, "ExecPaths", c
->exec_paths
);
6031 strv_dump(f
, prefix
, "NoExecPaths", c
->no_exec_paths
);
6032 strv_dump(f
, prefix
, "ExecSearchPath", c
->exec_search_path
);
6034 for (size_t i
= 0; i
< c
->n_bind_mounts
; i
++)
6035 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
6036 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
6037 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
6038 c
->bind_mounts
[i
].source
,
6039 c
->bind_mounts
[i
].destination
,
6040 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
6042 for (size_t i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
6043 const TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
6045 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
6047 isempty(t
->options
) ? "" : ":",
6048 strempty(t
->options
));
6053 "%sUtmpIdentifier: %s\n",
6054 prefix
, c
->utmp_id
);
6056 if (c
->selinux_context
)
6058 "%sSELinuxContext: %s%s\n",
6059 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
6061 if (c
->apparmor_profile
)
6063 "%sAppArmorProfile: %s%s\n",
6064 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
6066 if (c
->smack_process_label
)
6068 "%sSmackProcessLabel: %s%s\n",
6069 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
6071 if (c
->personality
!= PERSONALITY_INVALID
)
6073 "%sPersonality: %s\n",
6074 prefix
, strna(personality_to_string(c
->personality
)));
6077 "%sLockPersonality: %s\n",
6078 prefix
, yes_no(c
->lock_personality
));
6080 if (c
->syscall_filter
) {
6082 "%sSystemCallFilter: ",
6085 if (!c
->syscall_allow_list
)
6091 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
6092 _cleanup_free_
char *name
= NULL
;
6093 const char *errno_name
= NULL
;
6094 int num
= PTR_TO_INT(val
);
6101 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
6102 fputs(strna(name
), f
);
6105 errno_name
= seccomp_errno_or_action_to_string(num
);
6107 fprintf(f
, ":%s", errno_name
);
6109 fprintf(f
, ":%d", num
);
6117 if (c
->syscall_archs
) {
6119 "%sSystemCallArchitectures:",
6124 SET_FOREACH(id
, c
->syscall_archs
)
6125 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
6130 if (exec_context_restrict_namespaces_set(c
)) {
6131 _cleanup_free_
char *s
= NULL
;
6133 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
6135 fprintf(f
, "%sRestrictNamespaces: %s\n",
6140 if (exec_context_restrict_filesystems_set(c
)) {
6142 SET_FOREACH(fs
, c
->restrict_filesystems
)
6143 fprintf(f
, "%sRestrictFileSystems: %s\n", prefix
, fs
);
6147 if (c
->network_namespace_path
)
6149 "%sNetworkNamespacePath: %s\n",
6150 prefix
, c
->network_namespace_path
);
6152 if (c
->syscall_errno
> 0) {
6153 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
6156 const char *errno_name
= seccomp_errno_or_action_to_string(c
->syscall_errno
);
6158 fputs(errno_name
, f
);
6160 fprintf(f
, "%d", c
->syscall_errno
);
6165 for (size_t i
= 0; i
< c
->n_mount_images
; i
++) {
6166 fprintf(f
, "%sMountImages: %s%s:%s", prefix
,
6167 c
->mount_images
[i
].ignore_enoent
? "-": "",
6168 c
->mount_images
[i
].source
,
6169 c
->mount_images
[i
].destination
);
6170 LIST_FOREACH(mount_options
, o
, c
->mount_images
[i
].mount_options
)
6171 fprintf(f
, ":%s:%s",
6172 partition_designator_to_string(o
->partition_designator
),
6173 strempty(o
->options
));
6177 for (size_t i
= 0; i
< c
->n_extension_images
; i
++) {
6178 fprintf(f
, "%sExtensionImages: %s%s", prefix
,
6179 c
->extension_images
[i
].ignore_enoent
? "-": "",
6180 c
->extension_images
[i
].source
);
6181 LIST_FOREACH(mount_options
, o
, c
->extension_images
[i
].mount_options
)
6182 fprintf(f
, ":%s:%s",
6183 partition_designator_to_string(o
->partition_designator
),
6184 strempty(o
->options
));
6188 strv_dump(f
, prefix
, "ExtensionDirectories", c
->extension_directories
);
6191 bool exec_context_maintains_privileges(const ExecContext
*c
) {
6194 /* Returns true if the process forked off would run under
6195 * an unchanged UID or as root. */
6200 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
6206 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
6214 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
6216 return IOPRIO_DEFAULT_CLASS_AND_PRIO
;
6218 return ioprio_normalize(p
);
6221 bool exec_context_get_effective_mount_apivfs(const ExecContext
*c
) {
6224 /* Explicit setting wins */
6225 if (c
->mount_apivfs_set
)
6226 return c
->mount_apivfs
;
6228 /* Default to "yes" if root directory or image are specified */
6229 if (exec_context_with_rootfs(c
))
6235 void exec_context_free_log_extra_fields(ExecContext
*c
) {
6238 for (size_t l
= 0; l
< c
->n_log_extra_fields
; l
++)
6239 free(c
->log_extra_fields
[l
].iov_base
);
6240 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
6241 c
->n_log_extra_fields
= 0;
6244 void exec_context_revert_tty(ExecContext
*c
) {
6245 _cleanup_close_
int fd
= -1;
6252 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6253 exec_context_tty_reset(c
, NULL
);
6255 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6256 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6257 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6258 if (!exec_context_may_touch_tty(c
))
6261 path
= exec_context_tty_path(c
);
6265 fd
= open(path
, O_PATH
|O_CLOEXEC
);
6267 return (void) log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
6268 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6271 if (fstat(fd
, &st
) < 0)
6272 return (void) log_warning_errno(errno
, "Failed to stat TTY '%s', ignoring: %m", path
);
6274 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6275 * if things are a character device, since a proper check either means we'd have to open the TTY and
6276 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6277 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6278 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6279 if (!S_ISCHR(st
.st_mode
))
6280 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path
);
6282 r
= fchmod_and_chown(fd
, TTY_MODE
, 0, TTY_GID
);
6284 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
6287 int exec_context_get_clean_directories(
6293 _cleanup_strv_free_
char **l
= NULL
;
6300 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
6301 if (!FLAGS_SET(mask
, 1U << t
))
6307 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
6310 j
= path_join(prefix
[t
], c
->directories
[t
].items
[i
].path
);
6314 r
= strv_consume(&l
, j
);
6318 /* Also remove private directories unconditionally. */
6319 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
6320 j
= path_join(prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
6324 r
= strv_consume(&l
, j
);
6329 STRV_FOREACH(symlink
, c
->directories
[t
].items
[i
].symlinks
) {
6330 j
= path_join(prefix
[t
], *symlink
);
6334 r
= strv_consume(&l
, j
);
6345 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
6346 ExecCleanMask mask
= 0;
6351 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6352 if (c
->directories
[t
].n_items
> 0)
6359 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
6366 dual_timestamp_get(&s
->start_timestamp
);
6369 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
6377 dual_timestamp_get(&s
->exit_timestamp
);
6382 if (context
&& context
->utmp_id
)
6383 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
6386 void exec_status_reset(ExecStatus
*s
) {
6389 *s
= (ExecStatus
) {};
6392 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
6399 prefix
= strempty(prefix
);
6402 "%sPID: "PID_FMT
"\n",
6405 if (dual_timestamp_is_set(&s
->start_timestamp
))
6407 "%sStart Timestamp: %s\n",
6408 prefix
, FORMAT_TIMESTAMP(s
->start_timestamp
.realtime
));
6410 if (dual_timestamp_is_set(&s
->exit_timestamp
))
6412 "%sExit Timestamp: %s\n"
6414 "%sExit Status: %i\n",
6415 prefix
, FORMAT_TIMESTAMP(s
->exit_timestamp
.realtime
),
6416 prefix
, sigchld_code_to_string(s
->code
),
6420 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6421 _cleanup_free_
char *cmd
= NULL
;
6422 const char *prefix2
;
6427 prefix
= strempty(prefix
);
6428 prefix2
= strjoina(prefix
, "\t");
6430 cmd
= quote_command_line(c
->argv
, SHELL_ESCAPE_EMPTY
);
6432 "%sCommand Line: %s\n",
6433 prefix
, cmd
?: strerror_safe(ENOMEM
));
6435 exec_status_dump(&c
->exec_status
, f
, prefix2
);
6438 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6441 prefix
= strempty(prefix
);
6443 LIST_FOREACH(command
, i
, c
)
6444 exec_command_dump(i
, f
, prefix
);
6447 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
6454 /* It's kind of important, that we keep the order here */
6455 LIST_FIND_TAIL(command
, *l
, end
);
6456 LIST_INSERT_AFTER(command
, *l
, end
, e
);
6461 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
6469 l
= strv_new_ap(path
, ap
);
6481 free_and_replace(c
->path
, p
);
6483 return strv_free_and_replace(c
->argv
, l
);
6486 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
6487 _cleanup_strv_free_
char **l
= NULL
;
6495 l
= strv_new_ap(path
, ap
);
6501 r
= strv_extend_strv(&c
->argv
, l
, false);
6508 static void *remove_tmpdir_thread(void *p
) {
6509 _cleanup_free_
char *path
= p
;
6511 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
6515 static ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
, bool destroy
) {
6522 (void) hashmap_remove(rt
->manager
->exec_runtime_by_id
, rt
->id
);
6524 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6526 if (destroy
&& rt
->tmp_dir
&& !streq(rt
->tmp_dir
, RUN_SYSTEMD_EMPTY
)) {
6527 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
6529 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
6531 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
6536 if (destroy
&& rt
->var_tmp_dir
&& !streq(rt
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
)) {
6537 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
6539 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
6541 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
6543 rt
->var_tmp_dir
= NULL
;
6546 rt
->id
= mfree(rt
->id
);
6547 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
6548 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
6549 safe_close_pair(rt
->netns_storage_socket
);
6550 safe_close_pair(rt
->ipcns_storage_socket
);
6554 static void exec_runtime_freep(ExecRuntime
**rt
) {
6555 (void) exec_runtime_free(*rt
, false);
6558 static int exec_runtime_allocate(ExecRuntime
**ret
, const char *id
) {
6559 _cleanup_free_
char *id_copy
= NULL
;
6564 id_copy
= strdup(id
);
6568 n
= new(ExecRuntime
, 1);
6572 *n
= (ExecRuntime
) {
6573 .id
= TAKE_PTR(id_copy
),
6574 .netns_storage_socket
= { -1, -1 },
6575 .ipcns_storage_socket
= { -1, -1 },
6582 static int exec_runtime_add(
6587 int netns_storage_socket
[2],
6588 int ipcns_storage_socket
[2],
6589 ExecRuntime
**ret
) {
6591 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
6597 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6599 r
= exec_runtime_allocate(&rt
, id
);
6603 r
= hashmap_ensure_put(&m
->exec_runtime_by_id
, &string_hash_ops
, rt
->id
, rt
);
6607 assert(!!rt
->tmp_dir
== !!rt
->var_tmp_dir
); /* We require both to be set together */
6608 rt
->tmp_dir
= TAKE_PTR(*tmp_dir
);
6609 rt
->var_tmp_dir
= TAKE_PTR(*var_tmp_dir
);
6611 if (netns_storage_socket
) {
6612 rt
->netns_storage_socket
[0] = TAKE_FD(netns_storage_socket
[0]);
6613 rt
->netns_storage_socket
[1] = TAKE_FD(netns_storage_socket
[1]);
6616 if (ipcns_storage_socket
) {
6617 rt
->ipcns_storage_socket
[0] = TAKE_FD(ipcns_storage_socket
[0]);
6618 rt
->ipcns_storage_socket
[1] = TAKE_FD(ipcns_storage_socket
[1]);
6625 /* do not remove created ExecRuntime object when the operation succeeds. */
6630 static int exec_runtime_make(
6632 const ExecContext
*c
,
6634 ExecRuntime
**ret
) {
6636 _cleanup_(namespace_cleanup_tmpdirp
) char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6637 _cleanup_close_pair_
int netns_storage_socket
[2] = { -1, -1 }, ipcns_storage_socket
[2] = { -1, -1 };
6644 /* It is not necessary to create ExecRuntime object. */
6645 if (!c
->private_network
&& !c
->private_ipc
&& !c
->private_tmp
&& !c
->network_namespace_path
) {
6650 if (c
->private_tmp
&&
6651 !(prefixed_path_strv_contains(c
->inaccessible_paths
, "/tmp") &&
6652 (prefixed_path_strv_contains(c
->inaccessible_paths
, "/var/tmp") ||
6653 prefixed_path_strv_contains(c
->inaccessible_paths
, "/var")))) {
6654 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
6659 if (c
->private_network
|| c
->network_namespace_path
) {
6660 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
6664 if (c
->private_ipc
|| c
->ipc_namespace_path
) {
6665 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ipcns_storage_socket
) < 0)
6669 r
= exec_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_storage_socket
, ipcns_storage_socket
, ret
);
6676 int exec_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecRuntime
**ret
) {
6684 rt
= hashmap_get(m
->exec_runtime_by_id
, id
);
6686 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6694 /* If not found, then create a new object. */
6695 r
= exec_runtime_make(m
, c
, id
, &rt
);
6699 /* When r == 0, it is not necessary to create ExecRuntime object. */
6705 /* increment reference counter. */
6711 ExecRuntime
*exec_runtime_unref(ExecRuntime
*rt
, bool destroy
) {
6715 assert(rt
->n_ref
> 0);
6721 return exec_runtime_free(rt
, destroy
);
6724 int exec_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
6731 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
) {
6732 fprintf(f
, "exec-runtime=%s", rt
->id
);
6735 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
6737 if (rt
->var_tmp_dir
)
6738 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
6740 if (rt
->netns_storage_socket
[0] >= 0) {
6743 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
6747 fprintf(f
, " netns-socket-0=%i", copy
);
6750 if (rt
->netns_storage_socket
[1] >= 0) {
6753 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
6757 fprintf(f
, " netns-socket-1=%i", copy
);
6760 if (rt
->ipcns_storage_socket
[0] >= 0) {
6763 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[0]);
6767 fprintf(f
, " ipcns-socket-0=%i", copy
);
6770 if (rt
->ipcns_storage_socket
[1] >= 0) {
6773 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[1]);
6777 fprintf(f
, " ipcns-socket-1=%i", copy
);
6786 int exec_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
6787 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt_create
= NULL
;
6791 /* This is for the migration from old (v237 or earlier) deserialization text.
6792 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6793 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6794 * so or not from the serialized text, then we always creates a new object owned by this. */
6800 /* Manager manages ExecRuntime objects by the unit id.
6801 * So, we omit the serialized text when the unit does not have id (yet?)... */
6802 if (isempty(u
->id
)) {
6803 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
6807 if (hashmap_ensure_allocated(&u
->manager
->exec_runtime_by_id
, &string_hash_ops
) < 0)
6810 rt
= hashmap_get(u
->manager
->exec_runtime_by_id
, u
->id
);
6812 if (exec_runtime_allocate(&rt_create
, u
->id
) < 0)
6818 if (streq(key
, "tmp-dir")) {
6819 if (free_and_strdup_warn(&rt
->tmp_dir
, value
) < 0)
6822 } else if (streq(key
, "var-tmp-dir")) {
6823 if (free_and_strdup_warn(&rt
->var_tmp_dir
, value
) < 0)
6826 } else if (streq(key
, "netns-socket-0")) {
6829 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
6830 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
6834 safe_close(rt
->netns_storage_socket
[0]);
6835 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
6837 } else if (streq(key
, "netns-socket-1")) {
6840 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
6841 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
6845 safe_close(rt
->netns_storage_socket
[1]);
6846 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
6851 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6853 r
= hashmap_put(u
->manager
->exec_runtime_by_id
, rt_create
->id
, rt_create
);
6855 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
6859 rt_create
->manager
= u
->manager
;
6862 TAKE_PTR(rt_create
);
6868 int exec_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
6869 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6871 int r
, netns_fdpair
[] = {-1, -1}, ipcns_fdpair
[] = {-1, -1};
6872 const char *p
, *v
= ASSERT_PTR(value
);
6878 n
= strcspn(v
, " ");
6879 id
= strndupa_safe(v
, n
);
6884 v
= startswith(p
, "tmp-dir=");
6886 n
= strcspn(v
, " ");
6887 tmp_dir
= strndup(v
, n
);
6895 v
= startswith(p
, "var-tmp-dir=");
6897 n
= strcspn(v
, " ");
6898 var_tmp_dir
= strndup(v
, n
);
6906 v
= startswith(p
, "netns-socket-0=");
6910 n
= strcspn(v
, " ");
6911 buf
= strndupa_safe(v
, n
);
6913 r
= safe_atoi(buf
, &netns_fdpair
[0]);
6915 return log_debug_errno(r
, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf
);
6916 if (!fdset_contains(fds
, netns_fdpair
[0]))
6917 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6918 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair
[0]);
6919 netns_fdpair
[0] = fdset_remove(fds
, netns_fdpair
[0]);
6925 v
= startswith(p
, "netns-socket-1=");
6929 n
= strcspn(v
, " ");
6930 buf
= strndupa_safe(v
, n
);
6932 r
= safe_atoi(buf
, &netns_fdpair
[1]);
6934 return log_debug_errno(r
, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf
);
6935 if (!fdset_contains(fds
, netns_fdpair
[1]))
6936 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6937 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair
[1]);
6938 netns_fdpair
[1] = fdset_remove(fds
, netns_fdpair
[1]);
6944 v
= startswith(p
, "ipcns-socket-0=");
6948 n
= strcspn(v
, " ");
6949 buf
= strndupa_safe(v
, n
);
6951 r
= safe_atoi(buf
, &ipcns_fdpair
[0]);
6953 return log_debug_errno(r
, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf
);
6954 if (!fdset_contains(fds
, ipcns_fdpair
[0]))
6955 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6956 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair
[0]);
6957 ipcns_fdpair
[0] = fdset_remove(fds
, ipcns_fdpair
[0]);
6963 v
= startswith(p
, "ipcns-socket-1=");
6967 n
= strcspn(v
, " ");
6968 buf
= strndupa_safe(v
, n
);
6970 r
= safe_atoi(buf
, &ipcns_fdpair
[1]);
6972 return log_debug_errno(r
, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf
);
6973 if (!fdset_contains(fds
, ipcns_fdpair
[1]))
6974 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
6975 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair
[1]);
6976 ipcns_fdpair
[1] = fdset_remove(fds
, ipcns_fdpair
[1]);
6980 r
= exec_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_fdpair
, ipcns_fdpair
, NULL
);
6982 return log_debug_errno(r
, "Failed to add exec-runtime: %m");
6986 void exec_runtime_vacuum(Manager
*m
) {
6991 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6993 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
) {
6997 (void) exec_runtime_free(rt
, false);
7001 void exec_params_clear(ExecParameters
*p
) {
7005 p
->environment
= strv_free(p
->environment
);
7006 p
->fd_names
= strv_free(p
->fd_names
);
7007 p
->fds
= mfree(p
->fds
);
7008 p
->exec_fd
= safe_close(p
->exec_fd
);
7011 ExecSetCredential
*exec_set_credential_free(ExecSetCredential
*sc
) {
7020 ExecLoadCredential
*exec_load_credential_free(ExecLoadCredential
*lc
) {
7029 void exec_directory_done(ExecDirectory
*d
) {
7033 for (size_t i
= 0; i
< d
->n_items
; i
++) {
7034 free(d
->items
[i
].path
);
7035 strv_free(d
->items
[i
].symlinks
);
7038 d
->items
= mfree(d
->items
);
7043 int exec_directory_add(ExecDirectoryItem
**d
, size_t *n
, const char *path
, char **symlinks
) {
7044 _cleanup_strv_free_
char **s
= NULL
;
7045 _cleanup_free_
char *p
= NULL
;
7056 s
= strv_copy(symlinks
);
7061 if (!GREEDY_REALLOC(*d
, *n
+ 1))
7064 (*d
)[(*n
) ++] = (ExecDirectoryItem
) {
7065 .path
= TAKE_PTR(p
),
7066 .symlinks
= TAKE_PTR(s
),
7072 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecSetCredential
, exec_set_credential_free
);
7073 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecLoadCredential
, exec_load_credential_free
);
7075 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
7076 [EXEC_INPUT_NULL
] = "null",
7077 [EXEC_INPUT_TTY
] = "tty",
7078 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
7079 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
7080 [EXEC_INPUT_SOCKET
] = "socket",
7081 [EXEC_INPUT_NAMED_FD
] = "fd",
7082 [EXEC_INPUT_DATA
] = "data",
7083 [EXEC_INPUT_FILE
] = "file",
7086 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
7088 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
7089 [EXEC_OUTPUT_INHERIT
] = "inherit",
7090 [EXEC_OUTPUT_NULL
] = "null",
7091 [EXEC_OUTPUT_TTY
] = "tty",
7092 [EXEC_OUTPUT_KMSG
] = "kmsg",
7093 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
7094 [EXEC_OUTPUT_JOURNAL
] = "journal",
7095 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
7096 [EXEC_OUTPUT_SOCKET
] = "socket",
7097 [EXEC_OUTPUT_NAMED_FD
] = "fd",
7098 [EXEC_OUTPUT_FILE
] = "file",
7099 [EXEC_OUTPUT_FILE_APPEND
] = "append",
7100 [EXEC_OUTPUT_FILE_TRUNCATE
] = "truncate",
7103 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
7105 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
7106 [EXEC_UTMP_INIT
] = "init",
7107 [EXEC_UTMP_LOGIN
] = "login",
7108 [EXEC_UTMP_USER
] = "user",
7111 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
7113 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
7114 [EXEC_PRESERVE_NO
] = "no",
7115 [EXEC_PRESERVE_YES
] = "yes",
7116 [EXEC_PRESERVE_RESTART
] = "restart",
7119 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
7121 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7122 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7123 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
7124 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
7125 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
7126 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
7127 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
7130 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
7132 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7133 static const char* const exec_directory_type_symlink_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7134 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectorySymlink",
7135 [EXEC_DIRECTORY_STATE
] = "StateDirectorySymlink",
7136 [EXEC_DIRECTORY_CACHE
] = "CacheDirectorySymlink",
7137 [EXEC_DIRECTORY_LOGS
] = "LogsDirectorySymlink",
7138 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectorySymlink",
7141 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink
, ExecDirectoryType
);
7143 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7144 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7145 * directories, specifically .timer units with their timestamp touch file. */
7146 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7147 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
7148 [EXEC_DIRECTORY_STATE
] = "state",
7149 [EXEC_DIRECTORY_CACHE
] = "cache",
7150 [EXEC_DIRECTORY_LOGS
] = "logs",
7151 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
7154 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
7156 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7157 * the service payload in. */
7158 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7159 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
7160 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
7161 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
7162 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
7163 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
7166 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
7168 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
7169 [EXEC_KEYRING_INHERIT
] = "inherit",
7170 [EXEC_KEYRING_PRIVATE
] = "private",
7171 [EXEC_KEYRING_SHARED
] = "shared",
7174 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);