1 /* SPDX-License-Identifier: LGPL-2.1+ */
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
16 #include <sys/socket.h>
18 #include <sys/types.h>
24 #include <security/pam_appl.h>
28 #include <selinux/selinux.h>
36 #include <sys/apparmor.h>
39 #include "sd-messages.h"
42 #include "alloc-util.h"
44 #include "apparmor-util.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cgroup-setup.h"
52 #include "cpu-set-util.h"
56 #include "errno-list.h"
58 #include "exit-status.h"
60 #include "format-util.h"
62 #include "glob-util.h"
69 #include "memory-util.h"
72 #include "namespace.h"
73 #include "parse-util.h"
74 #include "path-util.h"
75 #include "process-util.h"
76 #include "rlimit-util.h"
79 #include "seccomp-util.h"
81 #include "securebits-util.h"
82 #include "selinux-util.h"
83 #include "signal-util.h"
84 #include "smack-util.h"
85 #include "socket-util.h"
87 #include "stat-util.h"
88 #include "string-table.h"
89 #include "string-util.h"
91 #include "syslog-util.h"
92 #include "terminal-util.h"
93 #include "umask-util.h"
95 #include "user-util.h"
96 #include "utmp-wtmp.h"
98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
101 #define SNDBUF_SIZE (8*1024*1024)
103 static int shift_fds(int fds
[], size_t n_fds
) {
104 int start
, restart_from
;
109 /* Modifies the fds array! (sorts it) */
119 for (i
= start
; i
< (int) n_fds
; i
++) {
122 /* Already at right index? */
126 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
133 /* Hmm, the fd we wanted isn't free? Then
134 * let's remember that and try again from here */
135 if (nfd
!= i
+3 && restart_from
< 0)
139 if (restart_from
< 0)
142 start
= restart_from
;
148 static int flags_fds(const int fds
[], size_t n_socket_fds
, size_t n_storage_fds
, bool nonblock
) {
152 n_fds
= n_socket_fds
+ n_storage_fds
;
158 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
159 * O_NONBLOCK only applies to socket activation though. */
161 for (i
= 0; i
< n_fds
; i
++) {
163 if (i
< n_socket_fds
) {
164 r
= fd_nonblock(fds
[i
], nonblock
);
169 /* We unconditionally drop FD_CLOEXEC from the fds,
170 * since after all we want to pass these fds to our
173 r
= fd_cloexec(fds
[i
], false);
181 static const char *exec_context_tty_path(const ExecContext
*context
) {
184 if (context
->stdio_as_fds
)
187 if (context
->tty_path
)
188 return context
->tty_path
;
190 return "/dev/console";
193 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
198 path
= exec_context_tty_path(context
);
200 if (context
->tty_vhangup
) {
201 if (p
&& p
->stdin_fd
>= 0)
202 (void) terminal_vhangup_fd(p
->stdin_fd
);
204 (void) terminal_vhangup(path
);
207 if (context
->tty_reset
) {
208 if (p
&& p
->stdin_fd
>= 0)
209 (void) reset_terminal_fd(p
->stdin_fd
, true);
211 (void) reset_terminal(path
);
214 if (context
->tty_vt_disallocate
&& path
)
215 (void) vt_disallocate(path
);
218 static bool is_terminal_input(ExecInput i
) {
221 EXEC_INPUT_TTY_FORCE
,
222 EXEC_INPUT_TTY_FAIL
);
225 static bool is_terminal_output(ExecOutput o
) {
228 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
229 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
230 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
233 static bool is_syslog_output(ExecOutput o
) {
236 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
239 static bool is_kmsg_output(ExecOutput o
) {
242 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
245 static bool exec_context_needs_term(const ExecContext
*c
) {
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
250 if (is_terminal_input(c
->std_input
))
253 if (is_terminal_output(c
->std_output
))
256 if (is_terminal_output(c
->std_error
))
259 return !!c
->tty_path
;
262 static int open_null_as(int flags
, int nfd
) {
267 fd
= open("/dev/null", flags
|O_NOCTTY
);
271 return move_fd(fd
, nfd
, false);
274 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
275 static const union sockaddr_union sa
= {
276 .un
.sun_family
= AF_UNIX
,
277 .un
.sun_path
= "/run/systemd/journal/stdout",
279 uid_t olduid
= UID_INVALID
;
280 gid_t oldgid
= GID_INVALID
;
283 if (gid_is_valid(gid
)) {
286 if (setegid(gid
) < 0)
290 if (uid_is_valid(uid
)) {
293 if (seteuid(uid
) < 0) {
299 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
301 /* If we fail to restore the uid or gid, things will likely
302 fail later on. This should only happen if an LSM interferes. */
304 if (uid_is_valid(uid
))
305 (void) seteuid(olduid
);
308 if (gid_is_valid(gid
))
309 (void) setegid(oldgid
);
314 static int connect_logger_as(
316 const ExecContext
*context
,
317 const ExecParameters
*params
,
324 _cleanup_close_
int fd
= -1;
329 assert(output
< _EXEC_OUTPUT_MAX
);
333 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
337 r
= connect_journal_socket(fd
, uid
, gid
);
341 if (shutdown(fd
, SHUT_RD
) < 0)
344 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
354 context
->syslog_identifier
?: ident
,
355 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
356 context
->syslog_priority
,
357 !!context
->syslog_level_prefix
,
358 is_syslog_output(output
),
359 is_kmsg_output(output
),
360 is_terminal_output(output
)) < 0)
363 return move_fd(TAKE_FD(fd
), nfd
, false);
366 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
372 fd
= open_terminal(path
, flags
| O_NOCTTY
);
376 return move_fd(fd
, nfd
, false);
379 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
380 union sockaddr_union sa
= {};
381 _cleanup_close_
int fd
= -1;
386 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
389 fd
= open(path
, flags
|O_NOCTTY
, mode
);
393 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
395 if (strlen(path
) >= sizeof(sa
.un
.sun_path
)) /* Too long, can't be a UNIX socket */
398 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
400 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
404 salen
= sockaddr_un_set_path(&sa
.un
, path
);
408 if (connect(fd
, &sa
.sa
, salen
) < 0)
409 return errno
== EINVAL
? -ENXIO
: -errno
; /* Propagate initial error if we get EINVAL, i.e. we have
410 * indication that his wasn't an AF_UNIX socket after all */
412 if ((flags
& O_ACCMODE
) == O_RDONLY
)
413 r
= shutdown(fd
, SHUT_WR
);
414 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
415 r
= shutdown(fd
, SHUT_RD
);
424 static int fixup_input(
425 const ExecContext
*context
,
427 bool apply_tty_stdin
) {
433 std_input
= context
->std_input
;
435 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
436 return EXEC_INPUT_NULL
;
438 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
439 return EXEC_INPUT_NULL
;
441 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
442 return EXEC_INPUT_NULL
;
447 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
449 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
450 return EXEC_OUTPUT_INHERIT
;
455 static int setup_input(
456 const ExecContext
*context
,
457 const ExecParameters
*params
,
459 const int named_iofds
[static 3]) {
467 if (params
->stdin_fd
>= 0) {
468 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
471 /* Try to make this the controlling tty, if it is a tty, and reset it */
472 if (isatty(STDIN_FILENO
)) {
473 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
474 (void) reset_terminal_fd(STDIN_FILENO
, true);
480 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
484 case EXEC_INPUT_NULL
:
485 return open_null_as(O_RDONLY
, STDIN_FILENO
);
488 case EXEC_INPUT_TTY_FORCE
:
489 case EXEC_INPUT_TTY_FAIL
: {
492 fd
= acquire_terminal(exec_context_tty_path(context
),
493 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
494 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
495 ACQUIRE_TERMINAL_WAIT
,
500 return move_fd(fd
, STDIN_FILENO
, false);
503 case EXEC_INPUT_SOCKET
:
504 assert(socket_fd
>= 0);
506 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
508 case EXEC_INPUT_NAMED_FD
:
509 assert(named_iofds
[STDIN_FILENO
] >= 0);
511 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
512 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
514 case EXEC_INPUT_DATA
: {
517 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
521 return move_fd(fd
, STDIN_FILENO
, false);
524 case EXEC_INPUT_FILE
: {
528 assert(context
->stdio_file
[STDIN_FILENO
]);
530 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
531 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
533 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
537 return move_fd(fd
, STDIN_FILENO
, false);
541 assert_not_reached("Unknown input type");
545 static bool can_inherit_stderr_from_stdout(
546 const ExecContext
*context
,
552 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
555 if (e
== EXEC_OUTPUT_INHERIT
)
560 if (e
== EXEC_OUTPUT_NAMED_FD
)
561 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
563 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
))
564 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
569 static int setup_output(
571 const ExecContext
*context
,
572 const ExecParameters
*params
,
575 const int named_iofds
[static 3],
579 dev_t
*journal_stream_dev
,
580 ino_t
*journal_stream_ino
) {
590 assert(journal_stream_dev
);
591 assert(journal_stream_ino
);
593 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
595 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
598 return STDOUT_FILENO
;
601 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
602 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
605 return STDERR_FILENO
;
608 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
609 o
= fixup_output(context
->std_output
, socket_fd
);
611 if (fileno
== STDERR_FILENO
) {
613 e
= fixup_output(context
->std_error
, socket_fd
);
615 /* This expects the input and output are already set up */
617 /* Don't change the stderr file descriptor if we inherit all
618 * the way and are not on a tty */
619 if (e
== EXEC_OUTPUT_INHERIT
&&
620 o
== EXEC_OUTPUT_INHERIT
&&
621 i
== EXEC_INPUT_NULL
&&
622 !is_terminal_input(context
->std_input
) &&
626 /* Duplicate from stdout if possible */
627 if (can_inherit_stderr_from_stdout(context
, o
, e
))
628 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
632 } else if (o
== EXEC_OUTPUT_INHERIT
) {
633 /* If input got downgraded, inherit the original value */
634 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
635 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
637 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
638 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
639 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
641 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
645 /* We need to open /dev/null here anew, to get the right access mode. */
646 return open_null_as(O_WRONLY
, fileno
);
651 case EXEC_OUTPUT_NULL
:
652 return open_null_as(O_WRONLY
, fileno
);
654 case EXEC_OUTPUT_TTY
:
655 if (is_terminal_input(i
))
656 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
658 /* We don't reset the terminal if this is just about output */
659 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
661 case EXEC_OUTPUT_SYSLOG
:
662 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
663 case EXEC_OUTPUT_KMSG
:
664 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
665 case EXEC_OUTPUT_JOURNAL
:
666 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
667 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
669 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
670 r
= open_null_as(O_WRONLY
, fileno
);
674 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
675 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
676 * services to detect whether they are connected to the journal or not.
678 * If both stdout and stderr are connected to a stream then let's make sure to store the data
679 * about STDERR as that's usually the best way to do logging. */
681 if (fstat(fileno
, &st
) >= 0 &&
682 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
683 *journal_stream_dev
= st
.st_dev
;
684 *journal_stream_ino
= st
.st_ino
;
689 case EXEC_OUTPUT_SOCKET
:
690 assert(socket_fd
>= 0);
692 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
694 case EXEC_OUTPUT_NAMED_FD
:
695 assert(named_iofds
[fileno
] >= 0);
697 (void) fd_nonblock(named_iofds
[fileno
], false);
698 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
700 case EXEC_OUTPUT_FILE
:
701 case EXEC_OUTPUT_FILE_APPEND
: {
705 assert(context
->stdio_file
[fileno
]);
707 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
708 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
711 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
714 if (o
== EXEC_OUTPUT_FILE_APPEND
)
717 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
721 return move_fd(fd
, fileno
, 0);
725 assert_not_reached("Unknown error type");
729 static int chown_terminal(int fd
, uid_t uid
) {
734 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
735 if (isatty(fd
) < 1) {
736 if (IN_SET(errno
, EINVAL
, ENOTTY
))
737 return 0; /* not a tty */
742 /* This might fail. What matters are the results. */
743 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, -1);
750 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
751 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
754 assert(_saved_stdin
);
755 assert(_saved_stdout
);
757 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
761 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
762 if (saved_stdout
< 0)
765 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
769 r
= chown_terminal(fd
, getuid());
773 r
= reset_terminal_fd(fd
, true);
777 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
);
782 *_saved_stdin
= saved_stdin
;
783 *_saved_stdout
= saved_stdout
;
785 saved_stdin
= saved_stdout
= -1;
790 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
793 if (err
== -ETIMEDOUT
)
794 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
797 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
801 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
802 _cleanup_close_
int fd
= -1;
806 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
810 write_confirm_error_fd(err
, fd
, u
);
813 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
817 assert(saved_stdout
);
821 if (*saved_stdin
>= 0)
822 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
825 if (*saved_stdout
>= 0)
826 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
829 *saved_stdin
= safe_close(*saved_stdin
);
830 *saved_stdout
= safe_close(*saved_stdout
);
836 CONFIRM_PRETEND_FAILURE
= -1,
837 CONFIRM_PRETEND_SUCCESS
= 0,
841 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
842 int saved_stdout
= -1, saved_stdin
= -1, r
;
843 _cleanup_free_
char *e
= NULL
;
846 /* For any internal errors, assume a positive response. */
847 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
849 write_confirm_error(r
, vc
, u
);
850 return CONFIRM_EXECUTE
;
853 /* confirm_spawn might have been disabled while we were sleeping. */
854 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
859 e
= ellipsize(cmdline
, 60, 100);
867 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
869 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
876 printf("Resuming normal execution.\n");
877 manager_disable_confirm_spawn();
881 unit_dump(u
, stdout
, " ");
882 continue; /* ask again */
884 printf("Failing execution.\n");
885 r
= CONFIRM_PRETEND_FAILURE
;
888 printf(" c - continue, proceed without asking anymore\n"
889 " D - dump, show the state of the unit\n"
890 " f - fail, don't execute the command and pretend it failed\n"
892 " i - info, show a short summary of the unit\n"
893 " j - jobs, show jobs that are in progress\n"
894 " s - skip, don't execute the command and pretend it succeeded\n"
895 " y - yes, execute the command\n");
896 continue; /* ask again */
898 printf(" Description: %s\n"
901 u
->id
, u
->description
, cmdline
);
902 continue; /* ask again */
904 manager_dump_jobs(u
->manager
, stdout
, " ");
905 continue; /* ask again */
907 /* 'n' was removed in favor of 'f'. */
908 printf("Didn't understand 'n', did you mean 'f'?\n");
909 continue; /* ask again */
911 printf("Skipping execution.\n");
912 r
= CONFIRM_PRETEND_SUCCESS
;
918 assert_not_reached("Unhandled choice");
924 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
928 static int get_fixed_user(const ExecContext
*c
, const char **user
,
929 uid_t
*uid
, gid_t
*gid
,
930 const char **home
, const char **shell
) {
939 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
940 * (i.e. are "/" or "/bin/nologin"). */
943 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
951 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
961 r
= get_group_creds(&name
, gid
, 0);
969 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
970 const char *group
, gid_t gid
,
971 gid_t
**supplementary_gids
, int *ngids
) {
975 bool keep_groups
= false;
976 gid_t
*groups
= NULL
;
977 _cleanup_free_ gid_t
*l_gids
= NULL
;
982 * If user is given, then lookup GID and supplementary groups list.
983 * We avoid NSS lookups for gid=0. Also we have to initialize groups
984 * here and as early as possible so we keep the list of supplementary
985 * groups of the caller.
987 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
988 /* First step, initialize groups from /etc/groups */
989 if (initgroups(user
, gid
) < 0)
995 if (strv_isempty(c
->supplementary_groups
))
999 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1000 * be positive, otherwise fail.
1003 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1004 if (ngroups_max
<= 0)
1005 return errno_or_else(EOPNOTSUPP
);
1007 l_gids
= new(gid_t
, ngroups_max
);
1013 * Lookup the list of groups that the user belongs to, we
1014 * avoid NSS lookups here too for gid=0.
1017 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1022 STRV_FOREACH(i
, c
->supplementary_groups
) {
1025 if (k
>= ngroups_max
)
1029 r
= get_group_creds(&g
, l_gids
+k
, 0);
1037 * Sets ngids to zero to drop all supplementary groups, happens
1038 * when we are under root and SupplementaryGroups= is empty.
1045 /* Otherwise get the final list of supplementary groups */
1046 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1050 *supplementary_gids
= groups
;
1058 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1061 /* Handle SupplementaryGroups= if it is not empty */
1063 r
= maybe_setgroups(ngids
, supplementary_gids
);
1068 if (gid_is_valid(gid
)) {
1069 /* Then set our gids */
1070 if (setresgid(gid
, gid
, gid
) < 0)
1077 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1080 if (!uid_is_valid(uid
))
1083 /* Sets (but doesn't look up) the uid and make sure we keep the
1084 * capabilities while doing so. */
1086 if (context
->capability_ambient_set
!= 0) {
1088 /* First step: If we need to keep capabilities but
1089 * drop privileges we need to make sure we keep our
1090 * caps, while we drop privileges. */
1092 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1094 if (prctl(PR_GET_SECUREBITS
) != sb
)
1095 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1100 /* Second step: actually set the uids */
1101 if (setresuid(uid
, uid
, uid
) < 0)
1104 /* At this point we should have all necessary capabilities but
1105 are otherwise a normal user. However, the caps might got
1106 corrupted due to the setresuid() so we need clean them up
1107 later. This is done outside of this call. */
1114 static int null_conv(
1116 const struct pam_message
**msg
,
1117 struct pam_response
**resp
,
1118 void *appdata_ptr
) {
1120 /* We don't support conversations */
1122 return PAM_CONV_ERR
;
1127 static int setup_pam(
1134 int fds
[], size_t n_fds
) {
1138 static const struct pam_conv conv
= {
1143 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1144 pam_handle_t
*handle
= NULL
;
1146 int pam_code
= PAM_SUCCESS
, r
;
1147 char **nv
, **e
= NULL
;
1148 bool close_session
= false;
1149 pid_t pam_pid
= 0, parent_pid
;
1156 /* We set up PAM in the parent process, then fork. The child
1157 * will then stay around until killed via PR_GET_PDEATHSIG or
1158 * systemd via the cgroup logic. It will then remove the PAM
1159 * session again. The parent process will exec() the actual
1160 * daemon. We do things this way to ensure that the main PID
1161 * of the daemon is the one we initially fork()ed. */
1163 r
= barrier_create(&barrier
);
1167 if (log_get_max_level() < LOG_DEBUG
)
1168 flags
|= PAM_SILENT
;
1170 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1171 if (pam_code
!= PAM_SUCCESS
) {
1177 _cleanup_free_
char *q
= NULL
;
1179 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1180 * out if that's the case, and read the TTY off it. */
1182 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1183 tty
= strjoina("/dev/", q
);
1187 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1188 if (pam_code
!= PAM_SUCCESS
)
1192 STRV_FOREACH(nv
, *env
) {
1193 pam_code
= pam_putenv(handle
, *nv
);
1194 if (pam_code
!= PAM_SUCCESS
)
1198 pam_code
= pam_acct_mgmt(handle
, flags
);
1199 if (pam_code
!= PAM_SUCCESS
)
1202 pam_code
= pam_open_session(handle
, flags
);
1203 if (pam_code
!= PAM_SUCCESS
)
1206 close_session
= true;
1208 e
= pam_getenvlist(handle
);
1210 pam_code
= PAM_BUF_ERR
;
1214 /* Block SIGTERM, so that we know that it won't get lost in
1217 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1219 parent_pid
= getpid_cached();
1221 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1225 int sig
, ret
= EXIT_PAM
;
1227 /* The child's job is to reset the PAM session on
1229 barrier_set_role(&barrier
, BARRIER_CHILD
);
1231 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1232 * are open here that have been opened by PAM. */
1233 (void) close_many(fds
, n_fds
);
1235 /* Drop privileges - we don't need any to pam_close_session
1236 * and this will make PR_SET_PDEATHSIG work in most cases.
1237 * If this fails, ignore the error - but expect sd-pam threads
1238 * to fail to exit normally */
1240 r
= maybe_setgroups(0, NULL
);
1242 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1243 if (setresgid(gid
, gid
, gid
) < 0)
1244 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1245 if (setresuid(uid
, uid
, uid
) < 0)
1246 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1248 (void) ignore_signals(SIGPIPE
, -1);
1250 /* Wait until our parent died. This will only work if
1251 * the above setresuid() succeeds, otherwise the kernel
1252 * will not allow unprivileged parents kill their privileged
1253 * children this way. We rely on the control groups kill logic
1254 * to do the rest for us. */
1255 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1258 /* Tell the parent that our setup is done. This is especially
1259 * important regarding dropping privileges. Otherwise, unit
1260 * setup might race against our setresuid(2) call.
1262 * If the parent aborted, we'll detect this below, hence ignore
1263 * return failure here. */
1264 (void) barrier_place(&barrier
);
1266 /* Check if our parent process might already have died? */
1267 if (getppid() == parent_pid
) {
1270 assert_se(sigemptyset(&ss
) >= 0);
1271 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1274 if (sigwait(&ss
, &sig
) < 0) {
1281 assert(sig
== SIGTERM
);
1286 /* If our parent died we'll end the session */
1287 if (getppid() != parent_pid
) {
1288 pam_code
= pam_close_session(handle
, flags
);
1289 if (pam_code
!= PAM_SUCCESS
)
1296 pam_end(handle
, pam_code
| flags
);
1300 barrier_set_role(&barrier
, BARRIER_PARENT
);
1302 /* If the child was forked off successfully it will do all the
1303 * cleanups, so forget about the handle here. */
1306 /* Unblock SIGTERM again in the parent */
1307 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1309 /* We close the log explicitly here, since the PAM modules
1310 * might have opened it, but we don't want this fd around. */
1313 /* Synchronously wait for the child to initialize. We don't care for
1314 * errors as we cannot recover. However, warn loudly if it happens. */
1315 if (!barrier_place_and_sync(&barrier
))
1316 log_error("PAM initialization failed");
1318 return strv_free_and_replace(*env
, e
);
1321 if (pam_code
!= PAM_SUCCESS
) {
1322 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1323 r
= -EPERM
; /* PAM errors do not map to errno */
1325 log_error_errno(r
, "PAM failed: %m");
1329 pam_code
= pam_close_session(handle
, flags
);
1331 pam_end(handle
, pam_code
| flags
);
1343 static void rename_process_from_path(const char *path
) {
1344 char process_name
[11];
1348 /* This resulting string must fit in 10 chars (i.e. the length
1349 * of "/sbin/init") to look pretty in /bin/ps */
1353 rename_process("(...)");
1359 /* The end of the process name is usually more
1360 * interesting, since the first bit might just be
1366 process_name
[0] = '(';
1367 memcpy(process_name
+1, p
, l
);
1368 process_name
[1+l
] = ')';
1369 process_name
[1+l
+1] = 0;
1371 rename_process(process_name
);
1374 static bool context_has_address_families(const ExecContext
*c
) {
1377 return c
->address_families_whitelist
||
1378 !set_isempty(c
->address_families
);
1381 static bool context_has_syscall_filters(const ExecContext
*c
) {
1384 return c
->syscall_whitelist
||
1385 !hashmap_isempty(c
->syscall_filter
);
1388 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1391 if (c
->no_new_privileges
)
1394 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1397 /* We need NNP if we have any form of seccomp and are unprivileged */
1398 return context_has_address_families(c
) ||
1399 c
->memory_deny_write_execute
||
1400 c
->restrict_realtime
||
1401 c
->restrict_suid_sgid
||
1402 exec_context_restrict_namespaces_set(c
) ||
1403 c
->protect_kernel_tunables
||
1404 c
->protect_kernel_modules
||
1405 c
->private_devices
||
1406 context_has_syscall_filters(c
) ||
1407 !set_isempty(c
->syscall_archs
) ||
1408 c
->lock_personality
||
1409 c
->protect_hostname
;
1414 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1416 if (is_seccomp_available())
1419 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1423 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1424 uint32_t negative_action
, default_action
, action
;
1430 if (!context_has_syscall_filters(c
))
1433 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1436 negative_action
= c
->syscall_errno
== 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1438 if (c
->syscall_whitelist
) {
1439 default_action
= negative_action
;
1440 action
= SCMP_ACT_ALLOW
;
1442 default_action
= SCMP_ACT_ALLOW
;
1443 action
= negative_action
;
1446 if (needs_ambient_hack
) {
1447 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1452 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1455 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1459 if (set_isempty(c
->syscall_archs
))
1462 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1465 return seccomp_restrict_archs(c
->syscall_archs
);
1468 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1472 if (!context_has_address_families(c
))
1475 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1478 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1481 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1485 if (!c
->memory_deny_write_execute
)
1488 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1491 return seccomp_memory_deny_write_execute();
1494 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1498 if (!c
->restrict_realtime
)
1501 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1504 return seccomp_restrict_realtime();
1507 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1511 if (!c
->restrict_suid_sgid
)
1514 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1517 return seccomp_restrict_suid_sgid();
1520 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1524 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1525 * let's protect even those systems where this is left on in the kernel. */
1527 if (!c
->protect_kernel_tunables
)
1530 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1533 return seccomp_protect_sysctl();
1536 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1540 /* Turn off module syscalls on ProtectKernelModules=yes */
1542 if (!c
->protect_kernel_modules
)
1545 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1548 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1551 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1555 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1557 if (!c
->private_devices
)
1560 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1563 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1566 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1570 if (!exec_context_restrict_namespaces_set(c
))
1573 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1576 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1579 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1580 unsigned long personality
;
1586 if (!c
->lock_personality
)
1589 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1592 personality
= c
->personality
;
1594 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1595 if (personality
== PERSONALITY_INVALID
) {
1597 r
= opinionated_personality(&personality
);
1602 return seccomp_lock_personality(personality
);
1607 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1610 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1611 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1613 if (idle_pipe
[0] >= 0) {
1616 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1618 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1621 /* Signal systemd that we are bored and want to continue. */
1622 n
= write(idle_pipe
[3], "x", 1);
1624 /* Wait for systemd to react to the signal above. */
1625 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1628 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1632 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1635 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1637 static int build_environment(
1639 const ExecContext
*c
,
1640 const ExecParameters
*p
,
1643 const char *username
,
1645 dev_t journal_stream_dev
,
1646 ino_t journal_stream_ino
,
1649 _cleanup_strv_free_
char **our_env
= NULL
;
1650 ExecDirectoryType t
;
1659 our_env
= new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX
);
1664 _cleanup_free_
char *joined
= NULL
;
1666 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1668 our_env
[n_env
++] = x
;
1670 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1672 our_env
[n_env
++] = x
;
1674 joined
= strv_join(p
->fd_names
, ":");
1678 x
= strjoin("LISTEN_FDNAMES=", joined
);
1681 our_env
[n_env
++] = x
;
1684 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1685 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1687 our_env
[n_env
++] = x
;
1689 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1691 our_env
[n_env
++] = x
;
1694 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1695 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1696 * check the database directly. */
1697 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1698 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1701 our_env
[n_env
++] = x
;
1705 x
= strjoin("HOME=", home
);
1709 path_simplify(x
+ 5, true);
1710 our_env
[n_env
++] = x
;
1714 x
= strjoin("LOGNAME=", username
);
1717 our_env
[n_env
++] = x
;
1719 x
= strjoin("USER=", username
);
1722 our_env
[n_env
++] = x
;
1726 x
= strjoin("SHELL=", shell
);
1730 path_simplify(x
+ 6, true);
1731 our_env
[n_env
++] = x
;
1734 if (!sd_id128_is_null(u
->invocation_id
)) {
1735 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1738 our_env
[n_env
++] = x
;
1741 if (exec_context_needs_term(c
)) {
1742 const char *tty_path
, *term
= NULL
;
1744 tty_path
= exec_context_tty_path(c
);
1746 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1747 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1748 * passes to PID 1 ends up all the way in the console login shown. */
1750 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1751 term
= getenv("TERM");
1753 term
= default_term_for_tty(tty_path
);
1755 x
= strjoin("TERM=", term
);
1758 our_env
[n_env
++] = x
;
1761 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1762 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1765 our_env
[n_env
++] = x
;
1768 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1769 _cleanup_free_
char *pre
= NULL
, *joined
= NULL
;
1775 if (strv_isempty(c
->directories
[t
].paths
))
1778 n
= exec_directory_env_name_to_string(t
);
1782 pre
= strjoin(p
->prefix
[t
], "/");
1786 joined
= strv_join_prefix(c
->directories
[t
].paths
, ":", pre
);
1790 x
= strjoin(n
, "=", joined
);
1794 our_env
[n_env
++] = x
;
1797 our_env
[n_env
++] = NULL
;
1798 assert(n_env
<= 14 + _EXEC_DIRECTORY_TYPE_MAX
);
1800 *ret
= TAKE_PTR(our_env
);
1805 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1806 _cleanup_strv_free_
char **pass_env
= NULL
;
1807 size_t n_env
= 0, n_bufsize
= 0;
1810 STRV_FOREACH(i
, c
->pass_environment
) {
1811 _cleanup_free_
char *x
= NULL
;
1817 x
= strjoin(*i
, "=", v
);
1821 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1824 pass_env
[n_env
++] = TAKE_PTR(x
);
1825 pass_env
[n_env
] = NULL
;
1828 *ret
= TAKE_PTR(pass_env
);
1833 static bool exec_needs_mount_namespace(
1834 const ExecContext
*context
,
1835 const ExecParameters
*params
,
1836 const ExecRuntime
*runtime
) {
1841 if (context
->root_image
)
1844 if (!strv_isempty(context
->read_write_paths
) ||
1845 !strv_isempty(context
->read_only_paths
) ||
1846 !strv_isempty(context
->inaccessible_paths
))
1849 if (context
->n_bind_mounts
> 0)
1852 if (context
->n_temporary_filesystems
> 0)
1855 if (!IN_SET(context
->mount_flags
, 0, MS_SHARED
))
1858 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1861 if (context
->private_devices
||
1862 context
->private_mounts
||
1863 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1864 context
->protect_home
!= PROTECT_HOME_NO
||
1865 context
->protect_kernel_tunables
||
1866 context
->protect_kernel_modules
||
1867 context
->protect_control_groups
)
1870 if (context
->root_directory
) {
1871 ExecDirectoryType t
;
1873 if (context
->mount_apivfs
)
1876 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1877 if (!params
->prefix
[t
])
1880 if (!strv_isempty(context
->directories
[t
].paths
))
1885 if (context
->dynamic_user
&&
1886 (!strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1887 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1888 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
)))
1894 static int setup_private_users(uid_t uid
, gid_t gid
) {
1895 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1896 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1897 _cleanup_close_
int unshare_ready_fd
= -1;
1898 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1903 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1904 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1905 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1906 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1907 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1908 * continues execution normally. */
1910 if (uid
!= 0 && uid_is_valid(uid
)) {
1911 r
= asprintf(&uid_map
,
1912 "0 0 1\n" /* Map root → root */
1913 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1918 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1923 if (gid
!= 0 && gid_is_valid(gid
)) {
1924 r
= asprintf(&gid_map
,
1925 "0 0 1\n" /* Map root → root */
1926 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1931 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1936 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1938 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1939 if (unshare_ready_fd
< 0)
1942 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1944 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1947 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
1951 _cleanup_close_
int fd
= -1;
1955 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1956 * here, after the parent opened its own user namespace. */
1959 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1961 /* Wait until the parent unshared the user namespace */
1962 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1967 /* Disable the setgroups() system call in the child user namespace, for good. */
1968 a
= procfs_file_alloca(ppid
, "setgroups");
1969 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1971 if (errno
!= ENOENT
) {
1976 /* If the file is missing the kernel is too old, let's continue anyway. */
1978 if (write(fd
, "deny\n", 5) < 0) {
1983 fd
= safe_close(fd
);
1986 /* First write the GID map */
1987 a
= procfs_file_alloca(ppid
, "gid_map");
1988 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1993 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1997 fd
= safe_close(fd
);
1999 /* The write the UID map */
2000 a
= procfs_file_alloca(ppid
, "uid_map");
2001 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2006 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2011 _exit(EXIT_SUCCESS
);
2014 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2015 _exit(EXIT_FAILURE
);
2018 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2020 if (unshare(CLONE_NEWUSER
) < 0)
2023 /* Let the child know that the namespace is ready now */
2024 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2027 /* Try to read an error code from the child */
2028 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2031 if (n
== sizeof(r
)) { /* an error code was sent to us */
2036 if (n
!= 0) /* on success we should have read 0 bytes */
2039 r
= wait_for_terminate_and_check("(sd-userns)", pid
, 0);
2043 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2049 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2050 if (!context
->dynamic_user
)
2053 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2056 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2062 static int setup_exec_directory(
2063 const ExecContext
*context
,
2064 const ExecParameters
*params
,
2067 ExecDirectoryType type
,
2070 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2071 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2072 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2073 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2074 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2075 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2082 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2083 assert(exit_status
);
2085 if (!params
->prefix
[type
])
2088 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2089 if (!uid_is_valid(uid
))
2091 if (!gid_is_valid(gid
))
2095 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
2096 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2098 p
= path_join(params
->prefix
[type
], *rt
);
2104 r
= mkdir_parents_label(p
, 0755);
2108 if (exec_directory_is_private(context
, type
)) {
2109 _cleanup_free_
char *private_root
= NULL
;
2111 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2112 * case we want to avoid leaving a directory around fully accessible that is owned by
2113 * a dynamic user whose UID is later on reused. To lock this down we use the same
2114 * trick used by container managers to prohibit host users to get access to files of
2115 * the same UID in containers: we place everything inside a directory that has an
2116 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2117 * for unprivileged host code. We then use fs namespacing to make this directory
2118 * permeable for the service itself.
2120 * Specifically: for a service which wants a special directory "foo/" we first create
2121 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2122 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2123 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2124 * unprivileged host users can't look into it. Inside of the namespace of the unit
2125 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2126 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2127 * for the service and making sure it only gets access to the dirs it needs but no
2128 * others. Tricky? Yes, absolutely, but it works!
2130 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2131 * to be owned by the service itself.
2133 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2134 * for sharing files or sockets with other services. */
2136 private_root
= path_join(params
->prefix
[type
], "private");
2137 if (!private_root
) {
2142 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2143 r
= mkdir_safe_label(private_root
, 0700, 0, 0, MKDIR_WARN_MODE
);
2147 pp
= path_join(private_root
, *rt
);
2153 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2154 r
= mkdir_parents_label(pp
, 0755);
2158 if (is_dir(p
, false) > 0 &&
2159 (laccess(pp
, F_OK
) < 0 && errno
== ENOENT
)) {
2161 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2162 * it over. Most likely the service has been upgraded from one that didn't use
2163 * DynamicUser=1, to one that does. */
2165 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2166 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2167 exec_directory_type_to_string(type
), p
, pp
);
2169 if (rename(p
, pp
) < 0) {
2174 /* Otherwise, create the actual directory for the service */
2176 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2177 if (r
< 0 && r
!= -EEXIST
)
2181 /* And link it up from the original place */
2182 r
= symlink_idempotent(pp
, p
, true);
2187 _cleanup_free_
char *target
= NULL
;
2189 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2190 readlink_and_make_absolute(p
, &target
) >= 0) {
2191 _cleanup_free_
char *q
= NULL
;
2193 /* This already exists and is a symlink? Interesting. Maybe it's one created
2194 * by DynamicUser=1 (see above)?
2196 * We do this for all directory types except for ConfigurationDirectory=,
2197 * since they all support the private/ symlink logic at least in some
2198 * configurations, see above. */
2200 q
= path_join(params
->prefix
[type
], "private", *rt
);
2206 if (path_equal(q
, target
)) {
2208 /* Hmm, apparently DynamicUser= was once turned on for this service,
2209 * but is no longer. Let's move the directory back up. */
2211 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2212 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2213 exec_directory_type_to_string(type
), q
, p
);
2215 if (unlink(p
) < 0) {
2220 if (rename(q
, p
) < 0) {
2227 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2232 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2235 /* Don't change the owner/access mode of the configuration directory,
2236 * as in the common case it is not written to by a service, and shall
2237 * not be writable. */
2239 if (stat(p
, &st
) < 0) {
2244 /* Still complain if the access mode doesn't match */
2245 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2246 log_warning("%s \'%s\' already exists but the mode is different. "
2247 "(File system: %o %sMode: %o)",
2248 exec_directory_type_to_string(type
), *rt
,
2249 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2256 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2257 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2258 * current UID/GID ownership.) */
2259 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2263 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2264 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2265 * assignments to exist.*/
2266 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777);
2274 *exit_status
= exit_status_table
[type
];
2279 static int setup_smack(
2280 const ExecContext
*context
,
2281 const ExecCommand
*command
) {
2288 if (context
->smack_process_label
) {
2289 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2293 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2295 _cleanup_free_
char *exec_label
= NULL
;
2297 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2298 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2301 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2311 static int compile_bind_mounts(
2312 const ExecContext
*context
,
2313 const ExecParameters
*params
,
2314 BindMount
**ret_bind_mounts
,
2315 size_t *ret_n_bind_mounts
,
2316 char ***ret_empty_directories
) {
2318 _cleanup_strv_free_
char **empty_directories
= NULL
;
2319 BindMount
*bind_mounts
;
2321 ExecDirectoryType t
;
2326 assert(ret_bind_mounts
);
2327 assert(ret_n_bind_mounts
);
2328 assert(ret_empty_directories
);
2330 n
= context
->n_bind_mounts
;
2331 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2332 if (!params
->prefix
[t
])
2335 n
+= strv_length(context
->directories
[t
].paths
);
2339 *ret_bind_mounts
= NULL
;
2340 *ret_n_bind_mounts
= 0;
2341 *ret_empty_directories
= NULL
;
2345 bind_mounts
= new(BindMount
, n
);
2349 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2350 BindMount
*item
= context
->bind_mounts
+ i
;
2353 s
= strdup(item
->source
);
2359 d
= strdup(item
->destination
);
2366 bind_mounts
[h
++] = (BindMount
) {
2369 .read_only
= item
->read_only
,
2370 .recursive
= item
->recursive
,
2371 .ignore_enoent
= item
->ignore_enoent
,
2375 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2378 if (!params
->prefix
[t
])
2381 if (strv_isempty(context
->directories
[t
].paths
))
2384 if (exec_directory_is_private(context
, t
) &&
2385 !(context
->root_directory
|| context
->root_image
)) {
2388 /* So this is for a dynamic user, and we need to make sure the process can access its own
2389 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2390 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2392 private_root
= path_join(params
->prefix
[t
], "private");
2393 if (!private_root
) {
2398 r
= strv_consume(&empty_directories
, private_root
);
2403 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2406 if (exec_directory_is_private(context
, t
))
2407 s
= path_join(params
->prefix
[t
], "private", *suffix
);
2409 s
= path_join(params
->prefix
[t
], *suffix
);
2415 if (exec_directory_is_private(context
, t
) &&
2416 (context
->root_directory
|| context
->root_image
))
2417 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2418 * directory is not created on the root directory. So, let's bind-mount the directory
2419 * on the 'non-private' place. */
2420 d
= path_join(params
->prefix
[t
], *suffix
);
2429 bind_mounts
[h
++] = (BindMount
) {
2433 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
2435 .ignore_enoent
= false,
2442 *ret_bind_mounts
= bind_mounts
;
2443 *ret_n_bind_mounts
= n
;
2444 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2449 bind_mount_free_many(bind_mounts
, h
);
2453 static int apply_mount_namespace(
2455 const ExecCommand
*command
,
2456 const ExecContext
*context
,
2457 const ExecParameters
*params
,
2458 const ExecRuntime
*runtime
,
2459 char **error_path
) {
2461 _cleanup_strv_free_
char **empty_directories
= NULL
;
2462 char *tmp
= NULL
, *var
= NULL
;
2463 const char *root_dir
= NULL
, *root_image
= NULL
;
2464 NamespaceInfo ns_info
;
2465 bool needs_sandboxing
;
2466 BindMount
*bind_mounts
= NULL
;
2467 size_t n_bind_mounts
= 0;
2472 /* The runtime struct only contains the parent of the private /tmp,
2473 * which is non-accessible to world users. Inside of it there's a /tmp
2474 * that is sticky, and that's the one we want to use here. */
2476 if (context
->private_tmp
&& runtime
) {
2477 if (runtime
->tmp_dir
)
2478 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2479 if (runtime
->var_tmp_dir
)
2480 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2483 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2484 root_image
= context
->root_image
;
2487 root_dir
= context
->root_directory
;
2490 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2494 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2495 if (needs_sandboxing
)
2496 ns_info
= (NamespaceInfo
) {
2497 .ignore_protect_paths
= false,
2498 .private_dev
= context
->private_devices
,
2499 .protect_control_groups
= context
->protect_control_groups
,
2500 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2501 .protect_kernel_modules
= context
->protect_kernel_modules
,
2502 .protect_hostname
= context
->protect_hostname
,
2503 .mount_apivfs
= context
->mount_apivfs
,
2504 .private_mounts
= context
->private_mounts
,
2506 else if (!context
->dynamic_user
&& root_dir
)
2508 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2509 * sandbox info, otherwise enforce it, don't ignore protected paths and
2510 * fail if we are enable to apply the sandbox inside the mount namespace.
2512 ns_info
= (NamespaceInfo
) {
2513 .ignore_protect_paths
= true,
2516 ns_info
= (NamespaceInfo
) {};
2518 if (context
->mount_flags
== MS_SHARED
)
2519 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2521 r
= setup_namespace(root_dir
, root_image
,
2522 &ns_info
, context
->read_write_paths
,
2523 needs_sandboxing
? context
->read_only_paths
: NULL
,
2524 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2528 context
->temporary_filesystems
,
2529 context
->n_temporary_filesystems
,
2532 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2533 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2534 context
->mount_flags
,
2535 DISSECT_IMAGE_DISCARD_ON_LOOP
,
2538 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2540 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2541 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2542 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2543 * completely different execution environment. */
2545 if (n_bind_mounts
== 0 &&
2546 context
->n_temporary_filesystems
== 0 &&
2547 !root_dir
&& !root_image
&&
2548 !context
->dynamic_user
) {
2549 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
2553 log_unit_debug(u
, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2554 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2555 n_bind_mounts
, context
->n_temporary_filesystems
, yes_no(root_dir
), yes_no(root_image
), yes_no(context
->dynamic_user
));
2563 static int apply_working_directory(
2564 const ExecContext
*context
,
2565 const ExecParameters
*params
,
2572 assert(exit_status
);
2574 if (context
->working_directory_home
) {
2577 *exit_status
= EXIT_CHDIR
;
2583 } else if (context
->working_directory
)
2584 wd
= context
->working_directory
;
2588 if (params
->flags
& EXEC_APPLY_CHROOT
)
2591 d
= prefix_roota(context
->root_directory
, wd
);
2593 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2594 *exit_status
= EXIT_CHDIR
;
2601 static int apply_root_directory(
2602 const ExecContext
*context
,
2603 const ExecParameters
*params
,
2604 const bool needs_mount_ns
,
2608 assert(exit_status
);
2610 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2611 if (!needs_mount_ns
&& context
->root_directory
)
2612 if (chroot(context
->root_directory
) < 0) {
2613 *exit_status
= EXIT_CHROOT
;
2621 static int setup_keyring(
2623 const ExecContext
*context
,
2624 const ExecParameters
*p
,
2625 uid_t uid
, gid_t gid
) {
2627 key_serial_t keyring
;
2636 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2637 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2638 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2639 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2640 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2641 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2643 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2646 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2647 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2648 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2649 * & group is just as nasty as acquiring a reference to the user keyring. */
2651 saved_uid
= getuid();
2652 saved_gid
= getgid();
2654 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2655 if (setregid(gid
, -1) < 0)
2656 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2659 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2660 if (setreuid(uid
, -1) < 0) {
2661 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2666 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2667 if (keyring
== -1) {
2668 if (errno
== ENOSYS
)
2669 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2670 else if (IN_SET(errno
, EACCES
, EPERM
))
2671 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2672 else if (errno
== EDQUOT
)
2673 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2675 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2680 /* When requested link the user keyring into the session keyring. */
2681 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2683 if (keyctl(KEYCTL_LINK
,
2684 KEY_SPEC_USER_KEYRING
,
2685 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2686 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
2691 /* Restore uid/gid back */
2692 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2693 if (setreuid(saved_uid
, -1) < 0) {
2694 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2699 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2700 if (setregid(saved_gid
, -1) < 0)
2701 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2704 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2705 if (!sd_id128_is_null(u
->invocation_id
)) {
2708 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2710 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2712 if (keyctl(KEYCTL_SETPERM
, key
,
2713 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2714 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2715 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2720 /* Revert back uid & gid for the the last time, and exit */
2721 /* no extra logging, as only the first already reported error matters */
2722 if (getuid() != saved_uid
)
2723 (void) setreuid(saved_uid
, -1);
2725 if (getgid() != saved_gid
)
2726 (void) setregid(saved_gid
, -1);
2731 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
2737 array
[(*n
)++] = pair
[0];
2739 array
[(*n
)++] = pair
[1];
2742 static int close_remaining_fds(
2743 const ExecParameters
*params
,
2744 const ExecRuntime
*runtime
,
2745 const DynamicCreds
*dcreds
,
2749 int *fds
, size_t n_fds
) {
2751 size_t n_dont_close
= 0;
2752 int dont_close
[n_fds
+ 12];
2756 if (params
->stdin_fd
>= 0)
2757 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2758 if (params
->stdout_fd
>= 0)
2759 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2760 if (params
->stderr_fd
>= 0)
2761 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2764 dont_close
[n_dont_close
++] = socket_fd
;
2766 dont_close
[n_dont_close
++] = exec_fd
;
2768 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2769 n_dont_close
+= n_fds
;
2773 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2777 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2779 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2782 if (user_lookup_fd
>= 0)
2783 dont_close
[n_dont_close
++] = user_lookup_fd
;
2785 return close_all_fds(dont_close
, n_dont_close
);
2788 static int send_user_lookup(
2796 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2797 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2800 if (user_lookup_fd
< 0)
2803 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2806 if (writev(user_lookup_fd
,
2808 IOVEC_INIT(&uid
, sizeof(uid
)),
2809 IOVEC_INIT(&gid
, sizeof(gid
)),
2810 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2816 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2823 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2828 if (!c
->working_directory_home
)
2831 r
= get_home_dir(buf
);
2839 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2840 _cleanup_strv_free_
char ** list
= NULL
;
2841 ExecDirectoryType t
;
2848 assert(c
->dynamic_user
);
2850 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2851 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2854 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2857 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2863 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2866 if (exec_directory_is_private(c
, t
))
2867 e
= path_join(p
->prefix
[t
], "private", *i
);
2869 e
= path_join(p
->prefix
[t
], *i
);
2873 r
= strv_consume(&list
, e
);
2879 *ret
= TAKE_PTR(list
);
2884 static char *exec_command_line(char **argv
);
2886 static int exec_parameters_get_cgroup_path(const ExecParameters
*params
, char **ret
) {
2887 bool using_subcgroup
;
2893 if (!params
->cgroup_path
)
2896 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2897 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2898 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2899 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2900 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2901 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2902 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2903 * flag, which is only passed for the former statements, not for the latter. */
2905 using_subcgroup
= FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
|EXEC_CGROUP_DELEGATE
|EXEC_IS_CONTROL
);
2906 if (using_subcgroup
)
2907 p
= path_join(params
->cgroup_path
, ".control");
2909 p
= strdup(params
->cgroup_path
);
2914 return using_subcgroup
;
2917 static int exec_child(
2919 const ExecCommand
*command
,
2920 const ExecContext
*context
,
2921 const ExecParameters
*params
,
2922 ExecRuntime
*runtime
,
2923 DynamicCreds
*dcreds
,
2925 const int named_iofds
[static 3],
2927 size_t n_socket_fds
,
2928 size_t n_storage_fds
,
2933 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
2934 int *fds_with_exec_fd
, n_fds_with_exec_fd
, r
, ngids
= 0, exec_fd
= -1;
2935 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2936 const char *username
= NULL
, *groupname
= NULL
;
2937 _cleanup_free_
char *home_buffer
= NULL
;
2938 const char *home
= NULL
, *shell
= NULL
;
2939 char **final_argv
= NULL
;
2940 dev_t journal_stream_dev
= 0;
2941 ino_t journal_stream_ino
= 0;
2942 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2943 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2944 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2945 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2947 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
2948 bool use_selinux
= false;
2951 bool use_smack
= false;
2954 bool use_apparmor
= false;
2956 uid_t uid
= UID_INVALID
;
2957 gid_t gid
= GID_INVALID
;
2959 ExecDirectoryType dt
;
2966 assert(exit_status
);
2968 rename_process_from_path(command
->path
);
2970 /* We reset exactly these signals, since they are the
2971 * only ones we set to SIG_IGN in the main daemon. All
2972 * others we leave untouched because we set them to
2973 * SIG_DFL or a valid handler initially, both of which
2974 * will be demoted to SIG_DFL. */
2975 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2976 SIGNALS_IGNORE
, -1);
2978 if (context
->ignore_sigpipe
)
2979 (void) ignore_signals(SIGPIPE
, -1);
2981 r
= reset_signal_mask();
2983 *exit_status
= EXIT_SIGNAL_MASK
;
2984 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2987 if (params
->idle_pipe
)
2988 do_idle_pipe_dance(params
->idle_pipe
);
2990 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2991 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2992 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2993 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2996 log_set_open_when_needed(true);
2998 /* In case anything used libc syslog(), close this here, too */
3001 n_fds
= n_socket_fds
+ n_storage_fds
;
3002 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, params
->exec_fd
, fds
, n_fds
);
3004 *exit_status
= EXIT_FDS
;
3005 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
3008 if (!context
->same_pgrp
)
3010 *exit_status
= EXIT_SETSID
;
3011 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
3014 exec_context_tty_reset(context
, params
);
3016 if (unit_shall_confirm_spawn(unit
)) {
3017 const char *vc
= params
->confirm_spawn
;
3018 _cleanup_free_
char *cmdline
= NULL
;
3020 cmdline
= exec_command_line(command
->argv
);
3022 *exit_status
= EXIT_MEMORY
;
3026 r
= ask_for_confirmation(vc
, unit
, cmdline
);
3027 if (r
!= CONFIRM_EXECUTE
) {
3028 if (r
== CONFIRM_PRETEND_SUCCESS
) {
3029 *exit_status
= EXIT_SUCCESS
;
3032 *exit_status
= EXIT_CONFIRM
;
3033 log_unit_error(unit
, "Execution cancelled by the user");
3038 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3039 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3040 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3041 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3042 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3043 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
3044 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit
->manager
) ? "system" : "user", true) != 0) {
3045 *exit_status
= EXIT_MEMORY
;
3046 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
3049 if (context
->dynamic_user
&& dcreds
) {
3050 _cleanup_strv_free_
char **suggested_paths
= NULL
;
3052 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3053 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3054 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3055 *exit_status
= EXIT_USER
;
3056 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
3059 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
3061 *exit_status
= EXIT_MEMORY
;
3065 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
3067 *exit_status
= EXIT_USER
;
3069 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3072 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
3075 if (!uid_is_valid(uid
)) {
3076 *exit_status
= EXIT_USER
;
3077 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
3081 if (!gid_is_valid(gid
)) {
3082 *exit_status
= EXIT_USER
;
3083 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
3088 username
= dcreds
->user
->name
;
3091 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
3093 *exit_status
= EXIT_USER
;
3094 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
3097 r
= get_fixed_group(context
, &groupname
, &gid
);
3099 *exit_status
= EXIT_GROUP
;
3100 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
3104 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3105 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
3106 &supplementary_gids
, &ngids
);
3108 *exit_status
= EXIT_GROUP
;
3109 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
3112 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
3114 *exit_status
= EXIT_USER
;
3115 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
3118 user_lookup_fd
= safe_close(user_lookup_fd
);
3120 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
3122 *exit_status
= EXIT_CHDIR
;
3123 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
3126 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3127 * must sure to drop O_NONBLOCK */
3129 (void) fd_nonblock(socket_fd
, false);
3131 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3132 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3133 if (params
->cgroup_path
) {
3134 _cleanup_free_
char *p
= NULL
;
3136 r
= exec_parameters_get_cgroup_path(params
, &p
);
3138 *exit_status
= EXIT_CGROUP
;
3139 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
3142 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
3144 *exit_status
= EXIT_CGROUP
;
3145 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
3149 if (context
->network_namespace_path
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3150 r
= open_netns_path(runtime
->netns_storage_socket
, context
->network_namespace_path
);
3152 *exit_status
= EXIT_NETWORK
;
3153 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
3157 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
3159 *exit_status
= EXIT_STDIN
;
3160 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
3163 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
3165 *exit_status
= EXIT_STDOUT
;
3166 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
3169 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
3171 *exit_status
= EXIT_STDERR
;
3172 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
3175 if (context
->oom_score_adjust_set
) {
3176 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3177 * prohibit write access to this file, and we shouldn't trip up over that. */
3178 r
= set_oom_score_adjust(context
->oom_score_adjust
);
3179 if (IN_SET(r
, -EPERM
, -EACCES
))
3180 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3182 *exit_status
= EXIT_OOM_ADJUST
;
3183 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
3187 if (context
->nice_set
)
3188 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
3189 *exit_status
= EXIT_NICE
;
3190 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
3193 if (context
->cpu_sched_set
) {
3194 struct sched_param param
= {
3195 .sched_priority
= context
->cpu_sched_priority
,
3198 r
= sched_setscheduler(0,
3199 context
->cpu_sched_policy
|
3200 (context
->cpu_sched_reset_on_fork
?
3201 SCHED_RESET_ON_FORK
: 0),
3204 *exit_status
= EXIT_SETSCHEDULER
;
3205 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
3209 if (context
->cpu_set
.set
)
3210 if (sched_setaffinity(0, context
->cpu_set
.allocated
, context
->cpu_set
.set
) < 0) {
3211 *exit_status
= EXIT_CPUAFFINITY
;
3212 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
3215 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
3216 r
= apply_numa_policy(&context
->numa_policy
);
3217 if (r
== -EOPNOTSUPP
)
3218 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
3220 *exit_status
= EXIT_NUMA_POLICY
;
3221 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
3225 if (context
->ioprio_set
)
3226 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
3227 *exit_status
= EXIT_IOPRIO
;
3228 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
3231 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
3232 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
3233 *exit_status
= EXIT_TIMERSLACK
;
3234 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
3237 if (context
->personality
!= PERSONALITY_INVALID
) {
3238 r
= safe_personality(context
->personality
);
3240 *exit_status
= EXIT_PERSONALITY
;
3241 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
3245 if (context
->utmp_id
)
3246 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
3248 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
3249 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
3253 if (uid_is_valid(uid
)) {
3254 r
= chown_terminal(STDIN_FILENO
, uid
);
3256 *exit_status
= EXIT_STDIN
;
3257 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
3261 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3262 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3263 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3264 * touch a single hierarchy too. */
3265 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
3266 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
3268 *exit_status
= EXIT_CGROUP
;
3269 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
3273 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3274 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
3276 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
3279 r
= build_environment(
3291 *exit_status
= EXIT_MEMORY
;
3295 r
= build_pass_environment(context
, &pass_env
);
3297 *exit_status
= EXIT_MEMORY
;
3301 accum_env
= strv_env_merge(5,
3302 params
->environment
,
3305 context
->environment
,
3309 *exit_status
= EXIT_MEMORY
;
3312 accum_env
= strv_env_clean(accum_env
);
3314 (void) umask(context
->umask
);
3316 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
3318 *exit_status
= EXIT_KEYRING
;
3319 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
3322 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3323 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3325 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3326 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3328 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3329 if (needs_ambient_hack
)
3330 needs_setuid
= false;
3332 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3334 if (needs_sandboxing
) {
3335 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3336 * present. The actual MAC context application will happen later, as late as possible, to avoid
3337 * impacting our own code paths. */
3340 use_selinux
= mac_selinux_use();
3343 use_smack
= mac_smack_use();
3346 use_apparmor
= mac_apparmor_use();
3350 if (needs_sandboxing
) {
3353 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3354 * is set here. (See below.) */
3356 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
3358 *exit_status
= EXIT_LIMITS
;
3359 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
3365 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3366 * wins here. (See above.) */
3368 if (context
->pam_name
&& username
) {
3369 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3371 *exit_status
= EXIT_PAM
;
3372 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3377 if ((context
->private_network
|| context
->network_namespace_path
) && runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3379 if (ns_type_supported(NAMESPACE_NET
)) {
3380 r
= setup_netns(runtime
->netns_storage_socket
);
3382 *exit_status
= EXIT_NETWORK
;
3383 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3385 } else if (context
->network_namespace_path
) {
3386 *exit_status
= EXIT_NETWORK
;
3387 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
), "NetworkNamespacePath= is not supported, refusing.");
3389 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3392 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3393 if (needs_mount_namespace
) {
3394 _cleanup_free_
char *error_path
= NULL
;
3396 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
, &error_path
);
3398 *exit_status
= EXIT_NAMESPACE
;
3399 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
3400 error_path
? ": " : "", strempty(error_path
));
3404 if (context
->protect_hostname
) {
3405 if (ns_type_supported(NAMESPACE_UTS
)) {
3406 if (unshare(CLONE_NEWUTS
) < 0) {
3407 *exit_status
= EXIT_NAMESPACE
;
3408 return log_unit_error_errno(unit
, errno
, "Failed to set up UTS namespacing: %m");
3411 log_unit_warning(unit
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3413 r
= seccomp_protect_hostname();
3415 *exit_status
= EXIT_SECCOMP
;
3416 return log_unit_error_errno(unit
, r
, "Failed to apply hostname restrictions: %m");
3421 /* Drop groups as early as possbile */
3423 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3425 *exit_status
= EXIT_GROUP
;
3426 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3430 if (needs_sandboxing
) {
3432 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3433 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3435 *exit_status
= EXIT_SELINUX_CONTEXT
;
3436 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3441 if (context
->private_users
) {
3442 r
= setup_private_users(uid
, gid
);
3444 *exit_status
= EXIT_USER
;
3445 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3450 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3451 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3452 * however if we have it as we want to keep it open until the final execve(). */
3454 if (params
->exec_fd
>= 0) {
3455 exec_fd
= params
->exec_fd
;
3457 if (exec_fd
< 3 + (int) n_fds
) {
3460 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3461 * process we are about to execute. */
3463 moved_fd
= fcntl(exec_fd
, F_DUPFD_CLOEXEC
, 3 + (int) n_fds
);
3465 *exit_status
= EXIT_FDS
;
3466 return log_unit_error_errno(unit
, errno
, "Couldn't move exec fd up: %m");
3469 safe_close(exec_fd
);
3472 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3473 r
= fd_cloexec(exec_fd
, true);
3475 *exit_status
= EXIT_FDS
;
3476 return log_unit_error_errno(unit
, r
, "Failed to make exec fd FD_CLOEXEC: %m");
3480 fds_with_exec_fd
= newa(int, n_fds
+ 1);
3481 memcpy_safe(fds_with_exec_fd
, fds
, n_fds
* sizeof(int));
3482 fds_with_exec_fd
[n_fds
] = exec_fd
;
3483 n_fds_with_exec_fd
= n_fds
+ 1;
3485 fds_with_exec_fd
= fds
;
3486 n_fds_with_exec_fd
= n_fds
;
3489 r
= close_all_fds(fds_with_exec_fd
, n_fds_with_exec_fd
);
3491 r
= shift_fds(fds
, n_fds
);
3493 r
= flags_fds(fds
, n_socket_fds
, n_storage_fds
, context
->non_blocking
);
3495 *exit_status
= EXIT_FDS
;
3496 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3499 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3500 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3501 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3504 secure_bits
= context
->secure_bits
;
3506 if (needs_sandboxing
) {
3509 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3510 * requested. (Note this is placed after the general resource limit initialization, see
3511 * above, in order to take precedence.) */
3512 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3513 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3514 *exit_status
= EXIT_LIMITS
;
3515 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3520 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3521 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3523 r
= setup_smack(context
, command
);
3525 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3526 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3531 bset
= context
->capability_bounding_set
;
3532 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3533 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3534 * instead of us doing that */
3535 if (needs_ambient_hack
)
3536 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3537 (UINT64_C(1) << CAP_SETUID
) |
3538 (UINT64_C(1) << CAP_SETGID
);
3540 if (!cap_test_all(bset
)) {
3541 r
= capability_bounding_set_drop(bset
, false);
3543 *exit_status
= EXIT_CAPABILITIES
;
3544 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3548 /* This is done before enforce_user, but ambient set
3549 * does not survive over setresuid() if keep_caps is not set. */
3550 if (!needs_ambient_hack
&&
3551 context
->capability_ambient_set
!= 0) {
3552 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3554 *exit_status
= EXIT_CAPABILITIES
;
3555 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3560 /* chroot to root directory first, before we lose the ability to chroot */
3561 r
= apply_root_directory(context
, params
, needs_mount_namespace
, exit_status
);
3563 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
3566 if (uid_is_valid(uid
)) {
3567 r
= enforce_user(context
, uid
);
3569 *exit_status
= EXIT_USER
;
3570 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3573 if (!needs_ambient_hack
&&
3574 context
->capability_ambient_set
!= 0) {
3576 /* Fix the ambient capabilities after user change. */
3577 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3579 *exit_status
= EXIT_CAPABILITIES
;
3580 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3583 /* If we were asked to change user and ambient capabilities
3584 * were requested, we had to add keep-caps to the securebits
3585 * so that we would maintain the inherited capability set
3586 * through the setresuid(). Make sure that the bit is added
3587 * also to the context secure_bits so that we don't try to
3588 * drop the bit away next. */
3590 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3595 /* Apply working directory here, because the working directory might be on NFS and only the user running
3596 * this service might have the correct privilege to change to the working directory */
3597 r
= apply_working_directory(context
, params
, home
, exit_status
);
3599 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3601 if (needs_sandboxing
) {
3602 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3603 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3604 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3605 * are restricted. */
3609 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3612 r
= setexeccon(exec_context
);
3614 *exit_status
= EXIT_SELINUX_CONTEXT
;
3615 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3622 if (use_apparmor
&& context
->apparmor_profile
) {
3623 r
= aa_change_onexec(context
->apparmor_profile
);
3624 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3625 *exit_status
= EXIT_APPARMOR_PROFILE
;
3626 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3631 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3632 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3633 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3634 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3635 *exit_status
= EXIT_SECUREBITS
;
3636 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3639 if (context_has_no_new_privileges(context
))
3640 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3641 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3642 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3646 r
= apply_address_families(unit
, context
);
3648 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3649 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3652 r
= apply_memory_deny_write_execute(unit
, context
);
3654 *exit_status
= EXIT_SECCOMP
;
3655 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3658 r
= apply_restrict_realtime(unit
, context
);
3660 *exit_status
= EXIT_SECCOMP
;
3661 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3664 r
= apply_restrict_suid_sgid(unit
, context
);
3666 *exit_status
= EXIT_SECCOMP
;
3667 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
3670 r
= apply_restrict_namespaces(unit
, context
);
3672 *exit_status
= EXIT_SECCOMP
;
3673 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3676 r
= apply_protect_sysctl(unit
, context
);
3678 *exit_status
= EXIT_SECCOMP
;
3679 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3682 r
= apply_protect_kernel_modules(unit
, context
);
3684 *exit_status
= EXIT_SECCOMP
;
3685 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3688 r
= apply_private_devices(unit
, context
);
3690 *exit_status
= EXIT_SECCOMP
;
3691 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3694 r
= apply_syscall_archs(unit
, context
);
3696 *exit_status
= EXIT_SECCOMP
;
3697 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3700 r
= apply_lock_personality(unit
, context
);
3702 *exit_status
= EXIT_SECCOMP
;
3703 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3706 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3707 * by the filter as little as possible. */
3708 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3710 *exit_status
= EXIT_SECCOMP
;
3711 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3716 if (!strv_isempty(context
->unset_environment
)) {
3719 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3721 *exit_status
= EXIT_MEMORY
;
3725 strv_free_and_replace(accum_env
, ee
);
3728 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
3729 replaced_argv
= replace_env_argv(command
->argv
, accum_env
);
3730 if (!replaced_argv
) {
3731 *exit_status
= EXIT_MEMORY
;
3734 final_argv
= replaced_argv
;
3736 final_argv
= command
->argv
;
3738 if (DEBUG_LOGGING
) {
3739 _cleanup_free_
char *line
;
3741 line
= exec_command_line(final_argv
);
3743 log_struct(LOG_DEBUG
,
3744 "EXECUTABLE=%s", command
->path
,
3745 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3747 LOG_UNIT_INVOCATION_ID(unit
));
3753 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3754 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3756 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
3757 *exit_status
= EXIT_EXEC
;
3758 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
3762 execve(command
->path
, final_argv
, accum_env
);
3768 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3769 * that POLLHUP on it no longer means execve() succeeded. */
3771 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
3772 *exit_status
= EXIT_EXEC
;
3773 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
3777 if (r
== -ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3778 log_struct_errno(LOG_INFO
, r
,
3779 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3781 LOG_UNIT_INVOCATION_ID(unit
),
3782 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3784 "EXECUTABLE=%s", command
->path
);
3788 *exit_status
= EXIT_EXEC
;
3789 return log_unit_error_errno(unit
, r
, "Failed to execute command: %m");
3792 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
3793 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
3795 int exec_spawn(Unit
*unit
,
3796 ExecCommand
*command
,
3797 const ExecContext
*context
,
3798 const ExecParameters
*params
,
3799 ExecRuntime
*runtime
,
3800 DynamicCreds
*dcreds
,
3803 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
3804 _cleanup_free_
char *subcgroup_path
= NULL
;
3805 _cleanup_strv_free_
char **files_env
= NULL
;
3806 size_t n_storage_fds
= 0, n_socket_fds
= 0;
3807 _cleanup_free_
char *line
= NULL
;
3815 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
3817 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3818 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3819 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3821 if (params
->n_socket_fds
> 1) {
3822 log_unit_error(unit
, "Got more than one socket.");
3826 if (params
->n_socket_fds
== 0) {
3827 log_unit_error(unit
, "Got no socket.");
3831 socket_fd
= params
->fds
[0];
3835 n_socket_fds
= params
->n_socket_fds
;
3836 n_storage_fds
= params
->n_storage_fds
;
3839 r
= exec_context_named_iofds(context
, params
, named_iofds
);
3841 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3843 r
= exec_context_load_environment(unit
, context
, &files_env
);
3845 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3847 line
= exec_command_line(command
->argv
);
3851 log_struct(LOG_DEBUG
,
3852 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3853 "EXECUTABLE=%s", command
->path
,
3855 LOG_UNIT_INVOCATION_ID(unit
));
3857 if (params
->cgroup_path
) {
3858 r
= exec_parameters_get_cgroup_path(params
, &subcgroup_path
);
3860 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
3861 if (r
> 0) { /* We are using a child cgroup */
3862 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
3864 return log_unit_error_errno(unit
, r
, "Failed to create control group '%s': %m", subcgroup_path
);
3870 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3873 int exit_status
= EXIT_SUCCESS
;
3875 r
= exec_child(unit
,
3887 unit
->manager
->user_lookup_fds
[1],
3891 const char *status
=
3892 exit_status_to_string(exit_status
,
3893 EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
);
3895 log_struct_errno(LOG_ERR
, r
,
3896 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3898 LOG_UNIT_INVOCATION_ID(unit
),
3899 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3900 status
, command
->path
),
3901 "EXECUTABLE=%s", command
->path
);
3907 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3909 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3910 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3911 * process will be killed too). */
3913 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
3915 exec_status_start(&command
->exec_status
, pid
);
3921 void exec_context_init(ExecContext
*c
) {
3922 ExecDirectoryType i
;
3927 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3928 c
->cpu_sched_policy
= SCHED_OTHER
;
3929 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3930 c
->syslog_level_prefix
= true;
3931 c
->ignore_sigpipe
= true;
3932 c
->timer_slack_nsec
= NSEC_INFINITY
;
3933 c
->personality
= PERSONALITY_INVALID
;
3934 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3935 c
->directories
[i
].mode
= 0755;
3936 c
->timeout_clean_usec
= USEC_INFINITY
;
3937 c
->capability_bounding_set
= CAP_ALL
;
3938 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
3939 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
3940 c
->log_level_max
= -1;
3941 numa_policy_reset(&c
->numa_policy
);
3944 void exec_context_done(ExecContext
*c
) {
3945 ExecDirectoryType i
;
3950 c
->environment
= strv_free(c
->environment
);
3951 c
->environment_files
= strv_free(c
->environment_files
);
3952 c
->pass_environment
= strv_free(c
->pass_environment
);
3953 c
->unset_environment
= strv_free(c
->unset_environment
);
3955 rlimit_free_all(c
->rlimit
);
3957 for (l
= 0; l
< 3; l
++) {
3958 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3959 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
3962 c
->working_directory
= mfree(c
->working_directory
);
3963 c
->root_directory
= mfree(c
->root_directory
);
3964 c
->root_image
= mfree(c
->root_image
);
3965 c
->tty_path
= mfree(c
->tty_path
);
3966 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3967 c
->user
= mfree(c
->user
);
3968 c
->group
= mfree(c
->group
);
3970 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3972 c
->pam_name
= mfree(c
->pam_name
);
3974 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3975 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3976 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3978 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3979 c
->bind_mounts
= NULL
;
3980 c
->n_bind_mounts
= 0;
3981 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
3982 c
->temporary_filesystems
= NULL
;
3983 c
->n_temporary_filesystems
= 0;
3985 cpu_set_reset(&c
->cpu_set
);
3986 numa_policy_reset(&c
->numa_policy
);
3988 c
->utmp_id
= mfree(c
->utmp_id
);
3989 c
->selinux_context
= mfree(c
->selinux_context
);
3990 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3991 c
->smack_process_label
= mfree(c
->smack_process_label
);
3993 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3994 c
->syscall_archs
= set_free(c
->syscall_archs
);
3995 c
->address_families
= set_free(c
->address_families
);
3997 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3998 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
4000 c
->log_level_max
= -1;
4002 exec_context_free_log_extra_fields(c
);
4004 c
->log_ratelimit_interval_usec
= 0;
4005 c
->log_ratelimit_burst
= 0;
4007 c
->stdin_data
= mfree(c
->stdin_data
);
4008 c
->stdin_data_size
= 0;
4010 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
4013 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
4018 if (!runtime_prefix
)
4021 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
4022 _cleanup_free_
char *p
;
4024 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
4025 p
= path_join(runtime_prefix
, "private", *i
);
4027 p
= path_join(runtime_prefix
, *i
);
4031 /* We execute this synchronously, since we need to be sure this is gone when we start the
4033 (void) rm_rf(p
, REMOVE_ROOT
);
4039 static void exec_command_done(ExecCommand
*c
) {
4042 c
->path
= mfree(c
->path
);
4043 c
->argv
= strv_free(c
->argv
);
4046 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
4049 for (i
= 0; i
< n
; i
++)
4050 exec_command_done(c
+i
);
4053 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
4057 LIST_REMOVE(command
, c
, i
);
4058 exec_command_done(i
);
4065 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
4068 for (i
= 0; i
< n
; i
++)
4069 c
[i
] = exec_command_free_list(c
[i
]);
4072 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
4075 for (i
= 0; i
< n
; i
++)
4076 exec_status_reset(&c
[i
].exec_status
);
4079 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
4082 for (i
= 0; i
< n
; i
++) {
4085 LIST_FOREACH(command
, z
, c
[i
])
4086 exec_status_reset(&z
->exec_status
);
4090 typedef struct InvalidEnvInfo
{
4095 static void invalid_env(const char *p
, void *userdata
) {
4096 InvalidEnvInfo
*info
= userdata
;
4098 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
4101 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
4107 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
4110 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
4113 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
4116 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
4119 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
4122 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
4129 static int exec_context_named_iofds(
4130 const ExecContext
*c
,
4131 const ExecParameters
*p
,
4132 int named_iofds
[static 3]) {
4135 const char* stdio_fdname
[3];
4140 assert(named_iofds
);
4142 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
4143 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
4144 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
4146 for (i
= 0; i
< 3; i
++)
4147 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
4149 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
4151 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
4152 if (named_iofds
[STDIN_FILENO
] < 0 &&
4153 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
4154 stdio_fdname
[STDIN_FILENO
] &&
4155 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
4157 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
4160 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
4161 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
4162 stdio_fdname
[STDOUT_FILENO
] &&
4163 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
4165 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
4168 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
4169 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
4170 stdio_fdname
[STDERR_FILENO
] &&
4171 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
4173 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
4177 return targets
== 0 ? 0 : -ENOENT
;
4180 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
) {
4181 char **i
, **r
= NULL
;
4186 STRV_FOREACH(i
, c
->environment_files
) {
4190 bool ignore
= false;
4192 _cleanup_globfree_ glob_t pglob
= {};
4201 if (!path_is_absolute(fn
)) {
4209 /* Filename supports globbing, take all matching files */
4210 k
= safe_glob(fn
, 0, &pglob
);
4219 /* When we don't match anything, -ENOENT should be returned */
4220 assert(pglob
.gl_pathc
> 0);
4222 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
4223 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
4231 /* Log invalid environment variables with filename */
4233 InvalidEnvInfo info
= {
4235 .path
= pglob
.gl_pathv
[n
]
4238 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
4246 m
= strv_env_merge(2, r
, p
);
4262 static bool tty_may_match_dev_console(const char *tty
) {
4263 _cleanup_free_
char *resolved
= NULL
;
4268 tty
= skip_dev_prefix(tty
);
4270 /* trivial identity? */
4271 if (streq(tty
, "console"))
4274 if (resolve_dev_console(&resolved
) < 0)
4275 return true; /* if we could not resolve, assume it may */
4277 /* "tty0" means the active VC, so it may be the same sometimes */
4278 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
4281 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
4284 return ec
->tty_reset
||
4286 ec
->tty_vt_disallocate
||
4287 is_terminal_input(ec
->std_input
) ||
4288 is_terminal_output(ec
->std_output
) ||
4289 is_terminal_output(ec
->std_error
);
4292 bool exec_context_may_touch_console(const ExecContext
*ec
) {
4294 return exec_context_may_touch_tty(ec
) &&
4295 tty_may_match_dev_console(exec_context_tty_path(ec
));
4298 static void strv_fprintf(FILE *f
, char **l
) {
4304 fprintf(f
, " %s", *g
);
4307 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
4308 char **e
, **d
, buf_clean
[FORMAT_TIMESPAN_MAX
];
4309 ExecDirectoryType dt
;
4316 prefix
= strempty(prefix
);
4320 "%sWorkingDirectory: %s\n"
4321 "%sRootDirectory: %s\n"
4322 "%sNonBlocking: %s\n"
4323 "%sPrivateTmp: %s\n"
4324 "%sPrivateDevices: %s\n"
4325 "%sProtectKernelTunables: %s\n"
4326 "%sProtectKernelModules: %s\n"
4327 "%sProtectControlGroups: %s\n"
4328 "%sPrivateNetwork: %s\n"
4329 "%sPrivateUsers: %s\n"
4330 "%sProtectHome: %s\n"
4331 "%sProtectSystem: %s\n"
4332 "%sMountAPIVFS: %s\n"
4333 "%sIgnoreSIGPIPE: %s\n"
4334 "%sMemoryDenyWriteExecute: %s\n"
4335 "%sRestrictRealtime: %s\n"
4336 "%sRestrictSUIDSGID: %s\n"
4337 "%sKeyringMode: %s\n"
4338 "%sProtectHostname: %s\n",
4340 prefix
, c
->working_directory
? c
->working_directory
: "/",
4341 prefix
, c
->root_directory
? c
->root_directory
: "/",
4342 prefix
, yes_no(c
->non_blocking
),
4343 prefix
, yes_no(c
->private_tmp
),
4344 prefix
, yes_no(c
->private_devices
),
4345 prefix
, yes_no(c
->protect_kernel_tunables
),
4346 prefix
, yes_no(c
->protect_kernel_modules
),
4347 prefix
, yes_no(c
->protect_control_groups
),
4348 prefix
, yes_no(c
->private_network
),
4349 prefix
, yes_no(c
->private_users
),
4350 prefix
, protect_home_to_string(c
->protect_home
),
4351 prefix
, protect_system_to_string(c
->protect_system
),
4352 prefix
, yes_no(c
->mount_apivfs
),
4353 prefix
, yes_no(c
->ignore_sigpipe
),
4354 prefix
, yes_no(c
->memory_deny_write_execute
),
4355 prefix
, yes_no(c
->restrict_realtime
),
4356 prefix
, yes_no(c
->restrict_suid_sgid
),
4357 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
4358 prefix
, yes_no(c
->protect_hostname
));
4361 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
4363 STRV_FOREACH(e
, c
->environment
)
4364 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
4366 STRV_FOREACH(e
, c
->environment_files
)
4367 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
4369 STRV_FOREACH(e
, c
->pass_environment
)
4370 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
4372 STRV_FOREACH(e
, c
->unset_environment
)
4373 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
4375 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
4377 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4378 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
4380 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
4381 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
4385 "%sTimeoutCleanSec: %s\n",
4386 prefix
, format_timespan(buf_clean
, sizeof(buf_clean
), c
->timeout_clean_usec
, USEC_PER_SEC
));
4393 if (c
->oom_score_adjust_set
)
4395 "%sOOMScoreAdjust: %i\n",
4396 prefix
, c
->oom_score_adjust
);
4398 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
4400 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
4401 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
4402 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
4403 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
4406 if (c
->ioprio_set
) {
4407 _cleanup_free_
char *class_str
= NULL
;
4409 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
4411 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
4413 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
4416 if (c
->cpu_sched_set
) {
4417 _cleanup_free_
char *policy_str
= NULL
;
4419 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
4421 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
4424 "%sCPUSchedulingPriority: %i\n"
4425 "%sCPUSchedulingResetOnFork: %s\n",
4426 prefix
, c
->cpu_sched_priority
,
4427 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
4430 if (c
->cpu_set
.set
) {
4431 _cleanup_free_
char *affinity
= NULL
;
4433 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
4434 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
4437 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
4438 _cleanup_free_
char *nodes
= NULL
;
4440 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
4441 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
4442 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
4445 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
4446 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
4449 "%sStandardInput: %s\n"
4450 "%sStandardOutput: %s\n"
4451 "%sStandardError: %s\n",
4452 prefix
, exec_input_to_string(c
->std_input
),
4453 prefix
, exec_output_to_string(c
->std_output
),
4454 prefix
, exec_output_to_string(c
->std_error
));
4456 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
4457 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
4458 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
4459 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
4460 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
4461 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
4463 if (c
->std_input
== EXEC_INPUT_FILE
)
4464 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
4465 if (c
->std_output
== EXEC_OUTPUT_FILE
)
4466 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4467 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
4468 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4469 if (c
->std_error
== EXEC_OUTPUT_FILE
)
4470 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4471 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
4472 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4478 "%sTTYVHangup: %s\n"
4479 "%sTTYVTDisallocate: %s\n",
4480 prefix
, c
->tty_path
,
4481 prefix
, yes_no(c
->tty_reset
),
4482 prefix
, yes_no(c
->tty_vhangup
),
4483 prefix
, yes_no(c
->tty_vt_disallocate
));
4485 if (IN_SET(c
->std_output
,
4488 EXEC_OUTPUT_JOURNAL
,
4489 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4490 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4491 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
4492 IN_SET(c
->std_error
,
4495 EXEC_OUTPUT_JOURNAL
,
4496 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4497 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4498 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
4500 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
4502 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
4504 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
4506 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
4508 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
4511 if (c
->log_level_max
>= 0) {
4512 _cleanup_free_
char *t
= NULL
;
4514 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
4516 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
4519 if (c
->log_ratelimit_interval_usec
> 0) {
4520 char buf_timespan
[FORMAT_TIMESPAN_MAX
];
4523 "%sLogRateLimitIntervalSec: %s\n",
4524 prefix
, format_timespan(buf_timespan
, sizeof(buf_timespan
), c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
4527 if (c
->log_ratelimit_burst
> 0)
4528 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
4530 if (c
->n_log_extra_fields
> 0) {
4533 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
4534 fprintf(f
, "%sLogExtraFields: ", prefix
);
4535 fwrite(c
->log_extra_fields
[j
].iov_base
,
4536 1, c
->log_extra_fields
[j
].iov_len
,
4542 if (c
->secure_bits
) {
4543 _cleanup_free_
char *str
= NULL
;
4545 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4547 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4550 if (c
->capability_bounding_set
!= CAP_ALL
) {
4551 _cleanup_free_
char *str
= NULL
;
4553 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4555 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4558 if (c
->capability_ambient_set
!= 0) {
4559 _cleanup_free_
char *str
= NULL
;
4561 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4563 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4567 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4569 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4571 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4573 if (!strv_isempty(c
->supplementary_groups
)) {
4574 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4575 strv_fprintf(f
, c
->supplementary_groups
);
4580 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4582 if (!strv_isempty(c
->read_write_paths
)) {
4583 fprintf(f
, "%sReadWritePaths:", prefix
);
4584 strv_fprintf(f
, c
->read_write_paths
);
4588 if (!strv_isempty(c
->read_only_paths
)) {
4589 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4590 strv_fprintf(f
, c
->read_only_paths
);
4594 if (!strv_isempty(c
->inaccessible_paths
)) {
4595 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4596 strv_fprintf(f
, c
->inaccessible_paths
);
4600 if (c
->n_bind_mounts
> 0)
4601 for (i
= 0; i
< c
->n_bind_mounts
; i
++)
4602 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
4603 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4604 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
4605 c
->bind_mounts
[i
].source
,
4606 c
->bind_mounts
[i
].destination
,
4607 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4609 if (c
->n_temporary_filesystems
> 0)
4610 for (i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
4611 TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
4613 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
4615 isempty(t
->options
) ? "" : ":",
4616 strempty(t
->options
));
4621 "%sUtmpIdentifier: %s\n",
4622 prefix
, c
->utmp_id
);
4624 if (c
->selinux_context
)
4626 "%sSELinuxContext: %s%s\n",
4627 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4629 if (c
->apparmor_profile
)
4631 "%sAppArmorProfile: %s%s\n",
4632 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4634 if (c
->smack_process_label
)
4636 "%sSmackProcessLabel: %s%s\n",
4637 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4639 if (c
->personality
!= PERSONALITY_INVALID
)
4641 "%sPersonality: %s\n",
4642 prefix
, strna(personality_to_string(c
->personality
)));
4645 "%sLockPersonality: %s\n",
4646 prefix
, yes_no(c
->lock_personality
));
4648 if (c
->syscall_filter
) {
4656 "%sSystemCallFilter: ",
4659 if (!c
->syscall_whitelist
)
4663 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4664 _cleanup_free_
char *name
= NULL
;
4665 const char *errno_name
= NULL
;
4666 int num
= PTR_TO_INT(val
);
4673 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4674 fputs(strna(name
), f
);
4677 errno_name
= errno_to_name(num
);
4679 fprintf(f
, ":%s", errno_name
);
4681 fprintf(f
, ":%d", num
);
4689 if (c
->syscall_archs
) {
4696 "%sSystemCallArchitectures:",
4700 SET_FOREACH(id
, c
->syscall_archs
, j
)
4701 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4706 if (exec_context_restrict_namespaces_set(c
)) {
4707 _cleanup_free_
char *s
= NULL
;
4709 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
4711 fprintf(f
, "%sRestrictNamespaces: %s\n",
4715 if (c
->network_namespace_path
)
4717 "%sNetworkNamespacePath: %s\n",
4718 prefix
, c
->network_namespace_path
);
4720 if (c
->syscall_errno
> 0) {
4721 const char *errno_name
;
4723 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4725 errno_name
= errno_to_name(c
->syscall_errno
);
4727 fprintf(f
, "%s\n", errno_name
);
4729 fprintf(f
, "%d\n", c
->syscall_errno
);
4733 bool exec_context_maintains_privileges(const ExecContext
*c
) {
4736 /* Returns true if the process forked off would run under
4737 * an unchanged UID or as root. */
4742 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4748 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
4756 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4758 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4763 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4768 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4769 free(c
->log_extra_fields
[l
].iov_base
);
4770 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4771 c
->n_log_extra_fields
= 0;
4774 void exec_context_revert_tty(ExecContext
*c
) {
4779 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4780 exec_context_tty_reset(c
, NULL
);
4782 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4783 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4784 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4786 if (exec_context_may_touch_tty(c
)) {
4789 path
= exec_context_tty_path(c
);
4791 r
= chmod_and_chown(path
, TTY_MODE
, 0, TTY_GID
);
4792 if (r
< 0 && r
!= -ENOENT
)
4793 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
4798 int exec_context_get_clean_directories(
4804 _cleanup_strv_free_
char **l
= NULL
;
4805 ExecDirectoryType t
;
4812 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
4815 if (!FLAGS_SET(mask
, 1U << t
))
4821 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
4824 j
= path_join(prefix
[t
], *i
);
4828 r
= strv_consume(&l
, j
);
4832 /* Also remove private directories unconditionally. */
4833 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
4834 j
= path_join(prefix
[t
], "private", *i
);
4838 r
= strv_consume(&l
, j
);
4849 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
4850 ExecCleanMask mask
= 0;
4855 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
4856 if (!strv_isempty(c
->directories
[t
].paths
))
4863 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4870 dual_timestamp_get(&s
->start_timestamp
);
4873 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
4876 if (s
->pid
!= pid
) {
4882 dual_timestamp_get(&s
->exit_timestamp
);
4887 if (context
&& context
->utmp_id
)
4888 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4891 void exec_status_reset(ExecStatus
*s
) {
4894 *s
= (ExecStatus
) {};
4897 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
4898 char buf
[FORMAT_TIMESTAMP_MAX
];
4906 prefix
= strempty(prefix
);
4909 "%sPID: "PID_FMT
"\n",
4912 if (dual_timestamp_is_set(&s
->start_timestamp
))
4914 "%sStart Timestamp: %s\n",
4915 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4917 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4919 "%sExit Timestamp: %s\n"
4921 "%sExit Status: %i\n",
4922 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4923 prefix
, sigchld_code_to_string(s
->code
),
4927 static char *exec_command_line(char **argv
) {
4935 STRV_FOREACH(a
, argv
)
4943 STRV_FOREACH(a
, argv
) {
4950 if (strpbrk(*a
, WHITESPACE
)) {
4961 /* FIXME: this doesn't really handle arguments that have
4962 * spaces and ticks in them */
4967 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4968 _cleanup_free_
char *cmd
= NULL
;
4969 const char *prefix2
;
4974 prefix
= strempty(prefix
);
4975 prefix2
= strjoina(prefix
, "\t");
4977 cmd
= exec_command_line(c
->argv
);
4979 "%sCommand Line: %s\n",
4980 prefix
, cmd
? cmd
: strerror_safe(ENOMEM
));
4982 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4985 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4988 prefix
= strempty(prefix
);
4990 LIST_FOREACH(command
, c
, c
)
4991 exec_command_dump(c
, f
, prefix
);
4994 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
5001 /* It's kind of important, that we keep the order here */
5002 LIST_FIND_TAIL(command
, *l
, end
);
5003 LIST_INSERT_AFTER(command
, *l
, end
, e
);
5008 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
5016 l
= strv_new_ap(path
, ap
);
5028 free_and_replace(c
->path
, p
);
5030 return strv_free_and_replace(c
->argv
, l
);
5033 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
5034 _cleanup_strv_free_
char **l
= NULL
;
5042 l
= strv_new_ap(path
, ap
);
5048 r
= strv_extend_strv(&c
->argv
, l
, false);
5055 static void *remove_tmpdir_thread(void *p
) {
5056 _cleanup_free_
char *path
= p
;
5058 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
5062 static ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
, bool destroy
) {
5069 (void) hashmap_remove(rt
->manager
->exec_runtime_by_id
, rt
->id
);
5071 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5072 if (destroy
&& rt
->tmp_dir
) {
5073 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
5075 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
5077 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
5084 if (destroy
&& rt
->var_tmp_dir
) {
5085 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
5087 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
5089 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
5090 free(rt
->var_tmp_dir
);
5093 rt
->var_tmp_dir
= NULL
;
5096 rt
->id
= mfree(rt
->id
);
5097 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
5098 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
5099 safe_close_pair(rt
->netns_storage_socket
);
5103 static void exec_runtime_freep(ExecRuntime
**rt
) {
5104 (void) exec_runtime_free(*rt
, false);
5107 static int exec_runtime_allocate(ExecRuntime
**ret
) {
5112 n
= new(ExecRuntime
, 1);
5116 *n
= (ExecRuntime
) {
5117 .netns_storage_socket
= { -1, -1 },
5124 static int exec_runtime_add(
5127 const char *tmp_dir
,
5128 const char *var_tmp_dir
,
5129 const int netns_storage_socket
[2],
5130 ExecRuntime
**ret
) {
5132 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
5138 r
= hashmap_ensure_allocated(&m
->exec_runtime_by_id
, &string_hash_ops
);
5142 r
= exec_runtime_allocate(&rt
);
5146 rt
->id
= strdup(id
);
5151 rt
->tmp_dir
= strdup(tmp_dir
);
5155 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5156 assert(var_tmp_dir
);
5157 rt
->var_tmp_dir
= strdup(var_tmp_dir
);
5158 if (!rt
->var_tmp_dir
)
5162 if (netns_storage_socket
) {
5163 rt
->netns_storage_socket
[0] = netns_storage_socket
[0];
5164 rt
->netns_storage_socket
[1] = netns_storage_socket
[1];
5167 r
= hashmap_put(m
->exec_runtime_by_id
, rt
->id
, rt
);
5176 /* do not remove created ExecRuntime object when the operation succeeds. */
5181 static int exec_runtime_make(Manager
*m
, const ExecContext
*c
, const char *id
, ExecRuntime
**ret
) {
5182 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
5183 _cleanup_close_pair_
int netns_storage_socket
[2] = { -1, -1 };
5190 /* It is not necessary to create ExecRuntime object. */
5191 if (!c
->private_network
&& !c
->private_tmp
&& !c
->network_namespace_path
)
5194 if (c
->private_tmp
) {
5195 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
5200 if (c
->private_network
|| c
->network_namespace_path
) {
5201 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
5205 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, netns_storage_socket
, ret
);
5210 netns_storage_socket
[0] = netns_storage_socket
[1] = -1;
5214 int exec_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecRuntime
**ret
) {
5222 rt
= hashmap_get(m
->exec_runtime_by_id
, id
);
5224 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5230 /* If not found, then create a new object. */
5231 r
= exec_runtime_make(m
, c
, id
, &rt
);
5233 /* When r == 0, it is not necessary to create ExecRuntime object. */
5237 /* increment reference counter. */
5243 ExecRuntime
*exec_runtime_unref(ExecRuntime
*rt
, bool destroy
) {
5247 assert(rt
->n_ref
> 0);
5253 return exec_runtime_free(rt
, destroy
);
5256 int exec_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
5264 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
5265 fprintf(f
, "exec-runtime=%s", rt
->id
);
5268 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
5270 if (rt
->var_tmp_dir
)
5271 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
5273 if (rt
->netns_storage_socket
[0] >= 0) {
5276 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
5280 fprintf(f
, " netns-socket-0=%i", copy
);
5283 if (rt
->netns_storage_socket
[1] >= 0) {
5286 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
5290 fprintf(f
, " netns-socket-1=%i", copy
);
5299 int exec_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
5300 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt_create
= NULL
;
5304 /* This is for the migration from old (v237 or earlier) deserialization text.
5305 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5306 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5307 * so or not from the serialized text, then we always creates a new object owned by this. */
5313 /* Manager manages ExecRuntime objects by the unit id.
5314 * So, we omit the serialized text when the unit does not have id (yet?)... */
5315 if (isempty(u
->id
)) {
5316 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
5320 r
= hashmap_ensure_allocated(&u
->manager
->exec_runtime_by_id
, &string_hash_ops
);
5322 log_unit_debug_errno(u
, r
, "Failed to allocate storage for runtime parameter: %m");
5326 rt
= hashmap_get(u
->manager
->exec_runtime_by_id
, u
->id
);
5328 r
= exec_runtime_allocate(&rt_create
);
5332 rt_create
->id
= strdup(u
->id
);
5339 if (streq(key
, "tmp-dir")) {
5342 copy
= strdup(value
);
5346 free_and_replace(rt
->tmp_dir
, copy
);
5348 } else if (streq(key
, "var-tmp-dir")) {
5351 copy
= strdup(value
);
5355 free_and_replace(rt
->var_tmp_dir
, copy
);
5357 } else if (streq(key
, "netns-socket-0")) {
5360 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
5361 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
5365 safe_close(rt
->netns_storage_socket
[0]);
5366 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
5368 } else if (streq(key
, "netns-socket-1")) {
5371 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
5372 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
5376 safe_close(rt
->netns_storage_socket
[1]);
5377 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
5381 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5383 r
= hashmap_put(u
->manager
->exec_runtime_by_id
, rt_create
->id
, rt_create
);
5385 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
5389 rt_create
->manager
= u
->manager
;
5398 void exec_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
5399 char *id
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
5400 int r
, fd0
= -1, fd1
= -1;
5401 const char *p
, *v
= value
;
5408 n
= strcspn(v
, " ");
5409 id
= strndupa(v
, n
);
5414 v
= startswith(p
, "tmp-dir=");
5416 n
= strcspn(v
, " ");
5417 tmp_dir
= strndupa(v
, n
);
5423 v
= startswith(p
, "var-tmp-dir=");
5425 n
= strcspn(v
, " ");
5426 var_tmp_dir
= strndupa(v
, n
);
5432 v
= startswith(p
, "netns-socket-0=");
5436 n
= strcspn(v
, " ");
5437 buf
= strndupa(v
, n
);
5438 if (safe_atoi(buf
, &fd0
) < 0 || !fdset_contains(fds
, fd0
)) {
5439 log_debug("Unable to process exec-runtime netns fd specification.");
5442 fd0
= fdset_remove(fds
, fd0
);
5448 v
= startswith(p
, "netns-socket-1=");
5452 n
= strcspn(v
, " ");
5453 buf
= strndupa(v
, n
);
5454 if (safe_atoi(buf
, &fd1
) < 0 || !fdset_contains(fds
, fd1
)) {
5455 log_debug("Unable to process exec-runtime netns fd specification.");
5458 fd1
= fdset_remove(fds
, fd1
);
5463 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, (int[]) { fd0
, fd1
}, NULL
);
5465 log_debug_errno(r
, "Failed to add exec-runtime: %m");
5468 void exec_runtime_vacuum(Manager
*m
) {
5474 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5476 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
5480 (void) exec_runtime_free(rt
, false);
5484 void exec_params_clear(ExecParameters
*p
) {
5488 strv_free(p
->environment
);
5491 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
5492 [EXEC_INPUT_NULL
] = "null",
5493 [EXEC_INPUT_TTY
] = "tty",
5494 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
5495 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
5496 [EXEC_INPUT_SOCKET
] = "socket",
5497 [EXEC_INPUT_NAMED_FD
] = "fd",
5498 [EXEC_INPUT_DATA
] = "data",
5499 [EXEC_INPUT_FILE
] = "file",
5502 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
5504 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
5505 [EXEC_OUTPUT_INHERIT
] = "inherit",
5506 [EXEC_OUTPUT_NULL
] = "null",
5507 [EXEC_OUTPUT_TTY
] = "tty",
5508 [EXEC_OUTPUT_SYSLOG
] = "syslog",
5509 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
5510 [EXEC_OUTPUT_KMSG
] = "kmsg",
5511 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
5512 [EXEC_OUTPUT_JOURNAL
] = "journal",
5513 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
5514 [EXEC_OUTPUT_SOCKET
] = "socket",
5515 [EXEC_OUTPUT_NAMED_FD
] = "fd",
5516 [EXEC_OUTPUT_FILE
] = "file",
5517 [EXEC_OUTPUT_FILE_APPEND
] = "append",
5520 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
5522 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
5523 [EXEC_UTMP_INIT
] = "init",
5524 [EXEC_UTMP_LOGIN
] = "login",
5525 [EXEC_UTMP_USER
] = "user",
5528 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
5530 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
5531 [EXEC_PRESERVE_NO
] = "no",
5532 [EXEC_PRESERVE_YES
] = "yes",
5533 [EXEC_PRESERVE_RESTART
] = "restart",
5536 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
5538 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5539 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5540 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
5541 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
5542 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
5543 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
5544 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
5547 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
5549 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5550 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5551 * directories, specifically .timer units with their timestamp touch file. */
5552 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5553 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
5554 [EXEC_DIRECTORY_STATE
] = "state",
5555 [EXEC_DIRECTORY_CACHE
] = "cache",
5556 [EXEC_DIRECTORY_LOGS
] = "logs",
5557 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
5560 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
5562 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5563 * the service payload in. */
5564 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5565 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
5566 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
5567 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
5568 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
5569 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
5572 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
5574 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
5575 [EXEC_KEYRING_INHERIT
] = "inherit",
5576 [EXEC_KEYRING_PRIVATE
] = "private",
5577 [EXEC_KEYRING_SHARED
] = "shared",
5580 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);