1 /* SPDX-License-Identifier: LGPL-2.1+ */
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
16 #include <sys/socket.h>
18 #include <sys/types.h>
24 #include <security/pam_appl.h>
28 #include <selinux/selinux.h>
36 #include <sys/apparmor.h>
39 #include "sd-messages.h"
42 #include "alloc-util.h"
44 #include "apparmor-util.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
55 #include "errno-list.h"
57 #include "exit-status.h"
59 #include "format-util.h"
61 #include "glob-util.h"
70 #include "namespace.h"
71 #include "parse-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "rlimit-util.h"
77 #include "seccomp-util.h"
79 #include "securebits.h"
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
94 #include "user-util.h"
96 #include "utmp-wtmp.h"
98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
101 /* This assumes there is a 'tty' group */
102 #define TTY_MODE 0620
104 #define SNDBUF_SIZE (8*1024*1024)
106 static int shift_fds(int fds
[], size_t n_fds
) {
107 int start
, restart_from
;
112 /* Modifies the fds array! (sorts it) */
122 for (i
= start
; i
< (int) n_fds
; i
++) {
125 /* Already at right index? */
129 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
136 /* Hmm, the fd we wanted isn't free? Then
137 * let's remember that and try again from here */
138 if (nfd
!= i
+3 && restart_from
< 0)
142 if (restart_from
< 0)
145 start
= restart_from
;
151 static int flags_fds(const int fds
[], size_t n_socket_fds
, size_t n_storage_fds
, bool nonblock
) {
155 n_fds
= n_socket_fds
+ n_storage_fds
;
161 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
162 * O_NONBLOCK only applies to socket activation though. */
164 for (i
= 0; i
< n_fds
; i
++) {
166 if (i
< n_socket_fds
) {
167 r
= fd_nonblock(fds
[i
], nonblock
);
172 /* We unconditionally drop FD_CLOEXEC from the fds,
173 * since after all we want to pass these fds to our
176 r
= fd_cloexec(fds
[i
], false);
184 static const char *exec_context_tty_path(const ExecContext
*context
) {
187 if (context
->stdio_as_fds
)
190 if (context
->tty_path
)
191 return context
->tty_path
;
193 return "/dev/console";
196 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
201 path
= exec_context_tty_path(context
);
203 if (context
->tty_vhangup
) {
204 if (p
&& p
->stdin_fd
>= 0)
205 (void) terminal_vhangup_fd(p
->stdin_fd
);
207 (void) terminal_vhangup(path
);
210 if (context
->tty_reset
) {
211 if (p
&& p
->stdin_fd
>= 0)
212 (void) reset_terminal_fd(p
->stdin_fd
, true);
214 (void) reset_terminal(path
);
217 if (context
->tty_vt_disallocate
&& path
)
218 (void) vt_disallocate(path
);
221 static bool is_terminal_input(ExecInput i
) {
224 EXEC_INPUT_TTY_FORCE
,
225 EXEC_INPUT_TTY_FAIL
);
228 static bool is_terminal_output(ExecOutput o
) {
231 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
232 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
233 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
236 static bool is_syslog_output(ExecOutput o
) {
239 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
242 static bool is_kmsg_output(ExecOutput o
) {
245 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
248 static bool exec_context_needs_term(const ExecContext
*c
) {
251 /* Return true if the execution context suggests we should set $TERM to something useful. */
253 if (is_terminal_input(c
->std_input
))
256 if (is_terminal_output(c
->std_output
))
259 if (is_terminal_output(c
->std_error
))
262 return !!c
->tty_path
;
265 static int open_null_as(int flags
, int nfd
) {
270 fd
= open("/dev/null", flags
|O_NOCTTY
);
274 return move_fd(fd
, nfd
, false);
277 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
278 static const union sockaddr_union sa
= {
279 .un
.sun_family
= AF_UNIX
,
280 .un
.sun_path
= "/run/systemd/journal/stdout",
282 uid_t olduid
= UID_INVALID
;
283 gid_t oldgid
= GID_INVALID
;
286 if (gid_is_valid(gid
)) {
289 if (setegid(gid
) < 0)
293 if (uid_is_valid(uid
)) {
296 if (seteuid(uid
) < 0) {
302 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
304 /* If we fail to restore the uid or gid, things will likely
305 fail later on. This should only happen if an LSM interferes. */
307 if (uid_is_valid(uid
))
308 (void) seteuid(olduid
);
311 if (gid_is_valid(gid
))
312 (void) setegid(oldgid
);
317 static int connect_logger_as(
319 const ExecContext
*context
,
320 const ExecParameters
*params
,
327 _cleanup_close_
int fd
= -1;
332 assert(output
< _EXEC_OUTPUT_MAX
);
336 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
340 r
= connect_journal_socket(fd
, uid
, gid
);
344 if (shutdown(fd
, SHUT_RD
) < 0)
347 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
357 context
->syslog_identifier
?: ident
,
358 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
359 context
->syslog_priority
,
360 !!context
->syslog_level_prefix
,
361 is_syslog_output(output
),
362 is_kmsg_output(output
),
363 is_terminal_output(output
)) < 0)
366 return move_fd(TAKE_FD(fd
), nfd
, false);
369 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
375 fd
= open_terminal(path
, flags
| O_NOCTTY
);
379 return move_fd(fd
, nfd
, false);
382 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
383 union sockaddr_union sa
= {};
384 _cleanup_close_
int fd
= -1;
389 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
392 fd
= open(path
, flags
|O_NOCTTY
, mode
);
396 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
398 if (strlen(path
) >= sizeof(sa
.un
.sun_path
)) /* Too long, can't be a UNIX socket */
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
403 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
407 salen
= sockaddr_un_set_path(&sa
.un
, path
);
411 if (connect(fd
, &sa
.sa
, salen
) < 0)
412 return errno
== EINVAL
? -ENXIO
: -errno
; /* Propagate initial error if we get EINVAL, i.e. we have
413 * indication that his wasn't an AF_UNIX socket after all */
415 if ((flags
& O_ACCMODE
) == O_RDONLY
)
416 r
= shutdown(fd
, SHUT_WR
);
417 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
418 r
= shutdown(fd
, SHUT_RD
);
427 static int fixup_input(
428 const ExecContext
*context
,
430 bool apply_tty_stdin
) {
436 std_input
= context
->std_input
;
438 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
439 return EXEC_INPUT_NULL
;
441 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
442 return EXEC_INPUT_NULL
;
444 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
445 return EXEC_INPUT_NULL
;
450 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
452 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
453 return EXEC_OUTPUT_INHERIT
;
458 static int setup_input(
459 const ExecContext
*context
,
460 const ExecParameters
*params
,
462 int named_iofds
[3]) {
469 if (params
->stdin_fd
>= 0) {
470 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
473 /* Try to make this the controlling tty, if it is a tty, and reset it */
474 if (isatty(STDIN_FILENO
)) {
475 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
476 (void) reset_terminal_fd(STDIN_FILENO
, true);
482 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
486 case EXEC_INPUT_NULL
:
487 return open_null_as(O_RDONLY
, STDIN_FILENO
);
490 case EXEC_INPUT_TTY_FORCE
:
491 case EXEC_INPUT_TTY_FAIL
: {
494 fd
= acquire_terminal(exec_context_tty_path(context
),
495 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
496 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
497 ACQUIRE_TERMINAL_WAIT
,
502 return move_fd(fd
, STDIN_FILENO
, false);
505 case EXEC_INPUT_SOCKET
:
506 assert(socket_fd
>= 0);
508 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
510 case EXEC_INPUT_NAMED_FD
:
511 assert(named_iofds
[STDIN_FILENO
] >= 0);
513 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
514 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
516 case EXEC_INPUT_DATA
: {
519 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
523 return move_fd(fd
, STDIN_FILENO
, false);
526 case EXEC_INPUT_FILE
: {
530 assert(context
->stdio_file
[STDIN_FILENO
]);
532 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
533 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
535 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
539 return move_fd(fd
, STDIN_FILENO
, false);
543 assert_not_reached("Unknown input type");
547 static bool can_inherit_stderr_from_stdout(
548 const ExecContext
*context
,
554 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
557 if (e
== EXEC_OUTPUT_INHERIT
)
562 if (e
== EXEC_OUTPUT_NAMED_FD
)
563 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
565 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
))
566 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
571 static int setup_output(
573 const ExecContext
*context
,
574 const ExecParameters
*params
,
581 dev_t
*journal_stream_dev
,
582 ino_t
*journal_stream_ino
) {
592 assert(journal_stream_dev
);
593 assert(journal_stream_ino
);
595 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
597 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
600 return STDOUT_FILENO
;
603 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
604 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
607 return STDERR_FILENO
;
610 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
611 o
= fixup_output(context
->std_output
, socket_fd
);
613 if (fileno
== STDERR_FILENO
) {
615 e
= fixup_output(context
->std_error
, socket_fd
);
617 /* This expects the input and output are already set up */
619 /* Don't change the stderr file descriptor if we inherit all
620 * the way and are not on a tty */
621 if (e
== EXEC_OUTPUT_INHERIT
&&
622 o
== EXEC_OUTPUT_INHERIT
&&
623 i
== EXEC_INPUT_NULL
&&
624 !is_terminal_input(context
->std_input
) &&
628 /* Duplicate from stdout if possible */
629 if (can_inherit_stderr_from_stdout(context
, o
, e
))
630 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
634 } else if (o
== EXEC_OUTPUT_INHERIT
) {
635 /* If input got downgraded, inherit the original value */
636 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
637 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
639 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
640 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
641 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
643 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
647 /* We need to open /dev/null here anew, to get the right access mode. */
648 return open_null_as(O_WRONLY
, fileno
);
653 case EXEC_OUTPUT_NULL
:
654 return open_null_as(O_WRONLY
, fileno
);
656 case EXEC_OUTPUT_TTY
:
657 if (is_terminal_input(i
))
658 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
660 /* We don't reset the terminal if this is just about output */
661 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
663 case EXEC_OUTPUT_SYSLOG
:
664 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
665 case EXEC_OUTPUT_KMSG
:
666 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
667 case EXEC_OUTPUT_JOURNAL
:
668 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
669 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
671 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
672 r
= open_null_as(O_WRONLY
, fileno
);
676 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
677 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
678 * services to detect whether they are connected to the journal or not.
680 * If both stdout and stderr are connected to a stream then let's make sure to store the data
681 * about STDERR as that's usually the best way to do logging. */
683 if (fstat(fileno
, &st
) >= 0 &&
684 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
685 *journal_stream_dev
= st
.st_dev
;
686 *journal_stream_ino
= st
.st_ino
;
691 case EXEC_OUTPUT_SOCKET
:
692 assert(socket_fd
>= 0);
694 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
696 case EXEC_OUTPUT_NAMED_FD
:
697 assert(named_iofds
[fileno
] >= 0);
699 (void) fd_nonblock(named_iofds
[fileno
], false);
700 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
702 case EXEC_OUTPUT_FILE
:
703 case EXEC_OUTPUT_FILE_APPEND
: {
707 assert(context
->stdio_file
[fileno
]);
709 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
710 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
713 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
716 if (o
== EXEC_OUTPUT_FILE_APPEND
)
719 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
723 return move_fd(fd
, fileno
, 0);
727 assert_not_reached("Unknown error type");
731 static int chown_terminal(int fd
, uid_t uid
) {
736 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
740 /* This might fail. What matters are the results. */
741 (void) fchown(fd
, uid
, -1);
742 (void) fchmod(fd
, TTY_MODE
);
744 if (fstat(fd
, &st
) < 0)
747 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
753 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
754 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
757 assert(_saved_stdin
);
758 assert(_saved_stdout
);
760 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
764 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
765 if (saved_stdout
< 0)
768 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
772 r
= chown_terminal(fd
, getuid());
776 r
= reset_terminal_fd(fd
, true);
780 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
);
785 *_saved_stdin
= saved_stdin
;
786 *_saved_stdout
= saved_stdout
;
788 saved_stdin
= saved_stdout
= -1;
793 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
796 if (err
== -ETIMEDOUT
)
797 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
800 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
804 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
805 _cleanup_close_
int fd
= -1;
809 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
813 write_confirm_error_fd(err
, fd
, u
);
816 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
820 assert(saved_stdout
);
824 if (*saved_stdin
>= 0)
825 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
828 if (*saved_stdout
>= 0)
829 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
832 *saved_stdin
= safe_close(*saved_stdin
);
833 *saved_stdout
= safe_close(*saved_stdout
);
839 CONFIRM_PRETEND_FAILURE
= -1,
840 CONFIRM_PRETEND_SUCCESS
= 0,
844 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
845 int saved_stdout
= -1, saved_stdin
= -1, r
;
846 _cleanup_free_
char *e
= NULL
;
849 /* For any internal errors, assume a positive response. */
850 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
852 write_confirm_error(r
, vc
, u
);
853 return CONFIRM_EXECUTE
;
856 /* confirm_spawn might have been disabled while we were sleeping. */
857 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
862 e
= ellipsize(cmdline
, 60, 100);
870 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
872 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
879 printf("Resuming normal execution.\n");
880 manager_disable_confirm_spawn();
884 unit_dump(u
, stdout
, " ");
885 continue; /* ask again */
887 printf("Failing execution.\n");
888 r
= CONFIRM_PRETEND_FAILURE
;
891 printf(" c - continue, proceed without asking anymore\n"
892 " D - dump, show the state of the unit\n"
893 " f - fail, don't execute the command and pretend it failed\n"
895 " i - info, show a short summary of the unit\n"
896 " j - jobs, show jobs that are in progress\n"
897 " s - skip, don't execute the command and pretend it succeeded\n"
898 " y - yes, execute the command\n");
899 continue; /* ask again */
901 printf(" Description: %s\n"
904 u
->id
, u
->description
, cmdline
);
905 continue; /* ask again */
907 manager_dump_jobs(u
->manager
, stdout
, " ");
908 continue; /* ask again */
910 /* 'n' was removed in favor of 'f'. */
911 printf("Didn't understand 'n', did you mean 'f'?\n");
912 continue; /* ask again */
914 printf("Skipping execution.\n");
915 r
= CONFIRM_PRETEND_SUCCESS
;
921 assert_not_reached("Unhandled choice");
927 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
931 static int get_fixed_user(const ExecContext
*c
, const char **user
,
932 uid_t
*uid
, gid_t
*gid
,
933 const char **home
, const char **shell
) {
942 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
943 * (i.e. are "/" or "/bin/nologin"). */
946 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
954 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
964 r
= get_group_creds(&name
, gid
, 0);
972 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
973 const char *group
, gid_t gid
,
974 gid_t
**supplementary_gids
, int *ngids
) {
978 bool keep_groups
= false;
979 gid_t
*groups
= NULL
;
980 _cleanup_free_ gid_t
*l_gids
= NULL
;
985 * If user is given, then lookup GID and supplementary groups list.
986 * We avoid NSS lookups for gid=0. Also we have to initialize groups
987 * here and as early as possible so we keep the list of supplementary
988 * groups of the caller.
990 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
991 /* First step, initialize groups from /etc/groups */
992 if (initgroups(user
, gid
) < 0)
998 if (strv_isempty(c
->supplementary_groups
))
1002 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1003 * be positive, otherwise fail.
1006 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1007 if (ngroups_max
<= 0) {
1011 return -EOPNOTSUPP
; /* For all other values */
1014 l_gids
= new(gid_t
, ngroups_max
);
1020 * Lookup the list of groups that the user belongs to, we
1021 * avoid NSS lookups here too for gid=0.
1024 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1029 STRV_FOREACH(i
, c
->supplementary_groups
) {
1032 if (k
>= ngroups_max
)
1036 r
= get_group_creds(&g
, l_gids
+k
, 0);
1044 * Sets ngids to zero to drop all supplementary groups, happens
1045 * when we are under root and SupplementaryGroups= is empty.
1052 /* Otherwise get the final list of supplementary groups */
1053 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1057 *supplementary_gids
= groups
;
1065 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1068 /* Handle SupplementaryGroups= if it is not empty */
1070 r
= maybe_setgroups(ngids
, supplementary_gids
);
1075 if (gid_is_valid(gid
)) {
1076 /* Then set our gids */
1077 if (setresgid(gid
, gid
, gid
) < 0)
1084 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1087 if (!uid_is_valid(uid
))
1090 /* Sets (but doesn't look up) the uid and make sure we keep the
1091 * capabilities while doing so. */
1093 if (context
->capability_ambient_set
!= 0) {
1095 /* First step: If we need to keep capabilities but
1096 * drop privileges we need to make sure we keep our
1097 * caps, while we drop privileges. */
1099 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1101 if (prctl(PR_GET_SECUREBITS
) != sb
)
1102 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1107 /* Second step: actually set the uids */
1108 if (setresuid(uid
, uid
, uid
) < 0)
1111 /* At this point we should have all necessary capabilities but
1112 are otherwise a normal user. However, the caps might got
1113 corrupted due to the setresuid() so we need clean them up
1114 later. This is done outside of this call. */
1121 static int null_conv(
1123 const struct pam_message
**msg
,
1124 struct pam_response
**resp
,
1125 void *appdata_ptr
) {
1127 /* We don't support conversations */
1129 return PAM_CONV_ERR
;
1134 static int setup_pam(
1141 int fds
[], size_t n_fds
) {
1145 static const struct pam_conv conv
= {
1150 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1151 pam_handle_t
*handle
= NULL
;
1153 int pam_code
= PAM_SUCCESS
, r
;
1154 char **nv
, **e
= NULL
;
1155 bool close_session
= false;
1156 pid_t pam_pid
= 0, parent_pid
;
1163 /* We set up PAM in the parent process, then fork. The child
1164 * will then stay around until killed via PR_GET_PDEATHSIG or
1165 * systemd via the cgroup logic. It will then remove the PAM
1166 * session again. The parent process will exec() the actual
1167 * daemon. We do things this way to ensure that the main PID
1168 * of the daemon is the one we initially fork()ed. */
1170 r
= barrier_create(&barrier
);
1174 if (log_get_max_level() < LOG_DEBUG
)
1175 flags
|= PAM_SILENT
;
1177 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1178 if (pam_code
!= PAM_SUCCESS
) {
1184 _cleanup_free_
char *q
= NULL
;
1186 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1187 * out if that's the case, and read the TTY off it. */
1189 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1190 tty
= strjoina("/dev/", q
);
1194 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1195 if (pam_code
!= PAM_SUCCESS
)
1199 STRV_FOREACH(nv
, *env
) {
1200 pam_code
= pam_putenv(handle
, *nv
);
1201 if (pam_code
!= PAM_SUCCESS
)
1205 pam_code
= pam_acct_mgmt(handle
, flags
);
1206 if (pam_code
!= PAM_SUCCESS
)
1209 pam_code
= pam_open_session(handle
, flags
);
1210 if (pam_code
!= PAM_SUCCESS
)
1213 close_session
= true;
1215 e
= pam_getenvlist(handle
);
1217 pam_code
= PAM_BUF_ERR
;
1221 /* Block SIGTERM, so that we know that it won't get lost in
1224 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1226 parent_pid
= getpid_cached();
1228 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1232 int sig
, ret
= EXIT_PAM
;
1234 /* The child's job is to reset the PAM session on
1236 barrier_set_role(&barrier
, BARRIER_CHILD
);
1238 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1239 * are open here that have been opened by PAM. */
1240 (void) close_many(fds
, n_fds
);
1242 /* Drop privileges - we don't need any to pam_close_session
1243 * and this will make PR_SET_PDEATHSIG work in most cases.
1244 * If this fails, ignore the error - but expect sd-pam threads
1245 * to fail to exit normally */
1247 r
= maybe_setgroups(0, NULL
);
1249 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1250 if (setresgid(gid
, gid
, gid
) < 0)
1251 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1252 if (setresuid(uid
, uid
, uid
) < 0)
1253 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1255 (void) ignore_signals(SIGPIPE
, -1);
1257 /* Wait until our parent died. This will only work if
1258 * the above setresuid() succeeds, otherwise the kernel
1259 * will not allow unprivileged parents kill their privileged
1260 * children this way. We rely on the control groups kill logic
1261 * to do the rest for us. */
1262 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1265 /* Tell the parent that our setup is done. This is especially
1266 * important regarding dropping privileges. Otherwise, unit
1267 * setup might race against our setresuid(2) call.
1269 * If the parent aborted, we'll detect this below, hence ignore
1270 * return failure here. */
1271 (void) barrier_place(&barrier
);
1273 /* Check if our parent process might already have died? */
1274 if (getppid() == parent_pid
) {
1277 assert_se(sigemptyset(&ss
) >= 0);
1278 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1281 if (sigwait(&ss
, &sig
) < 0) {
1288 assert(sig
== SIGTERM
);
1293 /* If our parent died we'll end the session */
1294 if (getppid() != parent_pid
) {
1295 pam_code
= pam_close_session(handle
, flags
);
1296 if (pam_code
!= PAM_SUCCESS
)
1303 pam_end(handle
, pam_code
| flags
);
1307 barrier_set_role(&barrier
, BARRIER_PARENT
);
1309 /* If the child was forked off successfully it will do all the
1310 * cleanups, so forget about the handle here. */
1313 /* Unblock SIGTERM again in the parent */
1314 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1316 /* We close the log explicitly here, since the PAM modules
1317 * might have opened it, but we don't want this fd around. */
1320 /* Synchronously wait for the child to initialize. We don't care for
1321 * errors as we cannot recover. However, warn loudly if it happens. */
1322 if (!barrier_place_and_sync(&barrier
))
1323 log_error("PAM initialization failed");
1325 return strv_free_and_replace(*env
, e
);
1328 if (pam_code
!= PAM_SUCCESS
) {
1329 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1330 r
= -EPERM
; /* PAM errors do not map to errno */
1332 log_error_errno(r
, "PAM failed: %m");
1336 pam_code
= pam_close_session(handle
, flags
);
1338 pam_end(handle
, pam_code
| flags
);
1350 static void rename_process_from_path(const char *path
) {
1351 char process_name
[11];
1355 /* This resulting string must fit in 10 chars (i.e. the length
1356 * of "/sbin/init") to look pretty in /bin/ps */
1360 rename_process("(...)");
1366 /* The end of the process name is usually more
1367 * interesting, since the first bit might just be
1373 process_name
[0] = '(';
1374 memcpy(process_name
+1, p
, l
);
1375 process_name
[1+l
] = ')';
1376 process_name
[1+l
+1] = 0;
1378 rename_process(process_name
);
1381 static bool context_has_address_families(const ExecContext
*c
) {
1384 return c
->address_families_whitelist
||
1385 !set_isempty(c
->address_families
);
1388 static bool context_has_syscall_filters(const ExecContext
*c
) {
1391 return c
->syscall_whitelist
||
1392 !hashmap_isempty(c
->syscall_filter
);
1395 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1398 if (c
->no_new_privileges
)
1401 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1404 /* We need NNP if we have any form of seccomp and are unprivileged */
1405 return context_has_address_families(c
) ||
1406 c
->memory_deny_write_execute
||
1407 c
->restrict_realtime
||
1408 exec_context_restrict_namespaces_set(c
) ||
1409 c
->protect_kernel_tunables
||
1410 c
->protect_kernel_modules
||
1411 c
->private_devices
||
1412 context_has_syscall_filters(c
) ||
1413 !set_isempty(c
->syscall_archs
) ||
1414 c
->lock_personality
;
1419 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1421 if (is_seccomp_available())
1424 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1428 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1429 uint32_t negative_action
, default_action
, action
;
1435 if (!context_has_syscall_filters(c
))
1438 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1441 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1443 if (c
->syscall_whitelist
) {
1444 default_action
= negative_action
;
1445 action
= SCMP_ACT_ALLOW
;
1447 default_action
= SCMP_ACT_ALLOW
;
1448 action
= negative_action
;
1451 if (needs_ambient_hack
) {
1452 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1457 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1460 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1464 if (set_isempty(c
->syscall_archs
))
1467 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1470 return seccomp_restrict_archs(c
->syscall_archs
);
1473 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1477 if (!context_has_address_families(c
))
1480 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1483 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1486 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1490 if (!c
->memory_deny_write_execute
)
1493 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1496 return seccomp_memory_deny_write_execute();
1499 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1503 if (!c
->restrict_realtime
)
1506 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1509 return seccomp_restrict_realtime();
1512 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1516 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1517 * let's protect even those systems where this is left on in the kernel. */
1519 if (!c
->protect_kernel_tunables
)
1522 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1525 return seccomp_protect_sysctl();
1528 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1532 /* Turn off module syscalls on ProtectKernelModules=yes */
1534 if (!c
->protect_kernel_modules
)
1537 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1540 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1543 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1547 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1549 if (!c
->private_devices
)
1552 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1555 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1558 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1562 if (!exec_context_restrict_namespaces_set(c
))
1565 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1568 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1571 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1572 unsigned long personality
;
1578 if (!c
->lock_personality
)
1581 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1584 personality
= c
->personality
;
1586 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1587 if (personality
== PERSONALITY_INVALID
) {
1589 r
= opinionated_personality(&personality
);
1594 return seccomp_lock_personality(personality
);
1599 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1602 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1603 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1605 if (idle_pipe
[0] >= 0) {
1608 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1610 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1613 /* Signal systemd that we are bored and want to continue. */
1614 n
= write(idle_pipe
[3], "x", 1);
1616 /* Wait for systemd to react to the signal above. */
1617 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1620 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1624 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1627 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1629 static int build_environment(
1631 const ExecContext
*c
,
1632 const ExecParameters
*p
,
1635 const char *username
,
1637 dev_t journal_stream_dev
,
1638 ino_t journal_stream_ino
,
1641 _cleanup_strv_free_
char **our_env
= NULL
;
1642 ExecDirectoryType t
;
1651 our_env
= new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX
);
1656 _cleanup_free_
char *joined
= NULL
;
1658 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1660 our_env
[n_env
++] = x
;
1662 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1664 our_env
[n_env
++] = x
;
1666 joined
= strv_join(p
->fd_names
, ":");
1670 x
= strjoin("LISTEN_FDNAMES=", joined
);
1673 our_env
[n_env
++] = x
;
1676 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1677 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1679 our_env
[n_env
++] = x
;
1681 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1683 our_env
[n_env
++] = x
;
1686 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1687 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1688 * check the database directly. */
1689 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1690 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1693 our_env
[n_env
++] = x
;
1697 x
= strappend("HOME=", home
);
1700 our_env
[n_env
++] = x
;
1704 x
= strappend("LOGNAME=", username
);
1707 our_env
[n_env
++] = x
;
1709 x
= strappend("USER=", username
);
1712 our_env
[n_env
++] = x
;
1716 x
= strappend("SHELL=", shell
);
1719 our_env
[n_env
++] = x
;
1722 if (!sd_id128_is_null(u
->invocation_id
)) {
1723 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1726 our_env
[n_env
++] = x
;
1729 if (exec_context_needs_term(c
)) {
1730 const char *tty_path
, *term
= NULL
;
1732 tty_path
= exec_context_tty_path(c
);
1734 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1735 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1736 * passes to PID 1 ends up all the way in the console login shown. */
1738 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1739 term
= getenv("TERM");
1741 term
= default_term_for_tty(tty_path
);
1743 x
= strappend("TERM=", term
);
1746 our_env
[n_env
++] = x
;
1749 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1750 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1753 our_env
[n_env
++] = x
;
1756 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1757 _cleanup_free_
char *pre
= NULL
, *joined
= NULL
;
1763 if (strv_isempty(c
->directories
[t
].paths
))
1766 n
= exec_directory_env_name_to_string(t
);
1770 pre
= strjoin(p
->prefix
[t
], "/");
1774 joined
= strv_join_prefix(c
->directories
[t
].paths
, ":", pre
);
1778 x
= strjoin(n
, "=", joined
);
1782 our_env
[n_env
++] = x
;
1785 our_env
[n_env
++] = NULL
;
1786 assert(n_env
<= 14 + _EXEC_DIRECTORY_TYPE_MAX
);
1788 *ret
= TAKE_PTR(our_env
);
1793 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1794 _cleanup_strv_free_
char **pass_env
= NULL
;
1795 size_t n_env
= 0, n_bufsize
= 0;
1798 STRV_FOREACH(i
, c
->pass_environment
) {
1799 _cleanup_free_
char *x
= NULL
;
1805 x
= strjoin(*i
, "=", v
);
1809 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1812 pass_env
[n_env
++] = TAKE_PTR(x
);
1813 pass_env
[n_env
] = NULL
;
1816 *ret
= TAKE_PTR(pass_env
);
1821 static bool exec_needs_mount_namespace(
1822 const ExecContext
*context
,
1823 const ExecParameters
*params
,
1824 const ExecRuntime
*runtime
) {
1829 if (context
->root_image
)
1832 if (!strv_isempty(context
->read_write_paths
) ||
1833 !strv_isempty(context
->read_only_paths
) ||
1834 !strv_isempty(context
->inaccessible_paths
))
1837 if (context
->n_bind_mounts
> 0)
1840 if (context
->n_temporary_filesystems
> 0)
1843 if (context
->mount_flags
!= 0)
1846 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1849 if (context
->private_devices
||
1850 context
->private_mounts
||
1851 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1852 context
->protect_home
!= PROTECT_HOME_NO
||
1853 context
->protect_kernel_tunables
||
1854 context
->protect_kernel_modules
||
1855 context
->protect_control_groups
)
1858 if (context
->root_directory
) {
1859 ExecDirectoryType t
;
1861 if (context
->mount_apivfs
)
1864 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1865 if (!params
->prefix
[t
])
1868 if (!strv_isempty(context
->directories
[t
].paths
))
1873 if (context
->dynamic_user
&&
1874 (!strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1875 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1876 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
)))
1882 static int setup_private_users(uid_t uid
, gid_t gid
) {
1883 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1884 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1885 _cleanup_close_
int unshare_ready_fd
= -1;
1886 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1891 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1892 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1893 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1894 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1895 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1896 * continues execution normally. */
1898 if (uid
!= 0 && uid_is_valid(uid
)) {
1899 r
= asprintf(&uid_map
,
1900 "0 0 1\n" /* Map root → root */
1901 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1906 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1911 if (gid
!= 0 && gid_is_valid(gid
)) {
1912 r
= asprintf(&gid_map
,
1913 "0 0 1\n" /* Map root → root */
1914 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1919 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1924 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1926 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1927 if (unshare_ready_fd
< 0)
1930 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1932 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1935 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
1939 _cleanup_close_
int fd
= -1;
1943 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1944 * here, after the parent opened its own user namespace. */
1947 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1949 /* Wait until the parent unshared the user namespace */
1950 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1955 /* Disable the setgroups() system call in the child user namespace, for good. */
1956 a
= procfs_file_alloca(ppid
, "setgroups");
1957 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1959 if (errno
!= ENOENT
) {
1964 /* If the file is missing the kernel is too old, let's continue anyway. */
1966 if (write(fd
, "deny\n", 5) < 0) {
1971 fd
= safe_close(fd
);
1974 /* First write the GID map */
1975 a
= procfs_file_alloca(ppid
, "gid_map");
1976 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1981 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1985 fd
= safe_close(fd
);
1987 /* The write the UID map */
1988 a
= procfs_file_alloca(ppid
, "uid_map");
1989 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1994 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1999 _exit(EXIT_SUCCESS
);
2002 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2003 _exit(EXIT_FAILURE
);
2006 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2008 if (unshare(CLONE_NEWUSER
) < 0)
2011 /* Let the child know that the namespace is ready now */
2012 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2015 /* Try to read an error code from the child */
2016 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2019 if (n
== sizeof(r
)) { /* an error code was sent to us */
2024 if (n
!= 0) /* on success we should have read 0 bytes */
2027 r
= wait_for_terminate_and_check("(sd-userns)", pid
, 0);
2031 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2037 static int setup_exec_directory(
2038 const ExecContext
*context
,
2039 const ExecParameters
*params
,
2042 ExecDirectoryType type
,
2045 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2046 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2047 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2048 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2049 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2050 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2057 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2058 assert(exit_status
);
2060 if (!params
->prefix
[type
])
2063 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2064 if (!uid_is_valid(uid
))
2066 if (!gid_is_valid(gid
))
2070 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
2071 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2073 p
= strjoin(params
->prefix
[type
], "/", *rt
);
2079 r
= mkdir_parents_label(p
, 0755);
2083 if (context
->dynamic_user
&&
2084 !IN_SET(type
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2085 _cleanup_free_
char *private_root
= NULL
;
2087 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2088 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2089 * whose UID is later on reused. To lock this down we use the same trick used by container
2090 * managers to prohibit host users to get access to files of the same UID in containers: we
2091 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2092 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2093 * to make this directory permeable for the service itself.
2095 * Specifically: for a service which wants a special directory "foo/" we first create a
2096 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2097 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2098 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2099 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2100 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2101 * disabling the access boundary for the service and making sure it only gets access to the
2102 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2104 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2105 * owned by the service itself.
2106 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2107 * files or sockets with other services. */
2109 private_root
= strjoin(params
->prefix
[type
], "/private");
2110 if (!private_root
) {
2115 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2116 r
= mkdir_safe_label(private_root
, 0700, 0, 0, MKDIR_WARN_MODE
);
2120 pp
= strjoin(private_root
, "/", *rt
);
2126 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2127 r
= mkdir_parents_label(pp
, 0755);
2131 if (is_dir(p
, false) > 0 &&
2132 (laccess(pp
, F_OK
) < 0 && errno
== ENOENT
)) {
2134 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2135 * it over. Most likely the service has been upgraded from one that didn't use
2136 * DynamicUser=1, to one that does. */
2138 if (rename(p
, pp
) < 0) {
2143 /* Otherwise, create the actual directory for the service */
2145 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2146 if (r
< 0 && r
!= -EEXIST
)
2150 /* And link it up from the original place */
2151 r
= symlink_idempotent(pp
, p
, true);
2155 /* Lock down the access mode */
2156 if (chmod(pp
, context
->directories
[type
].mode
) < 0) {
2161 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2162 if (r
< 0 && r
!= -EEXIST
)
2164 if (r
== -EEXIST
&& !context
->dynamic_user
)
2168 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2169 * a service, and shall not be writable. */
2170 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2173 /* Then, change the ownership of the whole tree, if necessary */
2174 r
= path_chown_recursive(pp
?: p
, uid
, gid
);
2182 *exit_status
= exit_status_table
[type
];
2187 static int setup_smack(
2188 const ExecContext
*context
,
2189 const ExecCommand
*command
) {
2196 if (context
->smack_process_label
) {
2197 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2201 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2203 _cleanup_free_
char *exec_label
= NULL
;
2205 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2206 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2209 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2219 static int compile_bind_mounts(
2220 const ExecContext
*context
,
2221 const ExecParameters
*params
,
2222 BindMount
**ret_bind_mounts
,
2223 size_t *ret_n_bind_mounts
,
2224 char ***ret_empty_directories
) {
2226 _cleanup_strv_free_
char **empty_directories
= NULL
;
2227 BindMount
*bind_mounts
;
2229 ExecDirectoryType t
;
2234 assert(ret_bind_mounts
);
2235 assert(ret_n_bind_mounts
);
2236 assert(ret_empty_directories
);
2238 n
= context
->n_bind_mounts
;
2239 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2240 if (!params
->prefix
[t
])
2243 n
+= strv_length(context
->directories
[t
].paths
);
2247 *ret_bind_mounts
= NULL
;
2248 *ret_n_bind_mounts
= 0;
2249 *ret_empty_directories
= NULL
;
2253 bind_mounts
= new(BindMount
, n
);
2257 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2258 BindMount
*item
= context
->bind_mounts
+ i
;
2261 s
= strdup(item
->source
);
2267 d
= strdup(item
->destination
);
2274 bind_mounts
[h
++] = (BindMount
) {
2277 .read_only
= item
->read_only
,
2278 .recursive
= item
->recursive
,
2279 .ignore_enoent
= item
->ignore_enoent
,
2283 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2286 if (!params
->prefix
[t
])
2289 if (strv_isempty(context
->directories
[t
].paths
))
2292 if (context
->dynamic_user
&&
2293 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
) &&
2294 !(context
->root_directory
|| context
->root_image
)) {
2297 /* So this is for a dynamic user, and we need to make sure the process can access its own
2298 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2299 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2301 private_root
= strjoin(params
->prefix
[t
], "/private");
2302 if (!private_root
) {
2307 r
= strv_consume(&empty_directories
, private_root
);
2312 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2315 if (context
->dynamic_user
&&
2316 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
))
2317 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2319 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2325 if (context
->dynamic_user
&&
2326 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
) &&
2327 (context
->root_directory
|| context
->root_image
))
2328 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2329 * directory is not created on the root directory. So, let's bind-mount the directory
2330 * on the 'non-private' place. */
2331 d
= strjoin(params
->prefix
[t
], "/", *suffix
);
2340 bind_mounts
[h
++] = (BindMount
) {
2345 .ignore_enoent
= false,
2352 *ret_bind_mounts
= bind_mounts
;
2353 *ret_n_bind_mounts
= n
;
2354 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2359 bind_mount_free_many(bind_mounts
, h
);
2363 static int apply_mount_namespace(
2365 const ExecCommand
*command
,
2366 const ExecContext
*context
,
2367 const ExecParameters
*params
,
2368 const ExecRuntime
*runtime
) {
2370 _cleanup_strv_free_
char **empty_directories
= NULL
;
2371 char *tmp
= NULL
, *var
= NULL
;
2372 const char *root_dir
= NULL
, *root_image
= NULL
;
2373 NamespaceInfo ns_info
;
2374 bool needs_sandboxing
;
2375 BindMount
*bind_mounts
= NULL
;
2376 size_t n_bind_mounts
= 0;
2381 /* The runtime struct only contains the parent of the private /tmp,
2382 * which is non-accessible to world users. Inside of it there's a /tmp
2383 * that is sticky, and that's the one we want to use here. */
2385 if (context
->private_tmp
&& runtime
) {
2386 if (runtime
->tmp_dir
)
2387 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2388 if (runtime
->var_tmp_dir
)
2389 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2392 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2393 root_image
= context
->root_image
;
2396 root_dir
= context
->root_directory
;
2399 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2403 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2404 if (needs_sandboxing
)
2405 ns_info
= (NamespaceInfo
) {
2406 .ignore_protect_paths
= false,
2407 .private_dev
= context
->private_devices
,
2408 .protect_control_groups
= context
->protect_control_groups
,
2409 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2410 .protect_kernel_modules
= context
->protect_kernel_modules
,
2411 .mount_apivfs
= context
->mount_apivfs
,
2412 .private_mounts
= context
->private_mounts
,
2414 else if (!context
->dynamic_user
&& root_dir
)
2416 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2417 * sandbox info, otherwise enforce it, don't ignore protected paths and
2418 * fail if we are enable to apply the sandbox inside the mount namespace.
2420 ns_info
= (NamespaceInfo
) {
2421 .ignore_protect_paths
= true,
2424 ns_info
= (NamespaceInfo
) {};
2426 r
= setup_namespace(root_dir
, root_image
,
2427 &ns_info
, context
->read_write_paths
,
2428 needs_sandboxing
? context
->read_only_paths
: NULL
,
2429 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2433 context
->temporary_filesystems
,
2434 context
->n_temporary_filesystems
,
2437 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2438 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2439 context
->mount_flags
,
2440 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2442 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2444 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2445 * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
2446 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2447 * completely different execution environment. */
2449 if (n_bind_mounts
== 0 &&
2450 context
->n_temporary_filesystems
== 0 &&
2451 !root_dir
&& !root_image
&&
2452 !context
->dynamic_user
) {
2453 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
2457 log_unit_debug(u
, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2458 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2459 n_bind_mounts
, context
->n_temporary_filesystems
, yes_no(root_dir
), yes_no(root_image
), yes_no(context
->dynamic_user
));
2467 static int apply_working_directory(
2468 const ExecContext
*context
,
2469 const ExecParameters
*params
,
2471 const bool needs_mount_ns
,
2477 assert(exit_status
);
2479 if (context
->working_directory_home
) {
2482 *exit_status
= EXIT_CHDIR
;
2488 } else if (context
->working_directory
)
2489 wd
= context
->working_directory
;
2493 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2494 if (!needs_mount_ns
&& context
->root_directory
)
2495 if (chroot(context
->root_directory
) < 0) {
2496 *exit_status
= EXIT_CHROOT
;
2502 d
= prefix_roota(context
->root_directory
, wd
);
2504 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2505 *exit_status
= EXIT_CHDIR
;
2512 static int setup_keyring(
2514 const ExecContext
*context
,
2515 const ExecParameters
*p
,
2516 uid_t uid
, gid_t gid
) {
2518 key_serial_t keyring
;
2527 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2528 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2529 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2530 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2531 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2532 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2534 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2537 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2540 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2541 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2542 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2543 * & group is just as nasty as acquiring a reference to the user keyring. */
2545 saved_uid
= getuid();
2546 saved_gid
= getgid();
2548 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2549 if (setregid(gid
, -1) < 0)
2550 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2553 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2554 if (setreuid(uid
, -1) < 0) {
2555 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2560 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2561 if (keyring
== -1) {
2562 if (errno
== ENOSYS
)
2563 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2564 else if (IN_SET(errno
, EACCES
, EPERM
))
2565 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2566 else if (errno
== EDQUOT
)
2567 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2569 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2574 /* When requested link the user keyring into the session keyring. */
2575 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2577 if (keyctl(KEYCTL_LINK
,
2578 KEY_SPEC_USER_KEYRING
,
2579 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2580 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
2585 /* Restore uid/gid back */
2586 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2587 if (setreuid(saved_uid
, -1) < 0) {
2588 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2593 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2594 if (setregid(saved_gid
, -1) < 0)
2595 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2598 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2599 if (!sd_id128_is_null(u
->invocation_id
)) {
2602 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2604 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2606 if (keyctl(KEYCTL_SETPERM
, key
,
2607 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2608 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2609 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2614 /* Revert back uid & gid for the the last time, and exit */
2615 /* no extra logging, as only the first already reported error matters */
2616 if (getuid() != saved_uid
)
2617 (void) setreuid(saved_uid
, -1);
2619 if (getgid() != saved_gid
)
2620 (void) setregid(saved_gid
, -1);
2625 static void append_socket_pair(int *array
, size_t *n
, const int pair
[2]) {
2633 array
[(*n
)++] = pair
[0];
2635 array
[(*n
)++] = pair
[1];
2638 static int close_remaining_fds(
2639 const ExecParameters
*params
,
2640 const ExecRuntime
*runtime
,
2641 const DynamicCreds
*dcreds
,
2645 int *fds
, size_t n_fds
) {
2647 size_t n_dont_close
= 0;
2648 int dont_close
[n_fds
+ 12];
2652 if (params
->stdin_fd
>= 0)
2653 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2654 if (params
->stdout_fd
>= 0)
2655 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2656 if (params
->stderr_fd
>= 0)
2657 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2660 dont_close
[n_dont_close
++] = socket_fd
;
2662 dont_close
[n_dont_close
++] = exec_fd
;
2664 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2665 n_dont_close
+= n_fds
;
2669 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2673 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2675 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2678 if (user_lookup_fd
>= 0)
2679 dont_close
[n_dont_close
++] = user_lookup_fd
;
2681 return close_all_fds(dont_close
, n_dont_close
);
2684 static int send_user_lookup(
2692 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2693 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2696 if (user_lookup_fd
< 0)
2699 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2702 if (writev(user_lookup_fd
,
2704 IOVEC_INIT(&uid
, sizeof(uid
)),
2705 IOVEC_INIT(&gid
, sizeof(gid
)),
2706 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2712 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2719 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2724 if (!c
->working_directory_home
)
2728 /* Hardcode /root as home directory for UID 0 */
2733 r
= get_home_dir(buf
);
2741 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2742 _cleanup_strv_free_
char ** list
= NULL
;
2743 ExecDirectoryType t
;
2750 assert(c
->dynamic_user
);
2752 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2753 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2756 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2759 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2765 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2768 if (t
== EXEC_DIRECTORY_RUNTIME
)
2769 e
= strjoin(p
->prefix
[t
], "/", *i
);
2771 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2775 r
= strv_consume(&list
, e
);
2781 *ret
= TAKE_PTR(list
);
2786 static char *exec_command_line(char **argv
);
2788 static int exec_parameters_get_cgroup_path(const ExecParameters
*params
, char **ret
) {
2789 bool using_subcgroup
;
2795 if (!params
->cgroup_path
)
2798 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2799 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2800 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2801 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2802 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2803 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2804 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2805 * flag, which is only passed for the former statements, not for the latter. */
2807 using_subcgroup
= FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
|EXEC_CGROUP_DELEGATE
|EXEC_IS_CONTROL
);
2808 if (using_subcgroup
)
2809 p
= strjoin(params
->cgroup_path
, "/.control");
2811 p
= strdup(params
->cgroup_path
);
2816 return using_subcgroup
;
2819 static int exec_child(
2821 const ExecCommand
*command
,
2822 const ExecContext
*context
,
2823 const ExecParameters
*params
,
2824 ExecRuntime
*runtime
,
2825 DynamicCreds
*dcreds
,
2829 size_t n_socket_fds
,
2830 size_t n_storage_fds
,
2835 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2836 int *fds_with_exec_fd
, n_fds_with_exec_fd
, r
, ngids
= 0, exec_fd
= -1;
2837 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2838 const char *username
= NULL
, *groupname
= NULL
;
2839 _cleanup_free_
char *home_buffer
= NULL
;
2840 const char *home
= NULL
, *shell
= NULL
;
2841 dev_t journal_stream_dev
= 0;
2842 ino_t journal_stream_ino
= 0;
2843 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2844 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2845 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2846 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2848 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
2849 bool use_selinux
= false;
2852 bool use_smack
= false;
2855 bool use_apparmor
= false;
2857 uid_t uid
= UID_INVALID
;
2858 gid_t gid
= GID_INVALID
;
2860 ExecDirectoryType dt
;
2867 assert(exit_status
);
2869 rename_process_from_path(command
->path
);
2871 /* We reset exactly these signals, since they are the
2872 * only ones we set to SIG_IGN in the main daemon. All
2873 * others we leave untouched because we set them to
2874 * SIG_DFL or a valid handler initially, both of which
2875 * will be demoted to SIG_DFL. */
2876 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2877 SIGNALS_IGNORE
, -1);
2879 if (context
->ignore_sigpipe
)
2880 (void) ignore_signals(SIGPIPE
, -1);
2882 r
= reset_signal_mask();
2884 *exit_status
= EXIT_SIGNAL_MASK
;
2885 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2888 if (params
->idle_pipe
)
2889 do_idle_pipe_dance(params
->idle_pipe
);
2891 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2892 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2893 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2894 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2897 log_set_open_when_needed(true);
2899 /* In case anything used libc syslog(), close this here, too */
2902 n_fds
= n_socket_fds
+ n_storage_fds
;
2903 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, params
->exec_fd
, fds
, n_fds
);
2905 *exit_status
= EXIT_FDS
;
2906 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2909 if (!context
->same_pgrp
)
2911 *exit_status
= EXIT_SETSID
;
2912 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2915 exec_context_tty_reset(context
, params
);
2917 if (unit_shall_confirm_spawn(unit
)) {
2918 const char *vc
= params
->confirm_spawn
;
2919 _cleanup_free_
char *cmdline
= NULL
;
2921 cmdline
= exec_command_line(command
->argv
);
2923 *exit_status
= EXIT_MEMORY
;
2927 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2928 if (r
!= CONFIRM_EXECUTE
) {
2929 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2930 *exit_status
= EXIT_SUCCESS
;
2933 *exit_status
= EXIT_CONFIRM
;
2934 log_unit_error(unit
, "Execution cancelled by the user");
2939 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2940 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2941 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2942 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2943 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2944 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
2945 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit
->manager
) ? "system" : "user", true) != 0) {
2946 *exit_status
= EXIT_MEMORY
;
2947 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2950 if (context
->dynamic_user
&& dcreds
) {
2951 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2953 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2954 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2955 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2956 *exit_status
= EXIT_USER
;
2957 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2960 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2962 *exit_status
= EXIT_MEMORY
;
2966 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2968 *exit_status
= EXIT_USER
;
2970 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2973 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2976 if (!uid_is_valid(uid
)) {
2977 *exit_status
= EXIT_USER
;
2978 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2982 if (!gid_is_valid(gid
)) {
2983 *exit_status
= EXIT_USER
;
2984 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2989 username
= dcreds
->user
->name
;
2992 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2994 *exit_status
= EXIT_USER
;
2995 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2998 r
= get_fixed_group(context
, &groupname
, &gid
);
3000 *exit_status
= EXIT_GROUP
;
3001 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
3005 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3006 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
3007 &supplementary_gids
, &ngids
);
3009 *exit_status
= EXIT_GROUP
;
3010 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
3013 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
3015 *exit_status
= EXIT_USER
;
3016 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
3019 user_lookup_fd
= safe_close(user_lookup_fd
);
3021 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
3023 *exit_status
= EXIT_CHDIR
;
3024 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
3027 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3028 * must sure to drop O_NONBLOCK */
3030 (void) fd_nonblock(socket_fd
, false);
3032 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
3034 *exit_status
= EXIT_STDIN
;
3035 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
3038 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
3040 *exit_status
= EXIT_STDOUT
;
3041 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
3044 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
3046 *exit_status
= EXIT_STDERR
;
3047 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
3050 if (params
->cgroup_path
) {
3051 _cleanup_free_
char *p
= NULL
;
3053 r
= exec_parameters_get_cgroup_path(params
, &p
);
3055 *exit_status
= EXIT_CGROUP
;
3056 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
3059 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
3061 *exit_status
= EXIT_CGROUP
;
3062 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
3066 if (context
->oom_score_adjust_set
) {
3067 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3068 * prohibit write access to this file, and we shouldn't trip up over that. */
3069 r
= set_oom_score_adjust(context
->oom_score_adjust
);
3070 if (IN_SET(r
, -EPERM
, -EACCES
))
3071 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3073 *exit_status
= EXIT_OOM_ADJUST
;
3074 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
3078 if (context
->nice_set
)
3079 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
3080 *exit_status
= EXIT_NICE
;
3081 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
3084 if (context
->cpu_sched_set
) {
3085 struct sched_param param
= {
3086 .sched_priority
= context
->cpu_sched_priority
,
3089 r
= sched_setscheduler(0,
3090 context
->cpu_sched_policy
|
3091 (context
->cpu_sched_reset_on_fork
?
3092 SCHED_RESET_ON_FORK
: 0),
3095 *exit_status
= EXIT_SETSCHEDULER
;
3096 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
3100 if (context
->cpuset
)
3101 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
3102 *exit_status
= EXIT_CPUAFFINITY
;
3103 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
3106 if (context
->ioprio_set
)
3107 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
3108 *exit_status
= EXIT_IOPRIO
;
3109 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
3112 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
3113 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
3114 *exit_status
= EXIT_TIMERSLACK
;
3115 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
3118 if (context
->personality
!= PERSONALITY_INVALID
) {
3119 r
= safe_personality(context
->personality
);
3121 *exit_status
= EXIT_PERSONALITY
;
3122 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
3126 if (context
->utmp_id
)
3127 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
3129 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
3130 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
3134 if (context
->user
) {
3135 r
= chown_terminal(STDIN_FILENO
, uid
);
3137 *exit_status
= EXIT_STDIN
;
3138 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
3142 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3143 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3144 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3145 * touch a single hierarchy too. */
3146 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
3147 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
3149 *exit_status
= EXIT_CGROUP
;
3150 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
3154 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3155 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
3157 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
3160 r
= build_environment(
3172 *exit_status
= EXIT_MEMORY
;
3176 r
= build_pass_environment(context
, &pass_env
);
3178 *exit_status
= EXIT_MEMORY
;
3182 accum_env
= strv_env_merge(5,
3183 params
->environment
,
3186 context
->environment
,
3190 *exit_status
= EXIT_MEMORY
;
3193 accum_env
= strv_env_clean(accum_env
);
3195 (void) umask(context
->umask
);
3197 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
3199 *exit_status
= EXIT_KEYRING
;
3200 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
3203 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3204 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3206 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3207 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3209 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3210 if (needs_ambient_hack
)
3211 needs_setuid
= false;
3213 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3215 if (needs_sandboxing
) {
3216 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3217 * present. The actual MAC context application will happen later, as late as possible, to avoid
3218 * impacting our own code paths. */
3221 use_selinux
= mac_selinux_use();
3224 use_smack
= mac_smack_use();
3227 use_apparmor
= mac_apparmor_use();
3232 if (context
->pam_name
&& username
) {
3233 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3235 *exit_status
= EXIT_PAM
;
3236 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3241 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3242 if (ns_type_supported(NAMESPACE_NET
)) {
3243 r
= setup_netns(runtime
->netns_storage_socket
);
3245 *exit_status
= EXIT_NETWORK
;
3246 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3249 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3252 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3253 if (needs_mount_namespace
) {
3254 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3256 *exit_status
= EXIT_NAMESPACE
;
3257 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3261 /* Drop groups as early as possbile */
3263 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3265 *exit_status
= EXIT_GROUP
;
3266 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3270 if (needs_sandboxing
) {
3272 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3273 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3275 *exit_status
= EXIT_SELINUX_CONTEXT
;
3276 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3281 if (context
->private_users
) {
3282 r
= setup_private_users(uid
, gid
);
3284 *exit_status
= EXIT_USER
;
3285 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3290 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3291 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3292 * however if we have it as we want to keep it open until the final execve(). */
3294 if (params
->exec_fd
>= 0) {
3295 exec_fd
= params
->exec_fd
;
3297 if (exec_fd
< 3 + (int) n_fds
) {
3300 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3301 * process we are about to execute. */
3303 moved_fd
= fcntl(exec_fd
, F_DUPFD_CLOEXEC
, 3 + (int) n_fds
);
3305 *exit_status
= EXIT_FDS
;
3306 return log_unit_error_errno(unit
, errno
, "Couldn't move exec fd up: %m");
3309 safe_close(exec_fd
);
3312 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3313 r
= fd_cloexec(exec_fd
, true);
3315 *exit_status
= EXIT_FDS
;
3316 return log_unit_error_errno(unit
, r
, "Failed to make exec fd FD_CLOEXEC: %m");
3320 fds_with_exec_fd
= newa(int, n_fds
+ 1);
3321 memcpy_safe(fds_with_exec_fd
, fds
, n_fds
* sizeof(int));
3322 fds_with_exec_fd
[n_fds
] = exec_fd
;
3323 n_fds_with_exec_fd
= n_fds
+ 1;
3325 fds_with_exec_fd
= fds
;
3326 n_fds_with_exec_fd
= n_fds
;
3329 r
= close_all_fds(fds_with_exec_fd
, n_fds_with_exec_fd
);
3331 r
= shift_fds(fds
, n_fds
);
3333 r
= flags_fds(fds
, n_socket_fds
, n_storage_fds
, context
->non_blocking
);
3335 *exit_status
= EXIT_FDS
;
3336 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3339 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3340 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3341 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3344 secure_bits
= context
->secure_bits
;
3346 if (needs_sandboxing
) {
3350 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
3352 *exit_status
= EXIT_LIMITS
;
3353 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
3356 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3357 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3358 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3359 *exit_status
= EXIT_LIMITS
;
3360 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3365 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3366 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3368 r
= setup_smack(context
, command
);
3370 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3371 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3376 bset
= context
->capability_bounding_set
;
3377 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3378 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3379 * instead of us doing that */
3380 if (needs_ambient_hack
)
3381 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3382 (UINT64_C(1) << CAP_SETUID
) |
3383 (UINT64_C(1) << CAP_SETGID
);
3385 if (!cap_test_all(bset
)) {
3386 r
= capability_bounding_set_drop(bset
, false);
3388 *exit_status
= EXIT_CAPABILITIES
;
3389 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3393 /* This is done before enforce_user, but ambient set
3394 * does not survive over setresuid() if keep_caps is not set. */
3395 if (!needs_ambient_hack
&&
3396 context
->capability_ambient_set
!= 0) {
3397 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3399 *exit_status
= EXIT_CAPABILITIES
;
3400 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3406 if (context
->user
) {
3407 r
= enforce_user(context
, uid
);
3409 *exit_status
= EXIT_USER
;
3410 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3413 if (!needs_ambient_hack
&&
3414 context
->capability_ambient_set
!= 0) {
3416 /* Fix the ambient capabilities after user change. */
3417 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3419 *exit_status
= EXIT_CAPABILITIES
;
3420 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3423 /* If we were asked to change user and ambient capabilities
3424 * were requested, we had to add keep-caps to the securebits
3425 * so that we would maintain the inherited capability set
3426 * through the setresuid(). Make sure that the bit is added
3427 * also to the context secure_bits so that we don't try to
3428 * drop the bit away next. */
3430 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3435 /* Apply working directory here, because the working directory might be on NFS and only the user running
3436 * this service might have the correct privilege to change to the working directory */
3437 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3439 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3441 if (needs_sandboxing
) {
3442 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3443 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3444 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3445 * are restricted. */
3449 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3452 r
= setexeccon(exec_context
);
3454 *exit_status
= EXIT_SELINUX_CONTEXT
;
3455 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3462 if (use_apparmor
&& context
->apparmor_profile
) {
3463 r
= aa_change_onexec(context
->apparmor_profile
);
3464 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3465 *exit_status
= EXIT_APPARMOR_PROFILE
;
3466 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3471 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3472 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3473 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3474 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3475 *exit_status
= EXIT_SECUREBITS
;
3476 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3479 if (context_has_no_new_privileges(context
))
3480 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3481 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3482 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3486 r
= apply_address_families(unit
, context
);
3488 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3489 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3492 r
= apply_memory_deny_write_execute(unit
, context
);
3494 *exit_status
= EXIT_SECCOMP
;
3495 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3498 r
= apply_restrict_realtime(unit
, context
);
3500 *exit_status
= EXIT_SECCOMP
;
3501 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3504 r
= apply_restrict_namespaces(unit
, context
);
3506 *exit_status
= EXIT_SECCOMP
;
3507 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3510 r
= apply_protect_sysctl(unit
, context
);
3512 *exit_status
= EXIT_SECCOMP
;
3513 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3516 r
= apply_protect_kernel_modules(unit
, context
);
3518 *exit_status
= EXIT_SECCOMP
;
3519 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3522 r
= apply_private_devices(unit
, context
);
3524 *exit_status
= EXIT_SECCOMP
;
3525 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3528 r
= apply_syscall_archs(unit
, context
);
3530 *exit_status
= EXIT_SECCOMP
;
3531 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3534 r
= apply_lock_personality(unit
, context
);
3536 *exit_status
= EXIT_SECCOMP
;
3537 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3540 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3541 * by the filter as little as possible. */
3542 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3544 *exit_status
= EXIT_SECCOMP
;
3545 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3550 if (!strv_isempty(context
->unset_environment
)) {
3553 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3555 *exit_status
= EXIT_MEMORY
;
3559 strv_free_and_replace(accum_env
, ee
);
3562 final_argv
= replace_env_argv(command
->argv
, accum_env
);
3564 *exit_status
= EXIT_MEMORY
;
3568 if (DEBUG_LOGGING
) {
3569 _cleanup_free_
char *line
;
3571 line
= exec_command_line(final_argv
);
3573 log_struct(LOG_DEBUG
,
3574 "EXECUTABLE=%s", command
->path
,
3575 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3577 LOG_UNIT_INVOCATION_ID(unit
));
3583 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3584 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3586 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
3587 *exit_status
= EXIT_EXEC
;
3588 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
3592 execve(command
->path
, final_argv
, accum_env
);
3598 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3599 * that POLLHUP on it no longer means execve() succeeded. */
3601 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
3602 *exit_status
= EXIT_EXEC
;
3603 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
3607 if (r
== -ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3608 log_struct_errno(LOG_INFO
, r
,
3609 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3611 LOG_UNIT_INVOCATION_ID(unit
),
3612 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3614 "EXECUTABLE=%s", command
->path
);
3618 *exit_status
= EXIT_EXEC
;
3619 return log_unit_error_errno(unit
, r
, "Failed to execute command: %m");
3622 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
3623 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]);
3625 int exec_spawn(Unit
*unit
,
3626 ExecCommand
*command
,
3627 const ExecContext
*context
,
3628 const ExecParameters
*params
,
3629 ExecRuntime
*runtime
,
3630 DynamicCreds
*dcreds
,
3633 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
3634 _cleanup_free_
char *subcgroup_path
= NULL
;
3635 _cleanup_strv_free_
char **files_env
= NULL
;
3636 size_t n_storage_fds
= 0, n_socket_fds
= 0;
3637 _cleanup_free_
char *line
= NULL
;
3645 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
3647 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3648 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3649 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3651 if (params
->n_socket_fds
> 1) {
3652 log_unit_error(unit
, "Got more than one socket.");
3656 if (params
->n_socket_fds
== 0) {
3657 log_unit_error(unit
, "Got no socket.");
3661 socket_fd
= params
->fds
[0];
3665 n_socket_fds
= params
->n_socket_fds
;
3666 n_storage_fds
= params
->n_storage_fds
;
3669 r
= exec_context_named_iofds(context
, params
, named_iofds
);
3671 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3673 r
= exec_context_load_environment(unit
, context
, &files_env
);
3675 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3677 line
= exec_command_line(command
->argv
);
3681 log_struct(LOG_DEBUG
,
3682 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3683 "EXECUTABLE=%s", command
->path
,
3685 LOG_UNIT_INVOCATION_ID(unit
));
3687 if (params
->cgroup_path
) {
3688 r
= exec_parameters_get_cgroup_path(params
, &subcgroup_path
);
3690 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
3691 if (r
> 0) { /* We are using a child cgroup */
3692 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
3694 return log_unit_error_errno(unit
, r
, "Failed to create control group '%s': %m", subcgroup_path
);
3700 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3703 int exit_status
= EXIT_SUCCESS
;
3705 r
= exec_child(unit
,
3717 unit
->manager
->user_lookup_fds
[1],
3721 log_struct_errno(LOG_ERR
, r
,
3722 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3724 LOG_UNIT_INVOCATION_ID(unit
),
3725 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3726 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3728 "EXECUTABLE=%s", command
->path
);
3733 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3735 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3736 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3737 * process will be killed too). */
3739 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
3741 exec_status_start(&command
->exec_status
, pid
);
3747 void exec_context_init(ExecContext
*c
) {
3748 ExecDirectoryType i
;
3753 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3754 c
->cpu_sched_policy
= SCHED_OTHER
;
3755 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3756 c
->syslog_level_prefix
= true;
3757 c
->ignore_sigpipe
= true;
3758 c
->timer_slack_nsec
= NSEC_INFINITY
;
3759 c
->personality
= PERSONALITY_INVALID
;
3760 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3761 c
->directories
[i
].mode
= 0755;
3762 c
->capability_bounding_set
= CAP_ALL
;
3763 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
3764 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
3765 c
->log_level_max
= -1;
3768 void exec_context_done(ExecContext
*c
) {
3769 ExecDirectoryType i
;
3774 c
->environment
= strv_free(c
->environment
);
3775 c
->environment_files
= strv_free(c
->environment_files
);
3776 c
->pass_environment
= strv_free(c
->pass_environment
);
3777 c
->unset_environment
= strv_free(c
->unset_environment
);
3779 rlimit_free_all(c
->rlimit
);
3781 for (l
= 0; l
< 3; l
++) {
3782 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3783 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
3786 c
->working_directory
= mfree(c
->working_directory
);
3787 c
->root_directory
= mfree(c
->root_directory
);
3788 c
->root_image
= mfree(c
->root_image
);
3789 c
->tty_path
= mfree(c
->tty_path
);
3790 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3791 c
->user
= mfree(c
->user
);
3792 c
->group
= mfree(c
->group
);
3794 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3796 c
->pam_name
= mfree(c
->pam_name
);
3798 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3799 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3800 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3802 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3803 c
->bind_mounts
= NULL
;
3804 c
->n_bind_mounts
= 0;
3805 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
3806 c
->temporary_filesystems
= NULL
;
3807 c
->n_temporary_filesystems
= 0;
3809 c
->cpuset
= cpu_set_mfree(c
->cpuset
);
3811 c
->utmp_id
= mfree(c
->utmp_id
);
3812 c
->selinux_context
= mfree(c
->selinux_context
);
3813 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3814 c
->smack_process_label
= mfree(c
->smack_process_label
);
3816 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3817 c
->syscall_archs
= set_free(c
->syscall_archs
);
3818 c
->address_families
= set_free(c
->address_families
);
3820 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3821 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3823 c
->log_level_max
= -1;
3825 exec_context_free_log_extra_fields(c
);
3827 c
->log_rate_limit_interval_usec
= 0;
3828 c
->log_rate_limit_burst
= 0;
3830 c
->stdin_data
= mfree(c
->stdin_data
);
3831 c
->stdin_data_size
= 0;
3834 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
3839 if (!runtime_prefix
)
3842 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3843 _cleanup_free_
char *p
;
3845 p
= strjoin(runtime_prefix
, "/", *i
);
3849 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3851 (void) rm_rf(p
, REMOVE_ROOT
);
3857 static void exec_command_done(ExecCommand
*c
) {
3860 c
->path
= mfree(c
->path
);
3861 c
->argv
= strv_free(c
->argv
);
3864 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
3867 for (i
= 0; i
< n
; i
++)
3868 exec_command_done(c
+i
);
3871 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3875 LIST_REMOVE(command
, c
, i
);
3876 exec_command_done(i
);
3883 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
3886 for (i
= 0; i
< n
; i
++)
3887 c
[i
] = exec_command_free_list(c
[i
]);
3890 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
3893 for (i
= 0; i
< n
; i
++)
3894 exec_status_reset(&c
[i
].exec_status
);
3897 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
3900 for (i
= 0; i
< n
; i
++) {
3903 LIST_FOREACH(command
, z
, c
[i
])
3904 exec_status_reset(&z
->exec_status
);
3908 typedef struct InvalidEnvInfo
{
3913 static void invalid_env(const char *p
, void *userdata
) {
3914 InvalidEnvInfo
*info
= userdata
;
3916 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3919 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3925 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3928 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3931 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3934 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3937 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3940 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3947 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3949 const char* stdio_fdname
[3];
3955 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3956 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3957 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3959 for (i
= 0; i
< 3; i
++)
3960 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3962 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3964 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3965 if (named_iofds
[STDIN_FILENO
] < 0 &&
3966 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3967 stdio_fdname
[STDIN_FILENO
] &&
3968 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3970 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3973 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3974 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3975 stdio_fdname
[STDOUT_FILENO
] &&
3976 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3978 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3981 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3982 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3983 stdio_fdname
[STDERR_FILENO
] &&
3984 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3986 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3990 return targets
== 0 ? 0 : -ENOENT
;
3993 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
) {
3994 char **i
, **r
= NULL
;
3999 STRV_FOREACH(i
, c
->environment_files
) {
4003 bool ignore
= false;
4005 _cleanup_globfree_ glob_t pglob
= {};
4014 if (!path_is_absolute(fn
)) {
4022 /* Filename supports globbing, take all matching files */
4023 k
= safe_glob(fn
, 0, &pglob
);
4032 /* When we don't match anything, -ENOENT should be returned */
4033 assert(pglob
.gl_pathc
> 0);
4035 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
4036 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
4044 /* Log invalid environment variables with filename */
4046 InvalidEnvInfo info
= {
4048 .path
= pglob
.gl_pathv
[n
]
4051 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
4059 m
= strv_env_merge(2, r
, p
);
4075 static bool tty_may_match_dev_console(const char *tty
) {
4076 _cleanup_free_
char *resolved
= NULL
;
4081 tty
= skip_dev_prefix(tty
);
4083 /* trivial identity? */
4084 if (streq(tty
, "console"))
4087 if (resolve_dev_console(&resolved
) < 0)
4088 return true; /* if we could not resolve, assume it may */
4090 /* "tty0" means the active VC, so it may be the same sometimes */
4091 return streq(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
4094 bool exec_context_may_touch_console(const ExecContext
*ec
) {
4096 return (ec
->tty_reset
||
4098 ec
->tty_vt_disallocate
||
4099 is_terminal_input(ec
->std_input
) ||
4100 is_terminal_output(ec
->std_output
) ||
4101 is_terminal_output(ec
->std_error
)) &&
4102 tty_may_match_dev_console(exec_context_tty_path(ec
));
4105 static void strv_fprintf(FILE *f
, char **l
) {
4111 fprintf(f
, " %s", *g
);
4114 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
4115 ExecDirectoryType dt
;
4123 prefix
= strempty(prefix
);
4127 "%sWorkingDirectory: %s\n"
4128 "%sRootDirectory: %s\n"
4129 "%sNonBlocking: %s\n"
4130 "%sPrivateTmp: %s\n"
4131 "%sPrivateDevices: %s\n"
4132 "%sProtectKernelTunables: %s\n"
4133 "%sProtectKernelModules: %s\n"
4134 "%sProtectControlGroups: %s\n"
4135 "%sPrivateNetwork: %s\n"
4136 "%sPrivateUsers: %s\n"
4137 "%sProtectHome: %s\n"
4138 "%sProtectSystem: %s\n"
4139 "%sMountAPIVFS: %s\n"
4140 "%sIgnoreSIGPIPE: %s\n"
4141 "%sMemoryDenyWriteExecute: %s\n"
4142 "%sRestrictRealtime: %s\n"
4143 "%sKeyringMode: %s\n",
4145 prefix
, c
->working_directory
? c
->working_directory
: "/",
4146 prefix
, c
->root_directory
? c
->root_directory
: "/",
4147 prefix
, yes_no(c
->non_blocking
),
4148 prefix
, yes_no(c
->private_tmp
),
4149 prefix
, yes_no(c
->private_devices
),
4150 prefix
, yes_no(c
->protect_kernel_tunables
),
4151 prefix
, yes_no(c
->protect_kernel_modules
),
4152 prefix
, yes_no(c
->protect_control_groups
),
4153 prefix
, yes_no(c
->private_network
),
4154 prefix
, yes_no(c
->private_users
),
4155 prefix
, protect_home_to_string(c
->protect_home
),
4156 prefix
, protect_system_to_string(c
->protect_system
),
4157 prefix
, yes_no(c
->mount_apivfs
),
4158 prefix
, yes_no(c
->ignore_sigpipe
),
4159 prefix
, yes_no(c
->memory_deny_write_execute
),
4160 prefix
, yes_no(c
->restrict_realtime
),
4161 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
4164 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
4166 STRV_FOREACH(e
, c
->environment
)
4167 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
4169 STRV_FOREACH(e
, c
->environment_files
)
4170 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
4172 STRV_FOREACH(e
, c
->pass_environment
)
4173 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
4175 STRV_FOREACH(e
, c
->unset_environment
)
4176 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
4178 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
4180 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4181 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
4183 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
4184 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
4192 if (c
->oom_score_adjust_set
)
4194 "%sOOMScoreAdjust: %i\n",
4195 prefix
, c
->oom_score_adjust
);
4197 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
4199 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
4200 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
4201 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
4202 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
4205 if (c
->ioprio_set
) {
4206 _cleanup_free_
char *class_str
= NULL
;
4208 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
4210 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
4212 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
4215 if (c
->cpu_sched_set
) {
4216 _cleanup_free_
char *policy_str
= NULL
;
4218 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
4220 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
4223 "%sCPUSchedulingPriority: %i\n"
4224 "%sCPUSchedulingResetOnFork: %s\n",
4225 prefix
, c
->cpu_sched_priority
,
4226 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
4230 fprintf(f
, "%sCPUAffinity:", prefix
);
4231 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
4232 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
4233 fprintf(f
, " %u", i
);
4237 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
4238 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
4241 "%sStandardInput: %s\n"
4242 "%sStandardOutput: %s\n"
4243 "%sStandardError: %s\n",
4244 prefix
, exec_input_to_string(c
->std_input
),
4245 prefix
, exec_output_to_string(c
->std_output
),
4246 prefix
, exec_output_to_string(c
->std_error
));
4248 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
4249 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
4250 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
4251 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
4252 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
4253 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
4255 if (c
->std_input
== EXEC_INPUT_FILE
)
4256 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
4257 if (c
->std_output
== EXEC_OUTPUT_FILE
)
4258 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4259 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
4260 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4261 if (c
->std_error
== EXEC_OUTPUT_FILE
)
4262 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4263 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
4264 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4270 "%sTTYVHangup: %s\n"
4271 "%sTTYVTDisallocate: %s\n",
4272 prefix
, c
->tty_path
,
4273 prefix
, yes_no(c
->tty_reset
),
4274 prefix
, yes_no(c
->tty_vhangup
),
4275 prefix
, yes_no(c
->tty_vt_disallocate
));
4277 if (IN_SET(c
->std_output
,
4280 EXEC_OUTPUT_JOURNAL
,
4281 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4282 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4283 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
4284 IN_SET(c
->std_error
,
4287 EXEC_OUTPUT_JOURNAL
,
4288 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4289 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4290 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
4292 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
4294 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
4296 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
4298 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
4300 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
4303 if (c
->log_level_max
>= 0) {
4304 _cleanup_free_
char *t
= NULL
;
4306 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
4308 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
4311 if (c
->log_rate_limit_interval_usec
> 0) {
4312 char buf_timespan
[FORMAT_TIMESPAN_MAX
];
4315 "%sLogRateLimitIntervalSec: %s\n",
4316 prefix
, format_timespan(buf_timespan
, sizeof(buf_timespan
), c
->log_rate_limit_interval_usec
, USEC_PER_SEC
));
4319 if (c
->log_rate_limit_burst
> 0)
4320 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_rate_limit_burst
);
4322 if (c
->n_log_extra_fields
> 0) {
4325 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
4326 fprintf(f
, "%sLogExtraFields: ", prefix
);
4327 fwrite(c
->log_extra_fields
[j
].iov_base
,
4328 1, c
->log_extra_fields
[j
].iov_len
,
4334 if (c
->secure_bits
) {
4335 _cleanup_free_
char *str
= NULL
;
4337 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4339 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4342 if (c
->capability_bounding_set
!= CAP_ALL
) {
4343 _cleanup_free_
char *str
= NULL
;
4345 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4347 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4350 if (c
->capability_ambient_set
!= 0) {
4351 _cleanup_free_
char *str
= NULL
;
4353 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4355 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4359 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4361 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4363 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4365 if (!strv_isempty(c
->supplementary_groups
)) {
4366 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4367 strv_fprintf(f
, c
->supplementary_groups
);
4372 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4374 if (!strv_isempty(c
->read_write_paths
)) {
4375 fprintf(f
, "%sReadWritePaths:", prefix
);
4376 strv_fprintf(f
, c
->read_write_paths
);
4380 if (!strv_isempty(c
->read_only_paths
)) {
4381 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4382 strv_fprintf(f
, c
->read_only_paths
);
4386 if (!strv_isempty(c
->inaccessible_paths
)) {
4387 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4388 strv_fprintf(f
, c
->inaccessible_paths
);
4392 if (c
->n_bind_mounts
> 0)
4393 for (i
= 0; i
< c
->n_bind_mounts
; i
++)
4394 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
4395 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4396 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
4397 c
->bind_mounts
[i
].source
,
4398 c
->bind_mounts
[i
].destination
,
4399 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4401 if (c
->n_temporary_filesystems
> 0)
4402 for (i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
4403 TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
4405 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
4407 isempty(t
->options
) ? "" : ":",
4408 strempty(t
->options
));
4413 "%sUtmpIdentifier: %s\n",
4414 prefix
, c
->utmp_id
);
4416 if (c
->selinux_context
)
4418 "%sSELinuxContext: %s%s\n",
4419 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4421 if (c
->apparmor_profile
)
4423 "%sAppArmorProfile: %s%s\n",
4424 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4426 if (c
->smack_process_label
)
4428 "%sSmackProcessLabel: %s%s\n",
4429 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4431 if (c
->personality
!= PERSONALITY_INVALID
)
4433 "%sPersonality: %s\n",
4434 prefix
, strna(personality_to_string(c
->personality
)));
4437 "%sLockPersonality: %s\n",
4438 prefix
, yes_no(c
->lock_personality
));
4440 if (c
->syscall_filter
) {
4448 "%sSystemCallFilter: ",
4451 if (!c
->syscall_whitelist
)
4455 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4456 _cleanup_free_
char *name
= NULL
;
4457 const char *errno_name
= NULL
;
4458 int num
= PTR_TO_INT(val
);
4465 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4466 fputs(strna(name
), f
);
4469 errno_name
= errno_to_name(num
);
4471 fprintf(f
, ":%s", errno_name
);
4473 fprintf(f
, ":%d", num
);
4481 if (c
->syscall_archs
) {
4488 "%sSystemCallArchitectures:",
4492 SET_FOREACH(id
, c
->syscall_archs
, j
)
4493 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4498 if (exec_context_restrict_namespaces_set(c
)) {
4499 _cleanup_free_
char *s
= NULL
;
4501 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
4503 fprintf(f
, "%sRestrictNamespaces: %s\n",
4507 if (c
->syscall_errno
> 0) {
4508 const char *errno_name
;
4510 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4512 errno_name
= errno_to_name(c
->syscall_errno
);
4514 fprintf(f
, "%s\n", errno_name
);
4516 fprintf(f
, "%d\n", c
->syscall_errno
);
4519 if (c
->apparmor_profile
)
4521 "%sAppArmorProfile: %s%s\n",
4522 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4525 bool exec_context_maintains_privileges(const ExecContext
*c
) {
4528 /* Returns true if the process forked off would run under
4529 * an unchanged UID or as root. */
4534 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4540 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
4548 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4550 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4555 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4560 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4561 free(c
->log_extra_fields
[l
].iov_base
);
4562 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4563 c
->n_log_extra_fields
= 0;
4566 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4573 dual_timestamp_get(&s
->start_timestamp
);
4576 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
4579 if (s
->pid
!= pid
) {
4585 dual_timestamp_get(&s
->exit_timestamp
);
4591 if (context
->utmp_id
)
4592 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4594 exec_context_tty_reset(context
, NULL
);
4598 void exec_status_reset(ExecStatus
*s
) {
4601 *s
= (ExecStatus
) {};
4604 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
4605 char buf
[FORMAT_TIMESTAMP_MAX
];
4613 prefix
= strempty(prefix
);
4616 "%sPID: "PID_FMT
"\n",
4619 if (dual_timestamp_is_set(&s
->start_timestamp
))
4621 "%sStart Timestamp: %s\n",
4622 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4624 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4626 "%sExit Timestamp: %s\n"
4628 "%sExit Status: %i\n",
4629 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4630 prefix
, sigchld_code_to_string(s
->code
),
4634 static char *exec_command_line(char **argv
) {
4642 STRV_FOREACH(a
, argv
)
4650 STRV_FOREACH(a
, argv
) {
4657 if (strpbrk(*a
, WHITESPACE
)) {
4668 /* FIXME: this doesn't really handle arguments that have
4669 * spaces and ticks in them */
4674 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4675 _cleanup_free_
char *cmd
= NULL
;
4676 const char *prefix2
;
4681 prefix
= strempty(prefix
);
4682 prefix2
= strjoina(prefix
, "\t");
4684 cmd
= exec_command_line(c
->argv
);
4686 "%sCommand Line: %s\n",
4687 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4689 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4692 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4695 prefix
= strempty(prefix
);
4697 LIST_FOREACH(command
, c
, c
)
4698 exec_command_dump(c
, f
, prefix
);
4701 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4708 /* It's kind of important, that we keep the order here */
4709 LIST_FIND_TAIL(command
, *l
, end
);
4710 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4715 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4723 l
= strv_new_ap(path
, ap
);
4735 free_and_replace(c
->path
, p
);
4737 return strv_free_and_replace(c
->argv
, l
);
4740 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4741 _cleanup_strv_free_
char **l
= NULL
;
4749 l
= strv_new_ap(path
, ap
);
4755 r
= strv_extend_strv(&c
->argv
, l
, false);
4762 static void *remove_tmpdir_thread(void *p
) {
4763 _cleanup_free_
char *path
= p
;
4765 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4769 static ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
, bool destroy
) {
4776 (void) hashmap_remove(rt
->manager
->exec_runtime_by_id
, rt
->id
);
4778 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4779 if (destroy
&& rt
->tmp_dir
) {
4780 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4782 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4784 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4791 if (destroy
&& rt
->var_tmp_dir
) {
4792 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4794 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4796 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4797 free(rt
->var_tmp_dir
);
4800 rt
->var_tmp_dir
= NULL
;
4803 rt
->id
= mfree(rt
->id
);
4804 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
4805 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
4806 safe_close_pair(rt
->netns_storage_socket
);
4810 static void exec_runtime_freep(ExecRuntime
**rt
) {
4812 (void) exec_runtime_free(*rt
, false);
4815 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4818 *rt
= new0(ExecRuntime
, 1);
4822 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4826 static int exec_runtime_add(
4829 const char *tmp_dir
,
4830 const char *var_tmp_dir
,
4831 const int netns_storage_socket
[2],
4832 ExecRuntime
**ret
) {
4834 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
4840 r
= hashmap_ensure_allocated(&m
->exec_runtime_by_id
, &string_hash_ops
);
4844 r
= exec_runtime_allocate(&rt
);
4848 rt
->id
= strdup(id
);
4853 rt
->tmp_dir
= strdup(tmp_dir
);
4857 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4858 assert(var_tmp_dir
);
4859 rt
->var_tmp_dir
= strdup(var_tmp_dir
);
4860 if (!rt
->var_tmp_dir
)
4864 if (netns_storage_socket
) {
4865 rt
->netns_storage_socket
[0] = netns_storage_socket
[0];
4866 rt
->netns_storage_socket
[1] = netns_storage_socket
[1];
4869 r
= hashmap_put(m
->exec_runtime_by_id
, rt
->id
, rt
);
4878 /* do not remove created ExecRuntime object when the operation succeeds. */
4883 static int exec_runtime_make(Manager
*m
, const ExecContext
*c
, const char *id
, ExecRuntime
**ret
) {
4884 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
4885 _cleanup_close_pair_
int netns_storage_socket
[2] = {-1, -1};
4892 /* It is not necessary to create ExecRuntime object. */
4893 if (!c
->private_network
&& !c
->private_tmp
)
4896 if (c
->private_tmp
) {
4897 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
4902 if (c
->private_network
) {
4903 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
4907 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, netns_storage_socket
, ret
);
4912 netns_storage_socket
[0] = -1;
4913 netns_storage_socket
[1] = -1;
4917 int exec_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecRuntime
**ret
) {
4925 rt
= hashmap_get(m
->exec_runtime_by_id
, id
);
4927 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4933 /* If not found, then create a new object. */
4934 r
= exec_runtime_make(m
, c
, id
, &rt
);
4936 /* When r == 0, it is not necessary to create ExecRuntime object. */
4940 /* increment reference counter. */
4946 ExecRuntime
*exec_runtime_unref(ExecRuntime
*rt
, bool destroy
) {
4950 assert(rt
->n_ref
> 0);
4956 return exec_runtime_free(rt
, destroy
);
4959 int exec_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
4967 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
4968 fprintf(f
, "exec-runtime=%s", rt
->id
);
4971 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
4973 if (rt
->var_tmp_dir
)
4974 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
4976 if (rt
->netns_storage_socket
[0] >= 0) {
4979 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4983 fprintf(f
, " netns-socket-0=%i", copy
);
4986 if (rt
->netns_storage_socket
[1] >= 0) {
4989 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4993 fprintf(f
, " netns-socket-1=%i", copy
);
5002 int exec_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
5003 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt_create
= NULL
;
5007 /* This is for the migration from old (v237 or earlier) deserialization text.
5008 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5009 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5010 * so or not from the serialized text, then we always creates a new object owned by this. */
5016 /* Manager manages ExecRuntime objects by the unit id.
5017 * So, we omit the serialized text when the unit does not have id (yet?)... */
5018 if (isempty(u
->id
)) {
5019 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
5023 r
= hashmap_ensure_allocated(&u
->manager
->exec_runtime_by_id
, &string_hash_ops
);
5025 log_unit_debug_errno(u
, r
, "Failed to allocate storage for runtime parameter: %m");
5029 rt
= hashmap_get(u
->manager
->exec_runtime_by_id
, u
->id
);
5031 r
= exec_runtime_allocate(&rt_create
);
5035 rt_create
->id
= strdup(u
->id
);
5042 if (streq(key
, "tmp-dir")) {
5045 copy
= strdup(value
);
5049 free_and_replace(rt
->tmp_dir
, copy
);
5051 } else if (streq(key
, "var-tmp-dir")) {
5054 copy
= strdup(value
);
5058 free_and_replace(rt
->var_tmp_dir
, copy
);
5060 } else if (streq(key
, "netns-socket-0")) {
5063 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
5064 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
5068 safe_close(rt
->netns_storage_socket
[0]);
5069 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
5071 } else if (streq(key
, "netns-socket-1")) {
5074 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
5075 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
5079 safe_close(rt
->netns_storage_socket
[1]);
5080 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
5084 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5086 r
= hashmap_put(u
->manager
->exec_runtime_by_id
, rt_create
->id
, rt_create
);
5088 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
5092 rt_create
->manager
= u
->manager
;
5101 void exec_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
5102 char *id
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
5103 int r
, fd0
= -1, fd1
= -1;
5104 const char *p
, *v
= value
;
5111 n
= strcspn(v
, " ");
5112 id
= strndupa(v
, n
);
5117 v
= startswith(p
, "tmp-dir=");
5119 n
= strcspn(v
, " ");
5120 tmp_dir
= strndupa(v
, n
);
5126 v
= startswith(p
, "var-tmp-dir=");
5128 n
= strcspn(v
, " ");
5129 var_tmp_dir
= strndupa(v
, n
);
5135 v
= startswith(p
, "netns-socket-0=");
5139 n
= strcspn(v
, " ");
5140 buf
= strndupa(v
, n
);
5141 if (safe_atoi(buf
, &fd0
) < 0 || !fdset_contains(fds
, fd0
)) {
5142 log_debug("Unable to process exec-runtime netns fd specification.");
5145 fd0
= fdset_remove(fds
, fd0
);
5151 v
= startswith(p
, "netns-socket-1=");
5155 n
= strcspn(v
, " ");
5156 buf
= strndupa(v
, n
);
5157 if (safe_atoi(buf
, &fd1
) < 0 || !fdset_contains(fds
, fd1
)) {
5158 log_debug("Unable to process exec-runtime netns fd specification.");
5161 fd1
= fdset_remove(fds
, fd1
);
5166 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, (int[]) { fd0
, fd1
}, NULL
);
5168 log_debug_errno(r
, "Failed to add exec-runtime: %m");
5171 void exec_runtime_vacuum(Manager
*m
) {
5177 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5179 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
5183 (void) exec_runtime_free(rt
, false);
5187 void exec_params_clear(ExecParameters
*p
) {
5191 strv_free(p
->environment
);
5194 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
5195 [EXEC_INPUT_NULL
] = "null",
5196 [EXEC_INPUT_TTY
] = "tty",
5197 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
5198 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
5199 [EXEC_INPUT_SOCKET
] = "socket",
5200 [EXEC_INPUT_NAMED_FD
] = "fd",
5201 [EXEC_INPUT_DATA
] = "data",
5202 [EXEC_INPUT_FILE
] = "file",
5205 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
5207 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
5208 [EXEC_OUTPUT_INHERIT
] = "inherit",
5209 [EXEC_OUTPUT_NULL
] = "null",
5210 [EXEC_OUTPUT_TTY
] = "tty",
5211 [EXEC_OUTPUT_SYSLOG
] = "syslog",
5212 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
5213 [EXEC_OUTPUT_KMSG
] = "kmsg",
5214 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
5215 [EXEC_OUTPUT_JOURNAL
] = "journal",
5216 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
5217 [EXEC_OUTPUT_SOCKET
] = "socket",
5218 [EXEC_OUTPUT_NAMED_FD
] = "fd",
5219 [EXEC_OUTPUT_FILE
] = "file",
5220 [EXEC_OUTPUT_FILE_APPEND
] = "append",
5223 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
5225 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
5226 [EXEC_UTMP_INIT
] = "init",
5227 [EXEC_UTMP_LOGIN
] = "login",
5228 [EXEC_UTMP_USER
] = "user",
5231 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
5233 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
5234 [EXEC_PRESERVE_NO
] = "no",
5235 [EXEC_PRESERVE_YES
] = "yes",
5236 [EXEC_PRESERVE_RESTART
] = "restart",
5239 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
5241 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5242 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
5243 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
5244 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
5245 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
5246 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
5249 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
5251 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5252 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
5253 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
5254 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
5255 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
5256 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
5259 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
5261 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
5262 [EXEC_KEYRING_INHERIT
] = "inherit",
5263 [EXEC_KEYRING_PRIVATE
] = "private",
5264 [EXEC_KEYRING_SHARED
] = "shared",
5267 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);