1 /* SPDX-License-Identifier: LGPL-2.1+ */
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
16 #include <sys/socket.h>
18 #include <sys/types.h>
24 #include <security/pam_appl.h>
28 #include <selinux/selinux.h>
36 #include <sys/apparmor.h>
39 #include "sd-messages.h"
42 #include "alloc-util.h"
44 #include "apparmor-util.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
55 #include "errno-list.h"
57 #include "exit-status.h"
59 #include "format-util.h"
61 #include "glob-util.h"
68 #include "memory-util.h"
71 #include "namespace.h"
72 #include "parse-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
75 #include "rlimit-util.h"
78 #include "seccomp-util.h"
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
94 #include "user-util.h"
95 #include "utmp-wtmp.h"
97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
100 #define SNDBUF_SIZE (8*1024*1024)
102 static int shift_fds(int fds
[], size_t n_fds
) {
103 int start
, restart_from
;
108 /* Modifies the fds array! (sorts it) */
118 for (i
= start
; i
< (int) n_fds
; i
++) {
121 /* Already at right index? */
125 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
132 /* Hmm, the fd we wanted isn't free? Then
133 * let's remember that and try again from here */
134 if (nfd
!= i
+3 && restart_from
< 0)
138 if (restart_from
< 0)
141 start
= restart_from
;
147 static int flags_fds(const int fds
[], size_t n_socket_fds
, size_t n_storage_fds
, bool nonblock
) {
151 n_fds
= n_socket_fds
+ n_storage_fds
;
157 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
158 * O_NONBLOCK only applies to socket activation though. */
160 for (i
= 0; i
< n_fds
; i
++) {
162 if (i
< n_socket_fds
) {
163 r
= fd_nonblock(fds
[i
], nonblock
);
168 /* We unconditionally drop FD_CLOEXEC from the fds,
169 * since after all we want to pass these fds to our
172 r
= fd_cloexec(fds
[i
], false);
180 static const char *exec_context_tty_path(const ExecContext
*context
) {
183 if (context
->stdio_as_fds
)
186 if (context
->tty_path
)
187 return context
->tty_path
;
189 return "/dev/console";
192 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
197 path
= exec_context_tty_path(context
);
199 if (context
->tty_vhangup
) {
200 if (p
&& p
->stdin_fd
>= 0)
201 (void) terminal_vhangup_fd(p
->stdin_fd
);
203 (void) terminal_vhangup(path
);
206 if (context
->tty_reset
) {
207 if (p
&& p
->stdin_fd
>= 0)
208 (void) reset_terminal_fd(p
->stdin_fd
, true);
210 (void) reset_terminal(path
);
213 if (context
->tty_vt_disallocate
&& path
)
214 (void) vt_disallocate(path
);
217 static bool is_terminal_input(ExecInput i
) {
220 EXEC_INPUT_TTY_FORCE
,
221 EXEC_INPUT_TTY_FAIL
);
224 static bool is_terminal_output(ExecOutput o
) {
227 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
228 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
229 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
232 static bool is_syslog_output(ExecOutput o
) {
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
238 static bool is_kmsg_output(ExecOutput o
) {
241 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
244 static bool exec_context_needs_term(const ExecContext
*c
) {
247 /* Return true if the execution context suggests we should set $TERM to something useful. */
249 if (is_terminal_input(c
->std_input
))
252 if (is_terminal_output(c
->std_output
))
255 if (is_terminal_output(c
->std_error
))
258 return !!c
->tty_path
;
261 static int open_null_as(int flags
, int nfd
) {
266 fd
= open("/dev/null", flags
|O_NOCTTY
);
270 return move_fd(fd
, nfd
, false);
273 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
274 static const union sockaddr_union sa
= {
275 .un
.sun_family
= AF_UNIX
,
276 .un
.sun_path
= "/run/systemd/journal/stdout",
278 uid_t olduid
= UID_INVALID
;
279 gid_t oldgid
= GID_INVALID
;
282 if (gid_is_valid(gid
)) {
285 if (setegid(gid
) < 0)
289 if (uid_is_valid(uid
)) {
292 if (seteuid(uid
) < 0) {
298 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
300 /* If we fail to restore the uid or gid, things will likely
301 fail later on. This should only happen if an LSM interferes. */
303 if (uid_is_valid(uid
))
304 (void) seteuid(olduid
);
307 if (gid_is_valid(gid
))
308 (void) setegid(oldgid
);
313 static int connect_logger_as(
315 const ExecContext
*context
,
316 const ExecParameters
*params
,
323 _cleanup_close_
int fd
= -1;
328 assert(output
< _EXEC_OUTPUT_MAX
);
332 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
336 r
= connect_journal_socket(fd
, uid
, gid
);
340 if (shutdown(fd
, SHUT_RD
) < 0)
343 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
353 context
->syslog_identifier
?: ident
,
354 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
355 context
->syslog_priority
,
356 !!context
->syslog_level_prefix
,
357 is_syslog_output(output
),
358 is_kmsg_output(output
),
359 is_terminal_output(output
)) < 0)
362 return move_fd(TAKE_FD(fd
), nfd
, false);
365 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
371 fd
= open_terminal(path
, flags
| O_NOCTTY
);
375 return move_fd(fd
, nfd
, false);
378 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
379 union sockaddr_union sa
= {};
380 _cleanup_close_
int fd
= -1;
385 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
388 fd
= open(path
, flags
|O_NOCTTY
, mode
);
392 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
394 if (strlen(path
) >= sizeof(sa
.un
.sun_path
)) /* Too long, can't be a UNIX socket */
397 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
399 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
403 salen
= sockaddr_un_set_path(&sa
.un
, path
);
407 if (connect(fd
, &sa
.sa
, salen
) < 0)
408 return errno
== EINVAL
? -ENXIO
: -errno
; /* Propagate initial error if we get EINVAL, i.e. we have
409 * indication that his wasn't an AF_UNIX socket after all */
411 if ((flags
& O_ACCMODE
) == O_RDONLY
)
412 r
= shutdown(fd
, SHUT_WR
);
413 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
414 r
= shutdown(fd
, SHUT_RD
);
423 static int fixup_input(
424 const ExecContext
*context
,
426 bool apply_tty_stdin
) {
432 std_input
= context
->std_input
;
434 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
435 return EXEC_INPUT_NULL
;
437 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
438 return EXEC_INPUT_NULL
;
440 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
441 return EXEC_INPUT_NULL
;
446 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
448 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
449 return EXEC_OUTPUT_INHERIT
;
454 static int setup_input(
455 const ExecContext
*context
,
456 const ExecParameters
*params
,
458 const int named_iofds
[static 3]) {
466 if (params
->stdin_fd
>= 0) {
467 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
470 /* Try to make this the controlling tty, if it is a tty, and reset it */
471 if (isatty(STDIN_FILENO
)) {
472 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
473 (void) reset_terminal_fd(STDIN_FILENO
, true);
479 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
483 case EXEC_INPUT_NULL
:
484 return open_null_as(O_RDONLY
, STDIN_FILENO
);
487 case EXEC_INPUT_TTY_FORCE
:
488 case EXEC_INPUT_TTY_FAIL
: {
491 fd
= acquire_terminal(exec_context_tty_path(context
),
492 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
493 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
494 ACQUIRE_TERMINAL_WAIT
,
499 return move_fd(fd
, STDIN_FILENO
, false);
502 case EXEC_INPUT_SOCKET
:
503 assert(socket_fd
>= 0);
505 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
507 case EXEC_INPUT_NAMED_FD
:
508 assert(named_iofds
[STDIN_FILENO
] >= 0);
510 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
511 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
513 case EXEC_INPUT_DATA
: {
516 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
520 return move_fd(fd
, STDIN_FILENO
, false);
523 case EXEC_INPUT_FILE
: {
527 assert(context
->stdio_file
[STDIN_FILENO
]);
529 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
530 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
532 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
536 return move_fd(fd
, STDIN_FILENO
, false);
540 assert_not_reached("Unknown input type");
544 static bool can_inherit_stderr_from_stdout(
545 const ExecContext
*context
,
551 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
554 if (e
== EXEC_OUTPUT_INHERIT
)
559 if (e
== EXEC_OUTPUT_NAMED_FD
)
560 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
562 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
))
563 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
568 static int setup_output(
570 const ExecContext
*context
,
571 const ExecParameters
*params
,
574 const int named_iofds
[static 3],
578 dev_t
*journal_stream_dev
,
579 ino_t
*journal_stream_ino
) {
589 assert(journal_stream_dev
);
590 assert(journal_stream_ino
);
592 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
594 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
597 return STDOUT_FILENO
;
600 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
601 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
604 return STDERR_FILENO
;
607 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
608 o
= fixup_output(context
->std_output
, socket_fd
);
610 if (fileno
== STDERR_FILENO
) {
612 e
= fixup_output(context
->std_error
, socket_fd
);
614 /* This expects the input and output are already set up */
616 /* Don't change the stderr file descriptor if we inherit all
617 * the way and are not on a tty */
618 if (e
== EXEC_OUTPUT_INHERIT
&&
619 o
== EXEC_OUTPUT_INHERIT
&&
620 i
== EXEC_INPUT_NULL
&&
621 !is_terminal_input(context
->std_input
) &&
625 /* Duplicate from stdout if possible */
626 if (can_inherit_stderr_from_stdout(context
, o
, e
))
627 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
631 } else if (o
== EXEC_OUTPUT_INHERIT
) {
632 /* If input got downgraded, inherit the original value */
633 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
634 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
636 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
637 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
638 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
640 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
644 /* We need to open /dev/null here anew, to get the right access mode. */
645 return open_null_as(O_WRONLY
, fileno
);
650 case EXEC_OUTPUT_NULL
:
651 return open_null_as(O_WRONLY
, fileno
);
653 case EXEC_OUTPUT_TTY
:
654 if (is_terminal_input(i
))
655 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
657 /* We don't reset the terminal if this is just about output */
658 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
660 case EXEC_OUTPUT_SYSLOG
:
661 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
662 case EXEC_OUTPUT_KMSG
:
663 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
664 case EXEC_OUTPUT_JOURNAL
:
665 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
666 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
668 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
669 r
= open_null_as(O_WRONLY
, fileno
);
673 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
674 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
675 * services to detect whether they are connected to the journal or not.
677 * If both stdout and stderr are connected to a stream then let's make sure to store the data
678 * about STDERR as that's usually the best way to do logging. */
680 if (fstat(fileno
, &st
) >= 0 &&
681 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
682 *journal_stream_dev
= st
.st_dev
;
683 *journal_stream_ino
= st
.st_ino
;
688 case EXEC_OUTPUT_SOCKET
:
689 assert(socket_fd
>= 0);
691 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
693 case EXEC_OUTPUT_NAMED_FD
:
694 assert(named_iofds
[fileno
] >= 0);
696 (void) fd_nonblock(named_iofds
[fileno
], false);
697 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
699 case EXEC_OUTPUT_FILE
:
700 case EXEC_OUTPUT_FILE_APPEND
: {
704 assert(context
->stdio_file
[fileno
]);
706 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
707 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
710 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
713 if (o
== EXEC_OUTPUT_FILE_APPEND
)
716 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
720 return move_fd(fd
, fileno
, 0);
724 assert_not_reached("Unknown error type");
728 static int chown_terminal(int fd
, uid_t uid
) {
733 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
734 if (isatty(fd
) < 1) {
735 if (IN_SET(errno
, EINVAL
, ENOTTY
))
736 return 0; /* not a tty */
741 /* This might fail. What matters are the results. */
742 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, -1);
749 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
750 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
753 assert(_saved_stdin
);
754 assert(_saved_stdout
);
756 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
760 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
761 if (saved_stdout
< 0)
764 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
768 r
= chown_terminal(fd
, getuid());
772 r
= reset_terminal_fd(fd
, true);
776 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
);
781 *_saved_stdin
= saved_stdin
;
782 *_saved_stdout
= saved_stdout
;
784 saved_stdin
= saved_stdout
= -1;
789 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
792 if (err
== -ETIMEDOUT
)
793 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
796 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
800 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
801 _cleanup_close_
int fd
= -1;
805 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
809 write_confirm_error_fd(err
, fd
, u
);
812 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
816 assert(saved_stdout
);
820 if (*saved_stdin
>= 0)
821 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
824 if (*saved_stdout
>= 0)
825 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
828 *saved_stdin
= safe_close(*saved_stdin
);
829 *saved_stdout
= safe_close(*saved_stdout
);
835 CONFIRM_PRETEND_FAILURE
= -1,
836 CONFIRM_PRETEND_SUCCESS
= 0,
840 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
841 int saved_stdout
= -1, saved_stdin
= -1, r
;
842 _cleanup_free_
char *e
= NULL
;
845 /* For any internal errors, assume a positive response. */
846 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
848 write_confirm_error(r
, vc
, u
);
849 return CONFIRM_EXECUTE
;
852 /* confirm_spawn might have been disabled while we were sleeping. */
853 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
858 e
= ellipsize(cmdline
, 60, 100);
866 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
868 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
875 printf("Resuming normal execution.\n");
876 manager_disable_confirm_spawn();
880 unit_dump(u
, stdout
, " ");
881 continue; /* ask again */
883 printf("Failing execution.\n");
884 r
= CONFIRM_PRETEND_FAILURE
;
887 printf(" c - continue, proceed without asking anymore\n"
888 " D - dump, show the state of the unit\n"
889 " f - fail, don't execute the command and pretend it failed\n"
891 " i - info, show a short summary of the unit\n"
892 " j - jobs, show jobs that are in progress\n"
893 " s - skip, don't execute the command and pretend it succeeded\n"
894 " y - yes, execute the command\n");
895 continue; /* ask again */
897 printf(" Description: %s\n"
900 u
->id
, u
->description
, cmdline
);
901 continue; /* ask again */
903 manager_dump_jobs(u
->manager
, stdout
, " ");
904 continue; /* ask again */
906 /* 'n' was removed in favor of 'f'. */
907 printf("Didn't understand 'n', did you mean 'f'?\n");
908 continue; /* ask again */
910 printf("Skipping execution.\n");
911 r
= CONFIRM_PRETEND_SUCCESS
;
917 assert_not_reached("Unhandled choice");
923 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
927 static int get_fixed_user(const ExecContext
*c
, const char **user
,
928 uid_t
*uid
, gid_t
*gid
,
929 const char **home
, const char **shell
) {
938 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
939 * (i.e. are "/" or "/bin/nologin"). */
942 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
950 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
960 r
= get_group_creds(&name
, gid
, 0);
968 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
969 const char *group
, gid_t gid
,
970 gid_t
**supplementary_gids
, int *ngids
) {
974 bool keep_groups
= false;
975 gid_t
*groups
= NULL
;
976 _cleanup_free_ gid_t
*l_gids
= NULL
;
981 * If user is given, then lookup GID and supplementary groups list.
982 * We avoid NSS lookups for gid=0. Also we have to initialize groups
983 * here and as early as possible so we keep the list of supplementary
984 * groups of the caller.
986 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
987 /* First step, initialize groups from /etc/groups */
988 if (initgroups(user
, gid
) < 0)
994 if (strv_isempty(c
->supplementary_groups
))
998 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
999 * be positive, otherwise fail.
1002 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1003 if (ngroups_max
<= 0)
1004 return errno_or_else(EOPNOTSUPP
);
1006 l_gids
= new(gid_t
, ngroups_max
);
1012 * Lookup the list of groups that the user belongs to, we
1013 * avoid NSS lookups here too for gid=0.
1016 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1021 STRV_FOREACH(i
, c
->supplementary_groups
) {
1024 if (k
>= ngroups_max
)
1028 r
= get_group_creds(&g
, l_gids
+k
, 0);
1036 * Sets ngids to zero to drop all supplementary groups, happens
1037 * when we are under root and SupplementaryGroups= is empty.
1044 /* Otherwise get the final list of supplementary groups */
1045 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1049 *supplementary_gids
= groups
;
1057 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1060 /* Handle SupplementaryGroups= if it is not empty */
1062 r
= maybe_setgroups(ngids
, supplementary_gids
);
1067 if (gid_is_valid(gid
)) {
1068 /* Then set our gids */
1069 if (setresgid(gid
, gid
, gid
) < 0)
1076 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1079 if (!uid_is_valid(uid
))
1082 /* Sets (but doesn't look up) the uid and make sure we keep the
1083 * capabilities while doing so. */
1085 if (context
->capability_ambient_set
!= 0) {
1087 /* First step: If we need to keep capabilities but
1088 * drop privileges we need to make sure we keep our
1089 * caps, while we drop privileges. */
1091 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1093 if (prctl(PR_GET_SECUREBITS
) != sb
)
1094 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1099 /* Second step: actually set the uids */
1100 if (setresuid(uid
, uid
, uid
) < 0)
1103 /* At this point we should have all necessary capabilities but
1104 are otherwise a normal user. However, the caps might got
1105 corrupted due to the setresuid() so we need clean them up
1106 later. This is done outside of this call. */
1113 static int null_conv(
1115 const struct pam_message
**msg
,
1116 struct pam_response
**resp
,
1117 void *appdata_ptr
) {
1119 /* We don't support conversations */
1121 return PAM_CONV_ERR
;
1126 static int setup_pam(
1133 int fds
[], size_t n_fds
) {
1137 static const struct pam_conv conv
= {
1142 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1143 pam_handle_t
*handle
= NULL
;
1145 int pam_code
= PAM_SUCCESS
, r
;
1146 char **nv
, **e
= NULL
;
1147 bool close_session
= false;
1148 pid_t pam_pid
= 0, parent_pid
;
1155 /* We set up PAM in the parent process, then fork. The child
1156 * will then stay around until killed via PR_GET_PDEATHSIG or
1157 * systemd via the cgroup logic. It will then remove the PAM
1158 * session again. The parent process will exec() the actual
1159 * daemon. We do things this way to ensure that the main PID
1160 * of the daemon is the one we initially fork()ed. */
1162 r
= barrier_create(&barrier
);
1166 if (log_get_max_level() < LOG_DEBUG
)
1167 flags
|= PAM_SILENT
;
1169 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1170 if (pam_code
!= PAM_SUCCESS
) {
1176 _cleanup_free_
char *q
= NULL
;
1178 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1179 * out if that's the case, and read the TTY off it. */
1181 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1182 tty
= strjoina("/dev/", q
);
1186 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1187 if (pam_code
!= PAM_SUCCESS
)
1191 STRV_FOREACH(nv
, *env
) {
1192 pam_code
= pam_putenv(handle
, *nv
);
1193 if (pam_code
!= PAM_SUCCESS
)
1197 pam_code
= pam_acct_mgmt(handle
, flags
);
1198 if (pam_code
!= PAM_SUCCESS
)
1201 pam_code
= pam_open_session(handle
, flags
);
1202 if (pam_code
!= PAM_SUCCESS
)
1205 close_session
= true;
1207 e
= pam_getenvlist(handle
);
1209 pam_code
= PAM_BUF_ERR
;
1213 /* Block SIGTERM, so that we know that it won't get lost in
1216 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1218 parent_pid
= getpid_cached();
1220 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1224 int sig
, ret
= EXIT_PAM
;
1226 /* The child's job is to reset the PAM session on
1228 barrier_set_role(&barrier
, BARRIER_CHILD
);
1230 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1231 * are open here that have been opened by PAM. */
1232 (void) close_many(fds
, n_fds
);
1234 /* Drop privileges - we don't need any to pam_close_session
1235 * and this will make PR_SET_PDEATHSIG work in most cases.
1236 * If this fails, ignore the error - but expect sd-pam threads
1237 * to fail to exit normally */
1239 r
= maybe_setgroups(0, NULL
);
1241 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1242 if (setresgid(gid
, gid
, gid
) < 0)
1243 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1244 if (setresuid(uid
, uid
, uid
) < 0)
1245 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1247 (void) ignore_signals(SIGPIPE
, -1);
1249 /* Wait until our parent died. This will only work if
1250 * the above setresuid() succeeds, otherwise the kernel
1251 * will not allow unprivileged parents kill their privileged
1252 * children this way. We rely on the control groups kill logic
1253 * to do the rest for us. */
1254 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1257 /* Tell the parent that our setup is done. This is especially
1258 * important regarding dropping privileges. Otherwise, unit
1259 * setup might race against our setresuid(2) call.
1261 * If the parent aborted, we'll detect this below, hence ignore
1262 * return failure here. */
1263 (void) barrier_place(&barrier
);
1265 /* Check if our parent process might already have died? */
1266 if (getppid() == parent_pid
) {
1269 assert_se(sigemptyset(&ss
) >= 0);
1270 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1273 if (sigwait(&ss
, &sig
) < 0) {
1280 assert(sig
== SIGTERM
);
1285 /* If our parent died we'll end the session */
1286 if (getppid() != parent_pid
) {
1287 pam_code
= pam_close_session(handle
, flags
);
1288 if (pam_code
!= PAM_SUCCESS
)
1295 pam_end(handle
, pam_code
| flags
);
1299 barrier_set_role(&barrier
, BARRIER_PARENT
);
1301 /* If the child was forked off successfully it will do all the
1302 * cleanups, so forget about the handle here. */
1305 /* Unblock SIGTERM again in the parent */
1306 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1308 /* We close the log explicitly here, since the PAM modules
1309 * might have opened it, but we don't want this fd around. */
1312 /* Synchronously wait for the child to initialize. We don't care for
1313 * errors as we cannot recover. However, warn loudly if it happens. */
1314 if (!barrier_place_and_sync(&barrier
))
1315 log_error("PAM initialization failed");
1317 return strv_free_and_replace(*env
, e
);
1320 if (pam_code
!= PAM_SUCCESS
) {
1321 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1322 r
= -EPERM
; /* PAM errors do not map to errno */
1324 log_error_errno(r
, "PAM failed: %m");
1328 pam_code
= pam_close_session(handle
, flags
);
1330 pam_end(handle
, pam_code
| flags
);
1342 static void rename_process_from_path(const char *path
) {
1343 char process_name
[11];
1347 /* This resulting string must fit in 10 chars (i.e. the length
1348 * of "/sbin/init") to look pretty in /bin/ps */
1352 rename_process("(...)");
1358 /* The end of the process name is usually more
1359 * interesting, since the first bit might just be
1365 process_name
[0] = '(';
1366 memcpy(process_name
+1, p
, l
);
1367 process_name
[1+l
] = ')';
1368 process_name
[1+l
+1] = 0;
1370 rename_process(process_name
);
1373 static bool context_has_address_families(const ExecContext
*c
) {
1376 return c
->address_families_whitelist
||
1377 !set_isempty(c
->address_families
);
1380 static bool context_has_syscall_filters(const ExecContext
*c
) {
1383 return c
->syscall_whitelist
||
1384 !hashmap_isempty(c
->syscall_filter
);
1387 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1390 if (c
->no_new_privileges
)
1393 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1396 /* We need NNP if we have any form of seccomp and are unprivileged */
1397 return context_has_address_families(c
) ||
1398 c
->memory_deny_write_execute
||
1399 c
->restrict_realtime
||
1400 c
->restrict_suid_sgid
||
1401 exec_context_restrict_namespaces_set(c
) ||
1402 c
->protect_kernel_tunables
||
1403 c
->protect_kernel_modules
||
1404 c
->private_devices
||
1405 context_has_syscall_filters(c
) ||
1406 !set_isempty(c
->syscall_archs
) ||
1407 c
->lock_personality
||
1408 c
->protect_hostname
;
1413 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1415 if (is_seccomp_available())
1418 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1422 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1423 uint32_t negative_action
, default_action
, action
;
1429 if (!context_has_syscall_filters(c
))
1432 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1435 negative_action
= c
->syscall_errno
== 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1437 if (c
->syscall_whitelist
) {
1438 default_action
= negative_action
;
1439 action
= SCMP_ACT_ALLOW
;
1441 default_action
= SCMP_ACT_ALLOW
;
1442 action
= negative_action
;
1445 if (needs_ambient_hack
) {
1446 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1451 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1454 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1458 if (set_isempty(c
->syscall_archs
))
1461 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1464 return seccomp_restrict_archs(c
->syscall_archs
);
1467 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1471 if (!context_has_address_families(c
))
1474 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1477 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1480 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1484 if (!c
->memory_deny_write_execute
)
1487 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1490 return seccomp_memory_deny_write_execute();
1493 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1497 if (!c
->restrict_realtime
)
1500 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1503 return seccomp_restrict_realtime();
1506 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1510 if (!c
->restrict_suid_sgid
)
1513 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1516 return seccomp_restrict_suid_sgid();
1519 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1523 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1524 * let's protect even those systems where this is left on in the kernel. */
1526 if (!c
->protect_kernel_tunables
)
1529 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1532 return seccomp_protect_sysctl();
1535 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1539 /* Turn off module syscalls on ProtectKernelModules=yes */
1541 if (!c
->protect_kernel_modules
)
1544 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1547 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1550 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1554 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1556 if (!c
->private_devices
)
1559 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1562 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1565 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1569 if (!exec_context_restrict_namespaces_set(c
))
1572 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1575 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1578 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1579 unsigned long personality
;
1585 if (!c
->lock_personality
)
1588 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1591 personality
= c
->personality
;
1593 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1594 if (personality
== PERSONALITY_INVALID
) {
1596 r
= opinionated_personality(&personality
);
1601 return seccomp_lock_personality(personality
);
1606 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1609 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1610 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1612 if (idle_pipe
[0] >= 0) {
1615 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1617 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1620 /* Signal systemd that we are bored and want to continue. */
1621 n
= write(idle_pipe
[3], "x", 1);
1623 /* Wait for systemd to react to the signal above. */
1624 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1627 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1631 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1634 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1636 static int build_environment(
1638 const ExecContext
*c
,
1639 const ExecParameters
*p
,
1642 const char *username
,
1644 dev_t journal_stream_dev
,
1645 ino_t journal_stream_ino
,
1648 _cleanup_strv_free_
char **our_env
= NULL
;
1649 ExecDirectoryType t
;
1658 our_env
= new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX
);
1663 _cleanup_free_
char *joined
= NULL
;
1665 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1667 our_env
[n_env
++] = x
;
1669 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1671 our_env
[n_env
++] = x
;
1673 joined
= strv_join(p
->fd_names
, ":");
1677 x
= strjoin("LISTEN_FDNAMES=", joined
);
1680 our_env
[n_env
++] = x
;
1683 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1684 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1686 our_env
[n_env
++] = x
;
1688 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1690 our_env
[n_env
++] = x
;
1693 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1694 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1695 * check the database directly. */
1696 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1697 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1700 our_env
[n_env
++] = x
;
1704 x
= strjoin("HOME=", home
);
1708 path_simplify(x
+ 5, true);
1709 our_env
[n_env
++] = x
;
1713 x
= strjoin("LOGNAME=", username
);
1716 our_env
[n_env
++] = x
;
1718 x
= strjoin("USER=", username
);
1721 our_env
[n_env
++] = x
;
1725 x
= strjoin("SHELL=", shell
);
1729 path_simplify(x
+ 6, true);
1730 our_env
[n_env
++] = x
;
1733 if (!sd_id128_is_null(u
->invocation_id
)) {
1734 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1737 our_env
[n_env
++] = x
;
1740 if (exec_context_needs_term(c
)) {
1741 const char *tty_path
, *term
= NULL
;
1743 tty_path
= exec_context_tty_path(c
);
1745 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1746 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1747 * passes to PID 1 ends up all the way in the console login shown. */
1749 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1750 term
= getenv("TERM");
1752 term
= default_term_for_tty(tty_path
);
1754 x
= strjoin("TERM=", term
);
1757 our_env
[n_env
++] = x
;
1760 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1761 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1764 our_env
[n_env
++] = x
;
1767 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1768 _cleanup_free_
char *pre
= NULL
, *joined
= NULL
;
1774 if (strv_isempty(c
->directories
[t
].paths
))
1777 n
= exec_directory_env_name_to_string(t
);
1781 pre
= strjoin(p
->prefix
[t
], "/");
1785 joined
= strv_join_prefix(c
->directories
[t
].paths
, ":", pre
);
1789 x
= strjoin(n
, "=", joined
);
1793 our_env
[n_env
++] = x
;
1796 our_env
[n_env
++] = NULL
;
1797 assert(n_env
<= 14 + _EXEC_DIRECTORY_TYPE_MAX
);
1799 *ret
= TAKE_PTR(our_env
);
1804 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1805 _cleanup_strv_free_
char **pass_env
= NULL
;
1806 size_t n_env
= 0, n_bufsize
= 0;
1809 STRV_FOREACH(i
, c
->pass_environment
) {
1810 _cleanup_free_
char *x
= NULL
;
1816 x
= strjoin(*i
, "=", v
);
1820 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1823 pass_env
[n_env
++] = TAKE_PTR(x
);
1824 pass_env
[n_env
] = NULL
;
1827 *ret
= TAKE_PTR(pass_env
);
1832 static bool exec_needs_mount_namespace(
1833 const ExecContext
*context
,
1834 const ExecParameters
*params
,
1835 const ExecRuntime
*runtime
) {
1840 if (context
->root_image
)
1843 if (!strv_isempty(context
->read_write_paths
) ||
1844 !strv_isempty(context
->read_only_paths
) ||
1845 !strv_isempty(context
->inaccessible_paths
))
1848 if (context
->n_bind_mounts
> 0)
1851 if (context
->n_temporary_filesystems
> 0)
1854 if (!IN_SET(context
->mount_flags
, 0, MS_SHARED
))
1857 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1860 if (context
->private_devices
||
1861 context
->private_mounts
||
1862 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1863 context
->protect_home
!= PROTECT_HOME_NO
||
1864 context
->protect_kernel_tunables
||
1865 context
->protect_kernel_modules
||
1866 context
->protect_control_groups
)
1869 if (context
->root_directory
) {
1870 ExecDirectoryType t
;
1872 if (context
->mount_apivfs
)
1875 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1876 if (!params
->prefix
[t
])
1879 if (!strv_isempty(context
->directories
[t
].paths
))
1884 if (context
->dynamic_user
&&
1885 (!strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1886 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1887 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
)))
1893 static int setup_private_users(uid_t uid
, gid_t gid
) {
1894 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1895 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1896 _cleanup_close_
int unshare_ready_fd
= -1;
1897 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1902 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1903 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1904 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1905 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1906 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1907 * continues execution normally. */
1909 if (uid
!= 0 && uid_is_valid(uid
)) {
1910 r
= asprintf(&uid_map
,
1911 "0 0 1\n" /* Map root → root */
1912 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1917 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1922 if (gid
!= 0 && gid_is_valid(gid
)) {
1923 r
= asprintf(&gid_map
,
1924 "0 0 1\n" /* Map root → root */
1925 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1930 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1935 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1937 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1938 if (unshare_ready_fd
< 0)
1941 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1943 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1946 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
1950 _cleanup_close_
int fd
= -1;
1954 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1955 * here, after the parent opened its own user namespace. */
1958 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1960 /* Wait until the parent unshared the user namespace */
1961 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1966 /* Disable the setgroups() system call in the child user namespace, for good. */
1967 a
= procfs_file_alloca(ppid
, "setgroups");
1968 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1970 if (errno
!= ENOENT
) {
1975 /* If the file is missing the kernel is too old, let's continue anyway. */
1977 if (write(fd
, "deny\n", 5) < 0) {
1982 fd
= safe_close(fd
);
1985 /* First write the GID map */
1986 a
= procfs_file_alloca(ppid
, "gid_map");
1987 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1992 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1996 fd
= safe_close(fd
);
1998 /* The write the UID map */
1999 a
= procfs_file_alloca(ppid
, "uid_map");
2000 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2005 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2010 _exit(EXIT_SUCCESS
);
2013 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2014 _exit(EXIT_FAILURE
);
2017 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2019 if (unshare(CLONE_NEWUSER
) < 0)
2022 /* Let the child know that the namespace is ready now */
2023 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2026 /* Try to read an error code from the child */
2027 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2030 if (n
== sizeof(r
)) { /* an error code was sent to us */
2035 if (n
!= 0) /* on success we should have read 0 bytes */
2038 r
= wait_for_terminate_and_check("(sd-userns)", pid
, 0);
2042 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2048 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2049 if (!context
->dynamic_user
)
2052 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2055 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2061 static int setup_exec_directory(
2062 const ExecContext
*context
,
2063 const ExecParameters
*params
,
2066 ExecDirectoryType type
,
2069 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2070 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2071 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2072 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2073 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2074 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2081 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2082 assert(exit_status
);
2084 if (!params
->prefix
[type
])
2087 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2088 if (!uid_is_valid(uid
))
2090 if (!gid_is_valid(gid
))
2094 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
2095 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2097 p
= path_join(params
->prefix
[type
], *rt
);
2103 r
= mkdir_parents_label(p
, 0755);
2107 if (exec_directory_is_private(context
, type
)) {
2108 _cleanup_free_
char *private_root
= NULL
;
2110 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2111 * case we want to avoid leaving a directory around fully accessible that is owned by
2112 * a dynamic user whose UID is later on reused. To lock this down we use the same
2113 * trick used by container managers to prohibit host users to get access to files of
2114 * the same UID in containers: we place everything inside a directory that has an
2115 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2116 * for unprivileged host code. We then use fs namespacing to make this directory
2117 * permeable for the service itself.
2119 * Specifically: for a service which wants a special directory "foo/" we first create
2120 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2121 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2122 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2123 * unprivileged host users can't look into it. Inside of the namespace of the unit
2124 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2125 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2126 * for the service and making sure it only gets access to the dirs it needs but no
2127 * others. Tricky? Yes, absolutely, but it works!
2129 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2130 * to be owned by the service itself.
2132 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2133 * for sharing files or sockets with other services. */
2135 private_root
= path_join(params
->prefix
[type
], "private");
2136 if (!private_root
) {
2141 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2142 r
= mkdir_safe_label(private_root
, 0700, 0, 0, MKDIR_WARN_MODE
);
2146 pp
= path_join(private_root
, *rt
);
2152 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2153 r
= mkdir_parents_label(pp
, 0755);
2157 if (is_dir(p
, false) > 0 &&
2158 (laccess(pp
, F_OK
) < 0 && errno
== ENOENT
)) {
2160 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2161 * it over. Most likely the service has been upgraded from one that didn't use
2162 * DynamicUser=1, to one that does. */
2164 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2165 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2166 exec_directory_type_to_string(type
), p
, pp
);
2168 if (rename(p
, pp
) < 0) {
2173 /* Otherwise, create the actual directory for the service */
2175 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2176 if (r
< 0 && r
!= -EEXIST
)
2180 /* And link it up from the original place */
2181 r
= symlink_idempotent(pp
, p
, true);
2186 _cleanup_free_
char *target
= NULL
;
2188 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2189 readlink_and_make_absolute(p
, &target
) >= 0) {
2190 _cleanup_free_
char *q
= NULL
;
2192 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193 * by DynamicUser=1 (see above)?
2195 * We do this for all directory types except for ConfigurationDirectory=,
2196 * since they all support the private/ symlink logic at least in some
2197 * configurations, see above. */
2199 q
= path_join(params
->prefix
[type
], "private", *rt
);
2205 if (path_equal(q
, target
)) {
2207 /* Hmm, apparently DynamicUser= was once turned on for this service,
2208 * but is no longer. Let's move the directory back up. */
2210 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2211 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2212 exec_directory_type_to_string(type
), q
, p
);
2214 if (unlink(p
) < 0) {
2219 if (rename(q
, p
) < 0) {
2226 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2231 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2234 /* Don't change the owner/access mode of the configuration directory,
2235 * as in the common case it is not written to by a service, and shall
2236 * not be writable. */
2238 if (stat(p
, &st
) < 0) {
2243 /* Still complain if the access mode doesn't match */
2244 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2245 log_warning("%s \'%s\' already exists but the mode is different. "
2246 "(File system: %o %sMode: %o)",
2247 exec_directory_type_to_string(type
), *rt
,
2248 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2255 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2256 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2257 * current UID/GID ownership.) */
2258 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2262 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2263 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2264 * assignments to exist.*/
2265 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777);
2273 *exit_status
= exit_status_table
[type
];
2278 static int setup_smack(
2279 const ExecContext
*context
,
2280 const ExecCommand
*command
) {
2287 if (context
->smack_process_label
) {
2288 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2292 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2294 _cleanup_free_
char *exec_label
= NULL
;
2296 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2297 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2300 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2310 static int compile_bind_mounts(
2311 const ExecContext
*context
,
2312 const ExecParameters
*params
,
2313 BindMount
**ret_bind_mounts
,
2314 size_t *ret_n_bind_mounts
,
2315 char ***ret_empty_directories
) {
2317 _cleanup_strv_free_
char **empty_directories
= NULL
;
2318 BindMount
*bind_mounts
;
2320 ExecDirectoryType t
;
2325 assert(ret_bind_mounts
);
2326 assert(ret_n_bind_mounts
);
2327 assert(ret_empty_directories
);
2329 n
= context
->n_bind_mounts
;
2330 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2331 if (!params
->prefix
[t
])
2334 n
+= strv_length(context
->directories
[t
].paths
);
2338 *ret_bind_mounts
= NULL
;
2339 *ret_n_bind_mounts
= 0;
2340 *ret_empty_directories
= NULL
;
2344 bind_mounts
= new(BindMount
, n
);
2348 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2349 BindMount
*item
= context
->bind_mounts
+ i
;
2352 s
= strdup(item
->source
);
2358 d
= strdup(item
->destination
);
2365 bind_mounts
[h
++] = (BindMount
) {
2368 .read_only
= item
->read_only
,
2369 .recursive
= item
->recursive
,
2370 .ignore_enoent
= item
->ignore_enoent
,
2374 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2377 if (!params
->prefix
[t
])
2380 if (strv_isempty(context
->directories
[t
].paths
))
2383 if (exec_directory_is_private(context
, t
) &&
2384 !(context
->root_directory
|| context
->root_image
)) {
2387 /* So this is for a dynamic user, and we need to make sure the process can access its own
2388 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2389 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2391 private_root
= path_join(params
->prefix
[t
], "private");
2392 if (!private_root
) {
2397 r
= strv_consume(&empty_directories
, private_root
);
2402 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2405 if (exec_directory_is_private(context
, t
))
2406 s
= path_join(params
->prefix
[t
], "private", *suffix
);
2408 s
= path_join(params
->prefix
[t
], *suffix
);
2414 if (exec_directory_is_private(context
, t
) &&
2415 (context
->root_directory
|| context
->root_image
))
2416 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2417 * directory is not created on the root directory. So, let's bind-mount the directory
2418 * on the 'non-private' place. */
2419 d
= path_join(params
->prefix
[t
], *suffix
);
2428 bind_mounts
[h
++] = (BindMount
) {
2432 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
2434 .ignore_enoent
= false,
2441 *ret_bind_mounts
= bind_mounts
;
2442 *ret_n_bind_mounts
= n
;
2443 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2448 bind_mount_free_many(bind_mounts
, h
);
2452 static int apply_mount_namespace(
2454 const ExecCommand
*command
,
2455 const ExecContext
*context
,
2456 const ExecParameters
*params
,
2457 const ExecRuntime
*runtime
,
2458 char **error_path
) {
2460 _cleanup_strv_free_
char **empty_directories
= NULL
;
2461 char *tmp
= NULL
, *var
= NULL
;
2462 const char *root_dir
= NULL
, *root_image
= NULL
;
2463 NamespaceInfo ns_info
;
2464 bool needs_sandboxing
;
2465 BindMount
*bind_mounts
= NULL
;
2466 size_t n_bind_mounts
= 0;
2471 /* The runtime struct only contains the parent of the private /tmp,
2472 * which is non-accessible to world users. Inside of it there's a /tmp
2473 * that is sticky, and that's the one we want to use here. */
2475 if (context
->private_tmp
&& runtime
) {
2476 if (runtime
->tmp_dir
)
2477 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2478 if (runtime
->var_tmp_dir
)
2479 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2482 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2483 root_image
= context
->root_image
;
2486 root_dir
= context
->root_directory
;
2489 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2493 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2494 if (needs_sandboxing
)
2495 ns_info
= (NamespaceInfo
) {
2496 .ignore_protect_paths
= false,
2497 .private_dev
= context
->private_devices
,
2498 .protect_control_groups
= context
->protect_control_groups
,
2499 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2500 .protect_kernel_modules
= context
->protect_kernel_modules
,
2501 .protect_hostname
= context
->protect_hostname
,
2502 .mount_apivfs
= context
->mount_apivfs
,
2503 .private_mounts
= context
->private_mounts
,
2505 else if (!context
->dynamic_user
&& root_dir
)
2507 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2508 * sandbox info, otherwise enforce it, don't ignore protected paths and
2509 * fail if we are enable to apply the sandbox inside the mount namespace.
2511 ns_info
= (NamespaceInfo
) {
2512 .ignore_protect_paths
= true,
2515 ns_info
= (NamespaceInfo
) {};
2517 if (context
->mount_flags
== MS_SHARED
)
2518 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2520 r
= setup_namespace(root_dir
, root_image
,
2521 &ns_info
, context
->read_write_paths
,
2522 needs_sandboxing
? context
->read_only_paths
: NULL
,
2523 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2527 context
->temporary_filesystems
,
2528 context
->n_temporary_filesystems
,
2531 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2532 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2533 context
->mount_flags
,
2534 DISSECT_IMAGE_DISCARD_ON_LOOP
,
2537 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2539 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2540 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2541 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2542 * completely different execution environment. */
2544 if (n_bind_mounts
== 0 &&
2545 context
->n_temporary_filesystems
== 0 &&
2546 !root_dir
&& !root_image
&&
2547 !context
->dynamic_user
) {
2548 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
2552 log_unit_debug(u
, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2553 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2554 n_bind_mounts
, context
->n_temporary_filesystems
, yes_no(root_dir
), yes_no(root_image
), yes_no(context
->dynamic_user
));
2562 static int apply_working_directory(
2563 const ExecContext
*context
,
2564 const ExecParameters
*params
,
2571 assert(exit_status
);
2573 if (context
->working_directory_home
) {
2576 *exit_status
= EXIT_CHDIR
;
2582 } else if (context
->working_directory
)
2583 wd
= context
->working_directory
;
2587 if (params
->flags
& EXEC_APPLY_CHROOT
)
2590 d
= prefix_roota(context
->root_directory
, wd
);
2592 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2593 *exit_status
= EXIT_CHDIR
;
2600 static int apply_root_directory(
2601 const ExecContext
*context
,
2602 const ExecParameters
*params
,
2603 const bool needs_mount_ns
,
2607 assert(exit_status
);
2609 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2610 if (!needs_mount_ns
&& context
->root_directory
)
2611 if (chroot(context
->root_directory
) < 0) {
2612 *exit_status
= EXIT_CHROOT
;
2620 static int setup_keyring(
2622 const ExecContext
*context
,
2623 const ExecParameters
*p
,
2624 uid_t uid
, gid_t gid
) {
2626 key_serial_t keyring
;
2635 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2636 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2637 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2638 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2639 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2640 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2642 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2645 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2646 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2647 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2648 * & group is just as nasty as acquiring a reference to the user keyring. */
2650 saved_uid
= getuid();
2651 saved_gid
= getgid();
2653 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2654 if (setregid(gid
, -1) < 0)
2655 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2658 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2659 if (setreuid(uid
, -1) < 0) {
2660 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2665 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2666 if (keyring
== -1) {
2667 if (errno
== ENOSYS
)
2668 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2669 else if (IN_SET(errno
, EACCES
, EPERM
))
2670 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2671 else if (errno
== EDQUOT
)
2672 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2674 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2679 /* When requested link the user keyring into the session keyring. */
2680 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2682 if (keyctl(KEYCTL_LINK
,
2683 KEY_SPEC_USER_KEYRING
,
2684 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2685 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
2690 /* Restore uid/gid back */
2691 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2692 if (setreuid(saved_uid
, -1) < 0) {
2693 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2698 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2699 if (setregid(saved_gid
, -1) < 0)
2700 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2703 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2704 if (!sd_id128_is_null(u
->invocation_id
)) {
2707 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2709 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2711 if (keyctl(KEYCTL_SETPERM
, key
,
2712 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2713 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2714 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2719 /* Revert back uid & gid for the the last time, and exit */
2720 /* no extra logging, as only the first already reported error matters */
2721 if (getuid() != saved_uid
)
2722 (void) setreuid(saved_uid
, -1);
2724 if (getgid() != saved_gid
)
2725 (void) setregid(saved_gid
, -1);
2730 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
2736 array
[(*n
)++] = pair
[0];
2738 array
[(*n
)++] = pair
[1];
2741 static int close_remaining_fds(
2742 const ExecParameters
*params
,
2743 const ExecRuntime
*runtime
,
2744 const DynamicCreds
*dcreds
,
2748 int *fds
, size_t n_fds
) {
2750 size_t n_dont_close
= 0;
2751 int dont_close
[n_fds
+ 12];
2755 if (params
->stdin_fd
>= 0)
2756 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2757 if (params
->stdout_fd
>= 0)
2758 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2759 if (params
->stderr_fd
>= 0)
2760 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2763 dont_close
[n_dont_close
++] = socket_fd
;
2765 dont_close
[n_dont_close
++] = exec_fd
;
2767 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2768 n_dont_close
+= n_fds
;
2772 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2776 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2778 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2781 if (user_lookup_fd
>= 0)
2782 dont_close
[n_dont_close
++] = user_lookup_fd
;
2784 return close_all_fds(dont_close
, n_dont_close
);
2787 static int send_user_lookup(
2795 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2796 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2799 if (user_lookup_fd
< 0)
2802 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2805 if (writev(user_lookup_fd
,
2807 IOVEC_INIT(&uid
, sizeof(uid
)),
2808 IOVEC_INIT(&gid
, sizeof(gid
)),
2809 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2815 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2822 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2827 if (!c
->working_directory_home
)
2830 r
= get_home_dir(buf
);
2838 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2839 _cleanup_strv_free_
char ** list
= NULL
;
2840 ExecDirectoryType t
;
2847 assert(c
->dynamic_user
);
2849 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2850 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2853 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2856 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2862 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2865 if (exec_directory_is_private(c
, t
))
2866 e
= path_join(p
->prefix
[t
], "private", *i
);
2868 e
= path_join(p
->prefix
[t
], *i
);
2872 r
= strv_consume(&list
, e
);
2878 *ret
= TAKE_PTR(list
);
2883 static char *exec_command_line(char **argv
);
2885 static int exec_parameters_get_cgroup_path(const ExecParameters
*params
, char **ret
) {
2886 bool using_subcgroup
;
2892 if (!params
->cgroup_path
)
2895 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2896 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2897 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2898 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2899 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2900 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2901 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2902 * flag, which is only passed for the former statements, not for the latter. */
2904 using_subcgroup
= FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
|EXEC_CGROUP_DELEGATE
|EXEC_IS_CONTROL
);
2905 if (using_subcgroup
)
2906 p
= path_join(params
->cgroup_path
, ".control");
2908 p
= strdup(params
->cgroup_path
);
2913 return using_subcgroup
;
2916 static int exec_child(
2918 const ExecCommand
*command
,
2919 const ExecContext
*context
,
2920 const ExecParameters
*params
,
2921 ExecRuntime
*runtime
,
2922 DynamicCreds
*dcreds
,
2924 const int named_iofds
[static 3],
2926 size_t n_socket_fds
,
2927 size_t n_storage_fds
,
2932 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
2933 int *fds_with_exec_fd
, n_fds_with_exec_fd
, r
, ngids
= 0, exec_fd
= -1;
2934 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2935 const char *username
= NULL
, *groupname
= NULL
;
2936 _cleanup_free_
char *home_buffer
= NULL
;
2937 const char *home
= NULL
, *shell
= NULL
;
2938 char **final_argv
= NULL
;
2939 dev_t journal_stream_dev
= 0;
2940 ino_t journal_stream_ino
= 0;
2941 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2942 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2943 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2944 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2946 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
2947 bool use_selinux
= false;
2950 bool use_smack
= false;
2953 bool use_apparmor
= false;
2955 uid_t uid
= UID_INVALID
;
2956 gid_t gid
= GID_INVALID
;
2958 ExecDirectoryType dt
;
2965 assert(exit_status
);
2967 rename_process_from_path(command
->path
);
2969 /* We reset exactly these signals, since they are the
2970 * only ones we set to SIG_IGN in the main daemon. All
2971 * others we leave untouched because we set them to
2972 * SIG_DFL or a valid handler initially, both of which
2973 * will be demoted to SIG_DFL. */
2974 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2975 SIGNALS_IGNORE
, -1);
2977 if (context
->ignore_sigpipe
)
2978 (void) ignore_signals(SIGPIPE
, -1);
2980 r
= reset_signal_mask();
2982 *exit_status
= EXIT_SIGNAL_MASK
;
2983 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2986 if (params
->idle_pipe
)
2987 do_idle_pipe_dance(params
->idle_pipe
);
2989 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2990 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2991 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2992 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2995 log_set_open_when_needed(true);
2997 /* In case anything used libc syslog(), close this here, too */
3000 n_fds
= n_socket_fds
+ n_storage_fds
;
3001 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, params
->exec_fd
, fds
, n_fds
);
3003 *exit_status
= EXIT_FDS
;
3004 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
3007 if (!context
->same_pgrp
)
3009 *exit_status
= EXIT_SETSID
;
3010 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
3013 exec_context_tty_reset(context
, params
);
3015 if (unit_shall_confirm_spawn(unit
)) {
3016 const char *vc
= params
->confirm_spawn
;
3017 _cleanup_free_
char *cmdline
= NULL
;
3019 cmdline
= exec_command_line(command
->argv
);
3021 *exit_status
= EXIT_MEMORY
;
3025 r
= ask_for_confirmation(vc
, unit
, cmdline
);
3026 if (r
!= CONFIRM_EXECUTE
) {
3027 if (r
== CONFIRM_PRETEND_SUCCESS
) {
3028 *exit_status
= EXIT_SUCCESS
;
3031 *exit_status
= EXIT_CONFIRM
;
3032 log_unit_error(unit
, "Execution cancelled by the user");
3037 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3038 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3039 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3040 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3041 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3042 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
3043 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit
->manager
) ? "system" : "user", true) != 0) {
3044 *exit_status
= EXIT_MEMORY
;
3045 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
3048 if (context
->dynamic_user
&& dcreds
) {
3049 _cleanup_strv_free_
char **suggested_paths
= NULL
;
3051 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3052 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3053 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3054 *exit_status
= EXIT_USER
;
3055 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
3058 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
3060 *exit_status
= EXIT_MEMORY
;
3064 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
3066 *exit_status
= EXIT_USER
;
3068 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3071 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
3074 if (!uid_is_valid(uid
)) {
3075 *exit_status
= EXIT_USER
;
3076 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
3080 if (!gid_is_valid(gid
)) {
3081 *exit_status
= EXIT_USER
;
3082 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
3087 username
= dcreds
->user
->name
;
3090 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
3092 *exit_status
= EXIT_USER
;
3093 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
3096 r
= get_fixed_group(context
, &groupname
, &gid
);
3098 *exit_status
= EXIT_GROUP
;
3099 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
3103 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3104 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
3105 &supplementary_gids
, &ngids
);
3107 *exit_status
= EXIT_GROUP
;
3108 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
3111 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
3113 *exit_status
= EXIT_USER
;
3114 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
3117 user_lookup_fd
= safe_close(user_lookup_fd
);
3119 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
3121 *exit_status
= EXIT_CHDIR
;
3122 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
3125 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3126 * must sure to drop O_NONBLOCK */
3128 (void) fd_nonblock(socket_fd
, false);
3130 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3131 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3132 if (params
->cgroup_path
) {
3133 _cleanup_free_
char *p
= NULL
;
3135 r
= exec_parameters_get_cgroup_path(params
, &p
);
3137 *exit_status
= EXIT_CGROUP
;
3138 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
3141 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
3143 *exit_status
= EXIT_CGROUP
;
3144 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
3148 if (context
->network_namespace_path
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3149 r
= open_netns_path(runtime
->netns_storage_socket
, context
->network_namespace_path
);
3151 *exit_status
= EXIT_NETWORK
;
3152 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
3156 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
3158 *exit_status
= EXIT_STDIN
;
3159 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
3162 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
3164 *exit_status
= EXIT_STDOUT
;
3165 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
3168 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
3170 *exit_status
= EXIT_STDERR
;
3171 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
3174 if (context
->oom_score_adjust_set
) {
3175 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3176 * prohibit write access to this file, and we shouldn't trip up over that. */
3177 r
= set_oom_score_adjust(context
->oom_score_adjust
);
3178 if (IN_SET(r
, -EPERM
, -EACCES
))
3179 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3181 *exit_status
= EXIT_OOM_ADJUST
;
3182 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
3186 if (context
->nice_set
)
3187 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
3188 *exit_status
= EXIT_NICE
;
3189 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
3192 if (context
->cpu_sched_set
) {
3193 struct sched_param param
= {
3194 .sched_priority
= context
->cpu_sched_priority
,
3197 r
= sched_setscheduler(0,
3198 context
->cpu_sched_policy
|
3199 (context
->cpu_sched_reset_on_fork
?
3200 SCHED_RESET_ON_FORK
: 0),
3203 *exit_status
= EXIT_SETSCHEDULER
;
3204 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
3208 if (context
->cpu_set
.set
)
3209 if (sched_setaffinity(0, context
->cpu_set
.allocated
, context
->cpu_set
.set
) < 0) {
3210 *exit_status
= EXIT_CPUAFFINITY
;
3211 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
3214 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
3215 r
= apply_numa_policy(&context
->numa_policy
);
3216 if (r
== -EOPNOTSUPP
)
3217 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
3219 *exit_status
= EXIT_NUMA_POLICY
;
3220 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
3224 if (context
->ioprio_set
)
3225 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
3226 *exit_status
= EXIT_IOPRIO
;
3227 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
3230 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
3231 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
3232 *exit_status
= EXIT_TIMERSLACK
;
3233 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
3236 if (context
->personality
!= PERSONALITY_INVALID
) {
3237 r
= safe_personality(context
->personality
);
3239 *exit_status
= EXIT_PERSONALITY
;
3240 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
3244 if (context
->utmp_id
)
3245 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
3247 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
3248 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
3252 if (uid_is_valid(uid
)) {
3253 r
= chown_terminal(STDIN_FILENO
, uid
);
3255 *exit_status
= EXIT_STDIN
;
3256 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
3260 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3261 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3262 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3263 * touch a single hierarchy too. */
3264 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
3265 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
3267 *exit_status
= EXIT_CGROUP
;
3268 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
3272 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3273 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
3275 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
3278 r
= build_environment(
3290 *exit_status
= EXIT_MEMORY
;
3294 r
= build_pass_environment(context
, &pass_env
);
3296 *exit_status
= EXIT_MEMORY
;
3300 accum_env
= strv_env_merge(5,
3301 params
->environment
,
3304 context
->environment
,
3308 *exit_status
= EXIT_MEMORY
;
3311 accum_env
= strv_env_clean(accum_env
);
3313 (void) umask(context
->umask
);
3315 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
3317 *exit_status
= EXIT_KEYRING
;
3318 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
3321 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3322 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3324 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3325 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3327 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3328 if (needs_ambient_hack
)
3329 needs_setuid
= false;
3331 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3333 if (needs_sandboxing
) {
3334 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3335 * present. The actual MAC context application will happen later, as late as possible, to avoid
3336 * impacting our own code paths. */
3339 use_selinux
= mac_selinux_use();
3342 use_smack
= mac_smack_use();
3345 use_apparmor
= mac_apparmor_use();
3349 if (needs_sandboxing
) {
3352 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3353 * is set here. (See below.) */
3355 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
3357 *exit_status
= EXIT_LIMITS
;
3358 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
3364 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3365 * wins here. (See above.) */
3367 if (context
->pam_name
&& username
) {
3368 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3370 *exit_status
= EXIT_PAM
;
3371 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3376 if ((context
->private_network
|| context
->network_namespace_path
) && runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3378 if (ns_type_supported(NAMESPACE_NET
)) {
3379 r
= setup_netns(runtime
->netns_storage_socket
);
3381 *exit_status
= EXIT_NETWORK
;
3382 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3384 } else if (context
->network_namespace_path
) {
3385 *exit_status
= EXIT_NETWORK
;
3386 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
), "NetworkNamespacePath= is not supported, refusing.");
3388 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3391 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3392 if (needs_mount_namespace
) {
3393 _cleanup_free_
char *error_path
= NULL
;
3395 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
, &error_path
);
3397 *exit_status
= EXIT_NAMESPACE
;
3398 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
3399 error_path
? ": " : "", strempty(error_path
));
3403 if (context
->protect_hostname
) {
3404 if (ns_type_supported(NAMESPACE_UTS
)) {
3405 if (unshare(CLONE_NEWUTS
) < 0) {
3406 *exit_status
= EXIT_NAMESPACE
;
3407 return log_unit_error_errno(unit
, errno
, "Failed to set up UTS namespacing: %m");
3410 log_unit_warning(unit
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3412 r
= seccomp_protect_hostname();
3414 *exit_status
= EXIT_SECCOMP
;
3415 return log_unit_error_errno(unit
, r
, "Failed to apply hostname restrictions: %m");
3420 /* Drop groups as early as possbile */
3422 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3424 *exit_status
= EXIT_GROUP
;
3425 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3429 if (needs_sandboxing
) {
3431 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3432 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3434 *exit_status
= EXIT_SELINUX_CONTEXT
;
3435 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3440 if (context
->private_users
) {
3441 r
= setup_private_users(uid
, gid
);
3443 *exit_status
= EXIT_USER
;
3444 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3449 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3450 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3451 * however if we have it as we want to keep it open until the final execve(). */
3453 if (params
->exec_fd
>= 0) {
3454 exec_fd
= params
->exec_fd
;
3456 if (exec_fd
< 3 + (int) n_fds
) {
3459 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3460 * process we are about to execute. */
3462 moved_fd
= fcntl(exec_fd
, F_DUPFD_CLOEXEC
, 3 + (int) n_fds
);
3464 *exit_status
= EXIT_FDS
;
3465 return log_unit_error_errno(unit
, errno
, "Couldn't move exec fd up: %m");
3468 safe_close(exec_fd
);
3471 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3472 r
= fd_cloexec(exec_fd
, true);
3474 *exit_status
= EXIT_FDS
;
3475 return log_unit_error_errno(unit
, r
, "Failed to make exec fd FD_CLOEXEC: %m");
3479 fds_with_exec_fd
= newa(int, n_fds
+ 1);
3480 memcpy_safe(fds_with_exec_fd
, fds
, n_fds
* sizeof(int));
3481 fds_with_exec_fd
[n_fds
] = exec_fd
;
3482 n_fds_with_exec_fd
= n_fds
+ 1;
3484 fds_with_exec_fd
= fds
;
3485 n_fds_with_exec_fd
= n_fds
;
3488 r
= close_all_fds(fds_with_exec_fd
, n_fds_with_exec_fd
);
3490 r
= shift_fds(fds
, n_fds
);
3492 r
= flags_fds(fds
, n_socket_fds
, n_storage_fds
, context
->non_blocking
);
3494 *exit_status
= EXIT_FDS
;
3495 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3498 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3499 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3500 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3503 secure_bits
= context
->secure_bits
;
3505 if (needs_sandboxing
) {
3508 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3509 * requested. (Note this is placed after the general resource limit initialization, see
3510 * above, in order to take precedence.) */
3511 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3512 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3513 *exit_status
= EXIT_LIMITS
;
3514 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3519 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3520 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3522 r
= setup_smack(context
, command
);
3524 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3525 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3530 bset
= context
->capability_bounding_set
;
3531 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3532 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3533 * instead of us doing that */
3534 if (needs_ambient_hack
)
3535 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3536 (UINT64_C(1) << CAP_SETUID
) |
3537 (UINT64_C(1) << CAP_SETGID
);
3539 if (!cap_test_all(bset
)) {
3540 r
= capability_bounding_set_drop(bset
, false);
3542 *exit_status
= EXIT_CAPABILITIES
;
3543 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3547 /* This is done before enforce_user, but ambient set
3548 * does not survive over setresuid() if keep_caps is not set. */
3549 if (!needs_ambient_hack
&&
3550 context
->capability_ambient_set
!= 0) {
3551 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3553 *exit_status
= EXIT_CAPABILITIES
;
3554 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3559 /* chroot to root directory first, before we lose the ability to chroot */
3560 r
= apply_root_directory(context
, params
, needs_mount_namespace
, exit_status
);
3562 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
3565 if (uid_is_valid(uid
)) {
3566 r
= enforce_user(context
, uid
);
3568 *exit_status
= EXIT_USER
;
3569 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3572 if (!needs_ambient_hack
&&
3573 context
->capability_ambient_set
!= 0) {
3575 /* Fix the ambient capabilities after user change. */
3576 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3578 *exit_status
= EXIT_CAPABILITIES
;
3579 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3582 /* If we were asked to change user and ambient capabilities
3583 * were requested, we had to add keep-caps to the securebits
3584 * so that we would maintain the inherited capability set
3585 * through the setresuid(). Make sure that the bit is added
3586 * also to the context secure_bits so that we don't try to
3587 * drop the bit away next. */
3589 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3594 /* Apply working directory here, because the working directory might be on NFS and only the user running
3595 * this service might have the correct privilege to change to the working directory */
3596 r
= apply_working_directory(context
, params
, home
, exit_status
);
3598 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3600 if (needs_sandboxing
) {
3601 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3602 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3603 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3604 * are restricted. */
3608 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3611 r
= setexeccon(exec_context
);
3613 *exit_status
= EXIT_SELINUX_CONTEXT
;
3614 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3621 if (use_apparmor
&& context
->apparmor_profile
) {
3622 r
= aa_change_onexec(context
->apparmor_profile
);
3623 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3624 *exit_status
= EXIT_APPARMOR_PROFILE
;
3625 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3630 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3631 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3632 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3633 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3634 *exit_status
= EXIT_SECUREBITS
;
3635 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3638 if (context_has_no_new_privileges(context
))
3639 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3640 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3641 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3645 r
= apply_address_families(unit
, context
);
3647 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3648 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3651 r
= apply_memory_deny_write_execute(unit
, context
);
3653 *exit_status
= EXIT_SECCOMP
;
3654 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3657 r
= apply_restrict_realtime(unit
, context
);
3659 *exit_status
= EXIT_SECCOMP
;
3660 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3663 r
= apply_restrict_suid_sgid(unit
, context
);
3665 *exit_status
= EXIT_SECCOMP
;
3666 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
3669 r
= apply_restrict_namespaces(unit
, context
);
3671 *exit_status
= EXIT_SECCOMP
;
3672 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3675 r
= apply_protect_sysctl(unit
, context
);
3677 *exit_status
= EXIT_SECCOMP
;
3678 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3681 r
= apply_protect_kernel_modules(unit
, context
);
3683 *exit_status
= EXIT_SECCOMP
;
3684 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3687 r
= apply_private_devices(unit
, context
);
3689 *exit_status
= EXIT_SECCOMP
;
3690 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3693 r
= apply_syscall_archs(unit
, context
);
3695 *exit_status
= EXIT_SECCOMP
;
3696 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3699 r
= apply_lock_personality(unit
, context
);
3701 *exit_status
= EXIT_SECCOMP
;
3702 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3705 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3706 * by the filter as little as possible. */
3707 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3709 *exit_status
= EXIT_SECCOMP
;
3710 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3715 if (!strv_isempty(context
->unset_environment
)) {
3718 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3720 *exit_status
= EXIT_MEMORY
;
3724 strv_free_and_replace(accum_env
, ee
);
3727 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
3728 replaced_argv
= replace_env_argv(command
->argv
, accum_env
);
3729 if (!replaced_argv
) {
3730 *exit_status
= EXIT_MEMORY
;
3733 final_argv
= replaced_argv
;
3735 final_argv
= command
->argv
;
3737 if (DEBUG_LOGGING
) {
3738 _cleanup_free_
char *line
;
3740 line
= exec_command_line(final_argv
);
3742 log_struct(LOG_DEBUG
,
3743 "EXECUTABLE=%s", command
->path
,
3744 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3746 LOG_UNIT_INVOCATION_ID(unit
));
3752 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3753 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3755 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
3756 *exit_status
= EXIT_EXEC
;
3757 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
3761 execve(command
->path
, final_argv
, accum_env
);
3767 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3768 * that POLLHUP on it no longer means execve() succeeded. */
3770 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
3771 *exit_status
= EXIT_EXEC
;
3772 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
3776 if (r
== -ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3777 log_struct_errno(LOG_INFO
, r
,
3778 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3780 LOG_UNIT_INVOCATION_ID(unit
),
3781 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3783 "EXECUTABLE=%s", command
->path
);
3787 *exit_status
= EXIT_EXEC
;
3788 return log_unit_error_errno(unit
, r
, "Failed to execute command: %m");
3791 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
3792 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
3794 int exec_spawn(Unit
*unit
,
3795 ExecCommand
*command
,
3796 const ExecContext
*context
,
3797 const ExecParameters
*params
,
3798 ExecRuntime
*runtime
,
3799 DynamicCreds
*dcreds
,
3802 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
3803 _cleanup_free_
char *subcgroup_path
= NULL
;
3804 _cleanup_strv_free_
char **files_env
= NULL
;
3805 size_t n_storage_fds
= 0, n_socket_fds
= 0;
3806 _cleanup_free_
char *line
= NULL
;
3814 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
3816 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3817 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3818 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3820 if (params
->n_socket_fds
> 1) {
3821 log_unit_error(unit
, "Got more than one socket.");
3825 if (params
->n_socket_fds
== 0) {
3826 log_unit_error(unit
, "Got no socket.");
3830 socket_fd
= params
->fds
[0];
3834 n_socket_fds
= params
->n_socket_fds
;
3835 n_storage_fds
= params
->n_storage_fds
;
3838 r
= exec_context_named_iofds(context
, params
, named_iofds
);
3840 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3842 r
= exec_context_load_environment(unit
, context
, &files_env
);
3844 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3846 line
= exec_command_line(command
->argv
);
3850 log_struct(LOG_DEBUG
,
3851 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3852 "EXECUTABLE=%s", command
->path
,
3854 LOG_UNIT_INVOCATION_ID(unit
));
3856 if (params
->cgroup_path
) {
3857 r
= exec_parameters_get_cgroup_path(params
, &subcgroup_path
);
3859 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
3860 if (r
> 0) { /* We are using a child cgroup */
3861 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
3863 return log_unit_error_errno(unit
, r
, "Failed to create control group '%s': %m", subcgroup_path
);
3869 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3872 int exit_status
= EXIT_SUCCESS
;
3874 r
= exec_child(unit
,
3886 unit
->manager
->user_lookup_fds
[1],
3890 const char *status
=
3891 exit_status_to_string(exit_status
,
3892 EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
);
3894 log_struct_errno(LOG_ERR
, r
,
3895 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3897 LOG_UNIT_INVOCATION_ID(unit
),
3898 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3899 status
, command
->path
),
3900 "EXECUTABLE=%s", command
->path
);
3906 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3908 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3909 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3910 * process will be killed too). */
3912 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
3914 exec_status_start(&command
->exec_status
, pid
);
3920 void exec_context_init(ExecContext
*c
) {
3921 ExecDirectoryType i
;
3926 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3927 c
->cpu_sched_policy
= SCHED_OTHER
;
3928 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3929 c
->syslog_level_prefix
= true;
3930 c
->ignore_sigpipe
= true;
3931 c
->timer_slack_nsec
= NSEC_INFINITY
;
3932 c
->personality
= PERSONALITY_INVALID
;
3933 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3934 c
->directories
[i
].mode
= 0755;
3935 c
->timeout_clean_usec
= USEC_INFINITY
;
3936 c
->capability_bounding_set
= CAP_ALL
;
3937 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
3938 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
3939 c
->log_level_max
= -1;
3940 numa_policy_reset(&c
->numa_policy
);
3943 void exec_context_done(ExecContext
*c
) {
3944 ExecDirectoryType i
;
3949 c
->environment
= strv_free(c
->environment
);
3950 c
->environment_files
= strv_free(c
->environment_files
);
3951 c
->pass_environment
= strv_free(c
->pass_environment
);
3952 c
->unset_environment
= strv_free(c
->unset_environment
);
3954 rlimit_free_all(c
->rlimit
);
3956 for (l
= 0; l
< 3; l
++) {
3957 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3958 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
3961 c
->working_directory
= mfree(c
->working_directory
);
3962 c
->root_directory
= mfree(c
->root_directory
);
3963 c
->root_image
= mfree(c
->root_image
);
3964 c
->tty_path
= mfree(c
->tty_path
);
3965 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3966 c
->user
= mfree(c
->user
);
3967 c
->group
= mfree(c
->group
);
3969 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3971 c
->pam_name
= mfree(c
->pam_name
);
3973 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3974 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3975 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3977 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3978 c
->bind_mounts
= NULL
;
3979 c
->n_bind_mounts
= 0;
3980 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
3981 c
->temporary_filesystems
= NULL
;
3982 c
->n_temporary_filesystems
= 0;
3984 cpu_set_reset(&c
->cpu_set
);
3985 numa_policy_reset(&c
->numa_policy
);
3987 c
->utmp_id
= mfree(c
->utmp_id
);
3988 c
->selinux_context
= mfree(c
->selinux_context
);
3989 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3990 c
->smack_process_label
= mfree(c
->smack_process_label
);
3992 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3993 c
->syscall_archs
= set_free(c
->syscall_archs
);
3994 c
->address_families
= set_free(c
->address_families
);
3996 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3997 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3999 c
->log_level_max
= -1;
4001 exec_context_free_log_extra_fields(c
);
4003 c
->log_rate_limit_interval_usec
= 0;
4004 c
->log_rate_limit_burst
= 0;
4006 c
->stdin_data
= mfree(c
->stdin_data
);
4007 c
->stdin_data_size
= 0;
4009 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
4012 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
4017 if (!runtime_prefix
)
4020 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
4021 _cleanup_free_
char *p
;
4023 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
4024 p
= path_join(runtime_prefix
, "private", *i
);
4026 p
= path_join(runtime_prefix
, *i
);
4030 /* We execute this synchronously, since we need to be sure this is gone when we start the
4032 (void) rm_rf(p
, REMOVE_ROOT
);
4038 static void exec_command_done(ExecCommand
*c
) {
4041 c
->path
= mfree(c
->path
);
4042 c
->argv
= strv_free(c
->argv
);
4045 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
4048 for (i
= 0; i
< n
; i
++)
4049 exec_command_done(c
+i
);
4052 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
4056 LIST_REMOVE(command
, c
, i
);
4057 exec_command_done(i
);
4064 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
4067 for (i
= 0; i
< n
; i
++)
4068 c
[i
] = exec_command_free_list(c
[i
]);
4071 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
4074 for (i
= 0; i
< n
; i
++)
4075 exec_status_reset(&c
[i
].exec_status
);
4078 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
4081 for (i
= 0; i
< n
; i
++) {
4084 LIST_FOREACH(command
, z
, c
[i
])
4085 exec_status_reset(&z
->exec_status
);
4089 typedef struct InvalidEnvInfo
{
4094 static void invalid_env(const char *p
, void *userdata
) {
4095 InvalidEnvInfo
*info
= userdata
;
4097 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
4100 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
4106 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
4109 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
4112 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
4115 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
4118 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
4121 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
4128 static int exec_context_named_iofds(
4129 const ExecContext
*c
,
4130 const ExecParameters
*p
,
4131 int named_iofds
[static 3]) {
4134 const char* stdio_fdname
[3];
4139 assert(named_iofds
);
4141 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
4142 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
4143 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
4145 for (i
= 0; i
< 3; i
++)
4146 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
4148 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
4150 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
4151 if (named_iofds
[STDIN_FILENO
] < 0 &&
4152 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
4153 stdio_fdname
[STDIN_FILENO
] &&
4154 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
4156 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
4159 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
4160 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
4161 stdio_fdname
[STDOUT_FILENO
] &&
4162 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
4164 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
4167 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
4168 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
4169 stdio_fdname
[STDERR_FILENO
] &&
4170 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
4172 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
4176 return targets
== 0 ? 0 : -ENOENT
;
4179 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
) {
4180 char **i
, **r
= NULL
;
4185 STRV_FOREACH(i
, c
->environment_files
) {
4189 bool ignore
= false;
4191 _cleanup_globfree_ glob_t pglob
= {};
4200 if (!path_is_absolute(fn
)) {
4208 /* Filename supports globbing, take all matching files */
4209 k
= safe_glob(fn
, 0, &pglob
);
4218 /* When we don't match anything, -ENOENT should be returned */
4219 assert(pglob
.gl_pathc
> 0);
4221 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
4222 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
4230 /* Log invalid environment variables with filename */
4232 InvalidEnvInfo info
= {
4234 .path
= pglob
.gl_pathv
[n
]
4237 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
4245 m
= strv_env_merge(2, r
, p
);
4261 static bool tty_may_match_dev_console(const char *tty
) {
4262 _cleanup_free_
char *resolved
= NULL
;
4267 tty
= skip_dev_prefix(tty
);
4269 /* trivial identity? */
4270 if (streq(tty
, "console"))
4273 if (resolve_dev_console(&resolved
) < 0)
4274 return true; /* if we could not resolve, assume it may */
4276 /* "tty0" means the active VC, so it may be the same sometimes */
4277 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
4280 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
4283 return ec
->tty_reset
||
4285 ec
->tty_vt_disallocate
||
4286 is_terminal_input(ec
->std_input
) ||
4287 is_terminal_output(ec
->std_output
) ||
4288 is_terminal_output(ec
->std_error
);
4291 bool exec_context_may_touch_console(const ExecContext
*ec
) {
4293 return exec_context_may_touch_tty(ec
) &&
4294 tty_may_match_dev_console(exec_context_tty_path(ec
));
4297 static void strv_fprintf(FILE *f
, char **l
) {
4303 fprintf(f
, " %s", *g
);
4306 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
4307 char **e
, **d
, buf_clean
[FORMAT_TIMESPAN_MAX
];
4308 ExecDirectoryType dt
;
4315 prefix
= strempty(prefix
);
4319 "%sWorkingDirectory: %s\n"
4320 "%sRootDirectory: %s\n"
4321 "%sNonBlocking: %s\n"
4322 "%sPrivateTmp: %s\n"
4323 "%sPrivateDevices: %s\n"
4324 "%sProtectKernelTunables: %s\n"
4325 "%sProtectKernelModules: %s\n"
4326 "%sProtectControlGroups: %s\n"
4327 "%sPrivateNetwork: %s\n"
4328 "%sPrivateUsers: %s\n"
4329 "%sProtectHome: %s\n"
4330 "%sProtectSystem: %s\n"
4331 "%sMountAPIVFS: %s\n"
4332 "%sIgnoreSIGPIPE: %s\n"
4333 "%sMemoryDenyWriteExecute: %s\n"
4334 "%sRestrictRealtime: %s\n"
4335 "%sRestrictSUIDSGID: %s\n"
4336 "%sKeyringMode: %s\n"
4337 "%sProtectHostname: %s\n",
4339 prefix
, c
->working_directory
? c
->working_directory
: "/",
4340 prefix
, c
->root_directory
? c
->root_directory
: "/",
4341 prefix
, yes_no(c
->non_blocking
),
4342 prefix
, yes_no(c
->private_tmp
),
4343 prefix
, yes_no(c
->private_devices
),
4344 prefix
, yes_no(c
->protect_kernel_tunables
),
4345 prefix
, yes_no(c
->protect_kernel_modules
),
4346 prefix
, yes_no(c
->protect_control_groups
),
4347 prefix
, yes_no(c
->private_network
),
4348 prefix
, yes_no(c
->private_users
),
4349 prefix
, protect_home_to_string(c
->protect_home
),
4350 prefix
, protect_system_to_string(c
->protect_system
),
4351 prefix
, yes_no(c
->mount_apivfs
),
4352 prefix
, yes_no(c
->ignore_sigpipe
),
4353 prefix
, yes_no(c
->memory_deny_write_execute
),
4354 prefix
, yes_no(c
->restrict_realtime
),
4355 prefix
, yes_no(c
->restrict_suid_sgid
),
4356 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
4357 prefix
, yes_no(c
->protect_hostname
));
4360 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
4362 STRV_FOREACH(e
, c
->environment
)
4363 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
4365 STRV_FOREACH(e
, c
->environment_files
)
4366 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
4368 STRV_FOREACH(e
, c
->pass_environment
)
4369 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
4371 STRV_FOREACH(e
, c
->unset_environment
)
4372 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
4374 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
4376 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4377 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
4379 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
4380 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
4384 "%sTimeoutCleanSec: %s\n",
4385 prefix
, format_timespan(buf_clean
, sizeof(buf_clean
), c
->timeout_clean_usec
, USEC_PER_SEC
));
4392 if (c
->oom_score_adjust_set
)
4394 "%sOOMScoreAdjust: %i\n",
4395 prefix
, c
->oom_score_adjust
);
4397 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
4399 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
4400 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
4401 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
4402 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
4405 if (c
->ioprio_set
) {
4406 _cleanup_free_
char *class_str
= NULL
;
4408 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
4410 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
4412 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
4415 if (c
->cpu_sched_set
) {
4416 _cleanup_free_
char *policy_str
= NULL
;
4418 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
4420 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
4423 "%sCPUSchedulingPriority: %i\n"
4424 "%sCPUSchedulingResetOnFork: %s\n",
4425 prefix
, c
->cpu_sched_priority
,
4426 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
4429 if (c
->cpu_set
.set
) {
4430 _cleanup_free_
char *affinity
= NULL
;
4432 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
4433 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
4436 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
4437 _cleanup_free_
char *nodes
= NULL
;
4439 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
4440 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
4441 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
4444 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
4445 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
4448 "%sStandardInput: %s\n"
4449 "%sStandardOutput: %s\n"
4450 "%sStandardError: %s\n",
4451 prefix
, exec_input_to_string(c
->std_input
),
4452 prefix
, exec_output_to_string(c
->std_output
),
4453 prefix
, exec_output_to_string(c
->std_error
));
4455 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
4456 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
4457 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
4458 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
4459 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
4460 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
4462 if (c
->std_input
== EXEC_INPUT_FILE
)
4463 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
4464 if (c
->std_output
== EXEC_OUTPUT_FILE
)
4465 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4466 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
4467 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4468 if (c
->std_error
== EXEC_OUTPUT_FILE
)
4469 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4470 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
4471 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4477 "%sTTYVHangup: %s\n"
4478 "%sTTYVTDisallocate: %s\n",
4479 prefix
, c
->tty_path
,
4480 prefix
, yes_no(c
->tty_reset
),
4481 prefix
, yes_no(c
->tty_vhangup
),
4482 prefix
, yes_no(c
->tty_vt_disallocate
));
4484 if (IN_SET(c
->std_output
,
4487 EXEC_OUTPUT_JOURNAL
,
4488 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4489 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4490 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
4491 IN_SET(c
->std_error
,
4494 EXEC_OUTPUT_JOURNAL
,
4495 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4496 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4497 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
4499 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
4501 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
4503 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
4505 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
4507 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
4510 if (c
->log_level_max
>= 0) {
4511 _cleanup_free_
char *t
= NULL
;
4513 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
4515 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
4518 if (c
->log_rate_limit_interval_usec
> 0) {
4519 char buf_timespan
[FORMAT_TIMESPAN_MAX
];
4522 "%sLogRateLimitIntervalSec: %s\n",
4523 prefix
, format_timespan(buf_timespan
, sizeof(buf_timespan
), c
->log_rate_limit_interval_usec
, USEC_PER_SEC
));
4526 if (c
->log_rate_limit_burst
> 0)
4527 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_rate_limit_burst
);
4529 if (c
->n_log_extra_fields
> 0) {
4532 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
4533 fprintf(f
, "%sLogExtraFields: ", prefix
);
4534 fwrite(c
->log_extra_fields
[j
].iov_base
,
4535 1, c
->log_extra_fields
[j
].iov_len
,
4541 if (c
->secure_bits
) {
4542 _cleanup_free_
char *str
= NULL
;
4544 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4546 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4549 if (c
->capability_bounding_set
!= CAP_ALL
) {
4550 _cleanup_free_
char *str
= NULL
;
4552 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4554 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4557 if (c
->capability_ambient_set
!= 0) {
4558 _cleanup_free_
char *str
= NULL
;
4560 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4562 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4566 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4568 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4570 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4572 if (!strv_isempty(c
->supplementary_groups
)) {
4573 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4574 strv_fprintf(f
, c
->supplementary_groups
);
4579 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4581 if (!strv_isempty(c
->read_write_paths
)) {
4582 fprintf(f
, "%sReadWritePaths:", prefix
);
4583 strv_fprintf(f
, c
->read_write_paths
);
4587 if (!strv_isempty(c
->read_only_paths
)) {
4588 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4589 strv_fprintf(f
, c
->read_only_paths
);
4593 if (!strv_isempty(c
->inaccessible_paths
)) {
4594 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4595 strv_fprintf(f
, c
->inaccessible_paths
);
4599 if (c
->n_bind_mounts
> 0)
4600 for (i
= 0; i
< c
->n_bind_mounts
; i
++)
4601 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
4602 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4603 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
4604 c
->bind_mounts
[i
].source
,
4605 c
->bind_mounts
[i
].destination
,
4606 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4608 if (c
->n_temporary_filesystems
> 0)
4609 for (i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
4610 TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
4612 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
4614 isempty(t
->options
) ? "" : ":",
4615 strempty(t
->options
));
4620 "%sUtmpIdentifier: %s\n",
4621 prefix
, c
->utmp_id
);
4623 if (c
->selinux_context
)
4625 "%sSELinuxContext: %s%s\n",
4626 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4628 if (c
->apparmor_profile
)
4630 "%sAppArmorProfile: %s%s\n",
4631 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4633 if (c
->smack_process_label
)
4635 "%sSmackProcessLabel: %s%s\n",
4636 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4638 if (c
->personality
!= PERSONALITY_INVALID
)
4640 "%sPersonality: %s\n",
4641 prefix
, strna(personality_to_string(c
->personality
)));
4644 "%sLockPersonality: %s\n",
4645 prefix
, yes_no(c
->lock_personality
));
4647 if (c
->syscall_filter
) {
4655 "%sSystemCallFilter: ",
4658 if (!c
->syscall_whitelist
)
4662 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4663 _cleanup_free_
char *name
= NULL
;
4664 const char *errno_name
= NULL
;
4665 int num
= PTR_TO_INT(val
);
4672 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4673 fputs(strna(name
), f
);
4676 errno_name
= errno_to_name(num
);
4678 fprintf(f
, ":%s", errno_name
);
4680 fprintf(f
, ":%d", num
);
4688 if (c
->syscall_archs
) {
4695 "%sSystemCallArchitectures:",
4699 SET_FOREACH(id
, c
->syscall_archs
, j
)
4700 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4705 if (exec_context_restrict_namespaces_set(c
)) {
4706 _cleanup_free_
char *s
= NULL
;
4708 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
4710 fprintf(f
, "%sRestrictNamespaces: %s\n",
4714 if (c
->network_namespace_path
)
4716 "%sNetworkNamespacePath: %s\n",
4717 prefix
, c
->network_namespace_path
);
4719 if (c
->syscall_errno
> 0) {
4720 const char *errno_name
;
4722 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4724 errno_name
= errno_to_name(c
->syscall_errno
);
4726 fprintf(f
, "%s\n", errno_name
);
4728 fprintf(f
, "%d\n", c
->syscall_errno
);
4732 bool exec_context_maintains_privileges(const ExecContext
*c
) {
4735 /* Returns true if the process forked off would run under
4736 * an unchanged UID or as root. */
4741 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4747 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
4755 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4757 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4762 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4767 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4768 free(c
->log_extra_fields
[l
].iov_base
);
4769 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4770 c
->n_log_extra_fields
= 0;
4773 void exec_context_revert_tty(ExecContext
*c
) {
4778 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4779 exec_context_tty_reset(c
, NULL
);
4781 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4782 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4783 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4785 if (exec_context_may_touch_tty(c
)) {
4788 path
= exec_context_tty_path(c
);
4790 r
= chmod_and_chown(path
, TTY_MODE
, 0, TTY_GID
);
4791 if (r
< 0 && r
!= -ENOENT
)
4792 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
4797 int exec_context_get_clean_directories(
4803 _cleanup_strv_free_
char **l
= NULL
;
4804 ExecDirectoryType t
;
4811 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
4814 if (!FLAGS_SET(mask
, 1U << t
))
4820 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
4823 j
= path_join(prefix
[t
], *i
);
4827 r
= strv_consume(&l
, j
);
4831 /* Also remove private directories unconditionally. */
4832 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
4833 j
= path_join(prefix
[t
], "private", *i
);
4837 r
= strv_consume(&l
, j
);
4848 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
4849 ExecCleanMask mask
= 0;
4854 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
4855 if (!strv_isempty(c
->directories
[t
].paths
))
4862 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4869 dual_timestamp_get(&s
->start_timestamp
);
4872 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
4875 if (s
->pid
!= pid
) {
4881 dual_timestamp_get(&s
->exit_timestamp
);
4886 if (context
&& context
->utmp_id
)
4887 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4890 void exec_status_reset(ExecStatus
*s
) {
4893 *s
= (ExecStatus
) {};
4896 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
4897 char buf
[FORMAT_TIMESTAMP_MAX
];
4905 prefix
= strempty(prefix
);
4908 "%sPID: "PID_FMT
"\n",
4911 if (dual_timestamp_is_set(&s
->start_timestamp
))
4913 "%sStart Timestamp: %s\n",
4914 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4916 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4918 "%sExit Timestamp: %s\n"
4920 "%sExit Status: %i\n",
4921 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4922 prefix
, sigchld_code_to_string(s
->code
),
4926 static char *exec_command_line(char **argv
) {
4934 STRV_FOREACH(a
, argv
)
4942 STRV_FOREACH(a
, argv
) {
4949 if (strpbrk(*a
, WHITESPACE
)) {
4960 /* FIXME: this doesn't really handle arguments that have
4961 * spaces and ticks in them */
4966 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4967 _cleanup_free_
char *cmd
= NULL
;
4968 const char *prefix2
;
4973 prefix
= strempty(prefix
);
4974 prefix2
= strjoina(prefix
, "\t");
4976 cmd
= exec_command_line(c
->argv
);
4978 "%sCommand Line: %s\n",
4979 prefix
, cmd
? cmd
: strerror_safe(ENOMEM
));
4981 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4984 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4987 prefix
= strempty(prefix
);
4989 LIST_FOREACH(command
, c
, c
)
4990 exec_command_dump(c
, f
, prefix
);
4993 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
5000 /* It's kind of important, that we keep the order here */
5001 LIST_FIND_TAIL(command
, *l
, end
);
5002 LIST_INSERT_AFTER(command
, *l
, end
, e
);
5007 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
5015 l
= strv_new_ap(path
, ap
);
5027 free_and_replace(c
->path
, p
);
5029 return strv_free_and_replace(c
->argv
, l
);
5032 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
5033 _cleanup_strv_free_
char **l
= NULL
;
5041 l
= strv_new_ap(path
, ap
);
5047 r
= strv_extend_strv(&c
->argv
, l
, false);
5054 static void *remove_tmpdir_thread(void *p
) {
5055 _cleanup_free_
char *path
= p
;
5057 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
5061 static ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
, bool destroy
) {
5068 (void) hashmap_remove(rt
->manager
->exec_runtime_by_id
, rt
->id
);
5070 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5071 if (destroy
&& rt
->tmp_dir
) {
5072 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
5074 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
5076 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
5083 if (destroy
&& rt
->var_tmp_dir
) {
5084 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
5086 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
5088 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
5089 free(rt
->var_tmp_dir
);
5092 rt
->var_tmp_dir
= NULL
;
5095 rt
->id
= mfree(rt
->id
);
5096 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
5097 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
5098 safe_close_pair(rt
->netns_storage_socket
);
5102 static void exec_runtime_freep(ExecRuntime
**rt
) {
5103 (void) exec_runtime_free(*rt
, false);
5106 static int exec_runtime_allocate(ExecRuntime
**ret
) {
5111 n
= new(ExecRuntime
, 1);
5115 *n
= (ExecRuntime
) {
5116 .netns_storage_socket
= { -1, -1 },
5123 static int exec_runtime_add(
5126 const char *tmp_dir
,
5127 const char *var_tmp_dir
,
5128 const int netns_storage_socket
[2],
5129 ExecRuntime
**ret
) {
5131 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
5137 r
= hashmap_ensure_allocated(&m
->exec_runtime_by_id
, &string_hash_ops
);
5141 r
= exec_runtime_allocate(&rt
);
5145 rt
->id
= strdup(id
);
5150 rt
->tmp_dir
= strdup(tmp_dir
);
5154 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5155 assert(var_tmp_dir
);
5156 rt
->var_tmp_dir
= strdup(var_tmp_dir
);
5157 if (!rt
->var_tmp_dir
)
5161 if (netns_storage_socket
) {
5162 rt
->netns_storage_socket
[0] = netns_storage_socket
[0];
5163 rt
->netns_storage_socket
[1] = netns_storage_socket
[1];
5166 r
= hashmap_put(m
->exec_runtime_by_id
, rt
->id
, rt
);
5175 /* do not remove created ExecRuntime object when the operation succeeds. */
5180 static int exec_runtime_make(Manager
*m
, const ExecContext
*c
, const char *id
, ExecRuntime
**ret
) {
5181 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
5182 _cleanup_close_pair_
int netns_storage_socket
[2] = { -1, -1 };
5189 /* It is not necessary to create ExecRuntime object. */
5190 if (!c
->private_network
&& !c
->private_tmp
&& !c
->network_namespace_path
)
5193 if (c
->private_tmp
) {
5194 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
5199 if (c
->private_network
|| c
->network_namespace_path
) {
5200 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
5204 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, netns_storage_socket
, ret
);
5209 netns_storage_socket
[0] = netns_storage_socket
[1] = -1;
5213 int exec_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecRuntime
**ret
) {
5221 rt
= hashmap_get(m
->exec_runtime_by_id
, id
);
5223 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5229 /* If not found, then create a new object. */
5230 r
= exec_runtime_make(m
, c
, id
, &rt
);
5232 /* When r == 0, it is not necessary to create ExecRuntime object. */
5236 /* increment reference counter. */
5242 ExecRuntime
*exec_runtime_unref(ExecRuntime
*rt
, bool destroy
) {
5246 assert(rt
->n_ref
> 0);
5252 return exec_runtime_free(rt
, destroy
);
5255 int exec_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
5263 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
5264 fprintf(f
, "exec-runtime=%s", rt
->id
);
5267 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
5269 if (rt
->var_tmp_dir
)
5270 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
5272 if (rt
->netns_storage_socket
[0] >= 0) {
5275 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
5279 fprintf(f
, " netns-socket-0=%i", copy
);
5282 if (rt
->netns_storage_socket
[1] >= 0) {
5285 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
5289 fprintf(f
, " netns-socket-1=%i", copy
);
5298 int exec_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
5299 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt_create
= NULL
;
5303 /* This is for the migration from old (v237 or earlier) deserialization text.
5304 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5305 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5306 * so or not from the serialized text, then we always creates a new object owned by this. */
5312 /* Manager manages ExecRuntime objects by the unit id.
5313 * So, we omit the serialized text when the unit does not have id (yet?)... */
5314 if (isempty(u
->id
)) {
5315 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
5319 r
= hashmap_ensure_allocated(&u
->manager
->exec_runtime_by_id
, &string_hash_ops
);
5321 log_unit_debug_errno(u
, r
, "Failed to allocate storage for runtime parameter: %m");
5325 rt
= hashmap_get(u
->manager
->exec_runtime_by_id
, u
->id
);
5327 r
= exec_runtime_allocate(&rt_create
);
5331 rt_create
->id
= strdup(u
->id
);
5338 if (streq(key
, "tmp-dir")) {
5341 copy
= strdup(value
);
5345 free_and_replace(rt
->tmp_dir
, copy
);
5347 } else if (streq(key
, "var-tmp-dir")) {
5350 copy
= strdup(value
);
5354 free_and_replace(rt
->var_tmp_dir
, copy
);
5356 } else if (streq(key
, "netns-socket-0")) {
5359 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
5360 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
5364 safe_close(rt
->netns_storage_socket
[0]);
5365 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
5367 } else if (streq(key
, "netns-socket-1")) {
5370 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
5371 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
5375 safe_close(rt
->netns_storage_socket
[1]);
5376 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
5380 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5382 r
= hashmap_put(u
->manager
->exec_runtime_by_id
, rt_create
->id
, rt_create
);
5384 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
5388 rt_create
->manager
= u
->manager
;
5397 void exec_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
5398 char *id
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
5399 int r
, fd0
= -1, fd1
= -1;
5400 const char *p
, *v
= value
;
5407 n
= strcspn(v
, " ");
5408 id
= strndupa(v
, n
);
5413 v
= startswith(p
, "tmp-dir=");
5415 n
= strcspn(v
, " ");
5416 tmp_dir
= strndupa(v
, n
);
5422 v
= startswith(p
, "var-tmp-dir=");
5424 n
= strcspn(v
, " ");
5425 var_tmp_dir
= strndupa(v
, n
);
5431 v
= startswith(p
, "netns-socket-0=");
5435 n
= strcspn(v
, " ");
5436 buf
= strndupa(v
, n
);
5437 if (safe_atoi(buf
, &fd0
) < 0 || !fdset_contains(fds
, fd0
)) {
5438 log_debug("Unable to process exec-runtime netns fd specification.");
5441 fd0
= fdset_remove(fds
, fd0
);
5447 v
= startswith(p
, "netns-socket-1=");
5451 n
= strcspn(v
, " ");
5452 buf
= strndupa(v
, n
);
5453 if (safe_atoi(buf
, &fd1
) < 0 || !fdset_contains(fds
, fd1
)) {
5454 log_debug("Unable to process exec-runtime netns fd specification.");
5457 fd1
= fdset_remove(fds
, fd1
);
5462 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, (int[]) { fd0
, fd1
}, NULL
);
5464 log_debug_errno(r
, "Failed to add exec-runtime: %m");
5467 void exec_runtime_vacuum(Manager
*m
) {
5473 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5475 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
5479 (void) exec_runtime_free(rt
, false);
5483 void exec_params_clear(ExecParameters
*p
) {
5487 strv_free(p
->environment
);
5490 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
5491 [EXEC_INPUT_NULL
] = "null",
5492 [EXEC_INPUT_TTY
] = "tty",
5493 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
5494 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
5495 [EXEC_INPUT_SOCKET
] = "socket",
5496 [EXEC_INPUT_NAMED_FD
] = "fd",
5497 [EXEC_INPUT_DATA
] = "data",
5498 [EXEC_INPUT_FILE
] = "file",
5501 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
5503 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
5504 [EXEC_OUTPUT_INHERIT
] = "inherit",
5505 [EXEC_OUTPUT_NULL
] = "null",
5506 [EXEC_OUTPUT_TTY
] = "tty",
5507 [EXEC_OUTPUT_SYSLOG
] = "syslog",
5508 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
5509 [EXEC_OUTPUT_KMSG
] = "kmsg",
5510 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
5511 [EXEC_OUTPUT_JOURNAL
] = "journal",
5512 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
5513 [EXEC_OUTPUT_SOCKET
] = "socket",
5514 [EXEC_OUTPUT_NAMED_FD
] = "fd",
5515 [EXEC_OUTPUT_FILE
] = "file",
5516 [EXEC_OUTPUT_FILE_APPEND
] = "append",
5519 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
5521 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
5522 [EXEC_UTMP_INIT
] = "init",
5523 [EXEC_UTMP_LOGIN
] = "login",
5524 [EXEC_UTMP_USER
] = "user",
5527 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
5529 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
5530 [EXEC_PRESERVE_NO
] = "no",
5531 [EXEC_PRESERVE_YES
] = "yes",
5532 [EXEC_PRESERVE_RESTART
] = "restart",
5535 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
5537 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5538 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5539 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
5540 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
5541 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
5542 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
5543 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
5546 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
5548 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5549 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5550 * directories, specifically .timer units with their timestamp touch file. */
5551 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5552 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
5553 [EXEC_DIRECTORY_STATE
] = "state",
5554 [EXEC_DIRECTORY_CACHE
] = "cache",
5555 [EXEC_DIRECTORY_LOGS
] = "logs",
5556 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
5559 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
5561 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5562 * the service payload in. */
5563 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
5564 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
5565 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
5566 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
5567 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
5568 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
5571 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
5573 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
5574 [EXEC_KEYRING_INHERIT
] = "inherit",
5575 [EXEC_KEYRING_PRIVATE
] = "private",
5576 [EXEC_KEYRING_SHARED
] = "shared",
5579 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);