1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
15 #include <sys/capability.h>
16 #include <sys/eventfd.h>
18 #include <sys/personality.h>
19 #include <sys/prctl.h>
21 #include <sys/socket.h>
23 #include <sys/types.h>
29 #include <security/pam_appl.h>
33 #include <selinux/selinux.h>
41 #include <sys/apparmor.h>
44 #include "sd-messages.h"
47 #include "alloc-util.h"
49 #include "apparmor-util.h"
54 #include "capability-util.h"
55 #include "chown-recursive.h"
56 #include "cpu-set-util.h"
59 #include "errno-list.h"
61 #include "exit-status.h"
64 #include "format-util.h"
66 #include "glob-util.h"
75 #include "namespace.h"
76 #include "parse-util.h"
77 #include "path-util.h"
78 #include "process-util.h"
79 #include "rlimit-util.h"
82 #include "seccomp-util.h"
84 #include "securebits.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
91 #include "stat-util.h"
92 #include "string-table.h"
93 #include "string-util.h"
95 #include "syslog-util.h"
96 #include "terminal-util.h"
98 #include "user-util.h"
100 #include "utmp-wtmp.h"
102 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
103 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
105 /* This assumes there is a 'tty' group */
106 #define TTY_MODE 0620
108 #define SNDBUF_SIZE (8*1024*1024)
110 static int shift_fds(int fds
[], size_t n_fds
) {
111 int start
, restart_from
;
116 /* Modifies the fds array! (sorts it) */
126 for (i
= start
; i
< (int) n_fds
; i
++) {
129 /* Already at right index? */
133 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
140 /* Hmm, the fd we wanted isn't free? Then
141 * let's remember that and try again from here */
142 if (nfd
!= i
+3 && restart_from
< 0)
146 if (restart_from
< 0)
149 start
= restart_from
;
155 static int flags_fds(const int fds
[], size_t n_storage_fds
, size_t n_socket_fds
, bool nonblock
) {
159 n_fds
= n_storage_fds
+ n_socket_fds
;
165 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
166 * O_NONBLOCK only applies to socket activation though. */
168 for (i
= 0; i
< n_fds
; i
++) {
170 if (i
< n_socket_fds
) {
171 r
= fd_nonblock(fds
[i
], nonblock
);
176 /* We unconditionally drop FD_CLOEXEC from the fds,
177 * since after all we want to pass these fds to our
180 r
= fd_cloexec(fds
[i
], false);
188 static const char *exec_context_tty_path(const ExecContext
*context
) {
191 if (context
->stdio_as_fds
)
194 if (context
->tty_path
)
195 return context
->tty_path
;
197 return "/dev/console";
200 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
205 path
= exec_context_tty_path(context
);
207 if (context
->tty_vhangup
) {
208 if (p
&& p
->stdin_fd
>= 0)
209 (void) terminal_vhangup_fd(p
->stdin_fd
);
211 (void) terminal_vhangup(path
);
214 if (context
->tty_reset
) {
215 if (p
&& p
->stdin_fd
>= 0)
216 (void) reset_terminal_fd(p
->stdin_fd
, true);
218 (void) reset_terminal(path
);
221 if (context
->tty_vt_disallocate
&& path
)
222 (void) vt_disallocate(path
);
225 static bool is_terminal_input(ExecInput i
) {
228 EXEC_INPUT_TTY_FORCE
,
229 EXEC_INPUT_TTY_FAIL
);
232 static bool is_terminal_output(ExecOutput o
) {
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
237 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
240 static bool is_syslog_output(ExecOutput o
) {
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
246 static bool is_kmsg_output(ExecOutput o
) {
249 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
252 static bool exec_context_needs_term(const ExecContext
*c
) {
255 /* Return true if the execution context suggests we should set $TERM to something useful. */
257 if (is_terminal_input(c
->std_input
))
260 if (is_terminal_output(c
->std_output
))
263 if (is_terminal_output(c
->std_error
))
266 return !!c
->tty_path
;
269 static int open_null_as(int flags
, int nfd
) {
274 fd
= open("/dev/null", flags
|O_NOCTTY
);
278 return move_fd(fd
, nfd
, false);
281 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
282 static const union sockaddr_union sa
= {
283 .un
.sun_family
= AF_UNIX
,
284 .un
.sun_path
= "/run/systemd/journal/stdout",
286 uid_t olduid
= UID_INVALID
;
287 gid_t oldgid
= GID_INVALID
;
290 if (gid_is_valid(gid
)) {
293 if (setegid(gid
) < 0)
297 if (uid_is_valid(uid
)) {
300 if (seteuid(uid
) < 0) {
306 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
308 /* If we fail to restore the uid or gid, things will likely
309 fail later on. This should only happen if an LSM interferes. */
311 if (uid_is_valid(uid
))
312 (void) seteuid(olduid
);
315 if (gid_is_valid(gid
))
316 (void) setegid(oldgid
);
321 static int connect_logger_as(
323 const ExecContext
*context
,
324 const ExecParameters
*params
,
335 assert(output
< _EXEC_OUTPUT_MAX
);
339 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
343 r
= connect_journal_socket(fd
, uid
, gid
);
347 if (shutdown(fd
, SHUT_RD
) < 0) {
352 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
362 context
->syslog_identifier
?: ident
,
363 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
364 context
->syslog_priority
,
365 !!context
->syslog_level_prefix
,
366 is_syslog_output(output
),
367 is_kmsg_output(output
),
368 is_terminal_output(output
));
370 return move_fd(fd
, nfd
, false);
372 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
378 fd
= open_terminal(path
, flags
| O_NOCTTY
);
382 return move_fd(fd
, nfd
, false);
385 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
386 union sockaddr_union sa
= {
387 .sa
.sa_family
= AF_UNIX
,
393 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
396 fd
= open(path
, flags
|O_NOCTTY
, mode
);
400 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
402 if (strlen(path
) > sizeof(sa
.un
.sun_path
)) /* Too long, can't be a UNIX socket */
405 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
407 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
411 strncpy(sa
.un
.sun_path
, path
, sizeof(sa
.un
.sun_path
));
412 if (connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0) {
414 return errno
== EINVAL
? -ENXIO
: -errno
; /* Propagate initial error if we get EINVAL, i.e. we have
415 * indication that his wasn't an AF_UNIX socket after all */
418 if ((flags
& O_ACCMODE
) == O_RDONLY
)
419 r
= shutdown(fd
, SHUT_WR
);
420 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
421 r
= shutdown(fd
, SHUT_RD
);
432 static int fixup_input(
433 const ExecContext
*context
,
435 bool apply_tty_stdin
) {
441 std_input
= context
->std_input
;
443 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
444 return EXEC_INPUT_NULL
;
446 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
447 return EXEC_INPUT_NULL
;
449 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
450 return EXEC_INPUT_NULL
;
455 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
457 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
458 return EXEC_OUTPUT_INHERIT
;
463 static int setup_input(
464 const ExecContext
*context
,
465 const ExecParameters
*params
,
467 int named_iofds
[3]) {
474 if (params
->stdin_fd
>= 0) {
475 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
478 /* Try to make this the controlling tty, if it is a tty, and reset it */
479 if (isatty(STDIN_FILENO
)) {
480 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
481 (void) reset_terminal_fd(STDIN_FILENO
, true);
487 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
491 case EXEC_INPUT_NULL
:
492 return open_null_as(O_RDONLY
, STDIN_FILENO
);
495 case EXEC_INPUT_TTY_FORCE
:
496 case EXEC_INPUT_TTY_FAIL
: {
499 fd
= acquire_terminal(exec_context_tty_path(context
),
500 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
501 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
502 ACQUIRE_TERMINAL_WAIT
,
507 return move_fd(fd
, STDIN_FILENO
, false);
510 case EXEC_INPUT_SOCKET
:
511 assert(socket_fd
>= 0);
513 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
515 case EXEC_INPUT_NAMED_FD
:
516 assert(named_iofds
[STDIN_FILENO
] >= 0);
518 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
519 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
521 case EXEC_INPUT_DATA
: {
524 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
528 return move_fd(fd
, STDIN_FILENO
, false);
531 case EXEC_INPUT_FILE
: {
535 assert(context
->stdio_file
[STDIN_FILENO
]);
537 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
538 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
540 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
544 return move_fd(fd
, STDIN_FILENO
, false);
548 assert_not_reached("Unknown input type");
552 static int setup_output(
554 const ExecContext
*context
,
555 const ExecParameters
*params
,
562 dev_t
*journal_stream_dev
,
563 ino_t
*journal_stream_ino
) {
573 assert(journal_stream_dev
);
574 assert(journal_stream_ino
);
576 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
578 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
581 return STDOUT_FILENO
;
584 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
585 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
588 return STDERR_FILENO
;
591 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
592 o
= fixup_output(context
->std_output
, socket_fd
);
594 if (fileno
== STDERR_FILENO
) {
596 e
= fixup_output(context
->std_error
, socket_fd
);
598 /* This expects the input and output are already set up */
600 /* Don't change the stderr file descriptor if we inherit all
601 * the way and are not on a tty */
602 if (e
== EXEC_OUTPUT_INHERIT
&&
603 o
== EXEC_OUTPUT_INHERIT
&&
604 i
== EXEC_INPUT_NULL
&&
605 !is_terminal_input(context
->std_input
) &&
609 /* Duplicate from stdout if possible */
610 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
611 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
615 } else if (o
== EXEC_OUTPUT_INHERIT
) {
616 /* If input got downgraded, inherit the original value */
617 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
618 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
620 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
621 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
622 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
624 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
628 /* We need to open /dev/null here anew, to get the right access mode. */
629 return open_null_as(O_WRONLY
, fileno
);
634 case EXEC_OUTPUT_NULL
:
635 return open_null_as(O_WRONLY
, fileno
);
637 case EXEC_OUTPUT_TTY
:
638 if (is_terminal_input(i
))
639 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
641 /* We don't reset the terminal if this is just about output */
642 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
644 case EXEC_OUTPUT_SYSLOG
:
645 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
646 case EXEC_OUTPUT_KMSG
:
647 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
648 case EXEC_OUTPUT_JOURNAL
:
649 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
650 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
652 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
653 r
= open_null_as(O_WRONLY
, fileno
);
657 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
658 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
659 * services to detect whether they are connected to the journal or not.
661 * If both stdout and stderr are connected to a stream then let's make sure to store the data
662 * about STDERR as that's usually the best way to do logging. */
664 if (fstat(fileno
, &st
) >= 0 &&
665 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
666 *journal_stream_dev
= st
.st_dev
;
667 *journal_stream_ino
= st
.st_ino
;
672 case EXEC_OUTPUT_SOCKET
:
673 assert(socket_fd
>= 0);
675 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
677 case EXEC_OUTPUT_NAMED_FD
:
678 assert(named_iofds
[fileno
] >= 0);
680 (void) fd_nonblock(named_iofds
[fileno
], false);
681 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
683 case EXEC_OUTPUT_FILE
: {
687 assert(context
->stdio_file
[fileno
]);
689 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
690 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
693 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
695 fd
= acquire_path(context
->stdio_file
[fileno
], O_WRONLY
, 0666 & ~context
->umask
);
699 return move_fd(fd
, fileno
, false);
703 assert_not_reached("Unknown error type");
707 static int chown_terminal(int fd
, uid_t uid
) {
712 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
716 /* This might fail. What matters are the results. */
717 (void) fchown(fd
, uid
, -1);
718 (void) fchmod(fd
, TTY_MODE
);
720 if (fstat(fd
, &st
) < 0)
723 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
729 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
730 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
733 assert(_saved_stdin
);
734 assert(_saved_stdout
);
736 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
740 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
741 if (saved_stdout
< 0)
744 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
748 r
= chown_terminal(fd
, getuid());
752 r
= reset_terminal_fd(fd
, true);
756 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
);
761 *_saved_stdin
= saved_stdin
;
762 *_saved_stdout
= saved_stdout
;
764 saved_stdin
= saved_stdout
= -1;
769 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
772 if (err
== -ETIMEDOUT
)
773 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
776 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
780 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
781 _cleanup_close_
int fd
= -1;
785 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
789 write_confirm_error_fd(err
, fd
, u
);
792 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
796 assert(saved_stdout
);
800 if (*saved_stdin
>= 0)
801 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
804 if (*saved_stdout
>= 0)
805 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
808 *saved_stdin
= safe_close(*saved_stdin
);
809 *saved_stdout
= safe_close(*saved_stdout
);
815 CONFIRM_PRETEND_FAILURE
= -1,
816 CONFIRM_PRETEND_SUCCESS
= 0,
820 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
821 int saved_stdout
= -1, saved_stdin
= -1, r
;
822 _cleanup_free_
char *e
= NULL
;
825 /* For any internal errors, assume a positive response. */
826 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
828 write_confirm_error(r
, vc
, u
);
829 return CONFIRM_EXECUTE
;
832 /* confirm_spawn might have been disabled while we were sleeping. */
833 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
838 e
= ellipsize(cmdline
, 60, 100);
846 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
848 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
855 printf("Resuming normal execution.\n");
856 manager_disable_confirm_spawn();
860 unit_dump(u
, stdout
, " ");
861 continue; /* ask again */
863 printf("Failing execution.\n");
864 r
= CONFIRM_PRETEND_FAILURE
;
867 printf(" c - continue, proceed without asking anymore\n"
868 " D - dump, show the state of the unit\n"
869 " f - fail, don't execute the command and pretend it failed\n"
871 " i - info, show a short summary of the unit\n"
872 " j - jobs, show jobs that are in progress\n"
873 " s - skip, don't execute the command and pretend it succeeded\n"
874 " y - yes, execute the command\n");
875 continue; /* ask again */
877 printf(" Description: %s\n"
880 u
->id
, u
->description
, cmdline
);
881 continue; /* ask again */
883 manager_dump_jobs(u
->manager
, stdout
, " ");
884 continue; /* ask again */
886 /* 'n' was removed in favor of 'f'. */
887 printf("Didn't understand 'n', did you mean 'f'?\n");
888 continue; /* ask again */
890 printf("Skipping execution.\n");
891 r
= CONFIRM_PRETEND_SUCCESS
;
897 assert_not_reached("Unhandled choice");
903 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
907 static int get_fixed_user(const ExecContext
*c
, const char **user
,
908 uid_t
*uid
, gid_t
*gid
,
909 const char **home
, const char **shell
) {
918 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
919 * (i.e. are "/" or "/bin/nologin"). */
922 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
930 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
940 r
= get_group_creds(&name
, gid
);
948 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
949 const char *group
, gid_t gid
,
950 gid_t
**supplementary_gids
, int *ngids
) {
954 bool keep_groups
= false;
955 gid_t
*groups
= NULL
;
956 _cleanup_free_ gid_t
*l_gids
= NULL
;
961 * If user is given, then lookup GID and supplementary groups list.
962 * We avoid NSS lookups for gid=0. Also we have to initialize groups
963 * here and as early as possible so we keep the list of supplementary
964 * groups of the caller.
966 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
967 /* First step, initialize groups from /etc/groups */
968 if (initgroups(user
, gid
) < 0)
974 if (strv_isempty(c
->supplementary_groups
))
978 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
979 * be positive, otherwise fail.
982 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
983 if (ngroups_max
<= 0) {
987 return -EOPNOTSUPP
; /* For all other values */
990 l_gids
= new(gid_t
, ngroups_max
);
996 * Lookup the list of groups that the user belongs to, we
997 * avoid NSS lookups here too for gid=0.
1000 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1005 STRV_FOREACH(i
, c
->supplementary_groups
) {
1008 if (k
>= ngroups_max
)
1012 r
= get_group_creds(&g
, l_gids
+k
);
1020 * Sets ngids to zero to drop all supplementary groups, happens
1021 * when we are under root and SupplementaryGroups= is empty.
1028 /* Otherwise get the final list of supplementary groups */
1029 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1033 *supplementary_gids
= groups
;
1041 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1044 /* Handle SupplementaryGroups= if it is not empty */
1046 r
= maybe_setgroups(ngids
, supplementary_gids
);
1051 if (gid_is_valid(gid
)) {
1052 /* Then set our gids */
1053 if (setresgid(gid
, gid
, gid
) < 0)
1060 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1063 if (!uid_is_valid(uid
))
1066 /* Sets (but doesn't look up) the uid and make sure we keep the
1067 * capabilities while doing so. */
1069 if (context
->capability_ambient_set
!= 0) {
1071 /* First step: If we need to keep capabilities but
1072 * drop privileges we need to make sure we keep our
1073 * caps, while we drop privileges. */
1075 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1077 if (prctl(PR_GET_SECUREBITS
) != sb
)
1078 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1083 /* Second step: actually set the uids */
1084 if (setresuid(uid
, uid
, uid
) < 0)
1087 /* At this point we should have all necessary capabilities but
1088 are otherwise a normal user. However, the caps might got
1089 corrupted due to the setresuid() so we need clean them up
1090 later. This is done outside of this call. */
1097 static int null_conv(
1099 const struct pam_message
**msg
,
1100 struct pam_response
**resp
,
1101 void *appdata_ptr
) {
1103 /* We don't support conversations */
1105 return PAM_CONV_ERR
;
1110 static int setup_pam(
1117 int fds
[], size_t n_fds
) {
1121 static const struct pam_conv conv
= {
1126 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1127 pam_handle_t
*handle
= NULL
;
1129 int pam_code
= PAM_SUCCESS
, r
;
1130 char **nv
, **e
= NULL
;
1131 bool close_session
= false;
1132 pid_t pam_pid
= 0, parent_pid
;
1139 /* We set up PAM in the parent process, then fork. The child
1140 * will then stay around until killed via PR_GET_PDEATHSIG or
1141 * systemd via the cgroup logic. It will then remove the PAM
1142 * session again. The parent process will exec() the actual
1143 * daemon. We do things this way to ensure that the main PID
1144 * of the daemon is the one we initially fork()ed. */
1146 r
= barrier_create(&barrier
);
1150 if (log_get_max_level() < LOG_DEBUG
)
1151 flags
|= PAM_SILENT
;
1153 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1154 if (pam_code
!= PAM_SUCCESS
) {
1160 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1161 if (pam_code
!= PAM_SUCCESS
)
1165 STRV_FOREACH(nv
, *env
) {
1166 pam_code
= pam_putenv(handle
, *nv
);
1167 if (pam_code
!= PAM_SUCCESS
)
1171 pam_code
= pam_acct_mgmt(handle
, flags
);
1172 if (pam_code
!= PAM_SUCCESS
)
1175 pam_code
= pam_open_session(handle
, flags
);
1176 if (pam_code
!= PAM_SUCCESS
)
1179 close_session
= true;
1181 e
= pam_getenvlist(handle
);
1183 pam_code
= PAM_BUF_ERR
;
1187 /* Block SIGTERM, so that we know that it won't get lost in
1190 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1192 parent_pid
= getpid_cached();
1194 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1198 int sig
, ret
= EXIT_PAM
;
1200 /* The child's job is to reset the PAM session on
1202 barrier_set_role(&barrier
, BARRIER_CHILD
);
1204 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1205 * are open here that have been opened by PAM. */
1206 (void) close_many(fds
, n_fds
);
1208 /* Drop privileges - we don't need any to pam_close_session
1209 * and this will make PR_SET_PDEATHSIG work in most cases.
1210 * If this fails, ignore the error - but expect sd-pam threads
1211 * to fail to exit normally */
1213 r
= maybe_setgroups(0, NULL
);
1215 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1216 if (setresgid(gid
, gid
, gid
) < 0)
1217 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1218 if (setresuid(uid
, uid
, uid
) < 0)
1219 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1221 (void) ignore_signals(SIGPIPE
, -1);
1223 /* Wait until our parent died. This will only work if
1224 * the above setresuid() succeeds, otherwise the kernel
1225 * will not allow unprivileged parents kill their privileged
1226 * children this way. We rely on the control groups kill logic
1227 * to do the rest for us. */
1228 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1231 /* Tell the parent that our setup is done. This is especially
1232 * important regarding dropping privileges. Otherwise, unit
1233 * setup might race against our setresuid(2) call.
1235 * If the parent aborted, we'll detect this below, hence ignore
1236 * return failure here. */
1237 (void) barrier_place(&barrier
);
1239 /* Check if our parent process might already have died? */
1240 if (getppid() == parent_pid
) {
1243 assert_se(sigemptyset(&ss
) >= 0);
1244 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1247 if (sigwait(&ss
, &sig
) < 0) {
1254 assert(sig
== SIGTERM
);
1259 /* If our parent died we'll end the session */
1260 if (getppid() != parent_pid
) {
1261 pam_code
= pam_close_session(handle
, flags
);
1262 if (pam_code
!= PAM_SUCCESS
)
1269 pam_end(handle
, pam_code
| flags
);
1273 barrier_set_role(&barrier
, BARRIER_PARENT
);
1275 /* If the child was forked off successfully it will do all the
1276 * cleanups, so forget about the handle here. */
1279 /* Unblock SIGTERM again in the parent */
1280 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1282 /* We close the log explicitly here, since the PAM modules
1283 * might have opened it, but we don't want this fd around. */
1286 /* Synchronously wait for the child to initialize. We don't care for
1287 * errors as we cannot recover. However, warn loudly if it happens. */
1288 if (!barrier_place_and_sync(&barrier
))
1289 log_error("PAM initialization failed");
1291 return strv_free_and_replace(*env
, e
);
1294 if (pam_code
!= PAM_SUCCESS
) {
1295 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1296 r
= -EPERM
; /* PAM errors do not map to errno */
1298 log_error_errno(r
, "PAM failed: %m");
1302 pam_code
= pam_close_session(handle
, flags
);
1304 pam_end(handle
, pam_code
| flags
);
1316 static void rename_process_from_path(const char *path
) {
1317 char process_name
[11];
1321 /* This resulting string must fit in 10 chars (i.e. the length
1322 * of "/sbin/init") to look pretty in /bin/ps */
1326 rename_process("(...)");
1332 /* The end of the process name is usually more
1333 * interesting, since the first bit might just be
1339 process_name
[0] = '(';
1340 memcpy(process_name
+1, p
, l
);
1341 process_name
[1+l
] = ')';
1342 process_name
[1+l
+1] = 0;
1344 rename_process(process_name
);
1347 static bool context_has_address_families(const ExecContext
*c
) {
1350 return c
->address_families_whitelist
||
1351 !set_isempty(c
->address_families
);
1354 static bool context_has_syscall_filters(const ExecContext
*c
) {
1357 return c
->syscall_whitelist
||
1358 !hashmap_isempty(c
->syscall_filter
);
1361 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1364 if (c
->no_new_privileges
)
1367 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1370 /* We need NNP if we have any form of seccomp and are unprivileged */
1371 return context_has_address_families(c
) ||
1372 c
->memory_deny_write_execute
||
1373 c
->restrict_realtime
||
1374 exec_context_restrict_namespaces_set(c
) ||
1375 c
->protect_kernel_tunables
||
1376 c
->protect_kernel_modules
||
1377 c
->private_devices
||
1378 context_has_syscall_filters(c
) ||
1379 !set_isempty(c
->syscall_archs
) ||
1380 c
->lock_personality
;
1385 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1387 if (is_seccomp_available())
1390 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1394 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1395 uint32_t negative_action
, default_action
, action
;
1401 if (!context_has_syscall_filters(c
))
1404 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1407 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1409 if (c
->syscall_whitelist
) {
1410 default_action
= negative_action
;
1411 action
= SCMP_ACT_ALLOW
;
1413 default_action
= SCMP_ACT_ALLOW
;
1414 action
= negative_action
;
1417 if (needs_ambient_hack
) {
1418 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1423 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1426 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1430 if (set_isempty(c
->syscall_archs
))
1433 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1436 return seccomp_restrict_archs(c
->syscall_archs
);
1439 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1443 if (!context_has_address_families(c
))
1446 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1449 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1452 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1456 if (!c
->memory_deny_write_execute
)
1459 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1462 return seccomp_memory_deny_write_execute();
1465 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1469 if (!c
->restrict_realtime
)
1472 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1475 return seccomp_restrict_realtime();
1478 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1482 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1483 * let's protect even those systems where this is left on in the kernel. */
1485 if (!c
->protect_kernel_tunables
)
1488 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1491 return seccomp_protect_sysctl();
1494 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1498 /* Turn off module syscalls on ProtectKernelModules=yes */
1500 if (!c
->protect_kernel_modules
)
1503 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1506 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1509 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1513 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1515 if (!c
->private_devices
)
1518 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1521 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1524 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1528 if (!exec_context_restrict_namespaces_set(c
))
1531 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1534 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1537 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1538 unsigned long personality
;
1544 if (!c
->lock_personality
)
1547 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1550 personality
= c
->personality
;
1552 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1553 if (personality
== PERSONALITY_INVALID
) {
1555 r
= opinionated_personality(&personality
);
1560 return seccomp_lock_personality(personality
);
1565 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1568 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1569 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1571 if (idle_pipe
[0] >= 0) {
1574 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1576 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1579 /* Signal systemd that we are bored and want to continue. */
1580 n
= write(idle_pipe
[3], "x", 1);
1582 /* Wait for systemd to react to the signal above. */
1583 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1586 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1590 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1593 static int build_environment(
1595 const ExecContext
*c
,
1596 const ExecParameters
*p
,
1599 const char *username
,
1601 dev_t journal_stream_dev
,
1602 ino_t journal_stream_ino
,
1605 _cleanup_strv_free_
char **our_env
= NULL
;
1613 our_env
= new0(char*, 14);
1618 _cleanup_free_
char *joined
= NULL
;
1620 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1622 our_env
[n_env
++] = x
;
1624 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1626 our_env
[n_env
++] = x
;
1628 joined
= strv_join(p
->fd_names
, ":");
1632 x
= strjoin("LISTEN_FDNAMES=", joined
);
1635 our_env
[n_env
++] = x
;
1638 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1639 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1641 our_env
[n_env
++] = x
;
1643 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1645 our_env
[n_env
++] = x
;
1648 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1649 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1650 * check the database directly. */
1651 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1652 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1655 our_env
[n_env
++] = x
;
1659 x
= strappend("HOME=", home
);
1662 our_env
[n_env
++] = x
;
1666 x
= strappend("LOGNAME=", username
);
1669 our_env
[n_env
++] = x
;
1671 x
= strappend("USER=", username
);
1674 our_env
[n_env
++] = x
;
1678 x
= strappend("SHELL=", shell
);
1681 our_env
[n_env
++] = x
;
1684 if (!sd_id128_is_null(u
->invocation_id
)) {
1685 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1688 our_env
[n_env
++] = x
;
1691 if (exec_context_needs_term(c
)) {
1692 const char *tty_path
, *term
= NULL
;
1694 tty_path
= exec_context_tty_path(c
);
1696 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1697 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1698 * passes to PID 1 ends up all the way in the console login shown. */
1700 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1701 term
= getenv("TERM");
1703 term
= default_term_for_tty(tty_path
);
1705 x
= strappend("TERM=", term
);
1708 our_env
[n_env
++] = x
;
1711 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1712 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1715 our_env
[n_env
++] = x
;
1718 our_env
[n_env
++] = NULL
;
1719 assert(n_env
<= 12);
1721 *ret
= TAKE_PTR(our_env
);
1726 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1727 _cleanup_strv_free_
char **pass_env
= NULL
;
1728 size_t n_env
= 0, n_bufsize
= 0;
1731 STRV_FOREACH(i
, c
->pass_environment
) {
1732 _cleanup_free_
char *x
= NULL
;
1738 x
= strjoin(*i
, "=", v
);
1742 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1745 pass_env
[n_env
++] = TAKE_PTR(x
);
1746 pass_env
[n_env
] = NULL
;
1749 *ret
= TAKE_PTR(pass_env
);
1754 static bool exec_needs_mount_namespace(
1755 const ExecContext
*context
,
1756 const ExecParameters
*params
,
1757 const ExecRuntime
*runtime
) {
1762 if (context
->root_image
)
1765 if (!strv_isempty(context
->read_write_paths
) ||
1766 !strv_isempty(context
->read_only_paths
) ||
1767 !strv_isempty(context
->inaccessible_paths
))
1770 if (context
->n_bind_mounts
> 0)
1773 if (context
->n_temporary_filesystems
> 0)
1776 if (context
->mount_flags
!= 0)
1779 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1782 if (context
->private_devices
||
1783 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1784 context
->protect_home
!= PROTECT_HOME_NO
||
1785 context
->protect_kernel_tunables
||
1786 context
->protect_kernel_modules
||
1787 context
->protect_control_groups
)
1790 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1793 if (context
->dynamic_user
&&
1794 (!strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1795 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1796 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
)))
1802 static int setup_private_users(uid_t uid
, gid_t gid
) {
1803 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1804 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1805 _cleanup_close_
int unshare_ready_fd
= -1;
1806 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1811 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1812 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1813 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1814 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1815 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1816 * continues execution normally. */
1818 if (uid
!= 0 && uid_is_valid(uid
)) {
1819 r
= asprintf(&uid_map
,
1820 "0 0 1\n" /* Map root → root */
1821 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1826 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1831 if (gid
!= 0 && gid_is_valid(gid
)) {
1832 r
= asprintf(&gid_map
,
1833 "0 0 1\n" /* Map root → root */
1834 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1839 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1844 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1846 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1847 if (unshare_ready_fd
< 0)
1850 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1852 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1855 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
1859 _cleanup_close_
int fd
= -1;
1863 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1864 * here, after the parent opened its own user namespace. */
1867 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1869 /* Wait until the parent unshared the user namespace */
1870 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1875 /* Disable the setgroups() system call in the child user namespace, for good. */
1876 a
= procfs_file_alloca(ppid
, "setgroups");
1877 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1879 if (errno
!= ENOENT
) {
1884 /* If the file is missing the kernel is too old, let's continue anyway. */
1886 if (write(fd
, "deny\n", 5) < 0) {
1891 fd
= safe_close(fd
);
1894 /* First write the GID map */
1895 a
= procfs_file_alloca(ppid
, "gid_map");
1896 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1901 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1905 fd
= safe_close(fd
);
1907 /* The write the UID map */
1908 a
= procfs_file_alloca(ppid
, "uid_map");
1909 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1914 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1919 _exit(EXIT_SUCCESS
);
1922 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1923 _exit(EXIT_FAILURE
);
1926 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1928 if (unshare(CLONE_NEWUSER
) < 0)
1931 /* Let the child know that the namespace is ready now */
1932 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1935 /* Try to read an error code from the child */
1936 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1939 if (n
== sizeof(r
)) { /* an error code was sent to us */
1944 if (n
!= 0) /* on success we should have read 0 bytes */
1947 r
= wait_for_terminate_and_check("(sd-userns)", pid
, 0);
1951 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
1957 static int setup_exec_directory(
1958 const ExecContext
*context
,
1959 const ExecParameters
*params
,
1962 ExecDirectoryType type
,
1965 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1966 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1967 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1968 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1969 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1970 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1977 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
1978 assert(exit_status
);
1980 if (!params
->prefix
[type
])
1983 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1984 if (!uid_is_valid(uid
))
1986 if (!gid_is_valid(gid
))
1990 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1991 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
1993 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1999 r
= mkdir_parents_label(p
, 0755);
2003 if (context
->dynamic_user
&&
2004 !IN_SET(type
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2005 _cleanup_free_
char *private_root
= NULL
, *relative
= NULL
, *parent
= NULL
;
2007 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2008 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2009 * whose UID is later on reused. To lock this down we use the same trick used by container
2010 * managers to prohibit host users to get access to files of the same UID in containers: we
2011 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2012 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2013 * to make this directory permeable for the service itself.
2015 * Specifically: for a service which wants a special directory "foo/" we first create a
2016 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2017 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2018 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2019 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2020 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2021 * disabling the access boundary for the service and making sure it only gets access to the
2022 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2024 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2025 * owned by the service itself.
2026 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2027 * files or sockets with other services. */
2029 private_root
= strjoin(params
->prefix
[type
], "/private");
2030 if (!private_root
) {
2035 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2036 r
= mkdir_safe_label(private_root
, 0700, 0, 0, MKDIR_WARN_MODE
);
2040 pp
= strjoin(private_root
, "/", *rt
);
2046 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2047 r
= mkdir_parents_label(pp
, 0755);
2051 if (is_dir(p
, false) > 0 &&
2052 (laccess(pp
, F_OK
) < 0 && errno
== ENOENT
)) {
2054 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2055 * it over. Most likely the service has been upgraded from one that didn't use
2056 * DynamicUser=1, to one that does. */
2058 if (rename(p
, pp
) < 0) {
2063 /* Otherwise, create the actual directory for the service */
2065 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2066 if (r
< 0 && r
!= -EEXIST
)
2070 parent
= dirname_malloc(p
);
2076 r
= path_make_relative(parent
, pp
, &relative
);
2080 /* And link it up from the original place */
2081 r
= symlink_idempotent(relative
, p
);
2085 /* Lock down the access mode */
2086 if (chmod(pp
, context
->directories
[type
].mode
) < 0) {
2091 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2098 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2099 * a service, and shall not be writable. */
2100 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2103 /* Then, change the ownership of the whole tree, if necessary */
2104 r
= path_chown_recursive(pp
?: p
, uid
, gid
);
2112 *exit_status
= exit_status_table
[type
];
2117 static int setup_smack(
2118 const ExecContext
*context
,
2119 const ExecCommand
*command
) {
2126 if (context
->smack_process_label
) {
2127 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2131 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2133 _cleanup_free_
char *exec_label
= NULL
;
2135 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2136 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2139 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2149 static int compile_bind_mounts(
2150 const ExecContext
*context
,
2151 const ExecParameters
*params
,
2152 BindMount
**ret_bind_mounts
,
2153 size_t *ret_n_bind_mounts
,
2154 char ***ret_empty_directories
) {
2156 _cleanup_strv_free_
char **empty_directories
= NULL
;
2157 BindMount
*bind_mounts
;
2159 ExecDirectoryType t
;
2164 assert(ret_bind_mounts
);
2165 assert(ret_n_bind_mounts
);
2166 assert(ret_empty_directories
);
2168 n
= context
->n_bind_mounts
;
2169 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2170 if (!params
->prefix
[t
])
2173 n
+= strv_length(context
->directories
[t
].paths
);
2177 *ret_bind_mounts
= NULL
;
2178 *ret_n_bind_mounts
= 0;
2179 *ret_empty_directories
= NULL
;
2183 bind_mounts
= new(BindMount
, n
);
2187 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2188 BindMount
*item
= context
->bind_mounts
+ i
;
2191 s
= strdup(item
->source
);
2197 d
= strdup(item
->destination
);
2204 bind_mounts
[h
++] = (BindMount
) {
2207 .read_only
= item
->read_only
,
2208 .recursive
= item
->recursive
,
2209 .ignore_enoent
= item
->ignore_enoent
,
2213 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2216 if (!params
->prefix
[t
])
2219 if (strv_isempty(context
->directories
[t
].paths
))
2222 if (context
->dynamic_user
&&
2223 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2226 /* So this is for a dynamic user, and we need to make sure the process can access its own
2227 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2228 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2230 private_root
= strjoin(params
->prefix
[t
], "/private");
2231 if (!private_root
) {
2236 r
= strv_consume(&empty_directories
, private_root
);
2241 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2244 if (context
->dynamic_user
&&
2245 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
))
2246 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2248 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2261 bind_mounts
[h
++] = (BindMount
) {
2266 .ignore_enoent
= false,
2273 *ret_bind_mounts
= bind_mounts
;
2274 *ret_n_bind_mounts
= n
;
2275 *ret_empty_directories
= TAKE_PTR(empty_directories
);
2280 bind_mount_free_many(bind_mounts
, h
);
2284 static int apply_mount_namespace(
2286 const ExecCommand
*command
,
2287 const ExecContext
*context
,
2288 const ExecParameters
*params
,
2289 const ExecRuntime
*runtime
) {
2291 _cleanup_strv_free_
char **empty_directories
= NULL
;
2292 char *tmp
= NULL
, *var
= NULL
;
2293 const char *root_dir
= NULL
, *root_image
= NULL
;
2294 NamespaceInfo ns_info
= {};
2295 bool needs_sandboxing
;
2296 BindMount
*bind_mounts
= NULL
;
2297 size_t n_bind_mounts
= 0;
2302 /* The runtime struct only contains the parent of the private /tmp,
2303 * which is non-accessible to world users. Inside of it there's a /tmp
2304 * that is sticky, and that's the one we want to use here. */
2306 if (context
->private_tmp
&& runtime
) {
2307 if (runtime
->tmp_dir
)
2308 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2309 if (runtime
->var_tmp_dir
)
2310 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2313 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2314 root_image
= context
->root_image
;
2317 root_dir
= context
->root_directory
;
2320 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2325 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2326 * sandbox info, otherwise enforce it, don't ignore protected paths and
2327 * fail if we are enable to apply the sandbox inside the mount namespace.
2329 if (!context
->dynamic_user
&& root_dir
)
2330 ns_info
.ignore_protect_paths
= true;
2332 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2334 if (needs_sandboxing
)
2335 ns_info
= (NamespaceInfo
) {
2336 .ignore_protect_paths
= false,
2337 .private_dev
= context
->private_devices
,
2338 .protect_control_groups
= context
->protect_control_groups
,
2339 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2340 .protect_kernel_modules
= context
->protect_kernel_modules
,
2341 .mount_apivfs
= context
->mount_apivfs
,
2344 r
= setup_namespace(root_dir
, root_image
,
2345 &ns_info
, context
->read_write_paths
,
2346 needs_sandboxing
? context
->read_only_paths
: NULL
,
2347 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2351 context
->temporary_filesystems
,
2352 context
->n_temporary_filesystems
,
2355 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2356 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2357 context
->mount_flags
,
2358 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2360 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2362 /* If we couldn't set up the namespace this is probably due to a
2363 * missing capability. In this case, silently proceeed. */
2364 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2365 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2372 static int apply_working_directory(
2373 const ExecContext
*context
,
2374 const ExecParameters
*params
,
2376 const bool needs_mount_ns
,
2382 assert(exit_status
);
2384 if (context
->working_directory_home
) {
2387 *exit_status
= EXIT_CHDIR
;
2393 } else if (context
->working_directory
)
2394 wd
= context
->working_directory
;
2398 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2399 if (!needs_mount_ns
&& context
->root_directory
)
2400 if (chroot(context
->root_directory
) < 0) {
2401 *exit_status
= EXIT_CHROOT
;
2407 d
= prefix_roota(context
->root_directory
, wd
);
2409 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2410 *exit_status
= EXIT_CHDIR
;
2417 static int setup_keyring(
2419 const ExecContext
*context
,
2420 const ExecParameters
*p
,
2421 uid_t uid
, gid_t gid
) {
2423 key_serial_t keyring
;
2432 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2433 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2434 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2435 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2436 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2437 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2439 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2442 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2445 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2446 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2447 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2448 * & group is just as nasty as acquiring a reference to the user keyring. */
2450 saved_uid
= getuid();
2451 saved_gid
= getgid();
2453 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2454 if (setregid(gid
, -1) < 0)
2455 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2458 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2459 if (setreuid(uid
, -1) < 0) {
2460 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2465 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2466 if (keyring
== -1) {
2467 if (errno
== ENOSYS
)
2468 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2469 else if (IN_SET(errno
, EACCES
, EPERM
))
2470 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2471 else if (errno
== EDQUOT
)
2472 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2474 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2479 /* When requested link the user keyring into the session keyring. */
2480 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2482 if (keyctl(KEYCTL_LINK
,
2483 KEY_SPEC_USER_KEYRING
,
2484 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2485 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
2490 /* Restore uid/gid back */
2491 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2492 if (setreuid(saved_uid
, -1) < 0) {
2493 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2498 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2499 if (setregid(saved_gid
, -1) < 0)
2500 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2503 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2504 if (!sd_id128_is_null(u
->invocation_id
)) {
2507 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2509 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2511 if (keyctl(KEYCTL_SETPERM
, key
,
2512 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2513 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2514 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2519 /* Revert back uid & gid for the the last time, and exit */
2520 /* no extra logging, as only the first already reported error matters */
2521 if (getuid() != saved_uid
)
2522 (void) setreuid(saved_uid
, -1);
2524 if (getgid() != saved_gid
)
2525 (void) setregid(saved_gid
, -1);
2530 static void append_socket_pair(int *array
, size_t *n
, const int pair
[2]) {
2538 array
[(*n
)++] = pair
[0];
2540 array
[(*n
)++] = pair
[1];
2543 static int close_remaining_fds(
2544 const ExecParameters
*params
,
2545 const ExecRuntime
*runtime
,
2546 const DynamicCreds
*dcreds
,
2549 int *fds
, size_t n_fds
) {
2551 size_t n_dont_close
= 0;
2552 int dont_close
[n_fds
+ 12];
2556 if (params
->stdin_fd
>= 0)
2557 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2558 if (params
->stdout_fd
>= 0)
2559 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2560 if (params
->stderr_fd
>= 0)
2561 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2564 dont_close
[n_dont_close
++] = socket_fd
;
2566 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2567 n_dont_close
+= n_fds
;
2571 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2575 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2577 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2580 if (user_lookup_fd
>= 0)
2581 dont_close
[n_dont_close
++] = user_lookup_fd
;
2583 return close_all_fds(dont_close
, n_dont_close
);
2586 static int send_user_lookup(
2594 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2595 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2598 if (user_lookup_fd
< 0)
2601 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2604 if (writev(user_lookup_fd
,
2606 IOVEC_INIT(&uid
, sizeof(uid
)),
2607 IOVEC_INIT(&gid
, sizeof(gid
)),
2608 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2614 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2621 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2626 if (!c
->working_directory_home
)
2630 /* Hardcode /root as home directory for UID 0 */
2635 r
= get_home_dir(buf
);
2643 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2644 _cleanup_strv_free_
char ** list
= NULL
;
2645 ExecDirectoryType t
;
2652 assert(c
->dynamic_user
);
2654 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2655 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2658 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2661 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2667 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2670 if (t
== EXEC_DIRECTORY_RUNTIME
)
2671 e
= strjoin(p
->prefix
[t
], "/", *i
);
2673 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2677 r
= strv_consume(&list
, e
);
2683 *ret
= TAKE_PTR(list
);
2688 static char *exec_command_line(char **argv
);
2690 static int exec_child(
2692 const ExecCommand
*command
,
2693 const ExecContext
*context
,
2694 const ExecParameters
*params
,
2695 ExecRuntime
*runtime
,
2696 DynamicCreds
*dcreds
,
2701 size_t n_storage_fds
,
2702 size_t n_socket_fds
,
2707 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2708 _cleanup_free_
char *home_buffer
= NULL
;
2709 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2710 const char *username
= NULL
, *groupname
= NULL
;
2711 const char *home
= NULL
, *shell
= NULL
;
2712 dev_t journal_stream_dev
= 0;
2713 ino_t journal_stream_ino
= 0;
2714 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2715 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2716 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2717 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2719 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
2720 bool use_selinux
= false;
2723 bool use_smack
= false;
2726 bool use_apparmor
= false;
2728 uid_t uid
= UID_INVALID
;
2729 gid_t gid
= GID_INVALID
;
2730 int i
, r
, ngids
= 0;
2732 ExecDirectoryType dt
;
2739 assert(exit_status
);
2741 rename_process_from_path(command
->path
);
2743 /* We reset exactly these signals, since they are the
2744 * only ones we set to SIG_IGN in the main daemon. All
2745 * others we leave untouched because we set them to
2746 * SIG_DFL or a valid handler initially, both of which
2747 * will be demoted to SIG_DFL. */
2748 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2749 SIGNALS_IGNORE
, -1);
2751 if (context
->ignore_sigpipe
)
2752 (void) ignore_signals(SIGPIPE
, -1);
2754 r
= reset_signal_mask();
2756 *exit_status
= EXIT_SIGNAL_MASK
;
2757 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2760 if (params
->idle_pipe
)
2761 do_idle_pipe_dance(params
->idle_pipe
);
2763 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2764 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2765 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2766 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2769 log_set_open_when_needed(true);
2771 /* In case anything used libc syslog(), close this here, too */
2774 n_fds
= n_storage_fds
+ n_socket_fds
;
2775 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2777 *exit_status
= EXIT_FDS
;
2778 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2781 if (!context
->same_pgrp
)
2783 *exit_status
= EXIT_SETSID
;
2784 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2787 exec_context_tty_reset(context
, params
);
2789 if (unit_shall_confirm_spawn(unit
)) {
2790 const char *vc
= params
->confirm_spawn
;
2791 _cleanup_free_
char *cmdline
= NULL
;
2793 cmdline
= exec_command_line(argv
);
2795 *exit_status
= EXIT_MEMORY
;
2799 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2800 if (r
!= CONFIRM_EXECUTE
) {
2801 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2802 *exit_status
= EXIT_SUCCESS
;
2805 *exit_status
= EXIT_CONFIRM
;
2806 log_unit_error(unit
, "Execution cancelled by the user");
2811 if (context
->dynamic_user
&& dcreds
) {
2812 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2814 /* Make sure we bypass our own NSS module for any NSS checks */
2815 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2816 *exit_status
= EXIT_USER
;
2817 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2820 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2822 *exit_status
= EXIT_MEMORY
;
2826 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2828 *exit_status
= EXIT_USER
;
2830 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2833 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2836 if (!uid_is_valid(uid
)) {
2837 *exit_status
= EXIT_USER
;
2838 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2842 if (!gid_is_valid(gid
)) {
2843 *exit_status
= EXIT_USER
;
2844 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2849 username
= dcreds
->user
->name
;
2852 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2854 *exit_status
= EXIT_USER
;
2855 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2858 r
= get_fixed_group(context
, &groupname
, &gid
);
2860 *exit_status
= EXIT_GROUP
;
2861 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
2865 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2866 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2867 &supplementary_gids
, &ngids
);
2869 *exit_status
= EXIT_GROUP
;
2870 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
2873 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2875 *exit_status
= EXIT_USER
;
2876 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
2879 user_lookup_fd
= safe_close(user_lookup_fd
);
2881 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2883 *exit_status
= EXIT_CHDIR
;
2884 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
2887 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2888 * must sure to drop O_NONBLOCK */
2890 (void) fd_nonblock(socket_fd
, false);
2892 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2894 *exit_status
= EXIT_STDIN
;
2895 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
2898 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2900 *exit_status
= EXIT_STDOUT
;
2901 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
2904 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2906 *exit_status
= EXIT_STDERR
;
2907 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
2910 if (params
->cgroup_path
) {
2911 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2913 *exit_status
= EXIT_CGROUP
;
2914 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", params
->cgroup_path
);
2918 if (context
->oom_score_adjust_set
) {
2919 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2921 /* When we can't make this change due to EPERM, then
2922 * let's silently skip over it. User namespaces
2923 * prohibit write access to this file, and we
2924 * shouldn't trip up over that. */
2926 sprintf(t
, "%i", context
->oom_score_adjust
);
2927 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2928 if (IN_SET(r
, -EPERM
, -EACCES
))
2929 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2931 *exit_status
= EXIT_OOM_ADJUST
;
2932 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
2936 if (context
->nice_set
)
2937 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2938 *exit_status
= EXIT_NICE
;
2939 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
2942 if (context
->cpu_sched_set
) {
2943 struct sched_param param
= {
2944 .sched_priority
= context
->cpu_sched_priority
,
2947 r
= sched_setscheduler(0,
2948 context
->cpu_sched_policy
|
2949 (context
->cpu_sched_reset_on_fork
?
2950 SCHED_RESET_ON_FORK
: 0),
2953 *exit_status
= EXIT_SETSCHEDULER
;
2954 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
2958 if (context
->cpuset
)
2959 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2960 *exit_status
= EXIT_CPUAFFINITY
;
2961 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
2964 if (context
->ioprio_set
)
2965 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2966 *exit_status
= EXIT_IOPRIO
;
2967 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
2970 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2971 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2972 *exit_status
= EXIT_TIMERSLACK
;
2973 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
2976 if (context
->personality
!= PERSONALITY_INVALID
) {
2977 r
= safe_personality(context
->personality
);
2979 *exit_status
= EXIT_PERSONALITY
;
2980 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
2984 if (context
->utmp_id
)
2985 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2987 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2988 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2992 if (context
->user
) {
2993 r
= chown_terminal(STDIN_FILENO
, uid
);
2995 *exit_status
= EXIT_STDIN
;
2996 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
3000 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3001 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3002 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3003 * touch a single hierarchy too. */
3004 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
3005 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
3007 *exit_status
= EXIT_CGROUP
;
3008 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
3012 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3013 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
3015 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
3018 r
= build_environment(
3030 *exit_status
= EXIT_MEMORY
;
3034 r
= build_pass_environment(context
, &pass_env
);
3036 *exit_status
= EXIT_MEMORY
;
3040 accum_env
= strv_env_merge(5,
3041 params
->environment
,
3044 context
->environment
,
3048 *exit_status
= EXIT_MEMORY
;
3051 accum_env
= strv_env_clean(accum_env
);
3053 (void) umask(context
->umask
);
3055 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
3057 *exit_status
= EXIT_KEYRING
;
3058 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
3061 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3062 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3064 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3065 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3067 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3068 if (needs_ambient_hack
)
3069 needs_setuid
= false;
3071 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3073 if (needs_sandboxing
) {
3074 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3075 * present. The actual MAC context application will happen later, as late as possible, to avoid
3076 * impacting our own code paths. */
3079 use_selinux
= mac_selinux_use();
3082 use_smack
= mac_smack_use();
3085 use_apparmor
= mac_apparmor_use();
3090 if (context
->pam_name
&& username
) {
3091 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3093 *exit_status
= EXIT_PAM
;
3094 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3099 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3100 if (ns_type_supported(NAMESPACE_NET
)) {
3101 r
= setup_netns(runtime
->netns_storage_socket
);
3103 *exit_status
= EXIT_NETWORK
;
3104 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3107 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3110 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3111 if (needs_mount_namespace
) {
3112 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3114 *exit_status
= EXIT_NAMESPACE
;
3115 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3119 /* Apply just after mount namespace setup */
3120 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3122 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3124 /* Drop groups as early as possbile */
3126 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3128 *exit_status
= EXIT_GROUP
;
3129 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3133 if (needs_sandboxing
) {
3135 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3136 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3138 *exit_status
= EXIT_SELINUX_CONTEXT
;
3139 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3144 if (context
->private_users
) {
3145 r
= setup_private_users(uid
, gid
);
3147 *exit_status
= EXIT_USER
;
3148 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3153 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3154 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3155 * was needed to upload the policy and can now be closed as well. */
3156 r
= close_all_fds(fds
, n_fds
);
3158 r
= shift_fds(fds
, n_fds
);
3160 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
3162 *exit_status
= EXIT_FDS
;
3163 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3166 secure_bits
= context
->secure_bits
;
3168 if (needs_sandboxing
) {
3171 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
3173 if (!context
->rlimit
[i
])
3176 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
3178 *exit_status
= EXIT_LIMITS
;
3179 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(i
));
3183 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3184 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3185 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3186 *exit_status
= EXIT_LIMITS
;
3187 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3192 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3193 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3195 r
= setup_smack(context
, command
);
3197 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3198 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3203 bset
= context
->capability_bounding_set
;
3204 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3205 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3206 * instead of us doing that */
3207 if (needs_ambient_hack
)
3208 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3209 (UINT64_C(1) << CAP_SETUID
) |
3210 (UINT64_C(1) << CAP_SETGID
);
3212 if (!cap_test_all(bset
)) {
3213 r
= capability_bounding_set_drop(bset
, false);
3215 *exit_status
= EXIT_CAPABILITIES
;
3216 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3220 /* This is done before enforce_user, but ambient set
3221 * does not survive over setresuid() if keep_caps is not set. */
3222 if (!needs_ambient_hack
&&
3223 context
->capability_ambient_set
!= 0) {
3224 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3226 *exit_status
= EXIT_CAPABILITIES
;
3227 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3233 if (context
->user
) {
3234 r
= enforce_user(context
, uid
);
3236 *exit_status
= EXIT_USER
;
3237 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3240 if (!needs_ambient_hack
&&
3241 context
->capability_ambient_set
!= 0) {
3243 /* Fix the ambient capabilities after user change. */
3244 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3246 *exit_status
= EXIT_CAPABILITIES
;
3247 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3250 /* If we were asked to change user and ambient capabilities
3251 * were requested, we had to add keep-caps to the securebits
3252 * so that we would maintain the inherited capability set
3253 * through the setresuid(). Make sure that the bit is added
3254 * also to the context secure_bits so that we don't try to
3255 * drop the bit away next. */
3257 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3262 if (needs_sandboxing
) {
3263 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3264 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3265 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3266 * are restricted. */
3270 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3273 r
= setexeccon(exec_context
);
3275 *exit_status
= EXIT_SELINUX_CONTEXT
;
3276 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3283 if (use_apparmor
&& context
->apparmor_profile
) {
3284 r
= aa_change_onexec(context
->apparmor_profile
);
3285 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3286 *exit_status
= EXIT_APPARMOR_PROFILE
;
3287 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3292 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3293 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3294 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3295 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3296 *exit_status
= EXIT_SECUREBITS
;
3297 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3300 if (context_has_no_new_privileges(context
))
3301 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3302 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3303 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3307 r
= apply_address_families(unit
, context
);
3309 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3310 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3313 r
= apply_memory_deny_write_execute(unit
, context
);
3315 *exit_status
= EXIT_SECCOMP
;
3316 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3319 r
= apply_restrict_realtime(unit
, context
);
3321 *exit_status
= EXIT_SECCOMP
;
3322 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3325 r
= apply_restrict_namespaces(unit
, context
);
3327 *exit_status
= EXIT_SECCOMP
;
3328 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3331 r
= apply_protect_sysctl(unit
, context
);
3333 *exit_status
= EXIT_SECCOMP
;
3334 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3337 r
= apply_protect_kernel_modules(unit
, context
);
3339 *exit_status
= EXIT_SECCOMP
;
3340 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3343 r
= apply_private_devices(unit
, context
);
3345 *exit_status
= EXIT_SECCOMP
;
3346 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3349 r
= apply_syscall_archs(unit
, context
);
3351 *exit_status
= EXIT_SECCOMP
;
3352 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3355 r
= apply_lock_personality(unit
, context
);
3357 *exit_status
= EXIT_SECCOMP
;
3358 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3361 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3362 * by the filter as little as possible. */
3363 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3365 *exit_status
= EXIT_SECCOMP
;
3366 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3371 if (!strv_isempty(context
->unset_environment
)) {
3374 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3376 *exit_status
= EXIT_MEMORY
;
3380 strv_free_and_replace(accum_env
, ee
);
3383 final_argv
= replace_env_argv(argv
, accum_env
);
3385 *exit_status
= EXIT_MEMORY
;
3389 if (DEBUG_LOGGING
) {
3390 _cleanup_free_
char *line
;
3392 line
= exec_command_line(final_argv
);
3394 log_struct(LOG_DEBUG
,
3395 "EXECUTABLE=%s", command
->path
,
3396 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3398 LOG_UNIT_INVOCATION_ID(unit
),
3403 execve(command
->path
, final_argv
, accum_env
);
3405 if (errno
== ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3407 log_struct_errno(LOG_INFO
, errno
,
3408 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3410 LOG_UNIT_INVOCATION_ID(unit
),
3411 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3413 "EXECUTABLE=%s", command
->path
,
3419 *exit_status
= EXIT_EXEC
;
3420 return log_unit_error_errno(unit
, errno
, "Failed to execute command: %m");
3423 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
3424 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]);
3426 int exec_spawn(Unit
*unit
,
3427 ExecCommand
*command
,
3428 const ExecContext
*context
,
3429 const ExecParameters
*params
,
3430 ExecRuntime
*runtime
,
3431 DynamicCreds
*dcreds
,
3434 _cleanup_strv_free_
char **files_env
= NULL
;
3436 size_t n_storage_fds
= 0, n_socket_fds
= 0;
3437 _cleanup_free_
char *line
= NULL
;
3439 int named_iofds
[3] = { -1, -1, -1 };
3448 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3450 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3451 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3452 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3454 if (params
->n_socket_fds
> 1) {
3455 log_unit_error(unit
, "Got more than one socket.");
3459 if (params
->n_socket_fds
== 0) {
3460 log_unit_error(unit
, "Got no socket.");
3464 socket_fd
= params
->fds
[0];
3468 n_storage_fds
= params
->n_storage_fds
;
3469 n_socket_fds
= params
->n_socket_fds
;
3472 r
= exec_context_named_iofds(context
, params
, named_iofds
);
3474 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3476 r
= exec_context_load_environment(unit
, context
, &files_env
);
3478 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3480 argv
= params
->argv
?: command
->argv
;
3481 line
= exec_command_line(argv
);
3485 log_struct(LOG_DEBUG
,
3486 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3487 "EXECUTABLE=%s", command
->path
,
3489 LOG_UNIT_INVOCATION_ID(unit
),
3494 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3497 int exit_status
= EXIT_SUCCESS
;
3499 r
= exec_child(unit
,
3512 unit
->manager
->user_lookup_fds
[1],
3516 log_struct_errno(LOG_ERR
, r
,
3517 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3519 LOG_UNIT_INVOCATION_ID(unit
),
3520 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3521 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3523 "EXECUTABLE=%s", command
->path
,
3530 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3532 /* We add the new process to the cgroup both in the child (so
3533 * that we can be sure that no user code is ever executed
3534 * outside of the cgroup) and in the parent (so that we can be
3535 * sure that when we kill the cgroup the process will be
3537 if (params
->cgroup_path
)
3538 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3540 exec_status_start(&command
->exec_status
, pid
);
3546 void exec_context_init(ExecContext
*c
) {
3547 ExecDirectoryType i
;
3552 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3553 c
->cpu_sched_policy
= SCHED_OTHER
;
3554 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3555 c
->syslog_level_prefix
= true;
3556 c
->ignore_sigpipe
= true;
3557 c
->timer_slack_nsec
= NSEC_INFINITY
;
3558 c
->personality
= PERSONALITY_INVALID
;
3559 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3560 c
->directories
[i
].mode
= 0755;
3561 c
->capability_bounding_set
= CAP_ALL
;
3562 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3563 c
->log_level_max
= -1;
3566 void exec_context_done(ExecContext
*c
) {
3567 ExecDirectoryType i
;
3572 c
->environment
= strv_free(c
->environment
);
3573 c
->environment_files
= strv_free(c
->environment_files
);
3574 c
->pass_environment
= strv_free(c
->pass_environment
);
3575 c
->unset_environment
= strv_free(c
->unset_environment
);
3577 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3578 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3580 for (l
= 0; l
< 3; l
++) {
3581 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3582 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
3585 c
->working_directory
= mfree(c
->working_directory
);
3586 c
->root_directory
= mfree(c
->root_directory
);
3587 c
->root_image
= mfree(c
->root_image
);
3588 c
->tty_path
= mfree(c
->tty_path
);
3589 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3590 c
->user
= mfree(c
->user
);
3591 c
->group
= mfree(c
->group
);
3593 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3595 c
->pam_name
= mfree(c
->pam_name
);
3597 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3598 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3599 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3601 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3602 c
->bind_mounts
= NULL
;
3603 c
->n_bind_mounts
= 0;
3604 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
3605 c
->temporary_filesystems
= NULL
;
3606 c
->n_temporary_filesystems
= 0;
3608 c
->cpuset
= cpu_set_mfree(c
->cpuset
);
3610 c
->utmp_id
= mfree(c
->utmp_id
);
3611 c
->selinux_context
= mfree(c
->selinux_context
);
3612 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3613 c
->smack_process_label
= mfree(c
->smack_process_label
);
3615 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3616 c
->syscall_archs
= set_free(c
->syscall_archs
);
3617 c
->address_families
= set_free(c
->address_families
);
3619 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3620 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3622 c
->log_level_max
= -1;
3624 exec_context_free_log_extra_fields(c
);
3626 c
->stdin_data
= mfree(c
->stdin_data
);
3627 c
->stdin_data_size
= 0;
3630 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
3635 if (!runtime_prefix
)
3638 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3639 _cleanup_free_
char *p
;
3641 p
= strjoin(runtime_prefix
, "/", *i
);
3645 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3647 (void) rm_rf(p
, REMOVE_ROOT
);
3653 static void exec_command_done(ExecCommand
*c
) {
3656 c
->path
= mfree(c
->path
);
3658 c
->argv
= strv_free(c
->argv
);
3661 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
3664 for (i
= 0; i
< n
; i
++)
3665 exec_command_done(c
+i
);
3668 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3672 LIST_REMOVE(command
, c
, i
);
3673 exec_command_done(i
);
3680 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
3683 for (i
= 0; i
< n
; i
++)
3684 c
[i
] = exec_command_free_list(c
[i
]);
3687 typedef struct InvalidEnvInfo
{
3692 static void invalid_env(const char *p
, void *userdata
) {
3693 InvalidEnvInfo
*info
= userdata
;
3695 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3698 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3704 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3707 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3710 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3713 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3716 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3719 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3726 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3728 const char* stdio_fdname
[3];
3734 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3735 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3736 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3738 for (i
= 0; i
< 3; i
++)
3739 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3741 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3743 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3744 if (named_iofds
[STDIN_FILENO
] < 0 &&
3745 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3746 stdio_fdname
[STDIN_FILENO
] &&
3747 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3749 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3752 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3753 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3754 stdio_fdname
[STDOUT_FILENO
] &&
3755 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3757 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3760 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3761 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3762 stdio_fdname
[STDERR_FILENO
] &&
3763 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3765 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3769 return targets
== 0 ? 0 : -ENOENT
;
3772 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
) {
3773 char **i
, **r
= NULL
;
3778 STRV_FOREACH(i
, c
->environment_files
) {
3782 bool ignore
= false;
3784 _cleanup_globfree_ glob_t pglob
= {};
3793 if (!path_is_absolute(fn
)) {
3801 /* Filename supports globbing, take all matching files */
3802 k
= safe_glob(fn
, 0, &pglob
);
3811 /* When we don't match anything, -ENOENT should be returned */
3812 assert(pglob
.gl_pathc
> 0);
3814 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3815 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3823 /* Log invalid environment variables with filename */
3825 InvalidEnvInfo info
= {
3827 .path
= pglob
.gl_pathv
[n
]
3830 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3838 m
= strv_env_merge(2, r
, p
);
3854 static bool tty_may_match_dev_console(const char *tty
) {
3855 _cleanup_free_
char *resolved
= NULL
;
3860 tty
= skip_dev_prefix(tty
);
3862 /* trivial identity? */
3863 if (streq(tty
, "console"))
3866 if (resolve_dev_console(&resolved
) < 0)
3867 return true; /* if we could not resolve, assume it may */
3869 /* "tty0" means the active VC, so it may be the same sometimes */
3870 return streq(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
3873 bool exec_context_may_touch_console(const ExecContext
*ec
) {
3875 return (ec
->tty_reset
||
3877 ec
->tty_vt_disallocate
||
3878 is_terminal_input(ec
->std_input
) ||
3879 is_terminal_output(ec
->std_output
) ||
3880 is_terminal_output(ec
->std_error
)) &&
3881 tty_may_match_dev_console(exec_context_tty_path(ec
));
3884 static void strv_fprintf(FILE *f
, char **l
) {
3890 fprintf(f
, " %s", *g
);
3893 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
3894 ExecDirectoryType dt
;
3902 prefix
= strempty(prefix
);
3906 "%sWorkingDirectory: %s\n"
3907 "%sRootDirectory: %s\n"
3908 "%sNonBlocking: %s\n"
3909 "%sPrivateTmp: %s\n"
3910 "%sPrivateDevices: %s\n"
3911 "%sProtectKernelTunables: %s\n"
3912 "%sProtectKernelModules: %s\n"
3913 "%sProtectControlGroups: %s\n"
3914 "%sPrivateNetwork: %s\n"
3915 "%sPrivateUsers: %s\n"
3916 "%sProtectHome: %s\n"
3917 "%sProtectSystem: %s\n"
3918 "%sMountAPIVFS: %s\n"
3919 "%sIgnoreSIGPIPE: %s\n"
3920 "%sMemoryDenyWriteExecute: %s\n"
3921 "%sRestrictRealtime: %s\n"
3922 "%sKeyringMode: %s\n",
3924 prefix
, c
->working_directory
? c
->working_directory
: "/",
3925 prefix
, c
->root_directory
? c
->root_directory
: "/",
3926 prefix
, yes_no(c
->non_blocking
),
3927 prefix
, yes_no(c
->private_tmp
),
3928 prefix
, yes_no(c
->private_devices
),
3929 prefix
, yes_no(c
->protect_kernel_tunables
),
3930 prefix
, yes_no(c
->protect_kernel_modules
),
3931 prefix
, yes_no(c
->protect_control_groups
),
3932 prefix
, yes_no(c
->private_network
),
3933 prefix
, yes_no(c
->private_users
),
3934 prefix
, protect_home_to_string(c
->protect_home
),
3935 prefix
, protect_system_to_string(c
->protect_system
),
3936 prefix
, yes_no(c
->mount_apivfs
),
3937 prefix
, yes_no(c
->ignore_sigpipe
),
3938 prefix
, yes_no(c
->memory_deny_write_execute
),
3939 prefix
, yes_no(c
->restrict_realtime
),
3940 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3943 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3945 STRV_FOREACH(e
, c
->environment
)
3946 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3948 STRV_FOREACH(e
, c
->environment_files
)
3949 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3951 STRV_FOREACH(e
, c
->pass_environment
)
3952 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3954 STRV_FOREACH(e
, c
->unset_environment
)
3955 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3957 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3959 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3960 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3962 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3963 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3971 if (c
->oom_score_adjust_set
)
3973 "%sOOMScoreAdjust: %i\n",
3974 prefix
, c
->oom_score_adjust
);
3976 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3978 fprintf(f
, "Limit%s%s: " RLIM_FMT
"\n",
3979 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3980 fprintf(f
, "Limit%s%sSoft: " RLIM_FMT
"\n",
3981 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3984 if (c
->ioprio_set
) {
3985 _cleanup_free_
char *class_str
= NULL
;
3987 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3989 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3991 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3994 if (c
->cpu_sched_set
) {
3995 _cleanup_free_
char *policy_str
= NULL
;
3997 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3999 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
4002 "%sCPUSchedulingPriority: %i\n"
4003 "%sCPUSchedulingResetOnFork: %s\n",
4004 prefix
, c
->cpu_sched_priority
,
4005 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
4009 fprintf(f
, "%sCPUAffinity:", prefix
);
4010 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
4011 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
4012 fprintf(f
, " %u", i
);
4016 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
4017 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
4020 "%sStandardInput: %s\n"
4021 "%sStandardOutput: %s\n"
4022 "%sStandardError: %s\n",
4023 prefix
, exec_input_to_string(c
->std_input
),
4024 prefix
, exec_output_to_string(c
->std_output
),
4025 prefix
, exec_output_to_string(c
->std_error
));
4027 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
4028 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
4029 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
4030 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
4031 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
4032 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
4034 if (c
->std_input
== EXEC_INPUT_FILE
)
4035 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
4036 if (c
->std_output
== EXEC_OUTPUT_FILE
)
4037 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4038 if (c
->std_error
== EXEC_OUTPUT_FILE
)
4039 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4045 "%sTTYVHangup: %s\n"
4046 "%sTTYVTDisallocate: %s\n",
4047 prefix
, c
->tty_path
,
4048 prefix
, yes_no(c
->tty_reset
),
4049 prefix
, yes_no(c
->tty_vhangup
),
4050 prefix
, yes_no(c
->tty_vt_disallocate
));
4052 if (IN_SET(c
->std_output
,
4055 EXEC_OUTPUT_JOURNAL
,
4056 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4057 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4058 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
4059 IN_SET(c
->std_error
,
4062 EXEC_OUTPUT_JOURNAL
,
4063 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4064 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4065 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
4067 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
4069 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
4071 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
4073 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
4075 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
4078 if (c
->log_level_max
>= 0) {
4079 _cleanup_free_
char *t
= NULL
;
4081 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
4083 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
4086 if (c
->n_log_extra_fields
> 0) {
4089 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
4090 fprintf(f
, "%sLogExtraFields: ", prefix
);
4091 fwrite(c
->log_extra_fields
[j
].iov_base
,
4092 1, c
->log_extra_fields
[j
].iov_len
,
4098 if (c
->secure_bits
) {
4099 _cleanup_free_
char *str
= NULL
;
4101 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4103 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4106 if (c
->capability_bounding_set
!= CAP_ALL
) {
4107 _cleanup_free_
char *str
= NULL
;
4109 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4111 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4114 if (c
->capability_ambient_set
!= 0) {
4115 _cleanup_free_
char *str
= NULL
;
4117 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4119 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4123 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4125 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4127 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4129 if (!strv_isempty(c
->supplementary_groups
)) {
4130 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4131 strv_fprintf(f
, c
->supplementary_groups
);
4136 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4138 if (!strv_isempty(c
->read_write_paths
)) {
4139 fprintf(f
, "%sReadWritePaths:", prefix
);
4140 strv_fprintf(f
, c
->read_write_paths
);
4144 if (!strv_isempty(c
->read_only_paths
)) {
4145 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4146 strv_fprintf(f
, c
->read_only_paths
);
4150 if (!strv_isempty(c
->inaccessible_paths
)) {
4151 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4152 strv_fprintf(f
, c
->inaccessible_paths
);
4156 if (c
->n_bind_mounts
> 0)
4157 for (i
= 0; i
< c
->n_bind_mounts
; i
++)
4158 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
4159 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4160 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
4161 c
->bind_mounts
[i
].source
,
4162 c
->bind_mounts
[i
].destination
,
4163 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4165 if (c
->n_temporary_filesystems
> 0)
4166 for (i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
4167 TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
4169 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
4171 isempty(t
->options
) ? "" : ":",
4172 strempty(t
->options
));
4177 "%sUtmpIdentifier: %s\n",
4178 prefix
, c
->utmp_id
);
4180 if (c
->selinux_context
)
4182 "%sSELinuxContext: %s%s\n",
4183 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4185 if (c
->apparmor_profile
)
4187 "%sAppArmorProfile: %s%s\n",
4188 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4190 if (c
->smack_process_label
)
4192 "%sSmackProcessLabel: %s%s\n",
4193 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4195 if (c
->personality
!= PERSONALITY_INVALID
)
4197 "%sPersonality: %s\n",
4198 prefix
, strna(personality_to_string(c
->personality
)));
4201 "%sLockPersonality: %s\n",
4202 prefix
, yes_no(c
->lock_personality
));
4204 if (c
->syscall_filter
) {
4212 "%sSystemCallFilter: ",
4215 if (!c
->syscall_whitelist
)
4219 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4220 _cleanup_free_
char *name
= NULL
;
4221 const char *errno_name
= NULL
;
4222 int num
= PTR_TO_INT(val
);
4229 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4230 fputs(strna(name
), f
);
4233 errno_name
= errno_to_name(num
);
4235 fprintf(f
, ":%s", errno_name
);
4237 fprintf(f
, ":%d", num
);
4245 if (c
->syscall_archs
) {
4252 "%sSystemCallArchitectures:",
4256 SET_FOREACH(id
, c
->syscall_archs
, j
)
4257 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4262 if (exec_context_restrict_namespaces_set(c
)) {
4263 _cleanup_free_
char *s
= NULL
;
4265 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
4267 fprintf(f
, "%sRestrictNamespaces: %s\n",
4271 if (c
->syscall_errno
> 0) {
4272 const char *errno_name
;
4274 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4276 errno_name
= errno_to_name(c
->syscall_errno
);
4278 fprintf(f
, "%s\n", errno_name
);
4280 fprintf(f
, "%d\n", c
->syscall_errno
);
4283 if (c
->apparmor_profile
)
4285 "%sAppArmorProfile: %s%s\n",
4286 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4289 bool exec_context_maintains_privileges(const ExecContext
*c
) {
4292 /* Returns true if the process forked off would run under
4293 * an unchanged UID or as root. */
4298 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4304 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
4312 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4314 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4319 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4324 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4325 free(c
->log_extra_fields
[l
].iov_base
);
4326 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4327 c
->n_log_extra_fields
= 0;
4330 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4335 dual_timestamp_get(&s
->start_timestamp
);
4338 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
4341 if (s
->pid
&& s
->pid
!= pid
)
4345 dual_timestamp_get(&s
->exit_timestamp
);
4351 if (context
->utmp_id
)
4352 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4354 exec_context_tty_reset(context
, NULL
);
4358 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
4359 char buf
[FORMAT_TIMESTAMP_MAX
];
4367 prefix
= strempty(prefix
);
4370 "%sPID: "PID_FMT
"\n",
4373 if (dual_timestamp_is_set(&s
->start_timestamp
))
4375 "%sStart Timestamp: %s\n",
4376 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4378 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4380 "%sExit Timestamp: %s\n"
4382 "%sExit Status: %i\n",
4383 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4384 prefix
, sigchld_code_to_string(s
->code
),
4388 static char *exec_command_line(char **argv
) {
4396 STRV_FOREACH(a
, argv
)
4404 STRV_FOREACH(a
, argv
) {
4411 if (strpbrk(*a
, WHITESPACE
)) {
4422 /* FIXME: this doesn't really handle arguments that have
4423 * spaces and ticks in them */
4428 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4429 _cleanup_free_
char *cmd
= NULL
;
4430 const char *prefix2
;
4435 prefix
= strempty(prefix
);
4436 prefix2
= strjoina(prefix
, "\t");
4438 cmd
= exec_command_line(c
->argv
);
4440 "%sCommand Line: %s\n",
4441 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4443 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4446 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4449 prefix
= strempty(prefix
);
4451 LIST_FOREACH(command
, c
, c
)
4452 exec_command_dump(c
, f
, prefix
);
4455 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4462 /* It's kind of important, that we keep the order here */
4463 LIST_FIND_TAIL(command
, *l
, end
);
4464 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4469 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4477 l
= strv_new_ap(path
, ap
);
4492 return strv_free_and_replace(c
->argv
, l
);
4495 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4496 _cleanup_strv_free_
char **l
= NULL
;
4504 l
= strv_new_ap(path
, ap
);
4510 r
= strv_extend_strv(&c
->argv
, l
, false);
4517 static void *remove_tmpdir_thread(void *p
) {
4518 _cleanup_free_
char *path
= p
;
4520 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4524 static ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
, bool destroy
) {
4531 (void) hashmap_remove(rt
->manager
->exec_runtime_by_id
, rt
->id
);
4533 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4534 if (destroy
&& rt
->tmp_dir
) {
4535 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4537 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4539 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4546 if (destroy
&& rt
->var_tmp_dir
) {
4547 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4549 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4551 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4552 free(rt
->var_tmp_dir
);
4555 rt
->var_tmp_dir
= NULL
;
4558 rt
->id
= mfree(rt
->id
);
4559 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
4560 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
4561 safe_close_pair(rt
->netns_storage_socket
);
4565 static void exec_runtime_freep(ExecRuntime
**rt
) {
4567 (void) exec_runtime_free(*rt
, false);
4570 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4573 *rt
= new0(ExecRuntime
, 1);
4577 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4581 static int exec_runtime_add(
4584 const char *tmp_dir
,
4585 const char *var_tmp_dir
,
4586 const int netns_storage_socket
[2],
4587 ExecRuntime
**ret
) {
4589 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
4595 r
= hashmap_ensure_allocated(&m
->exec_runtime_by_id
, &string_hash_ops
);
4599 r
= exec_runtime_allocate(&rt
);
4603 rt
->id
= strdup(id
);
4608 rt
->tmp_dir
= strdup(tmp_dir
);
4612 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4613 assert(var_tmp_dir
);
4614 rt
->var_tmp_dir
= strdup(var_tmp_dir
);
4615 if (!rt
->var_tmp_dir
)
4619 if (netns_storage_socket
) {
4620 rt
->netns_storage_socket
[0] = netns_storage_socket
[0];
4621 rt
->netns_storage_socket
[1] = netns_storage_socket
[1];
4624 r
= hashmap_put(m
->exec_runtime_by_id
, rt
->id
, rt
);
4633 /* do not remove created ExecRuntime object when the operation succeeds. */
4638 static int exec_runtime_make(Manager
*m
, const ExecContext
*c
, const char *id
, ExecRuntime
**ret
) {
4639 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
4640 _cleanup_close_pair_
int netns_storage_socket
[2] = {-1, -1};
4647 /* It is not necessary to create ExecRuntime object. */
4648 if (!c
->private_network
&& !c
->private_tmp
)
4651 if (c
->private_tmp
) {
4652 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
4657 if (c
->private_network
) {
4658 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
4662 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, netns_storage_socket
, ret
);
4667 netns_storage_socket
[0] = -1;
4668 netns_storage_socket
[1] = -1;
4672 int exec_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecRuntime
**ret
) {
4680 rt
= hashmap_get(m
->exec_runtime_by_id
, id
);
4682 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4688 /* If not found, then create a new object. */
4689 r
= exec_runtime_make(m
, c
, id
, &rt
);
4691 /* When r == 0, it is not necessary to create ExecRuntime object. */
4695 /* increment reference counter. */
4701 ExecRuntime
*exec_runtime_unref(ExecRuntime
*rt
, bool destroy
) {
4705 assert(rt
->n_ref
> 0);
4711 return exec_runtime_free(rt
, destroy
);
4714 int exec_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
4722 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
4723 fprintf(f
, "exec-runtime=%s", rt
->id
);
4726 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
4728 if (rt
->var_tmp_dir
)
4729 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
4731 if (rt
->netns_storage_socket
[0] >= 0) {
4734 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4738 fprintf(f
, " netns-socket-0=%i", copy
);
4741 if (rt
->netns_storage_socket
[1] >= 0) {
4744 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4748 fprintf(f
, " netns-socket-1=%i", copy
);
4757 int exec_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
4758 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt_create
= NULL
;
4762 /* This is for the migration from old (v237 or earlier) deserialization text.
4763 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4764 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4765 * so or not from the serialized text, then we always creates a new object owned by this. */
4771 /* Manager manages ExecRuntime objects by the unit id.
4772 * So, we omit the serialized text when the unit does not have id (yet?)... */
4773 if (isempty(u
->id
)) {
4774 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
4778 r
= hashmap_ensure_allocated(&u
->manager
->exec_runtime_by_id
, &string_hash_ops
);
4780 log_unit_debug_errno(u
, r
, "Failed to allocate storage for runtime parameter: %m");
4784 rt
= hashmap_get(u
->manager
->exec_runtime_by_id
, u
->id
);
4786 r
= exec_runtime_allocate(&rt_create
);
4790 rt_create
->id
= strdup(u
->id
);
4797 if (streq(key
, "tmp-dir")) {
4800 copy
= strdup(value
);
4804 free_and_replace(rt
->tmp_dir
, copy
);
4806 } else if (streq(key
, "var-tmp-dir")) {
4809 copy
= strdup(value
);
4813 free_and_replace(rt
->var_tmp_dir
, copy
);
4815 } else if (streq(key
, "netns-socket-0")) {
4818 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
4819 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4823 safe_close(rt
->netns_storage_socket
[0]);
4824 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4826 } else if (streq(key
, "netns-socket-1")) {
4829 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
4830 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4834 safe_close(rt
->netns_storage_socket
[1]);
4835 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4839 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4841 r
= hashmap_put(u
->manager
->exec_runtime_by_id
, rt_create
->id
, rt_create
);
4843 log_unit_debug_errno(u
, r
, "Failed to put runtime paramter to manager's storage: %m");
4847 rt_create
->manager
= u
->manager
;
4856 void exec_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
4857 char *id
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
4858 int r
, fd0
= -1, fd1
= -1;
4859 const char *p
, *v
= value
;
4866 n
= strcspn(v
, " ");
4867 id
= strndupa(v
, n
);
4872 v
= startswith(p
, "tmp-dir=");
4874 n
= strcspn(v
, " ");
4875 tmp_dir
= strndupa(v
, n
);
4881 v
= startswith(p
, "var-tmp-dir=");
4883 n
= strcspn(v
, " ");
4884 var_tmp_dir
= strndupa(v
, n
);
4890 v
= startswith(p
, "netns-socket-0=");
4894 n
= strcspn(v
, " ");
4895 buf
= strndupa(v
, n
);
4896 if (safe_atoi(buf
, &fd0
) < 0 || !fdset_contains(fds
, fd0
)) {
4897 log_debug("Unable to process exec-runtime netns fd specification.");
4900 fd0
= fdset_remove(fds
, fd0
);
4906 v
= startswith(p
, "netns-socket-1=");
4910 n
= strcspn(v
, " ");
4911 buf
= strndupa(v
, n
);
4912 if (safe_atoi(buf
, &fd1
) < 0 || !fdset_contains(fds
, fd1
)) {
4913 log_debug("Unable to process exec-runtime netns fd specification.");
4916 fd1
= fdset_remove(fds
, fd1
);
4921 r
= exec_runtime_add(m
, id
, tmp_dir
, var_tmp_dir
, (int[]) { fd0
, fd1
}, NULL
);
4923 log_debug_errno(r
, "Failed to add exec-runtime: %m");
4928 void exec_runtime_vacuum(Manager
*m
) {
4934 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4936 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
, i
) {
4940 (void) exec_runtime_free(rt
, false);
4944 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4945 [EXEC_INPUT_NULL
] = "null",
4946 [EXEC_INPUT_TTY
] = "tty",
4947 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4948 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4949 [EXEC_INPUT_SOCKET
] = "socket",
4950 [EXEC_INPUT_NAMED_FD
] = "fd",
4951 [EXEC_INPUT_DATA
] = "data",
4952 [EXEC_INPUT_FILE
] = "file",
4955 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4957 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4958 [EXEC_OUTPUT_INHERIT
] = "inherit",
4959 [EXEC_OUTPUT_NULL
] = "null",
4960 [EXEC_OUTPUT_TTY
] = "tty",
4961 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4962 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4963 [EXEC_OUTPUT_KMSG
] = "kmsg",
4964 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4965 [EXEC_OUTPUT_JOURNAL
] = "journal",
4966 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4967 [EXEC_OUTPUT_SOCKET
] = "socket",
4968 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4969 [EXEC_OUTPUT_FILE
] = "file",
4972 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4974 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4975 [EXEC_UTMP_INIT
] = "init",
4976 [EXEC_UTMP_LOGIN
] = "login",
4977 [EXEC_UTMP_USER
] = "user",
4980 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4982 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4983 [EXEC_PRESERVE_NO
] = "no",
4984 [EXEC_PRESERVE_YES
] = "yes",
4985 [EXEC_PRESERVE_RESTART
] = "restart",
4988 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4990 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
4991 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4992 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4993 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4994 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4995 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4998 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
5000 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
5001 [EXEC_KEYRING_INHERIT
] = "inherit",
5002 [EXEC_KEYRING_PRIVATE
] = "private",
5003 [EXEC_KEYRING_SHARED
] = "shared",
5006 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);