2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
33 #include <sys/socket.h>
35 #include <sys/types.h>
41 #include <security/pam_appl.h>
45 #include <selinux/selinux.h>
53 #include <sys/apparmor.h>
56 #include "sd-messages.h"
59 #include "alloc-util.h"
61 #include "apparmor-util.h"
66 #include "capability-util.h"
69 #include "errno-list.h"
71 #include "exit-status.h"
74 #include "format-util.h"
76 #include "glob-util.h"
83 #include "namespace.h"
84 #include "parse-util.h"
85 #include "path-util.h"
86 #include "process-util.h"
87 #include "rlimit-util.h"
90 #include "seccomp-util.h"
92 #include "securebits.h"
93 #include "securebits-util.h"
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "smack-util.h"
98 #include "string-table.h"
99 #include "string-util.h"
101 #include "syslog-util.h"
102 #include "terminal-util.h"
104 #include "user-util.h"
106 #include "utmp-wtmp.h"
108 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
109 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
111 /* This assumes there is a 'tty' group */
112 #define TTY_MODE 0620
114 #define SNDBUF_SIZE (8*1024*1024)
116 static int shift_fds(int fds
[], unsigned n_fds
) {
117 int start
, restart_from
;
122 /* Modifies the fds array! (sorts it) */
132 for (i
= start
; i
< (int) n_fds
; i
++) {
135 /* Already at right index? */
139 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
146 /* Hmm, the fd we wanted isn't free? Then
147 * let's remember that and try again from here */
148 if (nfd
!= i
+3 && restart_from
< 0)
152 if (restart_from
< 0)
155 start
= restart_from
;
161 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
165 n_fds
= n_storage_fds
+ n_socket_fds
;
171 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
172 * O_NONBLOCK only applies to socket activation though. */
174 for (i
= 0; i
< n_fds
; i
++) {
176 if (i
< n_socket_fds
) {
177 r
= fd_nonblock(fds
[i
], nonblock
);
182 /* We unconditionally drop FD_CLOEXEC from the fds,
183 * since after all we want to pass these fds to our
186 r
= fd_cloexec(fds
[i
], false);
194 static const char *exec_context_tty_path(const ExecContext
*context
) {
197 if (context
->stdio_as_fds
)
200 if (context
->tty_path
)
201 return context
->tty_path
;
203 return "/dev/console";
206 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
211 path
= exec_context_tty_path(context
);
213 if (context
->tty_vhangup
) {
214 if (p
&& p
->stdin_fd
>= 0)
215 (void) terminal_vhangup_fd(p
->stdin_fd
);
217 (void) terminal_vhangup(path
);
220 if (context
->tty_reset
) {
221 if (p
&& p
->stdin_fd
>= 0)
222 (void) reset_terminal_fd(p
->stdin_fd
, true);
224 (void) reset_terminal(path
);
227 if (context
->tty_vt_disallocate
&& path
)
228 (void) vt_disallocate(path
);
231 static bool is_terminal_input(ExecInput i
) {
234 EXEC_INPUT_TTY_FORCE
,
235 EXEC_INPUT_TTY_FAIL
);
238 static bool is_terminal_output(ExecOutput o
) {
241 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
243 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
246 static bool is_syslog_output(ExecOutput o
) {
249 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
252 static bool is_kmsg_output(ExecOutput o
) {
255 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
258 static bool exec_context_needs_term(const ExecContext
*c
) {
261 /* Return true if the execution context suggests we should set $TERM to something useful. */
263 if (is_terminal_input(c
->std_input
))
266 if (is_terminal_output(c
->std_output
))
269 if (is_terminal_output(c
->std_error
))
272 return !!c
->tty_path
;
275 static int open_null_as(int flags
, int nfd
) {
280 fd
= open("/dev/null", flags
|O_NOCTTY
);
285 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
293 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
294 static const union sockaddr_union sa
= {
295 .un
.sun_family
= AF_UNIX
,
296 .un
.sun_path
= "/run/systemd/journal/stdout",
298 uid_t olduid
= UID_INVALID
;
299 gid_t oldgid
= GID_INVALID
;
302 if (gid_is_valid(gid
)) {
305 if (setegid(gid
) < 0)
309 if (uid_is_valid(uid
)) {
312 if (seteuid(uid
) < 0) {
318 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
320 /* If we fail to restore the uid or gid, things will likely
321 fail later on. This should only happen if an LSM interferes. */
323 if (uid_is_valid(uid
))
324 (void) seteuid(olduid
);
327 if (gid_is_valid(gid
))
328 (void) setegid(oldgid
);
333 static int connect_logger_as(
335 const ExecContext
*context
,
336 const ExecParameters
*params
,
347 assert(output
< _EXEC_OUTPUT_MAX
);
351 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
355 r
= connect_journal_socket(fd
, uid
, gid
);
359 if (shutdown(fd
, SHUT_RD
) < 0) {
364 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
374 context
->syslog_identifier
?: ident
,
375 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
376 context
->syslog_priority
,
377 !!context
->syslog_level_prefix
,
378 is_syslog_output(output
),
379 is_kmsg_output(output
),
380 is_terminal_output(output
));
385 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
390 static int open_terminal_as(const char *path
, mode_t mode
, int nfd
) {
396 fd
= open_terminal(path
, mode
| O_NOCTTY
);
401 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
409 static int fixup_input(ExecInput std_input
, int socket_fd
, bool apply_tty_stdin
) {
411 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
412 return EXEC_INPUT_NULL
;
414 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
415 return EXEC_INPUT_NULL
;
420 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
422 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
423 return EXEC_OUTPUT_INHERIT
;
428 static int setup_input(
429 const ExecContext
*context
,
430 const ExecParameters
*params
,
432 int named_iofds
[3]) {
439 if (params
->stdin_fd
>= 0) {
440 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
443 /* Try to make this the controlling tty, if it is a tty, and reset it */
444 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
445 (void) reset_terminal_fd(STDIN_FILENO
, true);
450 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
454 case EXEC_INPUT_NULL
:
455 return open_null_as(O_RDONLY
, STDIN_FILENO
);
458 case EXEC_INPUT_TTY_FORCE
:
459 case EXEC_INPUT_TTY_FAIL
: {
462 fd
= acquire_terminal(exec_context_tty_path(context
),
463 i
== EXEC_INPUT_TTY_FAIL
,
464 i
== EXEC_INPUT_TTY_FORCE
,
470 if (fd
!= STDIN_FILENO
) {
471 r
= dup2(fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
479 case EXEC_INPUT_SOCKET
:
480 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
482 case EXEC_INPUT_NAMED_FD
:
483 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
484 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
487 assert_not_reached("Unknown input type");
491 static int setup_output(
493 const ExecContext
*context
,
494 const ExecParameters
*params
,
501 dev_t
*journal_stream_dev
,
502 ino_t
*journal_stream_ino
) {
512 assert(journal_stream_dev
);
513 assert(journal_stream_ino
);
515 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
517 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
520 return STDOUT_FILENO
;
523 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
524 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
527 return STDERR_FILENO
;
530 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
531 o
= fixup_output(context
->std_output
, socket_fd
);
533 if (fileno
== STDERR_FILENO
) {
535 e
= fixup_output(context
->std_error
, socket_fd
);
537 /* This expects the input and output are already set up */
539 /* Don't change the stderr file descriptor if we inherit all
540 * the way and are not on a tty */
541 if (e
== EXEC_OUTPUT_INHERIT
&&
542 o
== EXEC_OUTPUT_INHERIT
&&
543 i
== EXEC_INPUT_NULL
&&
544 !is_terminal_input(context
->std_input
) &&
548 /* Duplicate from stdout if possible */
549 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
550 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
554 } else if (o
== EXEC_OUTPUT_INHERIT
) {
555 /* If input got downgraded, inherit the original value */
556 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
557 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
559 /* If the input is connected to anything that's not a /dev/null, inherit that... */
560 if (i
!= EXEC_INPUT_NULL
)
561 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
563 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
567 /* We need to open /dev/null here anew, to get the right access mode. */
568 return open_null_as(O_WRONLY
, fileno
);
573 case EXEC_OUTPUT_NULL
:
574 return open_null_as(O_WRONLY
, fileno
);
576 case EXEC_OUTPUT_TTY
:
577 if (is_terminal_input(i
))
578 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
580 /* We don't reset the terminal if this is just about output */
581 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
583 case EXEC_OUTPUT_SYSLOG
:
584 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
585 case EXEC_OUTPUT_KMSG
:
586 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
587 case EXEC_OUTPUT_JOURNAL
:
588 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
589 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
591 log_unit_error_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
592 r
= open_null_as(O_WRONLY
, fileno
);
596 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
597 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
598 * services to detect whether they are connected to the journal or not.
600 * If both stdout and stderr are connected to a stream then let's make sure to store the data
601 * about STDERR as that's usually the best way to do logging. */
603 if (fstat(fileno
, &st
) >= 0 &&
604 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
605 *journal_stream_dev
= st
.st_dev
;
606 *journal_stream_ino
= st
.st_ino
;
611 case EXEC_OUTPUT_SOCKET
:
612 assert(socket_fd
>= 0);
613 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
615 case EXEC_OUTPUT_NAMED_FD
:
616 (void) fd_nonblock(named_iofds
[fileno
], false);
617 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
620 assert_not_reached("Unknown error type");
624 static int chown_terminal(int fd
, uid_t uid
) {
629 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
633 /* This might fail. What matters are the results. */
634 (void) fchown(fd
, uid
, -1);
635 (void) fchmod(fd
, TTY_MODE
);
637 if (fstat(fd
, &st
) < 0)
640 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
646 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
647 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
650 assert(_saved_stdin
);
651 assert(_saved_stdout
);
653 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
657 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
658 if (saved_stdout
< 0)
661 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
665 r
= chown_terminal(fd
, getuid());
669 r
= reset_terminal_fd(fd
, true);
673 if (dup2(fd
, STDIN_FILENO
) < 0)
676 if (dup2(fd
, STDOUT_FILENO
) < 0)
683 *_saved_stdin
= saved_stdin
;
684 *_saved_stdout
= saved_stdout
;
686 saved_stdin
= saved_stdout
= -1;
691 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
694 if (err
== -ETIMEDOUT
)
695 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
698 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
702 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
703 _cleanup_close_
int fd
= -1;
707 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
711 write_confirm_error_fd(err
, fd
, u
);
714 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
718 assert(saved_stdout
);
722 if (*saved_stdin
>= 0)
723 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
726 if (*saved_stdout
>= 0)
727 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
730 *saved_stdin
= safe_close(*saved_stdin
);
731 *saved_stdout
= safe_close(*saved_stdout
);
737 CONFIRM_PRETEND_FAILURE
= -1,
738 CONFIRM_PRETEND_SUCCESS
= 0,
742 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
743 int saved_stdout
= -1, saved_stdin
= -1, r
;
744 _cleanup_free_
char *e
= NULL
;
747 /* For any internal errors, assume a positive response. */
748 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
750 write_confirm_error(r
, vc
, u
);
751 return CONFIRM_EXECUTE
;
754 /* confirm_spawn might have been disabled while we were sleeping. */
755 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
760 e
= ellipsize(cmdline
, 60, 100);
768 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
770 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
777 printf("Resuming normal execution.\n");
778 manager_disable_confirm_spawn();
782 unit_dump(u
, stdout
, " ");
783 continue; /* ask again */
785 printf("Failing execution.\n");
786 r
= CONFIRM_PRETEND_FAILURE
;
789 printf(" c - continue, proceed without asking anymore\n"
790 " D - dump, show the state of the unit\n"
791 " f - fail, don't execute the command and pretend it failed\n"
793 " i - info, show a short summary of the unit\n"
794 " j - jobs, show jobs that are in progress\n"
795 " s - skip, don't execute the command and pretend it succeeded\n"
796 " y - yes, execute the command\n");
797 continue; /* ask again */
799 printf(" Description: %s\n"
802 u
->id
, u
->description
, cmdline
);
803 continue; /* ask again */
805 manager_dump_jobs(u
->manager
, stdout
, " ");
806 continue; /* ask again */
808 /* 'n' was removed in favor of 'f'. */
809 printf("Didn't understand 'n', did you mean 'f'?\n");
810 continue; /* ask again */
812 printf("Skipping execution.\n");
813 r
= CONFIRM_PRETEND_SUCCESS
;
819 assert_not_reached("Unhandled choice");
825 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
829 static int get_fixed_user(const ExecContext
*c
, const char **user
,
830 uid_t
*uid
, gid_t
*gid
,
831 const char **home
, const char **shell
) {
840 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
841 * (i.e. are "/" or "/bin/nologin"). */
844 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
852 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
862 r
= get_group_creds(&name
, gid
);
870 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
871 const char *group
, gid_t gid
,
872 gid_t
**supplementary_gids
, int *ngids
) {
876 bool keep_groups
= false;
877 gid_t
*groups
= NULL
;
878 _cleanup_free_ gid_t
*l_gids
= NULL
;
883 * If user is given, then lookup GID and supplementary groups list.
884 * We avoid NSS lookups for gid=0. Also we have to initialize groups
885 * here and as early as possible so we keep the list of supplementary
886 * groups of the caller.
888 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
889 /* First step, initialize groups from /etc/groups */
890 if (initgroups(user
, gid
) < 0)
896 if (!c
->supplementary_groups
)
900 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
901 * be positive, otherwise fail.
904 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
905 if (ngroups_max
<= 0) {
909 return -EOPNOTSUPP
; /* For all other values */
912 l_gids
= new(gid_t
, ngroups_max
);
918 * Lookup the list of groups that the user belongs to, we
919 * avoid NSS lookups here too for gid=0.
922 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
927 STRV_FOREACH(i
, c
->supplementary_groups
) {
930 if (k
>= ngroups_max
)
934 r
= get_group_creds(&g
, l_gids
+k
);
942 * Sets ngids to zero to drop all supplementary groups, happens
943 * when we are under root and SupplementaryGroups= is empty.
950 /* Otherwise get the final list of supplementary groups */
951 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
955 *supplementary_gids
= groups
;
963 static int enforce_groups(const ExecContext
*context
, gid_t gid
,
964 gid_t
*supplementary_gids
, int ngids
) {
969 /* Handle SupplementaryGroups= even if it is empty */
970 if (context
->supplementary_groups
) {
971 r
= maybe_setgroups(ngids
, supplementary_gids
);
976 if (gid_is_valid(gid
)) {
977 /* Then set our gids */
978 if (setresgid(gid
, gid
, gid
) < 0)
985 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
988 if (!uid_is_valid(uid
))
991 /* Sets (but doesn't look up) the uid and make sure we keep the
992 * capabilities while doing so. */
994 if (context
->capability_ambient_set
!= 0) {
996 /* First step: If we need to keep capabilities but
997 * drop privileges we need to make sure we keep our
998 * caps, while we drop privileges. */
1000 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1002 if (prctl(PR_GET_SECUREBITS
) != sb
)
1003 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1008 /* Second step: actually set the uids */
1009 if (setresuid(uid
, uid
, uid
) < 0)
1012 /* At this point we should have all necessary capabilities but
1013 are otherwise a normal user. However, the caps might got
1014 corrupted due to the setresuid() so we need clean them up
1015 later. This is done outside of this call. */
1022 static int null_conv(
1024 const struct pam_message
**msg
,
1025 struct pam_response
**resp
,
1026 void *appdata_ptr
) {
1028 /* We don't support conversations */
1030 return PAM_CONV_ERR
;
1035 static int setup_pam(
1042 int fds
[], unsigned n_fds
) {
1046 static const struct pam_conv conv
= {
1051 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1052 pam_handle_t
*handle
= NULL
;
1054 int pam_code
= PAM_SUCCESS
, r
;
1055 char **nv
, **e
= NULL
;
1056 bool close_session
= false;
1057 pid_t pam_pid
= 0, parent_pid
;
1064 /* We set up PAM in the parent process, then fork. The child
1065 * will then stay around until killed via PR_GET_PDEATHSIG or
1066 * systemd via the cgroup logic. It will then remove the PAM
1067 * session again. The parent process will exec() the actual
1068 * daemon. We do things this way to ensure that the main PID
1069 * of the daemon is the one we initially fork()ed. */
1071 r
= barrier_create(&barrier
);
1075 if (log_get_max_level() < LOG_DEBUG
)
1076 flags
|= PAM_SILENT
;
1078 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1079 if (pam_code
!= PAM_SUCCESS
) {
1085 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1086 if (pam_code
!= PAM_SUCCESS
)
1090 STRV_FOREACH(nv
, *env
) {
1091 pam_code
= pam_putenv(handle
, *nv
);
1092 if (pam_code
!= PAM_SUCCESS
)
1096 pam_code
= pam_acct_mgmt(handle
, flags
);
1097 if (pam_code
!= PAM_SUCCESS
)
1100 pam_code
= pam_open_session(handle
, flags
);
1101 if (pam_code
!= PAM_SUCCESS
)
1104 close_session
= true;
1106 e
= pam_getenvlist(handle
);
1108 pam_code
= PAM_BUF_ERR
;
1112 /* Block SIGTERM, so that we know that it won't get lost in
1115 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1117 parent_pid
= getpid_cached();
1126 int sig
, ret
= EXIT_PAM
;
1128 /* The child's job is to reset the PAM session on
1130 barrier_set_role(&barrier
, BARRIER_CHILD
);
1132 /* This string must fit in 10 chars (i.e. the length
1133 * of "/sbin/init"), to look pretty in /bin/ps */
1134 rename_process("(sd-pam)");
1136 /* Make sure we don't keep open the passed fds in this
1137 child. We assume that otherwise only those fds are
1138 open here that have been opened by PAM. */
1139 close_many(fds
, n_fds
);
1141 /* Drop privileges - we don't need any to pam_close_session
1142 * and this will make PR_SET_PDEATHSIG work in most cases.
1143 * If this fails, ignore the error - but expect sd-pam threads
1144 * to fail to exit normally */
1146 r
= maybe_setgroups(0, NULL
);
1148 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1149 if (setresgid(gid
, gid
, gid
) < 0)
1150 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1151 if (setresuid(uid
, uid
, uid
) < 0)
1152 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1154 (void) ignore_signals(SIGPIPE
, -1);
1156 /* Wait until our parent died. This will only work if
1157 * the above setresuid() succeeds, otherwise the kernel
1158 * will not allow unprivileged parents kill their privileged
1159 * children this way. We rely on the control groups kill logic
1160 * to do the rest for us. */
1161 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1164 /* Tell the parent that our setup is done. This is especially
1165 * important regarding dropping privileges. Otherwise, unit
1166 * setup might race against our setresuid(2) call.
1168 * If the parent aborted, we'll detect this below, hence ignore
1169 * return failure here. */
1170 (void) barrier_place(&barrier
);
1172 /* Check if our parent process might already have died? */
1173 if (getppid() == parent_pid
) {
1176 assert_se(sigemptyset(&ss
) >= 0);
1177 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1180 if (sigwait(&ss
, &sig
) < 0) {
1187 assert(sig
== SIGTERM
);
1192 /* If our parent died we'll end the session */
1193 if (getppid() != parent_pid
) {
1194 pam_code
= pam_close_session(handle
, flags
);
1195 if (pam_code
!= PAM_SUCCESS
)
1202 pam_end(handle
, pam_code
| flags
);
1206 barrier_set_role(&barrier
, BARRIER_PARENT
);
1208 /* If the child was forked off successfully it will do all the
1209 * cleanups, so forget about the handle here. */
1212 /* Unblock SIGTERM again in the parent */
1213 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1215 /* We close the log explicitly here, since the PAM modules
1216 * might have opened it, but we don't want this fd around. */
1219 /* Synchronously wait for the child to initialize. We don't care for
1220 * errors as we cannot recover. However, warn loudly if it happens. */
1221 if (!barrier_place_and_sync(&barrier
))
1222 log_error("PAM initialization failed");
1230 if (pam_code
!= PAM_SUCCESS
) {
1231 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1232 r
= -EPERM
; /* PAM errors do not map to errno */
1234 log_error_errno(r
, "PAM failed: %m");
1238 pam_code
= pam_close_session(handle
, flags
);
1240 pam_end(handle
, pam_code
| flags
);
1252 static void rename_process_from_path(const char *path
) {
1253 char process_name
[11];
1257 /* This resulting string must fit in 10 chars (i.e. the length
1258 * of "/sbin/init") to look pretty in /bin/ps */
1262 rename_process("(...)");
1268 /* The end of the process name is usually more
1269 * interesting, since the first bit might just be
1275 process_name
[0] = '(';
1276 memcpy(process_name
+1, p
, l
);
1277 process_name
[1+l
] = ')';
1278 process_name
[1+l
+1] = 0;
1280 rename_process(process_name
);
1283 static bool context_has_address_families(const ExecContext
*c
) {
1286 return c
->address_families_whitelist
||
1287 !set_isempty(c
->address_families
);
1290 static bool context_has_syscall_filters(const ExecContext
*c
) {
1293 return c
->syscall_whitelist
||
1294 !set_isempty(c
->syscall_filter
);
1297 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1300 if (c
->no_new_privileges
)
1303 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1306 /* We need NNP if we have any form of seccomp and are unprivileged */
1307 return context_has_address_families(c
) ||
1308 c
->memory_deny_write_execute
||
1309 c
->restrict_realtime
||
1310 exec_context_restrict_namespaces_set(c
) ||
1311 c
->protect_kernel_tunables
||
1312 c
->protect_kernel_modules
||
1313 c
->private_devices
||
1314 context_has_syscall_filters(c
) ||
1315 !set_isempty(c
->syscall_archs
) ||
1316 c
->lock_personality
;
1321 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1323 if (is_seccomp_available())
1327 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1332 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1333 uint32_t negative_action
, default_action
, action
;
1339 if (!context_has_syscall_filters(c
))
1342 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1345 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1347 if (c
->syscall_whitelist
) {
1348 default_action
= negative_action
;
1349 action
= SCMP_ACT_ALLOW
;
1351 default_action
= SCMP_ACT_ALLOW
;
1352 action
= negative_action
;
1355 if (needs_ambient_hack
) {
1356 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1361 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1364 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1368 if (set_isempty(c
->syscall_archs
))
1371 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1374 return seccomp_restrict_archs(c
->syscall_archs
);
1377 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1381 if (!context_has_address_families(c
))
1384 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1387 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1390 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1394 if (!c
->memory_deny_write_execute
)
1397 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1400 return seccomp_memory_deny_write_execute();
1403 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1407 if (!c
->restrict_realtime
)
1410 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1413 return seccomp_restrict_realtime();
1416 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1420 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1421 * let's protect even those systems where this is left on in the kernel. */
1423 if (!c
->protect_kernel_tunables
)
1426 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1429 return seccomp_protect_sysctl();
1432 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1436 /* Turn off module syscalls on ProtectKernelModules=yes */
1438 if (!c
->protect_kernel_modules
)
1441 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1444 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1447 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1451 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1453 if (!c
->private_devices
)
1456 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1459 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1462 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1466 if (!exec_context_restrict_namespaces_set(c
))
1469 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1472 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1475 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1476 unsigned long personality
;
1482 if (!c
->lock_personality
)
1485 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1488 personality
= c
->personality
;
1490 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1491 if (personality
== PERSONALITY_INVALID
) {
1493 r
= opinionated_personality(&personality
);
1498 return seccomp_lock_personality(personality
);
1503 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1506 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1507 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1509 if (idle_pipe
[0] >= 0) {
1512 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1514 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1517 /* Signal systemd that we are bored and want to continue. */
1518 n
= write(idle_pipe
[3], "x", 1);
1520 /* Wait for systemd to react to the signal above. */
1521 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1524 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1528 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1531 static int build_environment(
1533 const ExecContext
*c
,
1534 const ExecParameters
*p
,
1537 const char *username
,
1539 dev_t journal_stream_dev
,
1540 ino_t journal_stream_ino
,
1543 _cleanup_strv_free_
char **our_env
= NULL
;
1551 our_env
= new0(char*, 14);
1556 _cleanup_free_
char *joined
= NULL
;
1558 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1560 our_env
[n_env
++] = x
;
1562 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1564 our_env
[n_env
++] = x
;
1566 joined
= strv_join(p
->fd_names
, ":");
1570 x
= strjoin("LISTEN_FDNAMES=", joined
);
1573 our_env
[n_env
++] = x
;
1576 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1577 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1579 our_env
[n_env
++] = x
;
1581 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1583 our_env
[n_env
++] = x
;
1586 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1587 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1588 * check the database directly. */
1589 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1590 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1593 our_env
[n_env
++] = x
;
1597 x
= strappend("HOME=", home
);
1600 our_env
[n_env
++] = x
;
1604 x
= strappend("LOGNAME=", username
);
1607 our_env
[n_env
++] = x
;
1609 x
= strappend("USER=", username
);
1612 our_env
[n_env
++] = x
;
1616 x
= strappend("SHELL=", shell
);
1619 our_env
[n_env
++] = x
;
1622 if (!sd_id128_is_null(u
->invocation_id
)) {
1623 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1626 our_env
[n_env
++] = x
;
1629 if (exec_context_needs_term(c
)) {
1630 const char *tty_path
, *term
= NULL
;
1632 tty_path
= exec_context_tty_path(c
);
1634 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1635 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1636 * passes to PID 1 ends up all the way in the console login shown. */
1638 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1639 term
= getenv("TERM");
1641 term
= default_term_for_tty(tty_path
);
1643 x
= strappend("TERM=", term
);
1646 our_env
[n_env
++] = x
;
1649 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1650 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1653 our_env
[n_env
++] = x
;
1656 our_env
[n_env
++] = NULL
;
1657 assert(n_env
<= 12);
1665 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1666 _cleanup_strv_free_
char **pass_env
= NULL
;
1667 size_t n_env
= 0, n_bufsize
= 0;
1670 STRV_FOREACH(i
, c
->pass_environment
) {
1671 _cleanup_free_
char *x
= NULL
;
1677 x
= strjoin(*i
, "=", v
);
1681 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1684 pass_env
[n_env
++] = x
;
1685 pass_env
[n_env
] = NULL
;
1695 static bool exec_needs_mount_namespace(
1696 const ExecContext
*context
,
1697 const ExecParameters
*params
,
1698 ExecRuntime
*runtime
) {
1703 if (context
->root_image
)
1706 if (!strv_isempty(context
->read_write_paths
) ||
1707 !strv_isempty(context
->read_only_paths
) ||
1708 !strv_isempty(context
->inaccessible_paths
))
1711 if (context
->n_bind_mounts
> 0)
1714 if (context
->mount_flags
!= 0)
1717 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1720 if (context
->private_devices
||
1721 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1722 context
->protect_home
!= PROTECT_HOME_NO
||
1723 context
->protect_kernel_tunables
||
1724 context
->protect_kernel_modules
||
1725 context
->protect_control_groups
)
1728 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1734 static int setup_private_users(uid_t uid
, gid_t gid
) {
1735 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1736 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1737 _cleanup_close_
int unshare_ready_fd
= -1;
1738 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1744 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1745 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1746 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1747 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1748 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1749 * continues execution normally. */
1751 if (uid
!= 0 && uid_is_valid(uid
)) {
1752 r
= asprintf(&uid_map
,
1753 "0 0 1\n" /* Map root → root */
1754 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1759 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1764 if (gid
!= 0 && gid_is_valid(gid
)) {
1765 r
= asprintf(&gid_map
,
1766 "0 0 1\n" /* Map root → root */
1767 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1772 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1777 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1779 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1780 if (unshare_ready_fd
< 0)
1783 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1785 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1793 _cleanup_close_
int fd
= -1;
1797 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1798 * here, after the parent opened its own user namespace. */
1801 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1803 /* Wait until the parent unshared the user namespace */
1804 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1809 /* Disable the setgroups() system call in the child user namespace, for good. */
1810 a
= procfs_file_alloca(ppid
, "setgroups");
1811 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1813 if (errno
!= ENOENT
) {
1818 /* If the file is missing the kernel is too old, let's continue anyway. */
1820 if (write(fd
, "deny\n", 5) < 0) {
1825 fd
= safe_close(fd
);
1828 /* First write the GID map */
1829 a
= procfs_file_alloca(ppid
, "gid_map");
1830 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1835 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1839 fd
= safe_close(fd
);
1841 /* The write the UID map */
1842 a
= procfs_file_alloca(ppid
, "uid_map");
1843 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1848 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1853 _exit(EXIT_SUCCESS
);
1856 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1857 _exit(EXIT_FAILURE
);
1860 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1862 if (unshare(CLONE_NEWUSER
) < 0)
1865 /* Let the child know that the namespace is ready now */
1866 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1869 /* Try to read an error code from the child */
1870 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1873 if (n
== sizeof(r
)) { /* an error code was sent to us */
1878 if (n
!= 0) /* on success we should have read 0 bytes */
1881 r
= wait_for_terminate(pid
, &si
);
1886 /* If something strange happened with the child, let's consider this fatal, too */
1887 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1893 static int setup_exec_directory(
1894 const ExecContext
*context
,
1895 const ExecParameters
*params
,
1898 ExecDirectoryType type
,
1901 static const int exit_status_table
[_EXEC_DIRECTORY_MAX
] = {
1902 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1903 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1904 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1905 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1906 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1913 assert(type
>= 0 && type
< _EXEC_DIRECTORY_MAX
);
1914 assert(exit_status
);
1916 if (!params
->prefix
[type
])
1919 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1920 if (!uid_is_valid(uid
))
1922 if (!gid_is_valid(gid
))
1926 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1927 _cleanup_free_
char *p
;
1929 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1935 r
= mkdir_parents_label(p
, 0755);
1939 r
= mkdir_p_label(p
, context
->directories
[type
].mode
);
1943 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
1944 * a service, and shall not be writable. */
1945 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
1948 r
= chmod_and_chown(p
, context
->directories
[type
].mode
, uid
, gid
);
1956 *exit_status
= exit_status_table
[type
];
1961 static int setup_smack(
1962 const ExecContext
*context
,
1963 const ExecCommand
*command
) {
1970 if (context
->smack_process_label
) {
1971 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
1975 #ifdef SMACK_DEFAULT_PROCESS_LABEL
1977 _cleanup_free_
char *exec_label
= NULL
;
1979 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
1980 if (r
< 0 && r
!= -ENODATA
&& r
!= -EOPNOTSUPP
)
1983 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
1992 static int compile_read_write_paths(
1993 const ExecContext
*context
,
1994 const ExecParameters
*params
,
1997 _cleanup_strv_free_
char **l
= NULL
;
1999 ExecDirectoryType i
;
2001 /* Compile the list of writable paths. This is the combination of
2002 * the explicitly configured paths, plus all runtime directories. */
2004 if (strv_isempty(context
->read_write_paths
)) {
2005 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
2006 if (!strv_isempty(context
->directories
[i
].paths
))
2009 if (i
== _EXEC_DIRECTORY_MAX
) {
2010 *ret
= NULL
; /* NOP if neither is set */
2015 l
= strv_copy(context
->read_write_paths
);
2019 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++) {
2020 if (!params
->prefix
[i
])
2023 STRV_FOREACH(rt
, context
->directories
[i
].paths
) {
2026 s
= strjoin(params
->prefix
[i
], "/", *rt
);
2030 if (strv_consume(&l
, s
) < 0)
2041 static int apply_mount_namespace(
2043 ExecCommand
*command
,
2044 const ExecContext
*context
,
2045 const ExecParameters
*params
,
2046 ExecRuntime
*runtime
) {
2048 _cleanup_strv_free_
char **rw
= NULL
;
2049 char *tmp
= NULL
, *var
= NULL
;
2050 const char *root_dir
= NULL
, *root_image
= NULL
;
2051 NameSpaceInfo ns_info
= {
2052 .ignore_protect_paths
= false,
2053 .private_dev
= context
->private_devices
,
2054 .protect_control_groups
= context
->protect_control_groups
,
2055 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2056 .protect_kernel_modules
= context
->protect_kernel_modules
,
2057 .mount_apivfs
= context
->mount_apivfs
,
2059 bool needs_sandboxing
;
2064 /* The runtime struct only contains the parent of the private /tmp,
2065 * which is non-accessible to world users. Inside of it there's a /tmp
2066 * that is sticky, and that's the one we want to use here. */
2068 if (context
->private_tmp
&& runtime
) {
2069 if (runtime
->tmp_dir
)
2070 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2071 if (runtime
->var_tmp_dir
)
2072 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2075 r
= compile_read_write_paths(context
, params
, &rw
);
2079 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2080 root_image
= context
->root_image
;
2083 root_dir
= context
->root_directory
;
2087 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2088 * sandbox info, otherwise enforce it, don't ignore protected paths and
2089 * fail if we are enable to apply the sandbox inside the mount namespace.
2091 if (!context
->dynamic_user
&& root_dir
)
2092 ns_info
.ignore_protect_paths
= true;
2094 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2096 r
= setup_namespace(root_dir
, root_image
,
2098 needs_sandboxing
? context
->read_only_paths
: NULL
,
2099 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2100 context
->bind_mounts
,
2101 context
->n_bind_mounts
,
2104 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2105 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2106 context
->mount_flags
,
2107 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2109 /* If we couldn't set up the namespace this is probably due to a
2110 * missing capability. In this case, silently proceeed. */
2111 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2113 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2121 static int apply_working_directory(
2122 const ExecContext
*context
,
2123 const ExecParameters
*params
,
2125 const bool needs_mount_ns
,
2131 assert(exit_status
);
2133 if (context
->working_directory_home
) {
2136 *exit_status
= EXIT_CHDIR
;
2142 } else if (context
->working_directory
)
2143 wd
= context
->working_directory
;
2147 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2148 if (!needs_mount_ns
&& context
->root_directory
)
2149 if (chroot(context
->root_directory
) < 0) {
2150 *exit_status
= EXIT_CHROOT
;
2156 d
= prefix_roota(context
->root_directory
, wd
);
2158 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2159 *exit_status
= EXIT_CHDIR
;
2166 static int setup_keyring(
2168 const ExecContext
*context
,
2169 const ExecParameters
*p
,
2170 uid_t uid
, gid_t gid
) {
2172 key_serial_t keyring
;
2179 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2180 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2181 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2182 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2183 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2184 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2186 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2189 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2192 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2193 if (keyring
== -1) {
2194 if (errno
== ENOSYS
)
2195 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
2196 else if (IN_SET(errno
, EACCES
, EPERM
))
2197 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
2198 else if (errno
== EDQUOT
)
2199 log_debug_errno(errno
, "Out of kernel keyrings to allocate, ignoring.");
2201 return log_error_errno(errno
, "Setting up kernel keyring failed: %m");
2206 /* Populate they keyring with the invocation ID by default. */
2207 if (!sd_id128_is_null(u
->invocation_id
)) {
2210 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2212 log_debug_errno(errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2214 if (keyctl(KEYCTL_SETPERM
, key
,
2215 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2216 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2217 return log_error_errno(errno
, "Failed to restrict invocation ID permission: %m");
2221 /* And now, make the keyring owned by the service's user */
2222 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2223 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2224 return log_error_errno(errno
, "Failed to change ownership of session keyring: %m");
2226 /* When requested link the user keyring into the session keyring. */
2227 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2231 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2232 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2233 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2235 saved_uid
= getuid();
2236 saved_gid
= getgid();
2238 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2239 if (setregid(gid
, -1) < 0)
2240 return log_error_errno(errno
, "Failed to change GID for user keyring: %m");
2243 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2244 if (setreuid(uid
, -1) < 0) {
2245 (void) setregid(saved_gid
, -1);
2246 return log_error_errno(errno
, "Failed to change UID for user keyring: %m");
2250 if (keyctl(KEYCTL_LINK
,
2251 KEY_SPEC_USER_KEYRING
,
2252 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2256 (void) setreuid(saved_uid
, -1);
2257 (void) setregid(saved_gid
, -1);
2259 return log_error_errno(r
, "Failed to link user keyring into session keyring: %m");
2262 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2263 if (setreuid(saved_uid
, -1) < 0) {
2264 (void) setregid(saved_gid
, -1);
2265 return log_error_errno(errno
, "Failed to change UID back for user keyring: %m");
2269 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2270 if (setregid(saved_gid
, -1) < 0)
2271 return log_error_errno(errno
, "Failed to change GID back for user keyring: %m");
2278 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2286 array
[(*n
)++] = pair
[0];
2288 array
[(*n
)++] = pair
[1];
2291 static int close_remaining_fds(
2292 const ExecParameters
*params
,
2293 ExecRuntime
*runtime
,
2294 DynamicCreds
*dcreds
,
2297 int *fds
, unsigned n_fds
) {
2299 unsigned n_dont_close
= 0;
2300 int dont_close
[n_fds
+ 12];
2304 if (params
->stdin_fd
>= 0)
2305 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2306 if (params
->stdout_fd
>= 0)
2307 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2308 if (params
->stderr_fd
>= 0)
2309 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2312 dont_close
[n_dont_close
++] = socket_fd
;
2314 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2315 n_dont_close
+= n_fds
;
2319 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2323 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2325 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2328 if (user_lookup_fd
>= 0)
2329 dont_close
[n_dont_close
++] = user_lookup_fd
;
2331 return close_all_fds(dont_close
, n_dont_close
);
2334 static int send_user_lookup(
2342 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2343 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2346 if (user_lookup_fd
< 0)
2349 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2352 if (writev(user_lookup_fd
,
2354 { .iov_base
= &uid
, .iov_len
= sizeof(uid
) },
2355 { .iov_base
= &gid
, .iov_len
= sizeof(gid
) },
2356 { .iov_base
= unit
->id
, .iov_len
= strlen(unit
->id
) }}, 3) < 0)
2362 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2369 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2374 if (!c
->working_directory_home
)
2378 /* Hardcode /root as home directory for UID 0 */
2383 r
= get_home_dir(buf
);
2391 static int exec_child(
2393 ExecCommand
*command
,
2394 const ExecContext
*context
,
2395 const ExecParameters
*params
,
2396 ExecRuntime
*runtime
,
2397 DynamicCreds
*dcreds
,
2402 unsigned n_storage_fds
,
2403 unsigned n_socket_fds
,
2407 char **error_message
) {
2409 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2410 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2411 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2412 const char *username
= NULL
, *groupname
= NULL
;
2413 const char *home
= NULL
, *shell
= NULL
;
2414 dev_t journal_stream_dev
= 0;
2415 ino_t journal_stream_ino
= 0;
2416 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2417 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2418 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2419 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2421 bool use_selinux
= false;
2424 bool use_smack
= false;
2426 #ifdef HAVE_APPARMOR
2427 bool use_apparmor
= false;
2429 uid_t uid
= UID_INVALID
;
2430 gid_t gid
= GID_INVALID
;
2431 int i
, r
, ngids
= 0;
2433 ExecDirectoryType dt
;
2440 assert(exit_status
);
2441 assert(error_message
);
2442 /* We don't always set error_message, hence it must be initialized */
2443 assert(*error_message
== NULL
);
2445 rename_process_from_path(command
->path
);
2447 /* We reset exactly these signals, since they are the
2448 * only ones we set to SIG_IGN in the main daemon. All
2449 * others we leave untouched because we set them to
2450 * SIG_DFL or a valid handler initially, both of which
2451 * will be demoted to SIG_DFL. */
2452 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2453 SIGNALS_IGNORE
, -1);
2455 if (context
->ignore_sigpipe
)
2456 (void) ignore_signals(SIGPIPE
, -1);
2458 r
= reset_signal_mask();
2460 *exit_status
= EXIT_SIGNAL_MASK
;
2461 *error_message
= strdup("Failed to set process signal mask");
2462 /* If strdup fails, here and below, we will just print the generic error message. */
2466 if (params
->idle_pipe
)
2467 do_idle_pipe_dance(params
->idle_pipe
);
2469 /* Close sockets very early to make sure we don't
2470 * block init reexecution because it cannot bind its
2475 n_fds
= n_storage_fds
+ n_socket_fds
;
2476 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2478 *exit_status
= EXIT_FDS
;
2479 *error_message
= strdup("Failed to close unwanted file descriptors");
2483 if (!context
->same_pgrp
)
2485 *exit_status
= EXIT_SETSID
;
2486 *error_message
= strdup("Failed to create new process session");
2490 exec_context_tty_reset(context
, params
);
2492 if (unit_shall_confirm_spawn(unit
)) {
2493 const char *vc
= params
->confirm_spawn
;
2494 _cleanup_free_
char *cmdline
= NULL
;
2496 cmdline
= exec_command_line(argv
);
2498 *exit_status
= EXIT_MEMORY
;
2502 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2503 if (r
!= CONFIRM_EXECUTE
) {
2504 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2505 *exit_status
= EXIT_SUCCESS
;
2508 *exit_status
= EXIT_CONFIRM
;
2509 *error_message
= strdup("Execution cancelled by the user");
2514 if (context
->dynamic_user
&& dcreds
) {
2516 /* Make sure we bypass our own NSS module for any NSS checks */
2517 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2518 *exit_status
= EXIT_USER
;
2519 *error_message
= strdup("Failed to update environment");
2523 r
= dynamic_creds_realize(dcreds
, &uid
, &gid
);
2525 *exit_status
= EXIT_USER
;
2526 *error_message
= strdup("Failed to update dynamic user credentials");
2530 if (!uid_is_valid(uid
)) {
2531 *exit_status
= EXIT_USER
;
2532 (void) asprintf(error_message
, "UID validation failed for \""UID_FMT
"\"", uid
);
2533 /* If asprintf fails, here and below, we will just print the generic error message. */
2537 if (!gid_is_valid(gid
)) {
2538 *exit_status
= EXIT_USER
;
2539 (void) asprintf(error_message
, "GID validation failed for \""GID_FMT
"\"", gid
);
2544 username
= dcreds
->user
->name
;
2547 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2549 *exit_status
= EXIT_USER
;
2550 *error_message
= strdup("Failed to determine user credentials");
2554 r
= get_fixed_group(context
, &groupname
, &gid
);
2556 *exit_status
= EXIT_GROUP
;
2557 *error_message
= strdup("Failed to determine group credentials");
2562 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2563 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2564 &supplementary_gids
, &ngids
);
2566 *exit_status
= EXIT_GROUP
;
2567 *error_message
= strdup("Failed to determine supplementary groups");
2571 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2573 *exit_status
= EXIT_USER
;
2574 *error_message
= strdup("Failed to send user credentials to PID1");
2578 user_lookup_fd
= safe_close(user_lookup_fd
);
2580 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2582 *exit_status
= EXIT_CHDIR
;
2583 *error_message
= strdup("Failed to determine $HOME for user");
2587 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2588 * must sure to drop O_NONBLOCK */
2590 (void) fd_nonblock(socket_fd
, false);
2592 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2594 *exit_status
= EXIT_STDIN
;
2595 *error_message
= strdup("Failed to set up standard input");
2599 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2601 *exit_status
= EXIT_STDOUT
;
2602 *error_message
= strdup("Failed to set up standard output");
2606 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2608 *exit_status
= EXIT_STDERR
;
2609 *error_message
= strdup("Failed to set up standard error output");
2613 if (params
->cgroup_path
) {
2614 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2616 *exit_status
= EXIT_CGROUP
;
2617 (void) asprintf(error_message
, "Failed to attach to cgroup %s", params
->cgroup_path
);
2622 if (context
->oom_score_adjust_set
) {
2623 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2625 /* When we can't make this change due to EPERM, then
2626 * let's silently skip over it. User namespaces
2627 * prohibit write access to this file, and we
2628 * shouldn't trip up over that. */
2630 sprintf(t
, "%i", context
->oom_score_adjust
);
2631 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2632 if (r
== -EPERM
|| r
== -EACCES
) {
2634 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2637 *exit_status
= EXIT_OOM_ADJUST
;
2638 *error_message
= strdup("Failed to adjust OOM setting");
2643 if (context
->nice_set
)
2644 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2645 *exit_status
= EXIT_NICE
;
2646 *error_message
= strdup("Failed to set up process scheduling priority (nice level)");
2650 if (context
->cpu_sched_set
) {
2651 struct sched_param param
= {
2652 .sched_priority
= context
->cpu_sched_priority
,
2655 r
= sched_setscheduler(0,
2656 context
->cpu_sched_policy
|
2657 (context
->cpu_sched_reset_on_fork
?
2658 SCHED_RESET_ON_FORK
: 0),
2661 *exit_status
= EXIT_SETSCHEDULER
;
2662 *error_message
= strdup("Failed to set up CPU scheduling");
2667 if (context
->cpuset
)
2668 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2669 *exit_status
= EXIT_CPUAFFINITY
;
2670 *error_message
= strdup("Failed to set up CPU affinity");
2674 if (context
->ioprio_set
)
2675 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2676 *exit_status
= EXIT_IOPRIO
;
2677 *error_message
= strdup("Failed to set up IO scheduling priority");
2681 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2682 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2683 *exit_status
= EXIT_TIMERSLACK
;
2684 *error_message
= strdup("Failed to set up timer slack");
2688 if (context
->personality
!= PERSONALITY_INVALID
) {
2689 r
= safe_personality(context
->personality
);
2691 *exit_status
= EXIT_PERSONALITY
;
2692 *error_message
= strdup("Failed to set up execution domain (personality)");
2697 if (context
->utmp_id
)
2698 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2700 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2701 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2705 if (context
->user
) {
2706 r
= chown_terminal(STDIN_FILENO
, uid
);
2708 *exit_status
= EXIT_STDIN
;
2709 *error_message
= strdup("Failed to change ownership of terminal");
2714 /* If delegation is enabled we'll pass ownership of the cgroup
2715 * (but only in systemd's own controller hierarchy!) to the
2716 * user of the new process. */
2717 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2718 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2720 *exit_status
= EXIT_CGROUP
;
2721 *error_message
= strdup("Failed to adjust control group access");
2726 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2728 *exit_status
= EXIT_CGROUP
;
2729 *error_message
= strdup("Failed to adjust control group access");
2734 for (dt
= 0; dt
< _EXEC_DIRECTORY_MAX
; dt
++) {
2735 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2737 *error_message
= strdup("Failed to set up special execution directory");
2742 r
= build_environment(
2754 *exit_status
= EXIT_MEMORY
;
2758 r
= build_pass_environment(context
, &pass_env
);
2760 *exit_status
= EXIT_MEMORY
;
2764 accum_env
= strv_env_merge(5,
2765 params
->environment
,
2768 context
->environment
,
2772 *exit_status
= EXIT_MEMORY
;
2775 accum_env
= strv_env_clean(accum_env
);
2777 (void) umask(context
->umask
);
2779 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
2781 *exit_status
= EXIT_KEYRING
;
2782 *error_message
= strdup("Failed to set up kernel keyring");
2786 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2787 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2789 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2790 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
2792 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2793 if (needs_ambient_hack
)
2794 needs_setuid
= false;
2796 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
2798 if (needs_sandboxing
) {
2799 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
2800 * present. The actual MAC context application will happen later, as late as possible, to avoid
2801 * impacting our own code paths. */
2804 use_selinux
= mac_selinux_use();
2807 use_smack
= mac_smack_use();
2809 #ifdef HAVE_APPARMOR
2810 use_apparmor
= mac_apparmor_use();
2815 if (context
->pam_name
&& username
) {
2816 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
2818 *exit_status
= EXIT_PAM
;
2819 *error_message
= strdup("Failed to set up PAM session");
2825 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
2826 r
= setup_netns(runtime
->netns_storage_socket
);
2828 *exit_status
= EXIT_NETWORK
;
2829 *error_message
= strdup("Failed to set up network namespacing");
2834 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
2835 if (needs_mount_namespace
) {
2836 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
2838 *exit_status
= EXIT_NAMESPACE
;
2839 *error_message
= strdup("Failed to set up mount namespacing");
2844 /* Apply just after mount namespace setup */
2845 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
2847 *error_message
= strdup("Changing to the requested working directory failed");
2851 /* Drop groups as early as possbile */
2853 r
= enforce_groups(context
, gid
, supplementary_gids
, ngids
);
2855 *error_message
= strdup("Changing group credentials failed");
2856 *exit_status
= EXIT_GROUP
;
2861 if (needs_sandboxing
) {
2863 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
2864 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
2866 *error_message
= strdup("Failed to determine SELinux context");
2867 *exit_status
= EXIT_SELINUX_CONTEXT
;
2873 if (context
->private_users
) {
2874 r
= setup_private_users(uid
, gid
);
2876 *error_message
= strdup("Failed to set up user namespacing");
2877 *exit_status
= EXIT_USER
;
2883 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
2884 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
2885 * was needed to upload the policy and can now be closed as well. */
2886 r
= close_all_fds(fds
, n_fds
);
2888 r
= shift_fds(fds
, n_fds
);
2890 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
2892 *error_message
= strdup("Failed to adjust passed file descriptors");
2893 *exit_status
= EXIT_FDS
;
2897 secure_bits
= context
->secure_bits
;
2899 if (needs_sandboxing
) {
2902 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
2904 if (!context
->rlimit
[i
])
2907 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
2909 *error_message
= strdup("Failed to adjust resource limits");
2910 *exit_status
= EXIT_LIMITS
;
2915 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
2916 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
2917 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
2918 *error_message
= strdup("Failed to adjust RLIMIT_RTPRIO resource limit");
2919 *exit_status
= EXIT_LIMITS
;
2924 bset
= context
->capability_bounding_set
;
2925 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
2926 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
2927 * instead of us doing that */
2928 if (needs_ambient_hack
)
2929 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
2930 (UINT64_C(1) << CAP_SETUID
) |
2931 (UINT64_C(1) << CAP_SETGID
);
2933 if (!cap_test_all(bset
)) {
2934 r
= capability_bounding_set_drop(bset
, false);
2936 *exit_status
= EXIT_CAPABILITIES
;
2937 *error_message
= strdup("Failed to drop capabilities");
2942 /* This is done before enforce_user, but ambient set
2943 * does not survive over setresuid() if keep_caps is not set. */
2944 if (!needs_ambient_hack
&&
2945 context
->capability_ambient_set
!= 0) {
2946 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
2948 *exit_status
= EXIT_CAPABILITIES
;
2949 *error_message
= strdup("Failed to apply ambient capabilities (before UID change)");
2956 if (context
->user
) {
2957 r
= enforce_user(context
, uid
);
2959 *exit_status
= EXIT_USER
;
2960 (void) asprintf(error_message
, "Failed to change UID to "UID_FMT
, uid
);
2964 if (!needs_ambient_hack
&&
2965 context
->capability_ambient_set
!= 0) {
2967 /* Fix the ambient capabilities after user change. */
2968 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
2970 *exit_status
= EXIT_CAPABILITIES
;
2971 *error_message
= strdup("Failed to apply ambient capabilities (after UID change)");
2975 /* If we were asked to change user and ambient capabilities
2976 * were requested, we had to add keep-caps to the securebits
2977 * so that we would maintain the inherited capability set
2978 * through the setresuid(). Make sure that the bit is added
2979 * also to the context secure_bits so that we don't try to
2980 * drop the bit away next. */
2982 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
2987 if (needs_sandboxing
) {
2988 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
2989 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
2990 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
2991 * are restricted. */
2995 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
2998 r
= setexeccon(exec_context
);
3000 *exit_status
= EXIT_SELINUX_CONTEXT
;
3001 (void) asprintf(error_message
, "Failed to change SELinux context to %s", exec_context
);
3010 r
= setup_smack(context
, command
);
3012 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3013 *error_message
= strdup("Failed to set SMACK process label");
3019 #ifdef HAVE_APPARMOR
3020 if (use_apparmor
&& context
->apparmor_profile
) {
3021 r
= aa_change_onexec(context
->apparmor_profile
);
3022 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3023 *exit_status
= EXIT_APPARMOR_PROFILE
;
3024 (void) asprintf(error_message
,
3025 "Failed to prepare AppArmor profile change to %s",
3026 context
->apparmor_profile
);
3032 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3033 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3034 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3035 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3036 *exit_status
= EXIT_SECUREBITS
;
3037 *error_message
= strdup("Failed to set process secure bits");
3041 if (context_has_no_new_privileges(context
))
3042 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3043 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3044 *error_message
= strdup("Failed to disable new privileges");
3049 r
= apply_address_families(unit
, context
);
3051 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3052 *error_message
= strdup("Failed to restrict address families");
3056 r
= apply_memory_deny_write_execute(unit
, context
);
3058 *exit_status
= EXIT_SECCOMP
;
3059 *error_message
= strdup("Failed to disable writing to executable memory");
3063 r
= apply_restrict_realtime(unit
, context
);
3065 *exit_status
= EXIT_SECCOMP
;
3066 *error_message
= strdup("Failed to apply realtime restrictions");
3070 r
= apply_restrict_namespaces(unit
, context
);
3072 *exit_status
= EXIT_SECCOMP
;
3073 *error_message
= strdup("Failed to apply namespace restrictions");
3077 r
= apply_protect_sysctl(unit
, context
);
3079 *exit_status
= EXIT_SECCOMP
;
3080 *error_message
= strdup("Failed to apply sysctl restrictions");
3084 r
= apply_protect_kernel_modules(unit
, context
);
3086 *exit_status
= EXIT_SECCOMP
;
3087 *error_message
= strdup("Failed to apply module loading restrictions");
3091 r
= apply_private_devices(unit
, context
);
3093 *exit_status
= EXIT_SECCOMP
;
3094 *error_message
= strdup("Failed to set up private devices");
3098 r
= apply_syscall_archs(unit
, context
);
3100 *exit_status
= EXIT_SECCOMP
;
3101 *error_message
= strdup("Failed to apply syscall architecture restrictions");
3105 r
= apply_lock_personality(unit
, context
);
3107 *exit_status
= EXIT_SECCOMP
;
3108 *error_message
= strdup("Failed to lock personalities");
3112 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3113 * by the filter as little as possible. */
3114 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3116 *exit_status
= EXIT_SECCOMP
;
3117 *error_message
= strdup("Failed to apply system call filters");
3123 if (!strv_isempty(context
->unset_environment
)) {
3126 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3128 *exit_status
= EXIT_MEMORY
;
3132 strv_free(accum_env
);
3136 final_argv
= replace_env_argv(argv
, accum_env
);
3138 *exit_status
= EXIT_MEMORY
;
3139 *error_message
= strdup("Failed to prepare process arguments");
3143 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3144 _cleanup_free_
char *line
;
3146 line
= exec_command_line(final_argv
);
3149 log_struct(LOG_DEBUG
,
3150 "EXECUTABLE=%s", command
->path
,
3151 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3158 execve(command
->path
, final_argv
, accum_env
);
3159 *exit_status
= EXIT_EXEC
;
3163 int exec_spawn(Unit
*unit
,
3164 ExecCommand
*command
,
3165 const ExecContext
*context
,
3166 const ExecParameters
*params
,
3167 ExecRuntime
*runtime
,
3168 DynamicCreds
*dcreds
,
3171 _cleanup_strv_free_
char **files_env
= NULL
;
3173 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3174 _cleanup_free_
char *line
= NULL
;
3176 int named_iofds
[3] = { -1, -1, -1 };
3185 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3187 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3188 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3189 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3191 if (params
->n_socket_fds
> 1) {
3192 log_unit_error(unit
, "Got more than one socket.");
3196 if (params
->n_socket_fds
== 0) {
3197 log_unit_error(unit
, "Got no socket.");
3201 socket_fd
= params
->fds
[0];
3205 n_storage_fds
= params
->n_storage_fds
;
3206 n_socket_fds
= params
->n_socket_fds
;
3209 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3211 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3213 r
= exec_context_load_environment(unit
, context
, &files_env
);
3215 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3217 argv
= params
->argv
?: command
->argv
;
3218 line
= exec_command_line(argv
);
3222 log_struct(LOG_DEBUG
,
3223 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3224 "EXECUTABLE=%s", command
->path
,
3229 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3233 _cleanup_free_
char *error_message
= NULL
;
3235 r
= exec_child(unit
,
3248 unit
->manager
->user_lookup_fds
[1],
3254 log_struct_errno(LOG_ERR
, r
,
3255 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3257 LOG_UNIT_MESSAGE(unit
, "%s: %m",
3259 "EXECUTABLE=%s", command
->path
,
3261 else if (r
== -ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
))
3262 log_struct_errno(LOG_INFO
, r
,
3263 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3265 LOG_UNIT_MESSAGE(unit
, "Skipped spawning %s: %m",
3267 "EXECUTABLE=%s", command
->path
,
3270 log_struct_errno(LOG_ERR
, r
,
3271 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3273 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3274 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3276 "EXECUTABLE=%s", command
->path
,
3283 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3285 /* We add the new process to the cgroup both in the child (so
3286 * that we can be sure that no user code is ever executed
3287 * outside of the cgroup) and in the parent (so that we can be
3288 * sure that when we kill the cgroup the process will be
3290 if (params
->cgroup_path
)
3291 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3293 exec_status_start(&command
->exec_status
, pid
);
3299 void exec_context_init(ExecContext
*c
) {
3300 ExecDirectoryType i
;
3305 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3306 c
->cpu_sched_policy
= SCHED_OTHER
;
3307 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3308 c
->syslog_level_prefix
= true;
3309 c
->ignore_sigpipe
= true;
3310 c
->timer_slack_nsec
= NSEC_INFINITY
;
3311 c
->personality
= PERSONALITY_INVALID
;
3312 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
3313 c
->directories
[i
].mode
= 0755;
3314 c
->capability_bounding_set
= CAP_ALL
;
3315 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3318 void exec_context_done(ExecContext
*c
) {
3320 ExecDirectoryType i
;
3324 c
->environment
= strv_free(c
->environment
);
3325 c
->environment_files
= strv_free(c
->environment_files
);
3326 c
->pass_environment
= strv_free(c
->pass_environment
);
3327 c
->unset_environment
= strv_free(c
->unset_environment
);
3329 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3330 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3332 for (l
= 0; l
< 3; l
++)
3333 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3335 c
->working_directory
= mfree(c
->working_directory
);
3336 c
->root_directory
= mfree(c
->root_directory
);
3337 c
->root_image
= mfree(c
->root_image
);
3338 c
->tty_path
= mfree(c
->tty_path
);
3339 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3340 c
->user
= mfree(c
->user
);
3341 c
->group
= mfree(c
->group
);
3343 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3345 c
->pam_name
= mfree(c
->pam_name
);
3347 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3348 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3349 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3351 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3354 CPU_FREE(c
->cpuset
);
3356 c
->utmp_id
= mfree(c
->utmp_id
);
3357 c
->selinux_context
= mfree(c
->selinux_context
);
3358 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3359 c
->smack_process_label
= mfree(c
->smack_process_label
);
3361 c
->syscall_filter
= set_free(c
->syscall_filter
);
3362 c
->syscall_archs
= set_free(c
->syscall_archs
);
3363 c
->address_families
= set_free(c
->address_families
);
3365 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
3366 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3369 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3374 if (!runtime_prefix
)
3377 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3378 _cleanup_free_
char *p
;
3380 p
= strjoin(runtime_prefix
, "/", *i
);
3384 /* We execute this synchronously, since we need to be
3385 * sure this is gone when we start the service
3387 (void) rm_rf(p
, REMOVE_ROOT
);
3393 void exec_command_done(ExecCommand
*c
) {
3396 c
->path
= mfree(c
->path
);
3398 c
->argv
= strv_free(c
->argv
);
3401 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3404 for (i
= 0; i
< n
; i
++)
3405 exec_command_done(c
+i
);
3408 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3412 LIST_REMOVE(command
, c
, i
);
3413 exec_command_done(i
);
3420 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3423 for (i
= 0; i
< n
; i
++)
3424 c
[i
] = exec_command_free_list(c
[i
]);
3427 typedef struct InvalidEnvInfo
{
3432 static void invalid_env(const char *p
, void *userdata
) {
3433 InvalidEnvInfo
*info
= userdata
;
3435 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3438 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3443 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3445 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3447 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3449 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3451 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3453 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3459 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3460 unsigned i
, targets
;
3461 const char* stdio_fdname
[3];
3467 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3468 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3469 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3471 for (i
= 0; i
< 3; i
++)
3472 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3474 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3476 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3477 if (named_iofds
[STDIN_FILENO
] < 0 &&
3478 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3479 stdio_fdname
[STDIN_FILENO
] &&
3480 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3482 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3485 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3486 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3487 stdio_fdname
[STDOUT_FILENO
] &&
3488 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3490 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3493 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3494 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3495 stdio_fdname
[STDERR_FILENO
] &&
3496 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3498 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3502 return targets
== 0 ? 0 : -ENOENT
;
3505 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3506 char **i
, **r
= NULL
;
3511 STRV_FOREACH(i
, c
->environment_files
) {
3515 bool ignore
= false;
3517 _cleanup_globfree_ glob_t pglob
= {};
3526 if (!path_is_absolute(fn
)) {
3534 /* Filename supports globbing, take all matching files */
3535 k
= safe_glob(fn
, 0, &pglob
);
3544 /* When we don't match anything, -ENOENT should be returned */
3545 assert(pglob
.gl_pathc
> 0);
3547 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3548 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3556 /* Log invalid environment variables with filename */
3558 InvalidEnvInfo info
= {
3560 .path
= pglob
.gl_pathv
[n
]
3563 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3571 m
= strv_env_merge(2, r
, p
);
3587 static bool tty_may_match_dev_console(const char *tty
) {
3588 _cleanup_free_
char *active
= NULL
;
3594 tty
= skip_dev_prefix(tty
);
3596 /* trivial identity? */
3597 if (streq(tty
, "console"))
3600 console
= resolve_dev_console(&active
);
3601 /* if we could not resolve, assume it may */
3605 /* "tty0" means the active VC, so it may be the same sometimes */
3606 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3609 bool exec_context_may_touch_console(ExecContext
*ec
) {
3611 return (ec
->tty_reset
||
3613 ec
->tty_vt_disallocate
||
3614 is_terminal_input(ec
->std_input
) ||
3615 is_terminal_output(ec
->std_output
) ||
3616 is_terminal_output(ec
->std_error
)) &&
3617 tty_may_match_dev_console(exec_context_tty_path(ec
));
3620 static void strv_fprintf(FILE *f
, char **l
) {
3626 fprintf(f
, " %s", *g
);
3629 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3632 ExecDirectoryType dt
;
3638 prefix
= strempty(prefix
);
3642 "%sWorkingDirectory: %s\n"
3643 "%sRootDirectory: %s\n"
3644 "%sNonBlocking: %s\n"
3645 "%sPrivateTmp: %s\n"
3646 "%sPrivateDevices: %s\n"
3647 "%sProtectKernelTunables: %s\n"
3648 "%sProtectKernelModules: %s\n"
3649 "%sProtectControlGroups: %s\n"
3650 "%sPrivateNetwork: %s\n"
3651 "%sPrivateUsers: %s\n"
3652 "%sProtectHome: %s\n"
3653 "%sProtectSystem: %s\n"
3654 "%sMountAPIVFS: %s\n"
3655 "%sIgnoreSIGPIPE: %s\n"
3656 "%sMemoryDenyWriteExecute: %s\n"
3657 "%sRestrictRealtime: %s\n"
3658 "%sKeyringMode: %s\n",
3660 prefix
, c
->working_directory
? c
->working_directory
: "/",
3661 prefix
, c
->root_directory
? c
->root_directory
: "/",
3662 prefix
, yes_no(c
->non_blocking
),
3663 prefix
, yes_no(c
->private_tmp
),
3664 prefix
, yes_no(c
->private_devices
),
3665 prefix
, yes_no(c
->protect_kernel_tunables
),
3666 prefix
, yes_no(c
->protect_kernel_modules
),
3667 prefix
, yes_no(c
->protect_control_groups
),
3668 prefix
, yes_no(c
->private_network
),
3669 prefix
, yes_no(c
->private_users
),
3670 prefix
, protect_home_to_string(c
->protect_home
),
3671 prefix
, protect_system_to_string(c
->protect_system
),
3672 prefix
, yes_no(c
->mount_apivfs
),
3673 prefix
, yes_no(c
->ignore_sigpipe
),
3674 prefix
, yes_no(c
->memory_deny_write_execute
),
3675 prefix
, yes_no(c
->restrict_realtime
),
3676 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3679 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3681 STRV_FOREACH(e
, c
->environment
)
3682 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3684 STRV_FOREACH(e
, c
->environment_files
)
3685 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3687 STRV_FOREACH(e
, c
->pass_environment
)
3688 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3690 STRV_FOREACH(e
, c
->unset_environment
)
3691 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3693 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3695 for (dt
= 0; dt
< _EXEC_DIRECTORY_MAX
; dt
++) {
3696 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3698 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3699 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3707 if (c
->oom_score_adjust_set
)
3709 "%sOOMScoreAdjust: %i\n",
3710 prefix
, c
->oom_score_adjust
);
3712 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3714 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3715 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3716 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3717 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3720 if (c
->ioprio_set
) {
3721 _cleanup_free_
char *class_str
= NULL
;
3723 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3725 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3727 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3730 if (c
->cpu_sched_set
) {
3731 _cleanup_free_
char *policy_str
= NULL
;
3733 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3735 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3738 "%sCPUSchedulingPriority: %i\n"
3739 "%sCPUSchedulingResetOnFork: %s\n",
3740 prefix
, c
->cpu_sched_priority
,
3741 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3745 fprintf(f
, "%sCPUAffinity:", prefix
);
3746 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3747 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3748 fprintf(f
, " %u", i
);
3752 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3753 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3756 "%sStandardInput: %s\n"
3757 "%sStandardOutput: %s\n"
3758 "%sStandardError: %s\n",
3759 prefix
, exec_input_to_string(c
->std_input
),
3760 prefix
, exec_output_to_string(c
->std_output
),
3761 prefix
, exec_output_to_string(c
->std_error
));
3767 "%sTTYVHangup: %s\n"
3768 "%sTTYVTDisallocate: %s\n",
3769 prefix
, c
->tty_path
,
3770 prefix
, yes_no(c
->tty_reset
),
3771 prefix
, yes_no(c
->tty_vhangup
),
3772 prefix
, yes_no(c
->tty_vt_disallocate
));
3774 if (IN_SET(c
->std_output
,
3777 EXEC_OUTPUT_JOURNAL
,
3778 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3779 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3780 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
3781 IN_SET(c
->std_error
,
3784 EXEC_OUTPUT_JOURNAL
,
3785 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3786 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3787 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
3789 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
3791 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
3793 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
3795 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
3797 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
3800 if (c
->secure_bits
) {
3801 _cleanup_free_
char *str
= NULL
;
3803 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
3805 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
3808 if (c
->capability_bounding_set
!= CAP_ALL
) {
3809 _cleanup_free_
char *str
= NULL
;
3811 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
3813 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
3816 if (c
->capability_ambient_set
!= 0) {
3817 _cleanup_free_
char *str
= NULL
;
3819 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
3821 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
3825 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
3827 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
3829 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
3831 if (strv_length(c
->supplementary_groups
) > 0) {
3832 fprintf(f
, "%sSupplementaryGroups:", prefix
);
3833 strv_fprintf(f
, c
->supplementary_groups
);
3838 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
3840 if (strv_length(c
->read_write_paths
) > 0) {
3841 fprintf(f
, "%sReadWritePaths:", prefix
);
3842 strv_fprintf(f
, c
->read_write_paths
);
3846 if (strv_length(c
->read_only_paths
) > 0) {
3847 fprintf(f
, "%sReadOnlyPaths:", prefix
);
3848 strv_fprintf(f
, c
->read_only_paths
);
3852 if (strv_length(c
->inaccessible_paths
) > 0) {
3853 fprintf(f
, "%sInaccessiblePaths:", prefix
);
3854 strv_fprintf(f
, c
->inaccessible_paths
);
3858 if (c
->n_bind_mounts
> 0)
3859 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
3860 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
3861 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
3862 c
->bind_mounts
[i
].source
,
3863 c
->bind_mounts
[i
].destination
,
3864 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
3869 "%sUtmpIdentifier: %s\n",
3870 prefix
, c
->utmp_id
);
3872 if (c
->selinux_context
)
3874 "%sSELinuxContext: %s%s\n",
3875 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
3877 if (c
->apparmor_profile
)
3879 "%sAppArmorProfile: %s%s\n",
3880 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
3882 if (c
->smack_process_label
)
3884 "%sSmackProcessLabel: %s%s\n",
3885 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
3887 if (c
->personality
!= PERSONALITY_INVALID
)
3889 "%sPersonality: %s\n",
3890 prefix
, strna(personality_to_string(c
->personality
)));
3893 "%sLockPersonality: %s\n",
3894 prefix
, yes_no(c
->lock_personality
));
3896 if (c
->syscall_filter
) {
3904 "%sSystemCallFilter: ",
3907 if (!c
->syscall_whitelist
)
3911 SET_FOREACH(id
, c
->syscall_filter
, j
) {
3912 _cleanup_free_
char *name
= NULL
;
3919 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
3920 fputs(strna(name
), f
);
3927 if (c
->syscall_archs
) {
3934 "%sSystemCallArchitectures:",
3938 SET_FOREACH(id
, c
->syscall_archs
, j
)
3939 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
3944 if (exec_context_restrict_namespaces_set(c
)) {
3945 _cleanup_free_
char *s
= NULL
;
3947 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
3949 fprintf(f
, "%sRestrictNamespaces: %s\n",
3953 if (c
->syscall_errno
> 0)
3955 "%sSystemCallErrorNumber: %s\n",
3956 prefix
, strna(errno_to_name(c
->syscall_errno
)));
3958 if (c
->apparmor_profile
)
3960 "%sAppArmorProfile: %s%s\n",
3961 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
3964 bool exec_context_maintains_privileges(ExecContext
*c
) {
3967 /* Returns true if the process forked off would run under
3968 * an unchanged UID or as root. */
3973 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
3979 int exec_context_get_effective_ioprio(ExecContext
*c
) {
3987 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
3989 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
3994 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
3999 dual_timestamp_get(&s
->start_timestamp
);
4002 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
4005 if (s
->pid
&& s
->pid
!= pid
)
4009 dual_timestamp_get(&s
->exit_timestamp
);
4015 if (context
->utmp_id
)
4016 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4018 exec_context_tty_reset(context
, NULL
);
4022 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
4023 char buf
[FORMAT_TIMESTAMP_MAX
];
4031 prefix
= strempty(prefix
);
4034 "%sPID: "PID_FMT
"\n",
4037 if (dual_timestamp_is_set(&s
->start_timestamp
))
4039 "%sStart Timestamp: %s\n",
4040 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4042 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4044 "%sExit Timestamp: %s\n"
4046 "%sExit Status: %i\n",
4047 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4048 prefix
, sigchld_code_to_string(s
->code
),
4052 char *exec_command_line(char **argv
) {
4060 STRV_FOREACH(a
, argv
)
4068 STRV_FOREACH(a
, argv
) {
4075 if (strpbrk(*a
, WHITESPACE
)) {
4086 /* FIXME: this doesn't really handle arguments that have
4087 * spaces and ticks in them */
4092 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4093 _cleanup_free_
char *cmd
= NULL
;
4094 const char *prefix2
;
4099 prefix
= strempty(prefix
);
4100 prefix2
= strjoina(prefix
, "\t");
4102 cmd
= exec_command_line(c
->argv
);
4104 "%sCommand Line: %s\n",
4105 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4107 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4110 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4113 prefix
= strempty(prefix
);
4115 LIST_FOREACH(command
, c
, c
)
4116 exec_command_dump(c
, f
, prefix
);
4119 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4126 /* It's kind of important, that we keep the order here */
4127 LIST_FIND_TAIL(command
, *l
, end
);
4128 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4133 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4141 l
= strv_new_ap(path
, ap
);
4162 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4163 _cleanup_strv_free_
char **l
= NULL
;
4171 l
= strv_new_ap(path
, ap
);
4177 r
= strv_extend_strv(&c
->argv
, l
, false);
4185 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4190 *rt
= new0(ExecRuntime
, 1);
4195 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4200 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4210 if (!c
->private_network
&& !c
->private_tmp
)
4213 r
= exec_runtime_allocate(rt
);
4217 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4218 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4222 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4223 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4231 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4233 assert(r
->n_ref
> 0);
4239 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4244 assert(r
->n_ref
> 0);
4251 free(r
->var_tmp_dir
);
4252 safe_close_pair(r
->netns_storage_socket
);
4256 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4265 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4267 if (rt
->var_tmp_dir
)
4268 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4270 if (rt
->netns_storage_socket
[0] >= 0) {
4273 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4277 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4280 if (rt
->netns_storage_socket
[1] >= 0) {
4283 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4287 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4293 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4300 if (streq(key
, "tmp-dir")) {
4303 r
= exec_runtime_allocate(rt
);
4307 copy
= strdup(value
);
4311 free((*rt
)->tmp_dir
);
4312 (*rt
)->tmp_dir
= copy
;
4314 } else if (streq(key
, "var-tmp-dir")) {
4317 r
= exec_runtime_allocate(rt
);
4321 copy
= strdup(value
);
4325 free((*rt
)->var_tmp_dir
);
4326 (*rt
)->var_tmp_dir
= copy
;
4328 } else if (streq(key
, "netns-socket-0")) {
4331 r
= exec_runtime_allocate(rt
);
4335 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4336 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4338 safe_close((*rt
)->netns_storage_socket
[0]);
4339 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4341 } else if (streq(key
, "netns-socket-1")) {
4344 r
= exec_runtime_allocate(rt
);
4348 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4349 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4351 safe_close((*rt
)->netns_storage_socket
[1]);
4352 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4360 static void *remove_tmpdir_thread(void *p
) {
4361 _cleanup_free_
char *path
= p
;
4363 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4367 void exec_runtime_destroy(ExecRuntime
*rt
) {
4373 /* If there are multiple users of this, let's leave the stuff around */
4378 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4380 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4382 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4389 if (rt
->var_tmp_dir
) {
4390 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4392 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4394 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4395 free(rt
->var_tmp_dir
);
4398 rt
->var_tmp_dir
= NULL
;
4401 safe_close_pair(rt
->netns_storage_socket
);
4404 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4405 [EXEC_INPUT_NULL
] = "null",
4406 [EXEC_INPUT_TTY
] = "tty",
4407 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4408 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4409 [EXEC_INPUT_SOCKET
] = "socket",
4410 [EXEC_INPUT_NAMED_FD
] = "fd",
4413 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4415 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4416 [EXEC_OUTPUT_INHERIT
] = "inherit",
4417 [EXEC_OUTPUT_NULL
] = "null",
4418 [EXEC_OUTPUT_TTY
] = "tty",
4419 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4420 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4421 [EXEC_OUTPUT_KMSG
] = "kmsg",
4422 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4423 [EXEC_OUTPUT_JOURNAL
] = "journal",
4424 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4425 [EXEC_OUTPUT_SOCKET
] = "socket",
4426 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4429 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4431 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4432 [EXEC_UTMP_INIT
] = "init",
4433 [EXEC_UTMP_LOGIN
] = "login",
4434 [EXEC_UTMP_USER
] = "user",
4437 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4439 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4440 [EXEC_PRESERVE_NO
] = "no",
4441 [EXEC_PRESERVE_YES
] = "yes",
4442 [EXEC_PRESERVE_RESTART
] = "restart",
4445 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4447 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_MAX
] = {
4448 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4449 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4450 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4451 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4452 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4455 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
4457 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
4458 [EXEC_KEYRING_INHERIT
] = "inherit",
4459 [EXEC_KEYRING_PRIVATE
] = "private",
4460 [EXEC_KEYRING_SHARED
] = "shared",
4463 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);