2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
33 #include <sys/socket.h>
35 #include <sys/types.h>
41 #include <security/pam_appl.h>
45 #include <selinux/selinux.h>
53 #include <sys/apparmor.h>
56 #include "sd-messages.h"
59 #include "alloc-util.h"
61 #include "apparmor-util.h"
66 #include "capability-util.h"
69 #include "errno-list.h"
71 #include "exit-status.h"
74 #include "format-util.h"
76 #include "glob-util.h"
83 #include "namespace.h"
84 #include "parse-util.h"
85 #include "path-util.h"
86 #include "process-util.h"
87 #include "rlimit-util.h"
90 #include "seccomp-util.h"
92 #include "securebits.h"
93 #include "securebits-util.h"
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "smack-util.h"
98 #include "string-table.h"
99 #include "string-util.h"
101 #include "syslog-util.h"
102 #include "terminal-util.h"
104 #include "user-util.h"
106 #include "utmp-wtmp.h"
108 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
109 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
111 /* This assumes there is a 'tty' group */
112 #define TTY_MODE 0620
114 #define SNDBUF_SIZE (8*1024*1024)
116 static int shift_fds(int fds
[], unsigned n_fds
) {
117 int start
, restart_from
;
122 /* Modifies the fds array! (sorts it) */
132 for (i
= start
; i
< (int) n_fds
; i
++) {
135 /* Already at right index? */
139 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
146 /* Hmm, the fd we wanted isn't free? Then
147 * let's remember that and try again from here */
148 if (nfd
!= i
+3 && restart_from
< 0)
152 if (restart_from
< 0)
155 start
= restart_from
;
161 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
165 n_fds
= n_storage_fds
+ n_socket_fds
;
171 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
172 * O_NONBLOCK only applies to socket activation though. */
174 for (i
= 0; i
< n_fds
; i
++) {
176 if (i
< n_socket_fds
) {
177 r
= fd_nonblock(fds
[i
], nonblock
);
182 /* We unconditionally drop FD_CLOEXEC from the fds,
183 * since after all we want to pass these fds to our
186 r
= fd_cloexec(fds
[i
], false);
194 static const char *exec_context_tty_path(const ExecContext
*context
) {
197 if (context
->stdio_as_fds
)
200 if (context
->tty_path
)
201 return context
->tty_path
;
203 return "/dev/console";
206 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
211 path
= exec_context_tty_path(context
);
213 if (context
->tty_vhangup
) {
214 if (p
&& p
->stdin_fd
>= 0)
215 (void) terminal_vhangup_fd(p
->stdin_fd
);
217 (void) terminal_vhangup(path
);
220 if (context
->tty_reset
) {
221 if (p
&& p
->stdin_fd
>= 0)
222 (void) reset_terminal_fd(p
->stdin_fd
, true);
224 (void) reset_terminal(path
);
227 if (context
->tty_vt_disallocate
&& path
)
228 (void) vt_disallocate(path
);
231 static bool is_terminal_input(ExecInput i
) {
234 EXEC_INPUT_TTY_FORCE
,
235 EXEC_INPUT_TTY_FAIL
);
238 static bool is_terminal_output(ExecOutput o
) {
241 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
243 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
246 static bool is_syslog_output(ExecOutput o
) {
249 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
252 static bool is_kmsg_output(ExecOutput o
) {
255 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
258 static bool exec_context_needs_term(const ExecContext
*c
) {
261 /* Return true if the execution context suggests we should set $TERM to something useful. */
263 if (is_terminal_input(c
->std_input
))
266 if (is_terminal_output(c
->std_output
))
269 if (is_terminal_output(c
->std_error
))
272 return !!c
->tty_path
;
275 static int open_null_as(int flags
, int nfd
) {
280 fd
= open("/dev/null", flags
|O_NOCTTY
);
285 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
293 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
294 static const union sockaddr_union sa
= {
295 .un
.sun_family
= AF_UNIX
,
296 .un
.sun_path
= "/run/systemd/journal/stdout",
298 uid_t olduid
= UID_INVALID
;
299 gid_t oldgid
= GID_INVALID
;
302 if (gid_is_valid(gid
)) {
305 if (setegid(gid
) < 0)
309 if (uid_is_valid(uid
)) {
312 if (seteuid(uid
) < 0) {
318 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
320 /* If we fail to restore the uid or gid, things will likely
321 fail later on. This should only happen if an LSM interferes. */
323 if (uid_is_valid(uid
))
324 (void) seteuid(olduid
);
327 if (gid_is_valid(gid
))
328 (void) setegid(oldgid
);
333 static int connect_logger_as(
335 const ExecContext
*context
,
336 const ExecParameters
*params
,
347 assert(output
< _EXEC_OUTPUT_MAX
);
351 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
355 r
= connect_journal_socket(fd
, uid
, gid
);
359 if (shutdown(fd
, SHUT_RD
) < 0) {
364 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
374 context
->syslog_identifier
?: ident
,
375 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
376 context
->syslog_priority
,
377 !!context
->syslog_level_prefix
,
378 is_syslog_output(output
),
379 is_kmsg_output(output
),
380 is_terminal_output(output
));
385 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
390 static int open_terminal_as(const char *path
, mode_t mode
, int nfd
) {
396 fd
= open_terminal(path
, mode
| O_NOCTTY
);
401 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
409 static int fixup_input(ExecInput std_input
, int socket_fd
, bool apply_tty_stdin
) {
411 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
412 return EXEC_INPUT_NULL
;
414 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
415 return EXEC_INPUT_NULL
;
420 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
422 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
423 return EXEC_OUTPUT_INHERIT
;
428 static int setup_input(
429 const ExecContext
*context
,
430 const ExecParameters
*params
,
432 int named_iofds
[3]) {
439 if (params
->stdin_fd
>= 0) {
440 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
443 /* Try to make this the controlling tty, if it is a tty, and reset it */
444 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
445 (void) reset_terminal_fd(STDIN_FILENO
, true);
450 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
454 case EXEC_INPUT_NULL
:
455 return open_null_as(O_RDONLY
, STDIN_FILENO
);
458 case EXEC_INPUT_TTY_FORCE
:
459 case EXEC_INPUT_TTY_FAIL
: {
462 fd
= acquire_terminal(exec_context_tty_path(context
),
463 i
== EXEC_INPUT_TTY_FAIL
,
464 i
== EXEC_INPUT_TTY_FORCE
,
470 if (fd
!= STDIN_FILENO
) {
471 r
= dup2(fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
479 case EXEC_INPUT_SOCKET
:
480 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
482 case EXEC_INPUT_NAMED_FD
:
483 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
484 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
487 assert_not_reached("Unknown input type");
491 static int setup_output(
493 const ExecContext
*context
,
494 const ExecParameters
*params
,
501 dev_t
*journal_stream_dev
,
502 ino_t
*journal_stream_ino
) {
512 assert(journal_stream_dev
);
513 assert(journal_stream_ino
);
515 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
517 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
520 return STDOUT_FILENO
;
523 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
524 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
527 return STDERR_FILENO
;
530 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
531 o
= fixup_output(context
->std_output
, socket_fd
);
533 if (fileno
== STDERR_FILENO
) {
535 e
= fixup_output(context
->std_error
, socket_fd
);
537 /* This expects the input and output are already set up */
539 /* Don't change the stderr file descriptor if we inherit all
540 * the way and are not on a tty */
541 if (e
== EXEC_OUTPUT_INHERIT
&&
542 o
== EXEC_OUTPUT_INHERIT
&&
543 i
== EXEC_INPUT_NULL
&&
544 !is_terminal_input(context
->std_input
) &&
548 /* Duplicate from stdout if possible */
549 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
550 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
554 } else if (o
== EXEC_OUTPUT_INHERIT
) {
555 /* If input got downgraded, inherit the original value */
556 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
557 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
559 /* If the input is connected to anything that's not a /dev/null, inherit that... */
560 if (i
!= EXEC_INPUT_NULL
)
561 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
563 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
567 /* We need to open /dev/null here anew, to get the right access mode. */
568 return open_null_as(O_WRONLY
, fileno
);
573 case EXEC_OUTPUT_NULL
:
574 return open_null_as(O_WRONLY
, fileno
);
576 case EXEC_OUTPUT_TTY
:
577 if (is_terminal_input(i
))
578 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
580 /* We don't reset the terminal if this is just about output */
581 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
583 case EXEC_OUTPUT_SYSLOG
:
584 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
585 case EXEC_OUTPUT_KMSG
:
586 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
587 case EXEC_OUTPUT_JOURNAL
:
588 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
589 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
591 log_unit_error_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
592 r
= open_null_as(O_WRONLY
, fileno
);
596 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
597 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
598 * services to detect whether they are connected to the journal or not. */
600 if (fstat(fileno
, &st
) >= 0) {
601 *journal_stream_dev
= st
.st_dev
;
602 *journal_stream_ino
= st
.st_ino
;
607 case EXEC_OUTPUT_SOCKET
:
608 assert(socket_fd
>= 0);
609 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
611 case EXEC_OUTPUT_NAMED_FD
:
612 (void) fd_nonblock(named_iofds
[fileno
], false);
613 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
616 assert_not_reached("Unknown error type");
620 static int chown_terminal(int fd
, uid_t uid
) {
625 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
629 /* This might fail. What matters are the results. */
630 (void) fchown(fd
, uid
, -1);
631 (void) fchmod(fd
, TTY_MODE
);
633 if (fstat(fd
, &st
) < 0)
636 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
642 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
643 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
646 assert(_saved_stdin
);
647 assert(_saved_stdout
);
649 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
653 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
654 if (saved_stdout
< 0)
657 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
661 r
= chown_terminal(fd
, getuid());
665 r
= reset_terminal_fd(fd
, true);
669 if (dup2(fd
, STDIN_FILENO
) < 0)
672 if (dup2(fd
, STDOUT_FILENO
) < 0)
679 *_saved_stdin
= saved_stdin
;
680 *_saved_stdout
= saved_stdout
;
682 saved_stdin
= saved_stdout
= -1;
687 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
690 if (err
== -ETIMEDOUT
)
691 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
694 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
698 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
699 _cleanup_close_
int fd
= -1;
703 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
707 write_confirm_error_fd(err
, fd
, u
);
710 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
714 assert(saved_stdout
);
718 if (*saved_stdin
>= 0)
719 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
722 if (*saved_stdout
>= 0)
723 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
726 *saved_stdin
= safe_close(*saved_stdin
);
727 *saved_stdout
= safe_close(*saved_stdout
);
733 CONFIRM_PRETEND_FAILURE
= -1,
734 CONFIRM_PRETEND_SUCCESS
= 0,
738 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
739 int saved_stdout
= -1, saved_stdin
= -1, r
;
740 _cleanup_free_
char *e
= NULL
;
743 /* For any internal errors, assume a positive response. */
744 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
746 write_confirm_error(r
, vc
, u
);
747 return CONFIRM_EXECUTE
;
750 /* confirm_spawn might have been disabled while we were sleeping. */
751 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
756 e
= ellipsize(cmdline
, 60, 100);
764 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
766 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
773 printf("Resuming normal execution.\n");
774 manager_disable_confirm_spawn();
778 unit_dump(u
, stdout
, " ");
779 continue; /* ask again */
781 printf("Failing execution.\n");
782 r
= CONFIRM_PRETEND_FAILURE
;
785 printf(" c - continue, proceed without asking anymore\n"
786 " D - dump, show the state of the unit\n"
787 " f - fail, don't execute the command and pretend it failed\n"
789 " i - info, show a short summary of the unit\n"
790 " j - jobs, show jobs that are in progress\n"
791 " s - skip, don't execute the command and pretend it succeeded\n"
792 " y - yes, execute the command\n");
793 continue; /* ask again */
795 printf(" Description: %s\n"
798 u
->id
, u
->description
, cmdline
);
799 continue; /* ask again */
801 manager_dump_jobs(u
->manager
, stdout
, " ");
802 continue; /* ask again */
804 /* 'n' was removed in favor of 'f'. */
805 printf("Didn't understand 'n', did you mean 'f'?\n");
806 continue; /* ask again */
808 printf("Skipping execution.\n");
809 r
= CONFIRM_PRETEND_SUCCESS
;
815 assert_not_reached("Unhandled choice");
821 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
825 static int get_fixed_user(const ExecContext
*c
, const char **user
,
826 uid_t
*uid
, gid_t
*gid
,
827 const char **home
, const char **shell
) {
836 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
837 * (i.e. are "/" or "/bin/nologin"). */
840 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
848 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
858 r
= get_group_creds(&name
, gid
);
866 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
867 const char *group
, gid_t gid
,
868 gid_t
**supplementary_gids
, int *ngids
) {
872 bool keep_groups
= false;
873 gid_t
*groups
= NULL
;
874 _cleanup_free_ gid_t
*l_gids
= NULL
;
879 * If user is given, then lookup GID and supplementary groups list.
880 * We avoid NSS lookups for gid=0. Also we have to initialize groups
881 * here and as early as possible so we keep the list of supplementary
882 * groups of the caller.
884 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
885 /* First step, initialize groups from /etc/groups */
886 if (initgroups(user
, gid
) < 0)
892 if (!c
->supplementary_groups
)
896 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
897 * be positive, otherwise fail.
900 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
901 if (ngroups_max
<= 0) {
905 return -EOPNOTSUPP
; /* For all other values */
908 l_gids
= new(gid_t
, ngroups_max
);
914 * Lookup the list of groups that the user belongs to, we
915 * avoid NSS lookups here too for gid=0.
918 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
923 STRV_FOREACH(i
, c
->supplementary_groups
) {
926 if (k
>= ngroups_max
)
930 r
= get_group_creds(&g
, l_gids
+k
);
938 * Sets ngids to zero to drop all supplementary groups, happens
939 * when we are under root and SupplementaryGroups= is empty.
946 /* Otherwise get the final list of supplementary groups */
947 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
951 *supplementary_gids
= groups
;
959 static int enforce_groups(const ExecContext
*context
, gid_t gid
,
960 gid_t
*supplementary_gids
, int ngids
) {
965 /* Handle SupplementaryGroups= even if it is empty */
966 if (context
->supplementary_groups
) {
967 r
= maybe_setgroups(ngids
, supplementary_gids
);
972 if (gid_is_valid(gid
)) {
973 /* Then set our gids */
974 if (setresgid(gid
, gid
, gid
) < 0)
981 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
984 if (!uid_is_valid(uid
))
987 /* Sets (but doesn't look up) the uid and make sure we keep the
988 * capabilities while doing so. */
990 if (context
->capability_ambient_set
!= 0) {
992 /* First step: If we need to keep capabilities but
993 * drop privileges we need to make sure we keep our
994 * caps, while we drop privileges. */
996 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
998 if (prctl(PR_GET_SECUREBITS
) != sb
)
999 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1004 /* Second step: actually set the uids */
1005 if (setresuid(uid
, uid
, uid
) < 0)
1008 /* At this point we should have all necessary capabilities but
1009 are otherwise a normal user. However, the caps might got
1010 corrupted due to the setresuid() so we need clean them up
1011 later. This is done outside of this call. */
1018 static int null_conv(
1020 const struct pam_message
**msg
,
1021 struct pam_response
**resp
,
1022 void *appdata_ptr
) {
1024 /* We don't support conversations */
1026 return PAM_CONV_ERR
;
1031 static int setup_pam(
1038 int fds
[], unsigned n_fds
) {
1042 static const struct pam_conv conv
= {
1047 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1048 pam_handle_t
*handle
= NULL
;
1050 int pam_code
= PAM_SUCCESS
, r
;
1051 char **nv
, **e
= NULL
;
1052 bool close_session
= false;
1053 pid_t pam_pid
= 0, parent_pid
;
1060 /* We set up PAM in the parent process, then fork. The child
1061 * will then stay around until killed via PR_GET_PDEATHSIG or
1062 * systemd via the cgroup logic. It will then remove the PAM
1063 * session again. The parent process will exec() the actual
1064 * daemon. We do things this way to ensure that the main PID
1065 * of the daemon is the one we initially fork()ed. */
1067 r
= barrier_create(&barrier
);
1071 if (log_get_max_level() < LOG_DEBUG
)
1072 flags
|= PAM_SILENT
;
1074 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1075 if (pam_code
!= PAM_SUCCESS
) {
1081 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1082 if (pam_code
!= PAM_SUCCESS
)
1086 STRV_FOREACH(nv
, *env
) {
1087 pam_code
= pam_putenv(handle
, *nv
);
1088 if (pam_code
!= PAM_SUCCESS
)
1092 pam_code
= pam_acct_mgmt(handle
, flags
);
1093 if (pam_code
!= PAM_SUCCESS
)
1096 pam_code
= pam_open_session(handle
, flags
);
1097 if (pam_code
!= PAM_SUCCESS
)
1100 close_session
= true;
1102 e
= pam_getenvlist(handle
);
1104 pam_code
= PAM_BUF_ERR
;
1108 /* Block SIGTERM, so that we know that it won't get lost in
1111 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1113 parent_pid
= getpid_cached();
1122 int sig
, ret
= EXIT_PAM
;
1124 /* The child's job is to reset the PAM session on
1126 barrier_set_role(&barrier
, BARRIER_CHILD
);
1128 /* This string must fit in 10 chars (i.e. the length
1129 * of "/sbin/init"), to look pretty in /bin/ps */
1130 rename_process("(sd-pam)");
1132 /* Make sure we don't keep open the passed fds in this
1133 child. We assume that otherwise only those fds are
1134 open here that have been opened by PAM. */
1135 close_many(fds
, n_fds
);
1137 /* Drop privileges - we don't need any to pam_close_session
1138 * and this will make PR_SET_PDEATHSIG work in most cases.
1139 * If this fails, ignore the error - but expect sd-pam threads
1140 * to fail to exit normally */
1142 r
= maybe_setgroups(0, NULL
);
1144 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1145 if (setresgid(gid
, gid
, gid
) < 0)
1146 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1147 if (setresuid(uid
, uid
, uid
) < 0)
1148 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1150 (void) ignore_signals(SIGPIPE
, -1);
1152 /* Wait until our parent died. This will only work if
1153 * the above setresuid() succeeds, otherwise the kernel
1154 * will not allow unprivileged parents kill their privileged
1155 * children this way. We rely on the control groups kill logic
1156 * to do the rest for us. */
1157 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1160 /* Tell the parent that our setup is done. This is especially
1161 * important regarding dropping privileges. Otherwise, unit
1162 * setup might race against our setresuid(2) call.
1164 * If the parent aborted, we'll detect this below, hence ignore
1165 * return failure here. */
1166 (void) barrier_place(&barrier
);
1168 /* Check if our parent process might already have died? */
1169 if (getppid() == parent_pid
) {
1172 assert_se(sigemptyset(&ss
) >= 0);
1173 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1176 if (sigwait(&ss
, &sig
) < 0) {
1183 assert(sig
== SIGTERM
);
1188 /* If our parent died we'll end the session */
1189 if (getppid() != parent_pid
) {
1190 pam_code
= pam_close_session(handle
, flags
);
1191 if (pam_code
!= PAM_SUCCESS
)
1198 pam_end(handle
, pam_code
| flags
);
1202 barrier_set_role(&barrier
, BARRIER_PARENT
);
1204 /* If the child was forked off successfully it will do all the
1205 * cleanups, so forget about the handle here. */
1208 /* Unblock SIGTERM again in the parent */
1209 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1211 /* We close the log explicitly here, since the PAM modules
1212 * might have opened it, but we don't want this fd around. */
1215 /* Synchronously wait for the child to initialize. We don't care for
1216 * errors as we cannot recover. However, warn loudly if it happens. */
1217 if (!barrier_place_and_sync(&barrier
))
1218 log_error("PAM initialization failed");
1226 if (pam_code
!= PAM_SUCCESS
) {
1227 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1228 r
= -EPERM
; /* PAM errors do not map to errno */
1230 log_error_errno(r
, "PAM failed: %m");
1234 pam_code
= pam_close_session(handle
, flags
);
1236 pam_end(handle
, pam_code
| flags
);
1248 static void rename_process_from_path(const char *path
) {
1249 char process_name
[11];
1253 /* This resulting string must fit in 10 chars (i.e. the length
1254 * of "/sbin/init") to look pretty in /bin/ps */
1258 rename_process("(...)");
1264 /* The end of the process name is usually more
1265 * interesting, since the first bit might just be
1271 process_name
[0] = '(';
1272 memcpy(process_name
+1, p
, l
);
1273 process_name
[1+l
] = ')';
1274 process_name
[1+l
+1] = 0;
1276 rename_process(process_name
);
1279 static bool context_has_address_families(const ExecContext
*c
) {
1282 return c
->address_families_whitelist
||
1283 !set_isempty(c
->address_families
);
1286 static bool context_has_syscall_filters(const ExecContext
*c
) {
1289 return c
->syscall_whitelist
||
1290 !set_isempty(c
->syscall_filter
);
1293 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1296 if (c
->no_new_privileges
)
1299 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1302 /* We need NNP if we have any form of seccomp and are unprivileged */
1303 return context_has_address_families(c
) ||
1304 c
->memory_deny_write_execute
||
1305 c
->restrict_realtime
||
1306 exec_context_restrict_namespaces_set(c
) ||
1307 c
->protect_kernel_tunables
||
1308 c
->protect_kernel_modules
||
1309 c
->private_devices
||
1310 context_has_syscall_filters(c
) ||
1311 !set_isempty(c
->syscall_archs
) ||
1312 c
->lock_personality
;
1317 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1319 if (is_seccomp_available())
1323 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1328 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1329 uint32_t negative_action
, default_action
, action
;
1335 if (!context_has_syscall_filters(c
))
1338 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1341 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1343 if (c
->syscall_whitelist
) {
1344 default_action
= negative_action
;
1345 action
= SCMP_ACT_ALLOW
;
1347 default_action
= SCMP_ACT_ALLOW
;
1348 action
= negative_action
;
1351 if (needs_ambient_hack
) {
1352 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1357 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1360 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1364 if (set_isempty(c
->syscall_archs
))
1367 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1370 return seccomp_restrict_archs(c
->syscall_archs
);
1373 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1377 if (!context_has_address_families(c
))
1380 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1383 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1386 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1390 if (!c
->memory_deny_write_execute
)
1393 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1396 return seccomp_memory_deny_write_execute();
1399 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1403 if (!c
->restrict_realtime
)
1406 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1409 return seccomp_restrict_realtime();
1412 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1416 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1417 * let's protect even those systems where this is left on in the kernel. */
1419 if (!c
->protect_kernel_tunables
)
1422 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1425 return seccomp_protect_sysctl();
1428 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1432 /* Turn off module syscalls on ProtectKernelModules=yes */
1434 if (!c
->protect_kernel_modules
)
1437 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1440 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1443 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1447 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1449 if (!c
->private_devices
)
1452 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1455 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1458 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1462 if (!exec_context_restrict_namespaces_set(c
))
1465 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1468 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1471 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1472 unsigned long personality
;
1478 if (!c
->lock_personality
)
1481 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1484 personality
= c
->personality
;
1486 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1487 if (personality
== PERSONALITY_INVALID
) {
1489 r
= opinionated_personality(&personality
);
1494 return seccomp_lock_personality(personality
);
1499 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1502 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1503 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1505 if (idle_pipe
[0] >= 0) {
1508 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1510 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1513 /* Signal systemd that we are bored and want to continue. */
1514 n
= write(idle_pipe
[3], "x", 1);
1516 /* Wait for systemd to react to the signal above. */
1517 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1520 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1524 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1527 static int build_environment(
1529 const ExecContext
*c
,
1530 const ExecParameters
*p
,
1533 const char *username
,
1535 dev_t journal_stream_dev
,
1536 ino_t journal_stream_ino
,
1539 _cleanup_strv_free_
char **our_env
= NULL
;
1547 our_env
= new0(char*, 14);
1552 _cleanup_free_
char *joined
= NULL
;
1554 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1556 our_env
[n_env
++] = x
;
1558 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1560 our_env
[n_env
++] = x
;
1562 joined
= strv_join(p
->fd_names
, ":");
1566 x
= strjoin("LISTEN_FDNAMES=", joined
);
1569 our_env
[n_env
++] = x
;
1572 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1573 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1575 our_env
[n_env
++] = x
;
1577 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1579 our_env
[n_env
++] = x
;
1582 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1583 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1584 * check the database directly. */
1585 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1586 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1589 our_env
[n_env
++] = x
;
1593 x
= strappend("HOME=", home
);
1596 our_env
[n_env
++] = x
;
1600 x
= strappend("LOGNAME=", username
);
1603 our_env
[n_env
++] = x
;
1605 x
= strappend("USER=", username
);
1608 our_env
[n_env
++] = x
;
1612 x
= strappend("SHELL=", shell
);
1615 our_env
[n_env
++] = x
;
1618 if (!sd_id128_is_null(u
->invocation_id
)) {
1619 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1622 our_env
[n_env
++] = x
;
1625 if (exec_context_needs_term(c
)) {
1626 const char *tty_path
, *term
= NULL
;
1628 tty_path
= exec_context_tty_path(c
);
1630 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1631 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1632 * passes to PID 1 ends up all the way in the console login shown. */
1634 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1635 term
= getenv("TERM");
1637 term
= default_term_for_tty(tty_path
);
1639 x
= strappend("TERM=", term
);
1642 our_env
[n_env
++] = x
;
1645 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1646 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1649 our_env
[n_env
++] = x
;
1652 our_env
[n_env
++] = NULL
;
1653 assert(n_env
<= 12);
1661 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1662 _cleanup_strv_free_
char **pass_env
= NULL
;
1663 size_t n_env
= 0, n_bufsize
= 0;
1666 STRV_FOREACH(i
, c
->pass_environment
) {
1667 _cleanup_free_
char *x
= NULL
;
1673 x
= strjoin(*i
, "=", v
);
1677 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1680 pass_env
[n_env
++] = x
;
1681 pass_env
[n_env
] = NULL
;
1691 static bool exec_needs_mount_namespace(
1692 const ExecContext
*context
,
1693 const ExecParameters
*params
,
1694 ExecRuntime
*runtime
) {
1699 if (context
->root_image
)
1702 if (!strv_isempty(context
->read_write_paths
) ||
1703 !strv_isempty(context
->read_only_paths
) ||
1704 !strv_isempty(context
->inaccessible_paths
))
1707 if (context
->n_bind_mounts
> 0)
1710 if (context
->mount_flags
!= 0)
1713 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1716 if (context
->private_devices
||
1717 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1718 context
->protect_home
!= PROTECT_HOME_NO
||
1719 context
->protect_kernel_tunables
||
1720 context
->protect_kernel_modules
||
1721 context
->protect_control_groups
)
1724 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1730 static int setup_private_users(uid_t uid
, gid_t gid
) {
1731 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1732 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1733 _cleanup_close_
int unshare_ready_fd
= -1;
1734 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1740 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1741 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1742 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1743 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1744 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1745 * continues execution normally. */
1747 if (uid
!= 0 && uid_is_valid(uid
)) {
1748 r
= asprintf(&uid_map
,
1749 "0 0 1\n" /* Map root → root */
1750 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1755 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1760 if (gid
!= 0 && gid_is_valid(gid
)) {
1761 r
= asprintf(&gid_map
,
1762 "0 0 1\n" /* Map root → root */
1763 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1768 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1773 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1775 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1776 if (unshare_ready_fd
< 0)
1779 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1781 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1789 _cleanup_close_
int fd
= -1;
1793 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1794 * here, after the parent opened its own user namespace. */
1797 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1799 /* Wait until the parent unshared the user namespace */
1800 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1805 /* Disable the setgroups() system call in the child user namespace, for good. */
1806 a
= procfs_file_alloca(ppid
, "setgroups");
1807 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1809 if (errno
!= ENOENT
) {
1814 /* If the file is missing the kernel is too old, let's continue anyway. */
1816 if (write(fd
, "deny\n", 5) < 0) {
1821 fd
= safe_close(fd
);
1824 /* First write the GID map */
1825 a
= procfs_file_alloca(ppid
, "gid_map");
1826 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1831 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1835 fd
= safe_close(fd
);
1837 /* The write the UID map */
1838 a
= procfs_file_alloca(ppid
, "uid_map");
1839 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1844 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1849 _exit(EXIT_SUCCESS
);
1852 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1853 _exit(EXIT_FAILURE
);
1856 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1858 if (unshare(CLONE_NEWUSER
) < 0)
1861 /* Let the child know that the namespace is ready now */
1862 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1865 /* Try to read an error code from the child */
1866 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1869 if (n
== sizeof(r
)) { /* an error code was sent to us */
1874 if (n
!= 0) /* on success we should have read 0 bytes */
1877 r
= wait_for_terminate(pid
, &si
);
1882 /* If something strange happened with the child, let's consider this fatal, too */
1883 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1889 static int setup_exec_directory(
1890 const ExecContext
*context
,
1891 const ExecParameters
*params
,
1894 ExecDirectoryType type
,
1897 static const int exit_status_table
[_EXEC_DIRECTORY_MAX
] = {
1898 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1899 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1900 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1901 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1902 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1909 assert(type
>= 0 && type
< _EXEC_DIRECTORY_MAX
);
1910 assert(exit_status
);
1912 if (!params
->prefix
[type
])
1915 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1916 if (!uid_is_valid(uid
))
1918 if (!gid_is_valid(gid
))
1922 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1923 _cleanup_free_
char *p
;
1925 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1931 r
= mkdir_parents_label(p
, 0755);
1935 r
= mkdir_p_label(p
, context
->directories
[type
].mode
);
1939 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
1940 * a service, and shall not be writable. */
1941 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
1944 r
= chmod_and_chown(p
, context
->directories
[type
].mode
, uid
, gid
);
1952 *exit_status
= exit_status_table
[type
];
1957 static int setup_smack(
1958 const ExecContext
*context
,
1959 const ExecCommand
*command
) {
1966 if (context
->smack_process_label
) {
1967 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
1971 #ifdef SMACK_DEFAULT_PROCESS_LABEL
1973 _cleanup_free_
char *exec_label
= NULL
;
1975 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
1976 if (r
< 0 && r
!= -ENODATA
&& r
!= -EOPNOTSUPP
)
1979 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
1988 static int compile_read_write_paths(
1989 const ExecContext
*context
,
1990 const ExecParameters
*params
,
1993 _cleanup_strv_free_
char **l
= NULL
;
1995 ExecDirectoryType i
;
1997 /* Compile the list of writable paths. This is the combination of
1998 * the explicitly configured paths, plus all runtime directories. */
2000 if (strv_isempty(context
->read_write_paths
)) {
2001 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
2002 if (!strv_isempty(context
->directories
[i
].paths
))
2005 if (i
== _EXEC_DIRECTORY_MAX
) {
2006 *ret
= NULL
; /* NOP if neither is set */
2011 l
= strv_copy(context
->read_write_paths
);
2015 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++) {
2016 if (!params
->prefix
[i
])
2019 STRV_FOREACH(rt
, context
->directories
[i
].paths
) {
2022 s
= strjoin(params
->prefix
[i
], "/", *rt
);
2026 if (strv_consume(&l
, s
) < 0)
2037 static int apply_mount_namespace(
2039 ExecCommand
*command
,
2040 const ExecContext
*context
,
2041 const ExecParameters
*params
,
2042 ExecRuntime
*runtime
) {
2044 _cleanup_strv_free_
char **rw
= NULL
;
2045 char *tmp
= NULL
, *var
= NULL
;
2046 const char *root_dir
= NULL
, *root_image
= NULL
;
2047 NameSpaceInfo ns_info
= {
2048 .ignore_protect_paths
= false,
2049 .private_dev
= context
->private_devices
,
2050 .protect_control_groups
= context
->protect_control_groups
,
2051 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2052 .protect_kernel_modules
= context
->protect_kernel_modules
,
2053 .mount_apivfs
= context
->mount_apivfs
,
2055 bool needs_sandboxing
;
2060 /* The runtime struct only contains the parent of the private /tmp,
2061 * which is non-accessible to world users. Inside of it there's a /tmp
2062 * that is sticky, and that's the one we want to use here. */
2064 if (context
->private_tmp
&& runtime
) {
2065 if (runtime
->tmp_dir
)
2066 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2067 if (runtime
->var_tmp_dir
)
2068 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2071 r
= compile_read_write_paths(context
, params
, &rw
);
2075 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2076 root_image
= context
->root_image
;
2079 root_dir
= context
->root_directory
;
2083 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2084 * sandbox info, otherwise enforce it, don't ignore protected paths and
2085 * fail if we are enable to apply the sandbox inside the mount namespace.
2087 if (!context
->dynamic_user
&& root_dir
)
2088 ns_info
.ignore_protect_paths
= true;
2090 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2092 r
= setup_namespace(root_dir
, root_image
,
2094 needs_sandboxing
? context
->read_only_paths
: NULL
,
2095 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2096 context
->bind_mounts
,
2097 context
->n_bind_mounts
,
2100 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2101 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2102 context
->mount_flags
,
2103 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2105 /* If we couldn't set up the namespace this is probably due to a
2106 * missing capability. In this case, silently proceeed. */
2107 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2109 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2117 static int apply_working_directory(
2118 const ExecContext
*context
,
2119 const ExecParameters
*params
,
2121 const bool needs_mount_ns
,
2127 assert(exit_status
);
2129 if (context
->working_directory_home
) {
2132 *exit_status
= EXIT_CHDIR
;
2138 } else if (context
->working_directory
)
2139 wd
= context
->working_directory
;
2143 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2144 if (!needs_mount_ns
&& context
->root_directory
)
2145 if (chroot(context
->root_directory
) < 0) {
2146 *exit_status
= EXIT_CHROOT
;
2152 d
= prefix_roota(context
->root_directory
, wd
);
2154 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2155 *exit_status
= EXIT_CHDIR
;
2162 static int setup_keyring(Unit
*u
, const ExecParameters
*p
, uid_t uid
, gid_t gid
) {
2163 key_serial_t keyring
;
2168 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2169 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2170 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2171 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2172 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2173 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2175 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2178 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2179 if (keyring
== -1) {
2180 if (errno
== ENOSYS
)
2181 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
2182 else if (IN_SET(errno
, EACCES
, EPERM
))
2183 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
2184 else if (errno
== EDQUOT
)
2185 log_debug_errno(errno
, "Out of kernel keyrings to allocate, ignoring.");
2187 return log_error_errno(errno
, "Setting up kernel keyring failed: %m");
2192 /* Populate they keyring with the invocation ID by default. */
2193 if (!sd_id128_is_null(u
->invocation_id
)) {
2196 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2198 log_debug_errno(errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2200 if (keyctl(KEYCTL_SETPERM
, key
,
2201 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2202 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2203 return log_error_errno(errno
, "Failed to restrict invocation ID permission: %m");
2207 /* And now, make the keyring owned by the service's user */
2208 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2209 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2210 return log_error_errno(errno
, "Failed to change ownership of session keyring: %m");
2215 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2223 array
[(*n
)++] = pair
[0];
2225 array
[(*n
)++] = pair
[1];
2228 static int close_remaining_fds(
2229 const ExecParameters
*params
,
2230 ExecRuntime
*runtime
,
2231 DynamicCreds
*dcreds
,
2234 int *fds
, unsigned n_fds
) {
2236 unsigned n_dont_close
= 0;
2237 int dont_close
[n_fds
+ 12];
2241 if (params
->stdin_fd
>= 0)
2242 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2243 if (params
->stdout_fd
>= 0)
2244 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2245 if (params
->stderr_fd
>= 0)
2246 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2249 dont_close
[n_dont_close
++] = socket_fd
;
2251 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2252 n_dont_close
+= n_fds
;
2256 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2260 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2262 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2265 if (user_lookup_fd
>= 0)
2266 dont_close
[n_dont_close
++] = user_lookup_fd
;
2268 return close_all_fds(dont_close
, n_dont_close
);
2271 static int send_user_lookup(
2279 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2280 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2283 if (user_lookup_fd
< 0)
2286 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2289 if (writev(user_lookup_fd
,
2291 { .iov_base
= &uid
, .iov_len
= sizeof(uid
) },
2292 { .iov_base
= &gid
, .iov_len
= sizeof(gid
) },
2293 { .iov_base
= unit
->id
, .iov_len
= strlen(unit
->id
) }}, 3) < 0)
2299 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2306 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2311 if (!c
->working_directory_home
)
2315 /* Hardcode /root as home directory for UID 0 */
2320 r
= get_home_dir(buf
);
2328 static int exec_child(
2330 ExecCommand
*command
,
2331 const ExecContext
*context
,
2332 const ExecParameters
*params
,
2333 ExecRuntime
*runtime
,
2334 DynamicCreds
*dcreds
,
2339 unsigned n_storage_fds
,
2340 unsigned n_socket_fds
,
2344 char **error_message
) {
2346 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2347 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2348 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2349 const char *username
= NULL
, *groupname
= NULL
;
2350 const char *home
= NULL
, *shell
= NULL
;
2351 dev_t journal_stream_dev
= 0;
2352 ino_t journal_stream_ino
= 0;
2353 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2354 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2355 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2356 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2358 bool use_selinux
= false;
2361 bool use_smack
= false;
2363 #ifdef HAVE_APPARMOR
2364 bool use_apparmor
= false;
2366 uid_t uid
= UID_INVALID
;
2367 gid_t gid
= GID_INVALID
;
2368 int i
, r
, ngids
= 0;
2370 ExecDirectoryType dt
;
2377 assert(exit_status
);
2378 assert(error_message
);
2379 /* We don't always set error_message, hence it must be initialized */
2380 assert(*error_message
== NULL
);
2382 rename_process_from_path(command
->path
);
2384 /* We reset exactly these signals, since they are the
2385 * only ones we set to SIG_IGN in the main daemon. All
2386 * others we leave untouched because we set them to
2387 * SIG_DFL or a valid handler initially, both of which
2388 * will be demoted to SIG_DFL. */
2389 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2390 SIGNALS_IGNORE
, -1);
2392 if (context
->ignore_sigpipe
)
2393 (void) ignore_signals(SIGPIPE
, -1);
2395 r
= reset_signal_mask();
2397 *exit_status
= EXIT_SIGNAL_MASK
;
2398 *error_message
= strdup("Failed to reset signal mask");
2399 /* If strdup fails, here and below, we will just print the generic error message. */
2403 if (params
->idle_pipe
)
2404 do_idle_pipe_dance(params
->idle_pipe
);
2406 /* Close sockets very early to make sure we don't
2407 * block init reexecution because it cannot bind its
2412 n_fds
= n_storage_fds
+ n_socket_fds
;
2413 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2415 *exit_status
= EXIT_FDS
;
2416 *error_message
= strdup("Failed to close remaining fds");
2420 if (!context
->same_pgrp
)
2422 *exit_status
= EXIT_SETSID
;
2426 exec_context_tty_reset(context
, params
);
2428 if (unit_shall_confirm_spawn(unit
)) {
2429 const char *vc
= params
->confirm_spawn
;
2430 _cleanup_free_
char *cmdline
= NULL
;
2432 cmdline
= exec_command_line(argv
);
2434 *exit_status
= EXIT_CONFIRM
;
2438 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2439 if (r
!= CONFIRM_EXECUTE
) {
2440 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2441 *exit_status
= EXIT_SUCCESS
;
2444 *exit_status
= EXIT_CONFIRM
;
2445 *error_message
= strdup("Execution cancelled");
2450 if (context
->dynamic_user
&& dcreds
) {
2452 /* Make sure we bypass our own NSS module for any NSS checks */
2453 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2454 *exit_status
= EXIT_USER
;
2455 *error_message
= strdup("Failed to update environment");
2459 r
= dynamic_creds_realize(dcreds
, &uid
, &gid
);
2461 *exit_status
= EXIT_USER
;
2462 *error_message
= strdup("Failed to update dynamic user credentials");
2466 if (!uid_is_valid(uid
)) {
2467 *exit_status
= EXIT_USER
;
2468 (void) asprintf(error_message
, "UID validation failed for \""UID_FMT
"\"", uid
);
2469 /* If asprintf fails, here and below, we will just print the generic error message. */
2473 if (!gid_is_valid(gid
)) {
2474 *exit_status
= EXIT_USER
;
2475 (void) asprintf(error_message
, "GID validation failed for \""GID_FMT
"\"", gid
);
2480 username
= dcreds
->user
->name
;
2483 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2485 *exit_status
= EXIT_USER
;
2486 *error_message
= strdup("Failed to determine user credentials");
2490 r
= get_fixed_group(context
, &groupname
, &gid
);
2492 *exit_status
= EXIT_GROUP
;
2493 *error_message
= strdup("Failed to determine group credentials");
2498 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2499 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2500 &supplementary_gids
, &ngids
);
2502 *exit_status
= EXIT_GROUP
;
2503 *error_message
= strdup("Failed to determine supplementary groups");
2507 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2509 *exit_status
= EXIT_USER
;
2510 *error_message
= strdup("Failed to send user credentials to PID1");
2514 user_lookup_fd
= safe_close(user_lookup_fd
);
2516 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2518 *exit_status
= EXIT_CHDIR
;
2519 *error_message
= strdup("Failed to determine $HOME for user");
2523 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2524 * must sure to drop O_NONBLOCK */
2526 (void) fd_nonblock(socket_fd
, false);
2528 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2530 *exit_status
= EXIT_STDIN
;
2531 *error_message
= strdup("Failed to set up stdin");
2535 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2537 *exit_status
= EXIT_STDOUT
;
2538 *error_message
= strdup("Failed to set up stdout");
2542 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2544 *exit_status
= EXIT_STDERR
;
2545 *error_message
= strdup("Failed to set up stderr");
2549 if (params
->cgroup_path
) {
2550 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2552 *exit_status
= EXIT_CGROUP
;
2553 (void) asprintf(error_message
, "Failed to attach to cgroup %s", params
->cgroup_path
);
2558 if (context
->oom_score_adjust_set
) {
2559 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2561 /* When we can't make this change due to EPERM, then
2562 * let's silently skip over it. User namespaces
2563 * prohibit write access to this file, and we
2564 * shouldn't trip up over that. */
2566 sprintf(t
, "%i", context
->oom_score_adjust
);
2567 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2568 if (r
== -EPERM
|| r
== -EACCES
) {
2570 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2573 *exit_status
= EXIT_OOM_ADJUST
;
2574 *error_message
= strdup("Failed to write /proc/self/oom_score_adj");
2579 if (context
->nice_set
)
2580 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2581 *exit_status
= EXIT_NICE
;
2585 if (context
->cpu_sched_set
) {
2586 struct sched_param param
= {
2587 .sched_priority
= context
->cpu_sched_priority
,
2590 r
= sched_setscheduler(0,
2591 context
->cpu_sched_policy
|
2592 (context
->cpu_sched_reset_on_fork
?
2593 SCHED_RESET_ON_FORK
: 0),
2596 *exit_status
= EXIT_SETSCHEDULER
;
2601 if (context
->cpuset
)
2602 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2603 *exit_status
= EXIT_CPUAFFINITY
;
2607 if (context
->ioprio_set
)
2608 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2609 *exit_status
= EXIT_IOPRIO
;
2613 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2614 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2615 *exit_status
= EXIT_TIMERSLACK
;
2619 if (context
->personality
!= PERSONALITY_INVALID
) {
2620 r
= safe_personality(context
->personality
);
2622 *exit_status
= EXIT_PERSONALITY
;
2627 if (context
->utmp_id
)
2628 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2630 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2631 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2635 if (context
->user
) {
2636 r
= chown_terminal(STDIN_FILENO
, uid
);
2638 *exit_status
= EXIT_STDIN
;
2643 /* If delegation is enabled we'll pass ownership of the cgroup
2644 * (but only in systemd's own controller hierarchy!) to the
2645 * user of the new process. */
2646 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2647 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2649 *exit_status
= EXIT_CGROUP
;
2654 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2656 *exit_status
= EXIT_CGROUP
;
2661 for (dt
= 0; dt
< _EXEC_DIRECTORY_MAX
; dt
++) {
2662 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2667 r
= build_environment(
2679 *exit_status
= EXIT_MEMORY
;
2683 r
= build_pass_environment(context
, &pass_env
);
2685 *exit_status
= EXIT_MEMORY
;
2689 accum_env
= strv_env_merge(5,
2690 params
->environment
,
2693 context
->environment
,
2697 *exit_status
= EXIT_MEMORY
;
2700 accum_env
= strv_env_clean(accum_env
);
2702 (void) umask(context
->umask
);
2704 r
= setup_keyring(unit
, params
, uid
, gid
);
2706 *exit_status
= EXIT_KEYRING
;
2710 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2711 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2713 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2714 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
2716 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2717 if (needs_ambient_hack
)
2718 needs_setuid
= false;
2720 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
2722 if (needs_sandboxing
) {
2723 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
2724 * present. The actual MAC context application will happen later, as late as possible, to avoid
2725 * impacting our own code paths. */
2728 use_selinux
= mac_selinux_use();
2731 use_smack
= mac_smack_use();
2733 #ifdef HAVE_APPARMOR
2734 use_apparmor
= mac_apparmor_use();
2739 if (context
->pam_name
&& username
) {
2740 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
2742 *exit_status
= EXIT_PAM
;
2748 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
2749 r
= setup_netns(runtime
->netns_storage_socket
);
2751 *exit_status
= EXIT_NETWORK
;
2756 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
2757 if (needs_mount_namespace
) {
2758 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
2760 *exit_status
= EXIT_NAMESPACE
;
2765 /* Apply just after mount namespace setup */
2766 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
2770 /* Drop groups as early as possbile */
2772 r
= enforce_groups(context
, gid
, supplementary_gids
, ngids
);
2774 *exit_status
= EXIT_GROUP
;
2779 if (needs_sandboxing
) {
2781 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
2782 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
2784 *exit_status
= EXIT_SELINUX_CONTEXT
;
2790 if (context
->private_users
) {
2791 r
= setup_private_users(uid
, gid
);
2793 *exit_status
= EXIT_USER
;
2799 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
2800 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
2801 * was needed to upload the policy and can now be closed as well. */
2802 r
= close_all_fds(fds
, n_fds
);
2804 r
= shift_fds(fds
, n_fds
);
2806 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
2808 *exit_status
= EXIT_FDS
;
2812 secure_bits
= context
->secure_bits
;
2814 if (needs_sandboxing
) {
2817 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
2819 if (!context
->rlimit
[i
])
2822 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
2824 *exit_status
= EXIT_LIMITS
;
2829 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
2830 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
2831 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
2832 *exit_status
= EXIT_LIMITS
;
2837 bset
= context
->capability_bounding_set
;
2838 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
2839 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
2840 * instead of us doing that */
2841 if (needs_ambient_hack
)
2842 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
2843 (UINT64_C(1) << CAP_SETUID
) |
2844 (UINT64_C(1) << CAP_SETGID
);
2846 if (!cap_test_all(bset
)) {
2847 r
= capability_bounding_set_drop(bset
, false);
2849 *exit_status
= EXIT_CAPABILITIES
;
2850 *error_message
= strdup("Failed to drop capabilities");
2855 /* This is done before enforce_user, but ambient set
2856 * does not survive over setresuid() if keep_caps is not set. */
2857 if (!needs_ambient_hack
&&
2858 context
->capability_ambient_set
!= 0) {
2859 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
2861 *exit_status
= EXIT_CAPABILITIES
;
2862 *error_message
= strdup("Failed to apply ambient capabilities (before UID change)");
2869 if (context
->user
) {
2870 r
= enforce_user(context
, uid
);
2872 *exit_status
= EXIT_USER
;
2873 (void) asprintf(error_message
, "Failed to change UID to "UID_FMT
, uid
);
2877 if (!needs_ambient_hack
&&
2878 context
->capability_ambient_set
!= 0) {
2880 /* Fix the ambient capabilities after user change. */
2881 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
2883 *exit_status
= EXIT_CAPABILITIES
;
2884 *error_message
= strdup("Failed to apply ambient capabilities (after UID change)");
2888 /* If we were asked to change user and ambient capabilities
2889 * were requested, we had to add keep-caps to the securebits
2890 * so that we would maintain the inherited capability set
2891 * through the setresuid(). Make sure that the bit is added
2892 * also to the context secure_bits so that we don't try to
2893 * drop the bit away next. */
2895 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
2900 if (needs_sandboxing
) {
2901 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
2902 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
2903 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
2904 * are restricted. */
2908 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
2911 r
= setexeccon(exec_context
);
2913 *exit_status
= EXIT_SELINUX_CONTEXT
;
2914 (void) asprintf(error_message
, "Failed to set SELinux context to %s", exec_context
);
2923 r
= setup_smack(context
, command
);
2925 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
2926 *error_message
= strdup("Failed to set SMACK process label");
2932 #ifdef HAVE_APPARMOR
2933 if (use_apparmor
&& context
->apparmor_profile
) {
2934 r
= aa_change_onexec(context
->apparmor_profile
);
2935 if (r
< 0 && !context
->apparmor_profile_ignore
) {
2936 *exit_status
= EXIT_APPARMOR_PROFILE
;
2937 (void) asprintf(error_message
,
2938 "Failed to prepare AppArmor profile change to %s",
2939 context
->apparmor_profile
);
2945 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
2946 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
2947 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
2948 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
2949 *exit_status
= EXIT_SECUREBITS
;
2950 *error_message
= strdup("Failed to set secure bits");
2954 if (context_has_no_new_privileges(context
))
2955 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
2956 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
2957 *error_message
= strdup("Failed to disable new privileges");
2962 r
= apply_address_families(unit
, context
);
2964 *exit_status
= EXIT_ADDRESS_FAMILIES
;
2965 *error_message
= strdup("Failed to restrict address families");
2969 r
= apply_memory_deny_write_execute(unit
, context
);
2971 *exit_status
= EXIT_SECCOMP
;
2972 *error_message
= strdup("Failed to disable writing to executable memory");
2976 r
= apply_restrict_realtime(unit
, context
);
2978 *exit_status
= EXIT_SECCOMP
;
2979 *error_message
= strdup("Failed to apply realtime restrictions");
2983 r
= apply_restrict_namespaces(unit
, context
);
2985 *exit_status
= EXIT_SECCOMP
;
2986 *error_message
= strdup("Failed to apply namespace restrictions");
2990 r
= apply_protect_sysctl(unit
, context
);
2992 *exit_status
= EXIT_SECCOMP
;
2993 *error_message
= strdup("Failed to apply sysctl restrictions");
2997 r
= apply_protect_kernel_modules(unit
, context
);
2999 *exit_status
= EXIT_SECCOMP
;
3000 *error_message
= strdup("Failed to apply module loading restrictions");
3004 r
= apply_private_devices(unit
, context
);
3006 *exit_status
= EXIT_SECCOMP
;
3007 *error_message
= strdup("Failed to set up private devices");
3011 r
= apply_syscall_archs(unit
, context
);
3013 *exit_status
= EXIT_SECCOMP
;
3014 *error_message
= strdup("Failed to apply syscall architecture restrictions");
3018 r
= apply_lock_personality(unit
, context
);
3020 *exit_status
= EXIT_SECCOMP
;
3021 *error_message
= strdup("Failed to lock personalities");
3025 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3026 * by the filter as little as possible. */
3027 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3029 *exit_status
= EXIT_SECCOMP
;
3030 *error_message
= strdup("Failed to apply syscall filters");
3036 if (!strv_isempty(context
->unset_environment
)) {
3039 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3041 *exit_status
= EXIT_MEMORY
;
3045 strv_free(accum_env
);
3049 final_argv
= replace_env_argv(argv
, accum_env
);
3051 *exit_status
= EXIT_MEMORY
;
3052 *error_message
= strdup("Failed to prepare process arguments");
3056 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3057 _cleanup_free_
char *line
;
3059 line
= exec_command_line(final_argv
);
3062 log_struct(LOG_DEBUG
,
3063 "EXECUTABLE=%s", command
->path
,
3064 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3071 execve(command
->path
, final_argv
, accum_env
);
3072 *exit_status
= EXIT_EXEC
;
3076 int exec_spawn(Unit
*unit
,
3077 ExecCommand
*command
,
3078 const ExecContext
*context
,
3079 const ExecParameters
*params
,
3080 ExecRuntime
*runtime
,
3081 DynamicCreds
*dcreds
,
3084 _cleanup_strv_free_
char **files_env
= NULL
;
3086 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3087 _cleanup_free_
char *line
= NULL
;
3089 int named_iofds
[3] = { -1, -1, -1 };
3098 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3100 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3101 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3102 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3104 if (params
->n_socket_fds
> 1) {
3105 log_unit_error(unit
, "Got more than one socket.");
3109 if (params
->n_socket_fds
== 0) {
3110 log_unit_error(unit
, "Got no socket.");
3114 socket_fd
= params
->fds
[0];
3118 n_storage_fds
= params
->n_storage_fds
;
3119 n_socket_fds
= params
->n_socket_fds
;
3122 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3124 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3126 r
= exec_context_load_environment(unit
, context
, &files_env
);
3128 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3130 argv
= params
->argv
?: command
->argv
;
3131 line
= exec_command_line(argv
);
3135 log_struct(LOG_DEBUG
,
3136 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3137 "EXECUTABLE=%s", command
->path
,
3142 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3146 _cleanup_free_
char *error_message
= NULL
;
3148 r
= exec_child(unit
,
3161 unit
->manager
->user_lookup_fds
[1],
3167 log_struct_errno(LOG_ERR
, r
,
3168 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3170 LOG_UNIT_MESSAGE(unit
, "%s: %m",
3172 "EXECUTABLE=%s", command
->path
,
3174 else if (r
== -ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
))
3175 log_struct_errno(LOG_INFO
, r
,
3176 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3178 LOG_UNIT_MESSAGE(unit
, "Skipped spawning %s: %m",
3180 "EXECUTABLE=%s", command
->path
,
3183 log_struct_errno(LOG_ERR
, r
,
3184 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3186 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3187 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3189 "EXECUTABLE=%s", command
->path
,
3196 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3198 /* We add the new process to the cgroup both in the child (so
3199 * that we can be sure that no user code is ever executed
3200 * outside of the cgroup) and in the parent (so that we can be
3201 * sure that when we kill the cgroup the process will be
3203 if (params
->cgroup_path
)
3204 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3206 exec_status_start(&command
->exec_status
, pid
);
3212 void exec_context_init(ExecContext
*c
) {
3213 ExecDirectoryType i
;
3218 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3219 c
->cpu_sched_policy
= SCHED_OTHER
;
3220 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3221 c
->syslog_level_prefix
= true;
3222 c
->ignore_sigpipe
= true;
3223 c
->timer_slack_nsec
= NSEC_INFINITY
;
3224 c
->personality
= PERSONALITY_INVALID
;
3225 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
3226 c
->directories
[i
].mode
= 0755;
3227 c
->capability_bounding_set
= CAP_ALL
;
3228 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3231 void exec_context_done(ExecContext
*c
) {
3233 ExecDirectoryType i
;
3237 c
->environment
= strv_free(c
->environment
);
3238 c
->environment_files
= strv_free(c
->environment_files
);
3239 c
->pass_environment
= strv_free(c
->pass_environment
);
3240 c
->unset_environment
= strv_free(c
->unset_environment
);
3242 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3243 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3245 for (l
= 0; l
< 3; l
++)
3246 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3248 c
->working_directory
= mfree(c
->working_directory
);
3249 c
->root_directory
= mfree(c
->root_directory
);
3250 c
->root_image
= mfree(c
->root_image
);
3251 c
->tty_path
= mfree(c
->tty_path
);
3252 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3253 c
->user
= mfree(c
->user
);
3254 c
->group
= mfree(c
->group
);
3256 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3258 c
->pam_name
= mfree(c
->pam_name
);
3260 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3261 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3262 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3264 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3267 CPU_FREE(c
->cpuset
);
3269 c
->utmp_id
= mfree(c
->utmp_id
);
3270 c
->selinux_context
= mfree(c
->selinux_context
);
3271 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3272 c
->smack_process_label
= mfree(c
->smack_process_label
);
3274 c
->syscall_filter
= set_free(c
->syscall_filter
);
3275 c
->syscall_archs
= set_free(c
->syscall_archs
);
3276 c
->address_families
= set_free(c
->address_families
);
3278 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
3279 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3282 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3287 if (!runtime_prefix
)
3290 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3291 _cleanup_free_
char *p
;
3293 p
= strjoin(runtime_prefix
, "/", *i
);
3297 /* We execute this synchronously, since we need to be
3298 * sure this is gone when we start the service
3300 (void) rm_rf(p
, REMOVE_ROOT
);
3306 void exec_command_done(ExecCommand
*c
) {
3309 c
->path
= mfree(c
->path
);
3311 c
->argv
= strv_free(c
->argv
);
3314 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3317 for (i
= 0; i
< n
; i
++)
3318 exec_command_done(c
+i
);
3321 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3325 LIST_REMOVE(command
, c
, i
);
3326 exec_command_done(i
);
3333 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3336 for (i
= 0; i
< n
; i
++)
3337 c
[i
] = exec_command_free_list(c
[i
]);
3340 typedef struct InvalidEnvInfo
{
3345 static void invalid_env(const char *p
, void *userdata
) {
3346 InvalidEnvInfo
*info
= userdata
;
3348 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3351 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3356 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3358 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3360 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3362 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3364 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3366 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3372 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3373 unsigned i
, targets
;
3374 const char* stdio_fdname
[3];
3380 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3381 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3382 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3384 for (i
= 0; i
< 3; i
++)
3385 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3387 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3389 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3390 if (named_iofds
[STDIN_FILENO
] < 0 &&
3391 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3392 stdio_fdname
[STDIN_FILENO
] &&
3393 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3395 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3398 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3399 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3400 stdio_fdname
[STDOUT_FILENO
] &&
3401 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3403 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3406 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3407 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3408 stdio_fdname
[STDERR_FILENO
] &&
3409 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3411 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3415 return targets
== 0 ? 0 : -ENOENT
;
3418 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3419 char **i
, **r
= NULL
;
3424 STRV_FOREACH(i
, c
->environment_files
) {
3428 bool ignore
= false;
3430 _cleanup_globfree_ glob_t pglob
= {};
3439 if (!path_is_absolute(fn
)) {
3447 /* Filename supports globbing, take all matching files */
3448 k
= safe_glob(fn
, 0, &pglob
);
3457 /* When we don't match anything, -ENOENT should be returned */
3458 assert(pglob
.gl_pathc
> 0);
3460 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3461 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3469 /* Log invalid environment variables with filename */
3471 InvalidEnvInfo info
= {
3473 .path
= pglob
.gl_pathv
[n
]
3476 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3484 m
= strv_env_merge(2, r
, p
);
3500 static bool tty_may_match_dev_console(const char *tty
) {
3501 _cleanup_free_
char *active
= NULL
;
3507 tty
= skip_dev_prefix(tty
);
3509 /* trivial identity? */
3510 if (streq(tty
, "console"))
3513 console
= resolve_dev_console(&active
);
3514 /* if we could not resolve, assume it may */
3518 /* "tty0" means the active VC, so it may be the same sometimes */
3519 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3522 bool exec_context_may_touch_console(ExecContext
*ec
) {
3524 return (ec
->tty_reset
||
3526 ec
->tty_vt_disallocate
||
3527 is_terminal_input(ec
->std_input
) ||
3528 is_terminal_output(ec
->std_output
) ||
3529 is_terminal_output(ec
->std_error
)) &&
3530 tty_may_match_dev_console(exec_context_tty_path(ec
));
3533 static void strv_fprintf(FILE *f
, char **l
) {
3539 fprintf(f
, " %s", *g
);
3542 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3545 ExecDirectoryType dt
;
3551 prefix
= strempty(prefix
);
3555 "%sWorkingDirectory: %s\n"
3556 "%sRootDirectory: %s\n"
3557 "%sNonBlocking: %s\n"
3558 "%sPrivateTmp: %s\n"
3559 "%sPrivateDevices: %s\n"
3560 "%sProtectKernelTunables: %s\n"
3561 "%sProtectKernelModules: %s\n"
3562 "%sProtectControlGroups: %s\n"
3563 "%sPrivateNetwork: %s\n"
3564 "%sPrivateUsers: %s\n"
3565 "%sProtectHome: %s\n"
3566 "%sProtectSystem: %s\n"
3567 "%sMountAPIVFS: %s\n"
3568 "%sIgnoreSIGPIPE: %s\n"
3569 "%sMemoryDenyWriteExecute: %s\n"
3570 "%sRestrictRealtime: %s\n",
3572 prefix
, c
->working_directory
? c
->working_directory
: "/",
3573 prefix
, c
->root_directory
? c
->root_directory
: "/",
3574 prefix
, yes_no(c
->non_blocking
),
3575 prefix
, yes_no(c
->private_tmp
),
3576 prefix
, yes_no(c
->private_devices
),
3577 prefix
, yes_no(c
->protect_kernel_tunables
),
3578 prefix
, yes_no(c
->protect_kernel_modules
),
3579 prefix
, yes_no(c
->protect_control_groups
),
3580 prefix
, yes_no(c
->private_network
),
3581 prefix
, yes_no(c
->private_users
),
3582 prefix
, protect_home_to_string(c
->protect_home
),
3583 prefix
, protect_system_to_string(c
->protect_system
),
3584 prefix
, yes_no(c
->mount_apivfs
),
3585 prefix
, yes_no(c
->ignore_sigpipe
),
3586 prefix
, yes_no(c
->memory_deny_write_execute
),
3587 prefix
, yes_no(c
->restrict_realtime
));
3590 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3592 STRV_FOREACH(e
, c
->environment
)
3593 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3595 STRV_FOREACH(e
, c
->environment_files
)
3596 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3598 STRV_FOREACH(e
, c
->pass_environment
)
3599 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3601 STRV_FOREACH(e
, c
->unset_environment
)
3602 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3604 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3606 for (dt
= 0; dt
< _EXEC_DIRECTORY_MAX
; dt
++) {
3607 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3609 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3610 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3618 if (c
->oom_score_adjust_set
)
3620 "%sOOMScoreAdjust: %i\n",
3621 prefix
, c
->oom_score_adjust
);
3623 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3625 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3626 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3627 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3628 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3631 if (c
->ioprio_set
) {
3632 _cleanup_free_
char *class_str
= NULL
;
3634 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3636 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3638 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3641 if (c
->cpu_sched_set
) {
3642 _cleanup_free_
char *policy_str
= NULL
;
3644 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3646 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3649 "%sCPUSchedulingPriority: %i\n"
3650 "%sCPUSchedulingResetOnFork: %s\n",
3651 prefix
, c
->cpu_sched_priority
,
3652 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3656 fprintf(f
, "%sCPUAffinity:", prefix
);
3657 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3658 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3659 fprintf(f
, " %u", i
);
3663 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3664 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3667 "%sStandardInput: %s\n"
3668 "%sStandardOutput: %s\n"
3669 "%sStandardError: %s\n",
3670 prefix
, exec_input_to_string(c
->std_input
),
3671 prefix
, exec_output_to_string(c
->std_output
),
3672 prefix
, exec_output_to_string(c
->std_error
));
3678 "%sTTYVHangup: %s\n"
3679 "%sTTYVTDisallocate: %s\n",
3680 prefix
, c
->tty_path
,
3681 prefix
, yes_no(c
->tty_reset
),
3682 prefix
, yes_no(c
->tty_vhangup
),
3683 prefix
, yes_no(c
->tty_vt_disallocate
));
3685 if (IN_SET(c
->std_output
,
3688 EXEC_OUTPUT_JOURNAL
,
3689 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3690 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3691 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
3692 IN_SET(c
->std_error
,
3695 EXEC_OUTPUT_JOURNAL
,
3696 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3697 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3698 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
3700 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
3702 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
3704 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
3706 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
3708 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
3711 if (c
->secure_bits
) {
3712 _cleanup_free_
char *str
= NULL
;
3714 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
3716 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
3719 if (c
->capability_bounding_set
!= CAP_ALL
) {
3720 _cleanup_free_
char *str
= NULL
;
3722 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
3724 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
3727 if (c
->capability_ambient_set
!= 0) {
3728 _cleanup_free_
char *str
= NULL
;
3730 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
3732 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
3736 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
3738 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
3740 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
3742 if (strv_length(c
->supplementary_groups
) > 0) {
3743 fprintf(f
, "%sSupplementaryGroups:", prefix
);
3744 strv_fprintf(f
, c
->supplementary_groups
);
3749 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
3751 if (strv_length(c
->read_write_paths
) > 0) {
3752 fprintf(f
, "%sReadWritePaths:", prefix
);
3753 strv_fprintf(f
, c
->read_write_paths
);
3757 if (strv_length(c
->read_only_paths
) > 0) {
3758 fprintf(f
, "%sReadOnlyPaths:", prefix
);
3759 strv_fprintf(f
, c
->read_only_paths
);
3763 if (strv_length(c
->inaccessible_paths
) > 0) {
3764 fprintf(f
, "%sInaccessiblePaths:", prefix
);
3765 strv_fprintf(f
, c
->inaccessible_paths
);
3769 if (c
->n_bind_mounts
> 0)
3770 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
3771 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
3772 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
3773 c
->bind_mounts
[i
].source
,
3774 c
->bind_mounts
[i
].destination
,
3775 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
3780 "%sUtmpIdentifier: %s\n",
3781 prefix
, c
->utmp_id
);
3783 if (c
->selinux_context
)
3785 "%sSELinuxContext: %s%s\n",
3786 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
3788 if (c
->apparmor_profile
)
3790 "%sAppArmorProfile: %s%s\n",
3791 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
3793 if (c
->smack_process_label
)
3795 "%sSmackProcessLabel: %s%s\n",
3796 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
3798 if (c
->personality
!= PERSONALITY_INVALID
)
3800 "%sPersonality: %s\n",
3801 prefix
, strna(personality_to_string(c
->personality
)));
3804 "%sLockPersonality: %s\n",
3805 prefix
, yes_no(c
->lock_personality
));
3807 if (c
->syscall_filter
) {
3815 "%sSystemCallFilter: ",
3818 if (!c
->syscall_whitelist
)
3822 SET_FOREACH(id
, c
->syscall_filter
, j
) {
3823 _cleanup_free_
char *name
= NULL
;
3830 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
3831 fputs(strna(name
), f
);
3838 if (c
->syscall_archs
) {
3845 "%sSystemCallArchitectures:",
3849 SET_FOREACH(id
, c
->syscall_archs
, j
)
3850 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
3855 if (exec_context_restrict_namespaces_set(c
)) {
3856 _cleanup_free_
char *s
= NULL
;
3858 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
3860 fprintf(f
, "%sRestrictNamespaces: %s\n",
3864 if (c
->syscall_errno
> 0)
3866 "%sSystemCallErrorNumber: %s\n",
3867 prefix
, strna(errno_to_name(c
->syscall_errno
)));
3869 if (c
->apparmor_profile
)
3871 "%sAppArmorProfile: %s%s\n",
3872 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
3875 bool exec_context_maintains_privileges(ExecContext
*c
) {
3878 /* Returns true if the process forked off would run under
3879 * an unchanged UID or as root. */
3884 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
3890 int exec_context_get_effective_ioprio(ExecContext
*c
) {
3898 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
3900 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
3905 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
3910 dual_timestamp_get(&s
->start_timestamp
);
3913 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
3916 if (s
->pid
&& s
->pid
!= pid
)
3920 dual_timestamp_get(&s
->exit_timestamp
);
3926 if (context
->utmp_id
)
3927 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
3929 exec_context_tty_reset(context
, NULL
);
3933 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
3934 char buf
[FORMAT_TIMESTAMP_MAX
];
3942 prefix
= strempty(prefix
);
3945 "%sPID: "PID_FMT
"\n",
3948 if (dual_timestamp_is_set(&s
->start_timestamp
))
3950 "%sStart Timestamp: %s\n",
3951 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
3953 if (dual_timestamp_is_set(&s
->exit_timestamp
))
3955 "%sExit Timestamp: %s\n"
3957 "%sExit Status: %i\n",
3958 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
3959 prefix
, sigchld_code_to_string(s
->code
),
3963 char *exec_command_line(char **argv
) {
3971 STRV_FOREACH(a
, argv
)
3979 STRV_FOREACH(a
, argv
) {
3986 if (strpbrk(*a
, WHITESPACE
)) {
3997 /* FIXME: this doesn't really handle arguments that have
3998 * spaces and ticks in them */
4003 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4004 _cleanup_free_
char *cmd
= NULL
;
4005 const char *prefix2
;
4010 prefix
= strempty(prefix
);
4011 prefix2
= strjoina(prefix
, "\t");
4013 cmd
= exec_command_line(c
->argv
);
4015 "%sCommand Line: %s\n",
4016 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4018 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4021 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4024 prefix
= strempty(prefix
);
4026 LIST_FOREACH(command
, c
, c
)
4027 exec_command_dump(c
, f
, prefix
);
4030 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4037 /* It's kind of important, that we keep the order here */
4038 LIST_FIND_TAIL(command
, *l
, end
);
4039 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4044 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4052 l
= strv_new_ap(path
, ap
);
4073 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4074 _cleanup_strv_free_
char **l
= NULL
;
4082 l
= strv_new_ap(path
, ap
);
4088 r
= strv_extend_strv(&c
->argv
, l
, false);
4096 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4101 *rt
= new0(ExecRuntime
, 1);
4106 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4111 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4121 if (!c
->private_network
&& !c
->private_tmp
)
4124 r
= exec_runtime_allocate(rt
);
4128 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4129 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4133 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4134 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4142 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4144 assert(r
->n_ref
> 0);
4150 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4155 assert(r
->n_ref
> 0);
4162 free(r
->var_tmp_dir
);
4163 safe_close_pair(r
->netns_storage_socket
);
4167 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4176 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4178 if (rt
->var_tmp_dir
)
4179 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4181 if (rt
->netns_storage_socket
[0] >= 0) {
4184 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4188 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4191 if (rt
->netns_storage_socket
[1] >= 0) {
4194 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4198 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4204 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4211 if (streq(key
, "tmp-dir")) {
4214 r
= exec_runtime_allocate(rt
);
4218 copy
= strdup(value
);
4222 free((*rt
)->tmp_dir
);
4223 (*rt
)->tmp_dir
= copy
;
4225 } else if (streq(key
, "var-tmp-dir")) {
4228 r
= exec_runtime_allocate(rt
);
4232 copy
= strdup(value
);
4236 free((*rt
)->var_tmp_dir
);
4237 (*rt
)->var_tmp_dir
= copy
;
4239 } else if (streq(key
, "netns-socket-0")) {
4242 r
= exec_runtime_allocate(rt
);
4246 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4247 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4249 safe_close((*rt
)->netns_storage_socket
[0]);
4250 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4252 } else if (streq(key
, "netns-socket-1")) {
4255 r
= exec_runtime_allocate(rt
);
4259 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4260 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4262 safe_close((*rt
)->netns_storage_socket
[1]);
4263 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4271 static void *remove_tmpdir_thread(void *p
) {
4272 _cleanup_free_
char *path
= p
;
4274 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4278 void exec_runtime_destroy(ExecRuntime
*rt
) {
4284 /* If there are multiple users of this, let's leave the stuff around */
4289 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4291 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4293 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4300 if (rt
->var_tmp_dir
) {
4301 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4303 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4305 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4306 free(rt
->var_tmp_dir
);
4309 rt
->var_tmp_dir
= NULL
;
4312 safe_close_pair(rt
->netns_storage_socket
);
4315 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4316 [EXEC_INPUT_NULL
] = "null",
4317 [EXEC_INPUT_TTY
] = "tty",
4318 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4319 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4320 [EXEC_INPUT_SOCKET
] = "socket",
4321 [EXEC_INPUT_NAMED_FD
] = "fd",
4324 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4326 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4327 [EXEC_OUTPUT_INHERIT
] = "inherit",
4328 [EXEC_OUTPUT_NULL
] = "null",
4329 [EXEC_OUTPUT_TTY
] = "tty",
4330 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4331 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4332 [EXEC_OUTPUT_KMSG
] = "kmsg",
4333 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4334 [EXEC_OUTPUT_JOURNAL
] = "journal",
4335 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4336 [EXEC_OUTPUT_SOCKET
] = "socket",
4337 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4340 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4342 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4343 [EXEC_UTMP_INIT
] = "init",
4344 [EXEC_UTMP_LOGIN
] = "login",
4345 [EXEC_UTMP_USER
] = "user",
4348 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4350 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4351 [EXEC_PRESERVE_NO
] = "no",
4352 [EXEC_PRESERVE_YES
] = "yes",
4353 [EXEC_PRESERVE_RESTART
] = "restart",
4356 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4358 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_MAX
] = {
4359 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4360 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4361 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4362 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4363 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4366 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);