2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
33 #include <sys/socket.h>
35 #include <sys/types.h>
41 #include <security/pam_appl.h>
45 #include <selinux/selinux.h>
53 #include <sys/apparmor.h>
56 #include "sd-messages.h"
59 #include "alloc-util.h"
61 #include "apparmor-util.h"
66 #include "capability-util.h"
69 #include "errno-list.h"
71 #include "exit-status.h"
74 #include "format-util.h"
76 #include "glob-util.h"
83 #include "namespace.h"
84 #include "parse-util.h"
85 #include "path-util.h"
86 #include "process-util.h"
87 #include "rlimit-util.h"
90 #include "seccomp-util.h"
92 #include "securebits.h"
93 #include "securebits-util.h"
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "smack-util.h"
98 #include "string-table.h"
99 #include "string-util.h"
101 #include "syslog-util.h"
102 #include "terminal-util.h"
104 #include "user-util.h"
106 #include "utmp-wtmp.h"
108 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
109 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
111 /* This assumes there is a 'tty' group */
112 #define TTY_MODE 0620
114 #define SNDBUF_SIZE (8*1024*1024)
116 static int shift_fds(int fds
[], unsigned n_fds
) {
117 int start
, restart_from
;
122 /* Modifies the fds array! (sorts it) */
132 for (i
= start
; i
< (int) n_fds
; i
++) {
135 /* Already at right index? */
139 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
146 /* Hmm, the fd we wanted isn't free? Then
147 * let's remember that and try again from here */
148 if (nfd
!= i
+3 && restart_from
< 0)
152 if (restart_from
< 0)
155 start
= restart_from
;
161 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
165 n_fds
= n_storage_fds
+ n_socket_fds
;
171 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
172 * O_NONBLOCK only applies to socket activation though. */
174 for (i
= 0; i
< n_fds
; i
++) {
176 if (i
< n_socket_fds
) {
177 r
= fd_nonblock(fds
[i
], nonblock
);
182 /* We unconditionally drop FD_CLOEXEC from the fds,
183 * since after all we want to pass these fds to our
186 r
= fd_cloexec(fds
[i
], false);
194 static const char *exec_context_tty_path(const ExecContext
*context
) {
197 if (context
->stdio_as_fds
)
200 if (context
->tty_path
)
201 return context
->tty_path
;
203 return "/dev/console";
206 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
211 path
= exec_context_tty_path(context
);
213 if (context
->tty_vhangup
) {
214 if (p
&& p
->stdin_fd
>= 0)
215 (void) terminal_vhangup_fd(p
->stdin_fd
);
217 (void) terminal_vhangup(path
);
220 if (context
->tty_reset
) {
221 if (p
&& p
->stdin_fd
>= 0)
222 (void) reset_terminal_fd(p
->stdin_fd
, true);
224 (void) reset_terminal(path
);
227 if (context
->tty_vt_disallocate
&& path
)
228 (void) vt_disallocate(path
);
231 static bool is_terminal_input(ExecInput i
) {
234 EXEC_INPUT_TTY_FORCE
,
235 EXEC_INPUT_TTY_FAIL
);
238 static bool is_terminal_output(ExecOutput o
) {
241 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
243 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
246 static bool is_syslog_output(ExecOutput o
) {
249 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
252 static bool is_kmsg_output(ExecOutput o
) {
255 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
258 static bool exec_context_needs_term(const ExecContext
*c
) {
261 /* Return true if the execution context suggests we should set $TERM to something useful. */
263 if (is_terminal_input(c
->std_input
))
266 if (is_terminal_output(c
->std_output
))
269 if (is_terminal_output(c
->std_error
))
272 return !!c
->tty_path
;
275 static int open_null_as(int flags
, int nfd
) {
280 fd
= open("/dev/null", flags
|O_NOCTTY
);
285 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
293 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
294 static const union sockaddr_union sa
= {
295 .un
.sun_family
= AF_UNIX
,
296 .un
.sun_path
= "/run/systemd/journal/stdout",
298 uid_t olduid
= UID_INVALID
;
299 gid_t oldgid
= GID_INVALID
;
302 if (gid_is_valid(gid
)) {
305 if (setegid(gid
) < 0)
309 if (uid_is_valid(uid
)) {
312 if (seteuid(uid
) < 0) {
318 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
320 /* If we fail to restore the uid or gid, things will likely
321 fail later on. This should only happen if an LSM interferes. */
323 if (uid_is_valid(uid
))
324 (void) seteuid(olduid
);
327 if (gid_is_valid(gid
))
328 (void) setegid(oldgid
);
333 static int connect_logger_as(
335 const ExecContext
*context
,
336 const ExecParameters
*params
,
347 assert(output
< _EXEC_OUTPUT_MAX
);
351 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
355 r
= connect_journal_socket(fd
, uid
, gid
);
359 if (shutdown(fd
, SHUT_RD
) < 0) {
364 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
374 context
->syslog_identifier
?: ident
,
375 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
376 context
->syslog_priority
,
377 !!context
->syslog_level_prefix
,
378 is_syslog_output(output
),
379 is_kmsg_output(output
),
380 is_terminal_output(output
));
385 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
390 static int open_terminal_as(const char *path
, mode_t mode
, int nfd
) {
396 fd
= open_terminal(path
, mode
| O_NOCTTY
);
401 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
409 static int fixup_input(ExecInput std_input
, int socket_fd
, bool apply_tty_stdin
) {
411 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
412 return EXEC_INPUT_NULL
;
414 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
415 return EXEC_INPUT_NULL
;
420 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
422 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
423 return EXEC_OUTPUT_INHERIT
;
428 static int setup_input(
429 const ExecContext
*context
,
430 const ExecParameters
*params
,
432 int named_iofds
[3]) {
439 if (params
->stdin_fd
>= 0) {
440 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
443 /* Try to make this the controlling tty, if it is a tty, and reset it */
444 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
445 (void) reset_terminal_fd(STDIN_FILENO
, true);
450 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
454 case EXEC_INPUT_NULL
:
455 return open_null_as(O_RDONLY
, STDIN_FILENO
);
458 case EXEC_INPUT_TTY_FORCE
:
459 case EXEC_INPUT_TTY_FAIL
: {
462 fd
= acquire_terminal(exec_context_tty_path(context
),
463 i
== EXEC_INPUT_TTY_FAIL
,
464 i
== EXEC_INPUT_TTY_FORCE
,
470 if (fd
!= STDIN_FILENO
) {
471 r
= dup2(fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
479 case EXEC_INPUT_SOCKET
:
480 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
482 case EXEC_INPUT_NAMED_FD
:
483 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
484 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
487 assert_not_reached("Unknown input type");
491 static int setup_output(
493 const ExecContext
*context
,
494 const ExecParameters
*params
,
501 dev_t
*journal_stream_dev
,
502 ino_t
*journal_stream_ino
) {
512 assert(journal_stream_dev
);
513 assert(journal_stream_ino
);
515 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
517 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
520 return STDOUT_FILENO
;
523 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
524 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
527 return STDERR_FILENO
;
530 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
531 o
= fixup_output(context
->std_output
, socket_fd
);
533 if (fileno
== STDERR_FILENO
) {
535 e
= fixup_output(context
->std_error
, socket_fd
);
537 /* This expects the input and output are already set up */
539 /* Don't change the stderr file descriptor if we inherit all
540 * the way and are not on a tty */
541 if (e
== EXEC_OUTPUT_INHERIT
&&
542 o
== EXEC_OUTPUT_INHERIT
&&
543 i
== EXEC_INPUT_NULL
&&
544 !is_terminal_input(context
->std_input
) &&
548 /* Duplicate from stdout if possible */
549 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
550 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
554 } else if (o
== EXEC_OUTPUT_INHERIT
) {
555 /* If input got downgraded, inherit the original value */
556 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
557 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
559 /* If the input is connected to anything that's not a /dev/null, inherit that... */
560 if (i
!= EXEC_INPUT_NULL
)
561 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
563 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
567 /* We need to open /dev/null here anew, to get the right access mode. */
568 return open_null_as(O_WRONLY
, fileno
);
573 case EXEC_OUTPUT_NULL
:
574 return open_null_as(O_WRONLY
, fileno
);
576 case EXEC_OUTPUT_TTY
:
577 if (is_terminal_input(i
))
578 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
580 /* We don't reset the terminal if this is just about output */
581 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
583 case EXEC_OUTPUT_SYSLOG
:
584 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
585 case EXEC_OUTPUT_KMSG
:
586 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
587 case EXEC_OUTPUT_JOURNAL
:
588 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
589 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
591 log_unit_error_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
592 r
= open_null_as(O_WRONLY
, fileno
);
596 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
597 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
598 * services to detect whether they are connected to the journal or not. */
600 if (fstat(fileno
, &st
) >= 0) {
601 *journal_stream_dev
= st
.st_dev
;
602 *journal_stream_ino
= st
.st_ino
;
607 case EXEC_OUTPUT_SOCKET
:
608 assert(socket_fd
>= 0);
609 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
611 case EXEC_OUTPUT_NAMED_FD
:
612 (void) fd_nonblock(named_iofds
[fileno
], false);
613 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
616 assert_not_reached("Unknown error type");
620 static int chown_terminal(int fd
, uid_t uid
) {
625 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
629 /* This might fail. What matters are the results. */
630 (void) fchown(fd
, uid
, -1);
631 (void) fchmod(fd
, TTY_MODE
);
633 if (fstat(fd
, &st
) < 0)
636 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
642 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
643 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
646 assert(_saved_stdin
);
647 assert(_saved_stdout
);
649 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
653 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
654 if (saved_stdout
< 0)
657 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
661 r
= chown_terminal(fd
, getuid());
665 r
= reset_terminal_fd(fd
, true);
669 if (dup2(fd
, STDIN_FILENO
) < 0)
672 if (dup2(fd
, STDOUT_FILENO
) < 0)
679 *_saved_stdin
= saved_stdin
;
680 *_saved_stdout
= saved_stdout
;
682 saved_stdin
= saved_stdout
= -1;
687 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
690 if (err
== -ETIMEDOUT
)
691 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
694 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
698 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
699 _cleanup_close_
int fd
= -1;
703 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
707 write_confirm_error_fd(err
, fd
, u
);
710 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
714 assert(saved_stdout
);
718 if (*saved_stdin
>= 0)
719 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
722 if (*saved_stdout
>= 0)
723 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
726 *saved_stdin
= safe_close(*saved_stdin
);
727 *saved_stdout
= safe_close(*saved_stdout
);
733 CONFIRM_PRETEND_FAILURE
= -1,
734 CONFIRM_PRETEND_SUCCESS
= 0,
738 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
739 int saved_stdout
= -1, saved_stdin
= -1, r
;
740 _cleanup_free_
char *e
= NULL
;
743 /* For any internal errors, assume a positive response. */
744 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
746 write_confirm_error(r
, vc
, u
);
747 return CONFIRM_EXECUTE
;
750 /* confirm_spawn might have been disabled while we were sleeping. */
751 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
756 e
= ellipsize(cmdline
, 60, 100);
764 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
766 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
773 printf("Resuming normal execution.\n");
774 manager_disable_confirm_spawn();
778 unit_dump(u
, stdout
, " ");
779 continue; /* ask again */
781 printf("Failing execution.\n");
782 r
= CONFIRM_PRETEND_FAILURE
;
785 printf(" c - continue, proceed without asking anymore\n"
786 " D - dump, show the state of the unit\n"
787 " f - fail, don't execute the command and pretend it failed\n"
789 " i - info, show a short summary of the unit\n"
790 " j - jobs, show jobs that are in progress\n"
791 " s - skip, don't execute the command and pretend it succeeded\n"
792 " y - yes, execute the command\n");
793 continue; /* ask again */
795 printf(" Description: %s\n"
798 u
->id
, u
->description
, cmdline
);
799 continue; /* ask again */
801 manager_dump_jobs(u
->manager
, stdout
, " ");
802 continue; /* ask again */
804 /* 'n' was removed in favor of 'f'. */
805 printf("Didn't understand 'n', did you mean 'f'?\n");
806 continue; /* ask again */
808 printf("Skipping execution.\n");
809 r
= CONFIRM_PRETEND_SUCCESS
;
815 assert_not_reached("Unhandled choice");
821 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
825 static int get_fixed_user(const ExecContext
*c
, const char **user
,
826 uid_t
*uid
, gid_t
*gid
,
827 const char **home
, const char **shell
) {
836 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
837 * (i.e. are "/" or "/bin/nologin"). */
840 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
848 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
858 r
= get_group_creds(&name
, gid
);
866 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
867 const char *group
, gid_t gid
,
868 gid_t
**supplementary_gids
, int *ngids
) {
872 bool keep_groups
= false;
873 gid_t
*groups
= NULL
;
874 _cleanup_free_ gid_t
*l_gids
= NULL
;
879 * If user is given, then lookup GID and supplementary groups list.
880 * We avoid NSS lookups for gid=0. Also we have to initialize groups
881 * here and as early as possible so we keep the list of supplementary
882 * groups of the caller.
884 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
885 /* First step, initialize groups from /etc/groups */
886 if (initgroups(user
, gid
) < 0)
892 if (!c
->supplementary_groups
)
896 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
897 * be positive, otherwise fail.
900 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
901 if (ngroups_max
<= 0) {
905 return -EOPNOTSUPP
; /* For all other values */
908 l_gids
= new(gid_t
, ngroups_max
);
914 * Lookup the list of groups that the user belongs to, we
915 * avoid NSS lookups here too for gid=0.
918 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
923 STRV_FOREACH(i
, c
->supplementary_groups
) {
926 if (k
>= ngroups_max
)
930 r
= get_group_creds(&g
, l_gids
+k
);
938 * Sets ngids to zero to drop all supplementary groups, happens
939 * when we are under root and SupplementaryGroups= is empty.
946 /* Otherwise get the final list of supplementary groups */
947 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
951 *supplementary_gids
= groups
;
959 static int enforce_groups(const ExecContext
*context
, gid_t gid
,
960 gid_t
*supplementary_gids
, int ngids
) {
965 /* Handle SupplementaryGroups= even if it is empty */
966 if (context
->supplementary_groups
) {
967 r
= maybe_setgroups(ngids
, supplementary_gids
);
972 if (gid_is_valid(gid
)) {
973 /* Then set our gids */
974 if (setresgid(gid
, gid
, gid
) < 0)
981 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
984 if (!uid_is_valid(uid
))
987 /* Sets (but doesn't look up) the uid and make sure we keep the
988 * capabilities while doing so. */
990 if (context
->capability_ambient_set
!= 0) {
992 /* First step: If we need to keep capabilities but
993 * drop privileges we need to make sure we keep our
994 * caps, while we drop privileges. */
996 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
998 if (prctl(PR_GET_SECUREBITS
) != sb
)
999 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1004 /* Second step: actually set the uids */
1005 if (setresuid(uid
, uid
, uid
) < 0)
1008 /* At this point we should have all necessary capabilities but
1009 are otherwise a normal user. However, the caps might got
1010 corrupted due to the setresuid() so we need clean them up
1011 later. This is done outside of this call. */
1018 static int null_conv(
1020 const struct pam_message
**msg
,
1021 struct pam_response
**resp
,
1022 void *appdata_ptr
) {
1024 /* We don't support conversations */
1026 return PAM_CONV_ERR
;
1031 static int setup_pam(
1038 int fds
[], unsigned n_fds
) {
1042 static const struct pam_conv conv
= {
1047 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1048 pam_handle_t
*handle
= NULL
;
1050 int pam_code
= PAM_SUCCESS
, r
;
1051 char **nv
, **e
= NULL
;
1052 bool close_session
= false;
1053 pid_t pam_pid
= 0, parent_pid
;
1060 /* We set up PAM in the parent process, then fork. The child
1061 * will then stay around until killed via PR_GET_PDEATHSIG or
1062 * systemd via the cgroup logic. It will then remove the PAM
1063 * session again. The parent process will exec() the actual
1064 * daemon. We do things this way to ensure that the main PID
1065 * of the daemon is the one we initially fork()ed. */
1067 r
= barrier_create(&barrier
);
1071 if (log_get_max_level() < LOG_DEBUG
)
1072 flags
|= PAM_SILENT
;
1074 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1075 if (pam_code
!= PAM_SUCCESS
) {
1081 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1082 if (pam_code
!= PAM_SUCCESS
)
1086 STRV_FOREACH(nv
, *env
) {
1087 pam_code
= pam_putenv(handle
, *nv
);
1088 if (pam_code
!= PAM_SUCCESS
)
1092 pam_code
= pam_acct_mgmt(handle
, flags
);
1093 if (pam_code
!= PAM_SUCCESS
)
1096 pam_code
= pam_open_session(handle
, flags
);
1097 if (pam_code
!= PAM_SUCCESS
)
1100 close_session
= true;
1102 e
= pam_getenvlist(handle
);
1104 pam_code
= PAM_BUF_ERR
;
1108 /* Block SIGTERM, so that we know that it won't get lost in
1111 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1113 parent_pid
= getpid_cached();
1122 int sig
, ret
= EXIT_PAM
;
1124 /* The child's job is to reset the PAM session on
1126 barrier_set_role(&barrier
, BARRIER_CHILD
);
1128 /* This string must fit in 10 chars (i.e. the length
1129 * of "/sbin/init"), to look pretty in /bin/ps */
1130 rename_process("(sd-pam)");
1132 /* Make sure we don't keep open the passed fds in this
1133 child. We assume that otherwise only those fds are
1134 open here that have been opened by PAM. */
1135 close_many(fds
, n_fds
);
1137 /* Drop privileges - we don't need any to pam_close_session
1138 * and this will make PR_SET_PDEATHSIG work in most cases.
1139 * If this fails, ignore the error - but expect sd-pam threads
1140 * to fail to exit normally */
1142 r
= maybe_setgroups(0, NULL
);
1144 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1145 if (setresgid(gid
, gid
, gid
) < 0)
1146 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1147 if (setresuid(uid
, uid
, uid
) < 0)
1148 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1150 (void) ignore_signals(SIGPIPE
, -1);
1152 /* Wait until our parent died. This will only work if
1153 * the above setresuid() succeeds, otherwise the kernel
1154 * will not allow unprivileged parents kill their privileged
1155 * children this way. We rely on the control groups kill logic
1156 * to do the rest for us. */
1157 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1160 /* Tell the parent that our setup is done. This is especially
1161 * important regarding dropping privileges. Otherwise, unit
1162 * setup might race against our setresuid(2) call.
1164 * If the parent aborted, we'll detect this below, hence ignore
1165 * return failure here. */
1166 (void) barrier_place(&barrier
);
1168 /* Check if our parent process might already have died? */
1169 if (getppid() == parent_pid
) {
1172 assert_se(sigemptyset(&ss
) >= 0);
1173 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1176 if (sigwait(&ss
, &sig
) < 0) {
1183 assert(sig
== SIGTERM
);
1188 /* If our parent died we'll end the session */
1189 if (getppid() != parent_pid
) {
1190 pam_code
= pam_close_session(handle
, flags
);
1191 if (pam_code
!= PAM_SUCCESS
)
1198 pam_end(handle
, pam_code
| flags
);
1202 barrier_set_role(&barrier
, BARRIER_PARENT
);
1204 /* If the child was forked off successfully it will do all the
1205 * cleanups, so forget about the handle here. */
1208 /* Unblock SIGTERM again in the parent */
1209 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1211 /* We close the log explicitly here, since the PAM modules
1212 * might have opened it, but we don't want this fd around. */
1215 /* Synchronously wait for the child to initialize. We don't care for
1216 * errors as we cannot recover. However, warn loudly if it happens. */
1217 if (!barrier_place_and_sync(&barrier
))
1218 log_error("PAM initialization failed");
1226 if (pam_code
!= PAM_SUCCESS
) {
1227 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1228 r
= -EPERM
; /* PAM errors do not map to errno */
1230 log_error_errno(r
, "PAM failed: %m");
1234 pam_code
= pam_close_session(handle
, flags
);
1236 pam_end(handle
, pam_code
| flags
);
1248 static void rename_process_from_path(const char *path
) {
1249 char process_name
[11];
1253 /* This resulting string must fit in 10 chars (i.e. the length
1254 * of "/sbin/init") to look pretty in /bin/ps */
1258 rename_process("(...)");
1264 /* The end of the process name is usually more
1265 * interesting, since the first bit might just be
1271 process_name
[0] = '(';
1272 memcpy(process_name
+1, p
, l
);
1273 process_name
[1+l
] = ')';
1274 process_name
[1+l
+1] = 0;
1276 rename_process(process_name
);
1279 static bool context_has_address_families(const ExecContext
*c
) {
1282 return c
->address_families_whitelist
||
1283 !set_isempty(c
->address_families
);
1286 static bool context_has_syscall_filters(const ExecContext
*c
) {
1289 return c
->syscall_whitelist
||
1290 !set_isempty(c
->syscall_filter
);
1293 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1296 if (c
->no_new_privileges
)
1299 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1302 /* We need NNP if we have any form of seccomp and are unprivileged */
1303 return context_has_address_families(c
) ||
1304 c
->memory_deny_write_execute
||
1305 c
->restrict_realtime
||
1306 exec_context_restrict_namespaces_set(c
) ||
1307 c
->protect_kernel_tunables
||
1308 c
->protect_kernel_modules
||
1309 c
->private_devices
||
1310 context_has_syscall_filters(c
) ||
1311 !set_isempty(c
->syscall_archs
) ||
1312 c
->lock_personality
;
1317 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1319 if (is_seccomp_available())
1323 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1328 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1329 uint32_t negative_action
, default_action
, action
;
1335 if (!context_has_syscall_filters(c
))
1338 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1341 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1343 if (c
->syscall_whitelist
) {
1344 default_action
= negative_action
;
1345 action
= SCMP_ACT_ALLOW
;
1347 default_action
= SCMP_ACT_ALLOW
;
1348 action
= negative_action
;
1351 if (needs_ambient_hack
) {
1352 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1357 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1360 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1364 if (set_isempty(c
->syscall_archs
))
1367 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1370 return seccomp_restrict_archs(c
->syscall_archs
);
1373 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1377 if (!context_has_address_families(c
))
1380 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1383 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1386 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1390 if (!c
->memory_deny_write_execute
)
1393 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1396 return seccomp_memory_deny_write_execute();
1399 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1403 if (!c
->restrict_realtime
)
1406 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1409 return seccomp_restrict_realtime();
1412 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1416 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1417 * let's protect even those systems where this is left on in the kernel. */
1419 if (!c
->protect_kernel_tunables
)
1422 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1425 return seccomp_protect_sysctl();
1428 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1432 /* Turn off module syscalls on ProtectKernelModules=yes */
1434 if (!c
->protect_kernel_modules
)
1437 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1440 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1443 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1447 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1449 if (!c
->private_devices
)
1452 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1455 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1458 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1462 if (!exec_context_restrict_namespaces_set(c
))
1465 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1468 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1471 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1472 unsigned long personality
;
1478 if (!c
->lock_personality
)
1481 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1484 personality
= c
->personality
;
1486 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1487 if (personality
== PERSONALITY_INVALID
) {
1489 r
= opinionated_personality(&personality
);
1494 return seccomp_lock_personality(personality
);
1499 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1502 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1503 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1505 if (idle_pipe
[0] >= 0) {
1508 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1510 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1513 /* Signal systemd that we are bored and want to continue. */
1514 n
= write(idle_pipe
[3], "x", 1);
1516 /* Wait for systemd to react to the signal above. */
1517 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1520 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1524 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1527 static int build_environment(
1529 const ExecContext
*c
,
1530 const ExecParameters
*p
,
1533 const char *username
,
1535 dev_t journal_stream_dev
,
1536 ino_t journal_stream_ino
,
1539 _cleanup_strv_free_
char **our_env
= NULL
;
1547 our_env
= new0(char*, 14);
1552 _cleanup_free_
char *joined
= NULL
;
1554 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1556 our_env
[n_env
++] = x
;
1558 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1560 our_env
[n_env
++] = x
;
1562 joined
= strv_join(p
->fd_names
, ":");
1566 x
= strjoin("LISTEN_FDNAMES=", joined
);
1569 our_env
[n_env
++] = x
;
1572 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1573 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1575 our_env
[n_env
++] = x
;
1577 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1579 our_env
[n_env
++] = x
;
1582 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1583 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1584 * check the database directly. */
1585 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1586 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1589 our_env
[n_env
++] = x
;
1593 x
= strappend("HOME=", home
);
1596 our_env
[n_env
++] = x
;
1600 x
= strappend("LOGNAME=", username
);
1603 our_env
[n_env
++] = x
;
1605 x
= strappend("USER=", username
);
1608 our_env
[n_env
++] = x
;
1612 x
= strappend("SHELL=", shell
);
1615 our_env
[n_env
++] = x
;
1618 if (!sd_id128_is_null(u
->invocation_id
)) {
1619 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1622 our_env
[n_env
++] = x
;
1625 if (exec_context_needs_term(c
)) {
1626 const char *tty_path
, *term
= NULL
;
1628 tty_path
= exec_context_tty_path(c
);
1630 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1631 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1632 * passes to PID 1 ends up all the way in the console login shown. */
1634 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1635 term
= getenv("TERM");
1637 term
= default_term_for_tty(tty_path
);
1639 x
= strappend("TERM=", term
);
1642 our_env
[n_env
++] = x
;
1645 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1646 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1649 our_env
[n_env
++] = x
;
1652 our_env
[n_env
++] = NULL
;
1653 assert(n_env
<= 12);
1661 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1662 _cleanup_strv_free_
char **pass_env
= NULL
;
1663 size_t n_env
= 0, n_bufsize
= 0;
1666 STRV_FOREACH(i
, c
->pass_environment
) {
1667 _cleanup_free_
char *x
= NULL
;
1673 x
= strjoin(*i
, "=", v
);
1676 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1678 pass_env
[n_env
++] = x
;
1679 pass_env
[n_env
] = NULL
;
1689 static bool exec_needs_mount_namespace(
1690 const ExecContext
*context
,
1691 const ExecParameters
*params
,
1692 ExecRuntime
*runtime
) {
1697 if (context
->root_image
)
1700 if (!strv_isempty(context
->read_write_paths
) ||
1701 !strv_isempty(context
->read_only_paths
) ||
1702 !strv_isempty(context
->inaccessible_paths
))
1705 if (context
->n_bind_mounts
> 0)
1708 if (context
->mount_flags
!= 0)
1711 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1714 if (context
->private_devices
||
1715 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1716 context
->protect_home
!= PROTECT_HOME_NO
||
1717 context
->protect_kernel_tunables
||
1718 context
->protect_kernel_modules
||
1719 context
->protect_control_groups
)
1722 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1728 static int setup_private_users(uid_t uid
, gid_t gid
) {
1729 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1730 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1731 _cleanup_close_
int unshare_ready_fd
= -1;
1732 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1738 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1739 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1740 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1741 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1742 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1743 * continues execution normally. */
1745 if (uid
!= 0 && uid_is_valid(uid
)) {
1746 r
= asprintf(&uid_map
,
1747 "0 0 1\n" /* Map root → root */
1748 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1753 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1758 if (gid
!= 0 && gid_is_valid(gid
)) {
1759 r
= asprintf(&gid_map
,
1760 "0 0 1\n" /* Map root → root */
1761 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1766 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1771 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1773 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1774 if (unshare_ready_fd
< 0)
1777 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1779 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1787 _cleanup_close_
int fd
= -1;
1791 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1792 * here, after the parent opened its own user namespace. */
1795 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1797 /* Wait until the parent unshared the user namespace */
1798 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1803 /* Disable the setgroups() system call in the child user namespace, for good. */
1804 a
= procfs_file_alloca(ppid
, "setgroups");
1805 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1807 if (errno
!= ENOENT
) {
1812 /* If the file is missing the kernel is too old, let's continue anyway. */
1814 if (write(fd
, "deny\n", 5) < 0) {
1819 fd
= safe_close(fd
);
1822 /* First write the GID map */
1823 a
= procfs_file_alloca(ppid
, "gid_map");
1824 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1829 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1833 fd
= safe_close(fd
);
1835 /* The write the UID map */
1836 a
= procfs_file_alloca(ppid
, "uid_map");
1837 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1842 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1847 _exit(EXIT_SUCCESS
);
1850 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1851 _exit(EXIT_FAILURE
);
1854 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1856 if (unshare(CLONE_NEWUSER
) < 0)
1859 /* Let the child know that the namespace is ready now */
1860 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1863 /* Try to read an error code from the child */
1864 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1867 if (n
== sizeof(r
)) { /* an error code was sent to us */
1872 if (n
!= 0) /* on success we should have read 0 bytes */
1875 r
= wait_for_terminate(pid
, &si
);
1880 /* If something strange happened with the child, let's consider this fatal, too */
1881 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1887 static int setup_exec_directory(
1888 const ExecContext
*context
,
1889 const ExecParameters
*params
,
1892 ExecDirectoryType type
,
1895 static const int exit_status_table
[_EXEC_DIRECTORY_MAX
] = {
1896 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1897 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1898 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1899 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1900 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1907 assert(type
>= 0 && type
< _EXEC_DIRECTORY_MAX
);
1908 assert(exit_status
);
1910 if (!params
->prefix
[type
])
1913 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1914 if (!uid_is_valid(uid
))
1916 if (!gid_is_valid(gid
))
1920 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1921 _cleanup_free_
char *p
;
1923 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1929 r
= mkdir_parents_label(p
, 0755);
1933 r
= mkdir_p_label(p
, context
->directories
[type
].mode
);
1937 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
1938 * a service, and shall not be writable. */
1939 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
1942 r
= chmod_and_chown(p
, context
->directories
[type
].mode
, uid
, gid
);
1950 *exit_status
= exit_status_table
[type
];
1955 static int setup_smack(
1956 const ExecContext
*context
,
1957 const ExecCommand
*command
) {
1964 if (context
->smack_process_label
) {
1965 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
1969 #ifdef SMACK_DEFAULT_PROCESS_LABEL
1971 _cleanup_free_
char *exec_label
= NULL
;
1973 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
1974 if (r
< 0 && r
!= -ENODATA
&& r
!= -EOPNOTSUPP
)
1977 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
1986 static int compile_read_write_paths(
1987 const ExecContext
*context
,
1988 const ExecParameters
*params
,
1991 _cleanup_strv_free_
char **l
= NULL
;
1993 ExecDirectoryType i
;
1995 /* Compile the list of writable paths. This is the combination of
1996 * the explicitly configured paths, plus all runtime directories. */
1998 if (strv_isempty(context
->read_write_paths
)) {
1999 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
2000 if (!strv_isempty(context
->directories
[i
].paths
))
2003 if (i
== _EXEC_DIRECTORY_MAX
) {
2004 *ret
= NULL
; /* NOP if neither is set */
2009 l
= strv_copy(context
->read_write_paths
);
2013 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++) {
2014 if (!params
->prefix
[i
])
2017 STRV_FOREACH(rt
, context
->directories
[i
].paths
) {
2020 s
= strjoin(params
->prefix
[i
], "/", *rt
);
2024 if (strv_consume(&l
, s
) < 0)
2035 static int apply_mount_namespace(
2037 ExecCommand
*command
,
2038 const ExecContext
*context
,
2039 const ExecParameters
*params
,
2040 ExecRuntime
*runtime
) {
2042 _cleanup_strv_free_
char **rw
= NULL
;
2043 char *tmp
= NULL
, *var
= NULL
;
2044 const char *root_dir
= NULL
, *root_image
= NULL
;
2045 NameSpaceInfo ns_info
= {
2046 .ignore_protect_paths
= false,
2047 .private_dev
= context
->private_devices
,
2048 .protect_control_groups
= context
->protect_control_groups
,
2049 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2050 .protect_kernel_modules
= context
->protect_kernel_modules
,
2051 .mount_apivfs
= context
->mount_apivfs
,
2053 bool needs_sandboxing
;
2058 /* The runtime struct only contains the parent of the private /tmp,
2059 * which is non-accessible to world users. Inside of it there's a /tmp
2060 * that is sticky, and that's the one we want to use here. */
2062 if (context
->private_tmp
&& runtime
) {
2063 if (runtime
->tmp_dir
)
2064 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2065 if (runtime
->var_tmp_dir
)
2066 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2069 r
= compile_read_write_paths(context
, params
, &rw
);
2073 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2074 root_image
= context
->root_image
;
2077 root_dir
= context
->root_directory
;
2081 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2082 * sandbox info, otherwise enforce it, don't ignore protected paths and
2083 * fail if we are enable to apply the sandbox inside the mount namespace.
2085 if (!context
->dynamic_user
&& root_dir
)
2086 ns_info
.ignore_protect_paths
= true;
2088 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2090 r
= setup_namespace(root_dir
, root_image
,
2092 needs_sandboxing
? context
->read_only_paths
: NULL
,
2093 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2094 context
->bind_mounts
,
2095 context
->n_bind_mounts
,
2098 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2099 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2100 context
->mount_flags
,
2101 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2103 /* If we couldn't set up the namespace this is probably due to a
2104 * missing capability. In this case, silently proceeed. */
2105 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2107 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2115 static int apply_working_directory(
2116 const ExecContext
*context
,
2117 const ExecParameters
*params
,
2119 const bool needs_mount_ns
,
2125 assert(exit_status
);
2127 if (context
->working_directory_home
) {
2130 *exit_status
= EXIT_CHDIR
;
2136 } else if (context
->working_directory
)
2137 wd
= context
->working_directory
;
2141 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2142 if (!needs_mount_ns
&& context
->root_directory
)
2143 if (chroot(context
->root_directory
) < 0) {
2144 *exit_status
= EXIT_CHROOT
;
2150 d
= prefix_roota(context
->root_directory
, wd
);
2152 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2153 *exit_status
= EXIT_CHDIR
;
2160 static int setup_keyring(Unit
*u
, const ExecParameters
*p
, uid_t uid
, gid_t gid
) {
2161 key_serial_t keyring
;
2166 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2167 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2168 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2169 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2170 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2171 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2173 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2176 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2177 if (keyring
== -1) {
2178 if (errno
== ENOSYS
)
2179 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
2180 else if (IN_SET(errno
, EACCES
, EPERM
))
2181 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
2182 else if (errno
== EDQUOT
)
2183 log_debug_errno(errno
, "Out of kernel keyrings to allocate, ignoring.");
2185 return log_error_errno(errno
, "Setting up kernel keyring failed: %m");
2190 /* Populate they keyring with the invocation ID by default. */
2191 if (!sd_id128_is_null(u
->invocation_id
)) {
2194 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2196 log_debug_errno(errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2198 if (keyctl(KEYCTL_SETPERM
, key
,
2199 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2200 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2201 return log_error_errno(errno
, "Failed to restrict invocation ID permission: %m");
2205 /* And now, make the keyring owned by the service's user */
2206 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2207 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2208 return log_error_errno(errno
, "Failed to change ownership of session keyring: %m");
2213 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2221 array
[(*n
)++] = pair
[0];
2223 array
[(*n
)++] = pair
[1];
2226 static int close_remaining_fds(
2227 const ExecParameters
*params
,
2228 ExecRuntime
*runtime
,
2229 DynamicCreds
*dcreds
,
2232 int *fds
, unsigned n_fds
) {
2234 unsigned n_dont_close
= 0;
2235 int dont_close
[n_fds
+ 12];
2239 if (params
->stdin_fd
>= 0)
2240 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2241 if (params
->stdout_fd
>= 0)
2242 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2243 if (params
->stderr_fd
>= 0)
2244 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2247 dont_close
[n_dont_close
++] = socket_fd
;
2249 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2250 n_dont_close
+= n_fds
;
2254 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2258 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2260 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2263 if (user_lookup_fd
>= 0)
2264 dont_close
[n_dont_close
++] = user_lookup_fd
;
2266 return close_all_fds(dont_close
, n_dont_close
);
2269 static int send_user_lookup(
2277 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2278 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2281 if (user_lookup_fd
< 0)
2284 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2287 if (writev(user_lookup_fd
,
2289 { .iov_base
= &uid
, .iov_len
= sizeof(uid
) },
2290 { .iov_base
= &gid
, .iov_len
= sizeof(gid
) },
2291 { .iov_base
= unit
->id
, .iov_len
= strlen(unit
->id
) }}, 3) < 0)
2297 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2304 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2309 if (!c
->working_directory_home
)
2313 /* Hardcode /root as home directory for UID 0 */
2318 r
= get_home_dir(buf
);
2326 static int exec_child(
2328 ExecCommand
*command
,
2329 const ExecContext
*context
,
2330 const ExecParameters
*params
,
2331 ExecRuntime
*runtime
,
2332 DynamicCreds
*dcreds
,
2337 unsigned n_storage_fds
,
2338 unsigned n_socket_fds
,
2342 char **error_message
) {
2344 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2345 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2346 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2347 const char *username
= NULL
, *groupname
= NULL
;
2348 const char *home
= NULL
, *shell
= NULL
;
2349 dev_t journal_stream_dev
= 0;
2350 ino_t journal_stream_ino
= 0;
2351 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2352 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2353 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2354 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2356 bool use_selinux
= false;
2359 bool use_smack
= false;
2361 #ifdef HAVE_APPARMOR
2362 bool use_apparmor
= false;
2364 uid_t uid
= UID_INVALID
;
2365 gid_t gid
= GID_INVALID
;
2366 int i
, r
, ngids
= 0;
2368 ExecDirectoryType dt
;
2375 assert(exit_status
);
2376 assert(error_message
);
2377 /* We don't always set error_message, hence it must be initialized */
2378 assert(*error_message
== NULL
);
2380 rename_process_from_path(command
->path
);
2382 /* We reset exactly these signals, since they are the
2383 * only ones we set to SIG_IGN in the main daemon. All
2384 * others we leave untouched because we set them to
2385 * SIG_DFL or a valid handler initially, both of which
2386 * will be demoted to SIG_DFL. */
2387 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2388 SIGNALS_IGNORE
, -1);
2390 if (context
->ignore_sigpipe
)
2391 (void) ignore_signals(SIGPIPE
, -1);
2393 r
= reset_signal_mask();
2395 *exit_status
= EXIT_SIGNAL_MASK
;
2396 *error_message
= strdup("Failed to reset signal mask");
2397 /* If strdup fails, here and below, we will just print the generic error message. */
2401 if (params
->idle_pipe
)
2402 do_idle_pipe_dance(params
->idle_pipe
);
2404 /* Close sockets very early to make sure we don't
2405 * block init reexecution because it cannot bind its
2410 n_fds
= n_storage_fds
+ n_socket_fds
;
2411 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2413 *exit_status
= EXIT_FDS
;
2414 *error_message
= strdup("Failed to close remaining fds");
2418 if (!context
->same_pgrp
)
2420 *exit_status
= EXIT_SETSID
;
2424 exec_context_tty_reset(context
, params
);
2426 if (unit_shall_confirm_spawn(unit
)) {
2427 const char *vc
= params
->confirm_spawn
;
2428 _cleanup_free_
char *cmdline
= NULL
;
2430 cmdline
= exec_command_line(argv
);
2432 *exit_status
= EXIT_CONFIRM
;
2436 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2437 if (r
!= CONFIRM_EXECUTE
) {
2438 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2439 *exit_status
= EXIT_SUCCESS
;
2442 *exit_status
= EXIT_CONFIRM
;
2443 *error_message
= strdup("Execution cancelled");
2448 if (context
->dynamic_user
&& dcreds
) {
2450 /* Make sure we bypass our own NSS module for any NSS checks */
2451 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2452 *exit_status
= EXIT_USER
;
2453 *error_message
= strdup("Failed to update environment");
2457 r
= dynamic_creds_realize(dcreds
, &uid
, &gid
);
2459 *exit_status
= EXIT_USER
;
2460 *error_message
= strdup("Failed to update dynamic user credentials");
2464 if (!uid_is_valid(uid
)) {
2465 *exit_status
= EXIT_USER
;
2466 (void) asprintf(error_message
, "UID validation failed for \""UID_FMT
"\"", uid
);
2467 /* If asprintf fails, here and below, we will just print the generic error message. */
2471 if (!gid_is_valid(gid
)) {
2472 *exit_status
= EXIT_USER
;
2473 (void) asprintf(error_message
, "GID validation failed for \""GID_FMT
"\"", gid
);
2478 username
= dcreds
->user
->name
;
2481 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2483 *exit_status
= EXIT_USER
;
2484 *error_message
= strdup("Failed to determine user credentials");
2488 r
= get_fixed_group(context
, &groupname
, &gid
);
2490 *exit_status
= EXIT_GROUP
;
2491 *error_message
= strdup("Failed to determine group credentials");
2496 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2497 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2498 &supplementary_gids
, &ngids
);
2500 *exit_status
= EXIT_GROUP
;
2501 *error_message
= strdup("Failed to determine supplementary groups");
2505 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2507 *exit_status
= EXIT_USER
;
2508 *error_message
= strdup("Failed to send user credentials to PID1");
2512 user_lookup_fd
= safe_close(user_lookup_fd
);
2514 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2516 *exit_status
= EXIT_CHDIR
;
2517 *error_message
= strdup("Failed to determine $HOME for user");
2521 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2522 * must sure to drop O_NONBLOCK */
2524 (void) fd_nonblock(socket_fd
, false);
2526 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2528 *exit_status
= EXIT_STDIN
;
2529 *error_message
= strdup("Failed to set up stdin");
2533 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2535 *exit_status
= EXIT_STDOUT
;
2536 *error_message
= strdup("Failed to set up stdout");
2540 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2542 *exit_status
= EXIT_STDERR
;
2543 *error_message
= strdup("Failed to set up stderr");
2547 if (params
->cgroup_path
) {
2548 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2550 *exit_status
= EXIT_CGROUP
;
2551 (void) asprintf(error_message
, "Failed to attach to cgroup %s", params
->cgroup_path
);
2556 if (context
->oom_score_adjust_set
) {
2557 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2559 /* When we can't make this change due to EPERM, then
2560 * let's silently skip over it. User namespaces
2561 * prohibit write access to this file, and we
2562 * shouldn't trip up over that. */
2564 sprintf(t
, "%i", context
->oom_score_adjust
);
2565 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2566 if (r
== -EPERM
|| r
== -EACCES
) {
2568 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2571 *exit_status
= EXIT_OOM_ADJUST
;
2572 *error_message
= strdup("Failed to write /proc/self/oom_score_adj");
2577 if (context
->nice_set
)
2578 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2579 *exit_status
= EXIT_NICE
;
2583 if (context
->cpu_sched_set
) {
2584 struct sched_param param
= {
2585 .sched_priority
= context
->cpu_sched_priority
,
2588 r
= sched_setscheduler(0,
2589 context
->cpu_sched_policy
|
2590 (context
->cpu_sched_reset_on_fork
?
2591 SCHED_RESET_ON_FORK
: 0),
2594 *exit_status
= EXIT_SETSCHEDULER
;
2599 if (context
->cpuset
)
2600 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2601 *exit_status
= EXIT_CPUAFFINITY
;
2605 if (context
->ioprio_set
)
2606 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2607 *exit_status
= EXIT_IOPRIO
;
2611 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2612 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2613 *exit_status
= EXIT_TIMERSLACK
;
2617 if (context
->personality
!= PERSONALITY_INVALID
) {
2618 r
= safe_personality(context
->personality
);
2620 *exit_status
= EXIT_PERSONALITY
;
2625 if (context
->utmp_id
)
2626 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2628 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2629 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2633 if (context
->user
) {
2634 r
= chown_terminal(STDIN_FILENO
, uid
);
2636 *exit_status
= EXIT_STDIN
;
2641 /* If delegation is enabled we'll pass ownership of the cgroup
2642 * (but only in systemd's own controller hierarchy!) to the
2643 * user of the new process. */
2644 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2645 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2647 *exit_status
= EXIT_CGROUP
;
2652 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2654 *exit_status
= EXIT_CGROUP
;
2659 for (dt
= 0; dt
< _EXEC_DIRECTORY_MAX
; dt
++) {
2660 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2665 r
= build_environment(
2677 *exit_status
= EXIT_MEMORY
;
2681 r
= build_pass_environment(context
, &pass_env
);
2683 *exit_status
= EXIT_MEMORY
;
2687 accum_env
= strv_env_merge(5,
2688 params
->environment
,
2691 context
->environment
,
2695 *exit_status
= EXIT_MEMORY
;
2698 accum_env
= strv_env_clean(accum_env
);
2700 (void) umask(context
->umask
);
2702 r
= setup_keyring(unit
, params
, uid
, gid
);
2704 *exit_status
= EXIT_KEYRING
;
2708 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2709 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2711 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2712 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
2714 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2715 if (needs_ambient_hack
)
2716 needs_setuid
= false;
2718 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
2720 if (needs_sandboxing
) {
2721 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
2722 * present. The actual MAC context application will happen later, as late as possible, to avoid
2723 * impacting our own code paths. */
2726 use_selinux
= mac_selinux_use();
2729 use_smack
= mac_smack_use();
2731 #ifdef HAVE_APPARMOR
2732 use_apparmor
= mac_apparmor_use();
2737 if (context
->pam_name
&& username
) {
2738 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
2740 *exit_status
= EXIT_PAM
;
2746 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
2747 r
= setup_netns(runtime
->netns_storage_socket
);
2749 *exit_status
= EXIT_NETWORK
;
2754 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
2755 if (needs_mount_namespace
) {
2756 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
2758 *exit_status
= EXIT_NAMESPACE
;
2763 /* Apply just after mount namespace setup */
2764 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
2768 /* Drop groups as early as possbile */
2770 r
= enforce_groups(context
, gid
, supplementary_gids
, ngids
);
2772 *exit_status
= EXIT_GROUP
;
2777 if (needs_sandboxing
) {
2779 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
2780 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
2782 *exit_status
= EXIT_SELINUX_CONTEXT
;
2788 if (context
->private_users
) {
2789 r
= setup_private_users(uid
, gid
);
2791 *exit_status
= EXIT_USER
;
2797 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
2798 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
2799 * was needed to upload the policy and can now be closed as well. */
2800 r
= close_all_fds(fds
, n_fds
);
2802 r
= shift_fds(fds
, n_fds
);
2804 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
2806 *exit_status
= EXIT_FDS
;
2810 secure_bits
= context
->secure_bits
;
2812 if (needs_sandboxing
) {
2815 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
2817 if (!context
->rlimit
[i
])
2820 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
2822 *exit_status
= EXIT_LIMITS
;
2827 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
2828 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
2829 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
2830 *exit_status
= EXIT_LIMITS
;
2835 bset
= context
->capability_bounding_set
;
2836 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
2837 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
2838 * instead of us doing that */
2839 if (needs_ambient_hack
)
2840 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
2841 (UINT64_C(1) << CAP_SETUID
) |
2842 (UINT64_C(1) << CAP_SETGID
);
2844 if (!cap_test_all(bset
)) {
2845 r
= capability_bounding_set_drop(bset
, false);
2847 *exit_status
= EXIT_CAPABILITIES
;
2848 *error_message
= strdup("Failed to drop capabilities");
2853 /* This is done before enforce_user, but ambient set
2854 * does not survive over setresuid() if keep_caps is not set. */
2855 if (!needs_ambient_hack
&&
2856 context
->capability_ambient_set
!= 0) {
2857 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
2859 *exit_status
= EXIT_CAPABILITIES
;
2860 *error_message
= strdup("Failed to apply ambient capabilities (before UID change)");
2867 if (context
->user
) {
2868 r
= enforce_user(context
, uid
);
2870 *exit_status
= EXIT_USER
;
2871 (void) asprintf(error_message
, "Failed to change UID to "UID_FMT
, uid
);
2875 if (!needs_ambient_hack
&&
2876 context
->capability_ambient_set
!= 0) {
2878 /* Fix the ambient capabilities after user change. */
2879 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
2881 *exit_status
= EXIT_CAPABILITIES
;
2882 *error_message
= strdup("Failed to apply ambient capabilities (after UID change)");
2886 /* If we were asked to change user and ambient capabilities
2887 * were requested, we had to add keep-caps to the securebits
2888 * so that we would maintain the inherited capability set
2889 * through the setresuid(). Make sure that the bit is added
2890 * also to the context secure_bits so that we don't try to
2891 * drop the bit away next. */
2893 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
2898 if (needs_sandboxing
) {
2899 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
2900 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
2901 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
2902 * are restricted. */
2906 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
2909 r
= setexeccon(exec_context
);
2911 *exit_status
= EXIT_SELINUX_CONTEXT
;
2912 (void) asprintf(error_message
, "Failed to set SELinux context to %s", exec_context
);
2921 r
= setup_smack(context
, command
);
2923 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
2924 *error_message
= strdup("Failed to set SMACK process label");
2930 #ifdef HAVE_APPARMOR
2931 if (use_apparmor
&& context
->apparmor_profile
) {
2932 r
= aa_change_onexec(context
->apparmor_profile
);
2933 if (r
< 0 && !context
->apparmor_profile_ignore
) {
2934 *exit_status
= EXIT_APPARMOR_PROFILE
;
2935 (void) asprintf(error_message
,
2936 "Failed to prepare AppArmor profile change to %s",
2937 context
->apparmor_profile
);
2943 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
2944 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
2945 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
2946 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
2947 *exit_status
= EXIT_SECUREBITS
;
2948 *error_message
= strdup("Failed to set secure bits");
2952 if (context_has_no_new_privileges(context
))
2953 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
2954 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
2955 *error_message
= strdup("Failed to disable new privileges");
2960 r
= apply_address_families(unit
, context
);
2962 *exit_status
= EXIT_ADDRESS_FAMILIES
;
2963 *error_message
= strdup("Failed to restrict address families");
2967 r
= apply_memory_deny_write_execute(unit
, context
);
2969 *exit_status
= EXIT_SECCOMP
;
2970 *error_message
= strdup("Failed to disable writing to executable memory");
2974 r
= apply_restrict_realtime(unit
, context
);
2976 *exit_status
= EXIT_SECCOMP
;
2977 *error_message
= strdup("Failed to apply realtime restrictions");
2981 r
= apply_restrict_namespaces(unit
, context
);
2983 *exit_status
= EXIT_SECCOMP
;
2984 *error_message
= strdup("Failed to apply namespace restrictions");
2988 r
= apply_protect_sysctl(unit
, context
);
2990 *exit_status
= EXIT_SECCOMP
;
2991 *error_message
= strdup("Failed to apply sysctl restrictions");
2995 r
= apply_protect_kernel_modules(unit
, context
);
2997 *exit_status
= EXIT_SECCOMP
;
2998 *error_message
= strdup("Failed to apply module loading restrictions");
3002 r
= apply_private_devices(unit
, context
);
3004 *exit_status
= EXIT_SECCOMP
;
3005 *error_message
= strdup("Failed to set up private devices");
3009 r
= apply_syscall_archs(unit
, context
);
3011 *exit_status
= EXIT_SECCOMP
;
3012 *error_message
= strdup("Failed to apply syscall architecture restrictions");
3016 r
= apply_lock_personality(unit
, context
);
3018 *exit_status
= EXIT_SECCOMP
;
3019 *error_message
= strdup("Failed to lock personalities");
3023 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3024 * by the filter as little as possible. */
3025 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3027 *exit_status
= EXIT_SECCOMP
;
3028 *error_message
= strdup("Failed to apply syscall filters");
3034 final_argv
= replace_env_argv(argv
, accum_env
);
3036 *exit_status
= EXIT_MEMORY
;
3037 *error_message
= strdup("Failed to prepare process arguments");
3041 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3042 _cleanup_free_
char *line
;
3044 line
= exec_command_line(final_argv
);
3047 log_struct(LOG_DEBUG
,
3048 "EXECUTABLE=%s", command
->path
,
3049 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3056 execve(command
->path
, final_argv
, accum_env
);
3057 *exit_status
= EXIT_EXEC
;
3061 int exec_spawn(Unit
*unit
,
3062 ExecCommand
*command
,
3063 const ExecContext
*context
,
3064 const ExecParameters
*params
,
3065 ExecRuntime
*runtime
,
3066 DynamicCreds
*dcreds
,
3069 _cleanup_strv_free_
char **files_env
= NULL
;
3071 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3072 _cleanup_free_
char *line
= NULL
;
3074 int named_iofds
[3] = { -1, -1, -1 };
3083 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3085 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3086 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3087 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3089 if (params
->n_socket_fds
> 1) {
3090 log_unit_error(unit
, "Got more than one socket.");
3094 if (params
->n_socket_fds
== 0) {
3095 log_unit_error(unit
, "Got no socket.");
3099 socket_fd
= params
->fds
[0];
3103 n_storage_fds
= params
->n_storage_fds
;
3104 n_socket_fds
= params
->n_socket_fds
;
3107 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3109 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3111 r
= exec_context_load_environment(unit
, context
, &files_env
);
3113 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3115 argv
= params
->argv
?: command
->argv
;
3116 line
= exec_command_line(argv
);
3120 log_struct(LOG_DEBUG
,
3121 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3122 "EXECUTABLE=%s", command
->path
,
3127 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3131 _cleanup_free_
char *error_message
= NULL
;
3133 r
= exec_child(unit
,
3146 unit
->manager
->user_lookup_fds
[1],
3152 log_struct_errno(LOG_ERR
, r
,
3153 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3155 LOG_UNIT_MESSAGE(unit
, "%s: %m",
3157 "EXECUTABLE=%s", command
->path
,
3159 else if (r
== -ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
))
3160 log_struct_errno(LOG_INFO
, r
,
3161 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3163 LOG_UNIT_MESSAGE(unit
, "Skipped spawning %s: %m",
3165 "EXECUTABLE=%s", command
->path
,
3168 log_struct_errno(LOG_ERR
, r
,
3169 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3171 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3172 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3174 "EXECUTABLE=%s", command
->path
,
3181 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3183 /* We add the new process to the cgroup both in the child (so
3184 * that we can be sure that no user code is ever executed
3185 * outside of the cgroup) and in the parent (so that we can be
3186 * sure that when we kill the cgroup the process will be
3188 if (params
->cgroup_path
)
3189 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3191 exec_status_start(&command
->exec_status
, pid
);
3197 void exec_context_init(ExecContext
*c
) {
3198 ExecDirectoryType i
;
3203 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3204 c
->cpu_sched_policy
= SCHED_OTHER
;
3205 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3206 c
->syslog_level_prefix
= true;
3207 c
->ignore_sigpipe
= true;
3208 c
->timer_slack_nsec
= NSEC_INFINITY
;
3209 c
->personality
= PERSONALITY_INVALID
;
3210 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
3211 c
->directories
[i
].mode
= 0755;
3212 c
->capability_bounding_set
= CAP_ALL
;
3213 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3216 void exec_context_done(ExecContext
*c
) {
3218 ExecDirectoryType i
;
3222 c
->environment
= strv_free(c
->environment
);
3223 c
->environment_files
= strv_free(c
->environment_files
);
3224 c
->pass_environment
= strv_free(c
->pass_environment
);
3226 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3227 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3229 for (l
= 0; l
< 3; l
++)
3230 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3232 c
->working_directory
= mfree(c
->working_directory
);
3233 c
->root_directory
= mfree(c
->root_directory
);
3234 c
->root_image
= mfree(c
->root_image
);
3235 c
->tty_path
= mfree(c
->tty_path
);
3236 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3237 c
->user
= mfree(c
->user
);
3238 c
->group
= mfree(c
->group
);
3240 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3242 c
->pam_name
= mfree(c
->pam_name
);
3244 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3245 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3246 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3248 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3251 CPU_FREE(c
->cpuset
);
3253 c
->utmp_id
= mfree(c
->utmp_id
);
3254 c
->selinux_context
= mfree(c
->selinux_context
);
3255 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3256 c
->smack_process_label
= mfree(c
->smack_process_label
);
3258 c
->syscall_filter
= set_free(c
->syscall_filter
);
3259 c
->syscall_archs
= set_free(c
->syscall_archs
);
3260 c
->address_families
= set_free(c
->address_families
);
3262 for (i
= 0; i
< _EXEC_DIRECTORY_MAX
; i
++)
3263 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3266 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3271 if (!runtime_prefix
)
3274 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3275 _cleanup_free_
char *p
;
3277 p
= strjoin(runtime_prefix
, "/", *i
);
3281 /* We execute this synchronously, since we need to be
3282 * sure this is gone when we start the service
3284 (void) rm_rf(p
, REMOVE_ROOT
);
3290 void exec_command_done(ExecCommand
*c
) {
3293 c
->path
= mfree(c
->path
);
3295 c
->argv
= strv_free(c
->argv
);
3298 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3301 for (i
= 0; i
< n
; i
++)
3302 exec_command_done(c
+i
);
3305 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3309 LIST_REMOVE(command
, c
, i
);
3310 exec_command_done(i
);
3317 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3320 for (i
= 0; i
< n
; i
++)
3321 c
[i
] = exec_command_free_list(c
[i
]);
3324 typedef struct InvalidEnvInfo
{
3329 static void invalid_env(const char *p
, void *userdata
) {
3330 InvalidEnvInfo
*info
= userdata
;
3332 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3335 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3340 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3342 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3344 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3346 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3348 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3350 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3356 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3357 unsigned i
, targets
;
3358 const char* stdio_fdname
[3];
3364 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3365 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3366 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3368 for (i
= 0; i
< 3; i
++)
3369 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3371 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3373 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3374 if (named_iofds
[STDIN_FILENO
] < 0 &&
3375 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3376 stdio_fdname
[STDIN_FILENO
] &&
3377 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3379 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3382 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3383 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3384 stdio_fdname
[STDOUT_FILENO
] &&
3385 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3387 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3390 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3391 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3392 stdio_fdname
[STDERR_FILENO
] &&
3393 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3395 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3399 return targets
== 0 ? 0 : -ENOENT
;
3402 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3403 char **i
, **r
= NULL
;
3408 STRV_FOREACH(i
, c
->environment_files
) {
3412 bool ignore
= false;
3414 _cleanup_globfree_ glob_t pglob
= {};
3423 if (!path_is_absolute(fn
)) {
3431 /* Filename supports globbing, take all matching files */
3432 k
= safe_glob(fn
, 0, &pglob
);
3441 /* When we don't match anything, -ENOENT should be returned */
3442 assert(pglob
.gl_pathc
> 0);
3444 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3445 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3453 /* Log invalid environment variables with filename */
3455 InvalidEnvInfo info
= {
3457 .path
= pglob
.gl_pathv
[n
]
3460 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3468 m
= strv_env_merge(2, r
, p
);
3484 static bool tty_may_match_dev_console(const char *tty
) {
3485 _cleanup_free_
char *active
= NULL
;
3491 tty
= skip_dev_prefix(tty
);
3493 /* trivial identity? */
3494 if (streq(tty
, "console"))
3497 console
= resolve_dev_console(&active
);
3498 /* if we could not resolve, assume it may */
3502 /* "tty0" means the active VC, so it may be the same sometimes */
3503 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3506 bool exec_context_may_touch_console(ExecContext
*ec
) {
3508 return (ec
->tty_reset
||
3510 ec
->tty_vt_disallocate
||
3511 is_terminal_input(ec
->std_input
) ||
3512 is_terminal_output(ec
->std_output
) ||
3513 is_terminal_output(ec
->std_error
)) &&
3514 tty_may_match_dev_console(exec_context_tty_path(ec
));
3517 static void strv_fprintf(FILE *f
, char **l
) {
3523 fprintf(f
, " %s", *g
);
3526 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3529 ExecDirectoryType dt
;
3535 prefix
= strempty(prefix
);
3539 "%sWorkingDirectory: %s\n"
3540 "%sRootDirectory: %s\n"
3541 "%sNonBlocking: %s\n"
3542 "%sPrivateTmp: %s\n"
3543 "%sPrivateDevices: %s\n"
3544 "%sProtectKernelTunables: %s\n"
3545 "%sProtectKernelModules: %s\n"
3546 "%sProtectControlGroups: %s\n"
3547 "%sPrivateNetwork: %s\n"
3548 "%sPrivateUsers: %s\n"
3549 "%sProtectHome: %s\n"
3550 "%sProtectSystem: %s\n"
3551 "%sMountAPIVFS: %s\n"
3552 "%sIgnoreSIGPIPE: %s\n"
3553 "%sMemoryDenyWriteExecute: %s\n"
3554 "%sRestrictRealtime: %s\n",
3556 prefix
, c
->working_directory
? c
->working_directory
: "/",
3557 prefix
, c
->root_directory
? c
->root_directory
: "/",
3558 prefix
, yes_no(c
->non_blocking
),
3559 prefix
, yes_no(c
->private_tmp
),
3560 prefix
, yes_no(c
->private_devices
),
3561 prefix
, yes_no(c
->protect_kernel_tunables
),
3562 prefix
, yes_no(c
->protect_kernel_modules
),
3563 prefix
, yes_no(c
->protect_control_groups
),
3564 prefix
, yes_no(c
->private_network
),
3565 prefix
, yes_no(c
->private_users
),
3566 prefix
, protect_home_to_string(c
->protect_home
),
3567 prefix
, protect_system_to_string(c
->protect_system
),
3568 prefix
, yes_no(c
->mount_apivfs
),
3569 prefix
, yes_no(c
->ignore_sigpipe
),
3570 prefix
, yes_no(c
->memory_deny_write_execute
),
3571 prefix
, yes_no(c
->restrict_realtime
));
3574 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3576 STRV_FOREACH(e
, c
->environment
)
3577 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3579 STRV_FOREACH(e
, c
->environment_files
)
3580 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3582 STRV_FOREACH(e
, c
->pass_environment
)
3583 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3585 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3587 for (dt
= 0; dt
< _EXEC_DIRECTORY_MAX
; dt
++) {
3588 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3590 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3591 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3599 if (c
->oom_score_adjust_set
)
3601 "%sOOMScoreAdjust: %i\n",
3602 prefix
, c
->oom_score_adjust
);
3604 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3606 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3607 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3608 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3609 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3612 if (c
->ioprio_set
) {
3613 _cleanup_free_
char *class_str
= NULL
;
3615 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3617 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3619 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3622 if (c
->cpu_sched_set
) {
3623 _cleanup_free_
char *policy_str
= NULL
;
3625 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3627 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3630 "%sCPUSchedulingPriority: %i\n"
3631 "%sCPUSchedulingResetOnFork: %s\n",
3632 prefix
, c
->cpu_sched_priority
,
3633 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3637 fprintf(f
, "%sCPUAffinity:", prefix
);
3638 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3639 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3640 fprintf(f
, " %u", i
);
3644 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3645 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3648 "%sStandardInput: %s\n"
3649 "%sStandardOutput: %s\n"
3650 "%sStandardError: %s\n",
3651 prefix
, exec_input_to_string(c
->std_input
),
3652 prefix
, exec_output_to_string(c
->std_output
),
3653 prefix
, exec_output_to_string(c
->std_error
));
3659 "%sTTYVHangup: %s\n"
3660 "%sTTYVTDisallocate: %s\n",
3661 prefix
, c
->tty_path
,
3662 prefix
, yes_no(c
->tty_reset
),
3663 prefix
, yes_no(c
->tty_vhangup
),
3664 prefix
, yes_no(c
->tty_vt_disallocate
));
3666 if (IN_SET(c
->std_output
,
3669 EXEC_OUTPUT_JOURNAL
,
3670 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3671 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3672 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
3673 IN_SET(c
->std_error
,
3676 EXEC_OUTPUT_JOURNAL
,
3677 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3678 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3679 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
3681 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
3683 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
3685 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
3687 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
3689 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
3692 if (c
->secure_bits
) {
3693 _cleanup_free_
char *str
= NULL
;
3695 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
3697 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
3700 if (c
->capability_bounding_set
!= CAP_ALL
) {
3701 _cleanup_free_
char *str
= NULL
;
3703 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
3705 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
3708 if (c
->capability_ambient_set
!= 0) {
3709 _cleanup_free_
char *str
= NULL
;
3711 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
3713 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
3717 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
3719 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
3721 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
3723 if (strv_length(c
->supplementary_groups
) > 0) {
3724 fprintf(f
, "%sSupplementaryGroups:", prefix
);
3725 strv_fprintf(f
, c
->supplementary_groups
);
3730 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
3732 if (strv_length(c
->read_write_paths
) > 0) {
3733 fprintf(f
, "%sReadWritePaths:", prefix
);
3734 strv_fprintf(f
, c
->read_write_paths
);
3738 if (strv_length(c
->read_only_paths
) > 0) {
3739 fprintf(f
, "%sReadOnlyPaths:", prefix
);
3740 strv_fprintf(f
, c
->read_only_paths
);
3744 if (strv_length(c
->inaccessible_paths
) > 0) {
3745 fprintf(f
, "%sInaccessiblePaths:", prefix
);
3746 strv_fprintf(f
, c
->inaccessible_paths
);
3750 if (c
->n_bind_mounts
> 0)
3751 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
3752 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
3753 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
3754 c
->bind_mounts
[i
].source
,
3755 c
->bind_mounts
[i
].destination
,
3756 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
3761 "%sUtmpIdentifier: %s\n",
3762 prefix
, c
->utmp_id
);
3764 if (c
->selinux_context
)
3766 "%sSELinuxContext: %s%s\n",
3767 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
3769 if (c
->apparmor_profile
)
3771 "%sAppArmorProfile: %s%s\n",
3772 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
3774 if (c
->smack_process_label
)
3776 "%sSmackProcessLabel: %s%s\n",
3777 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
3779 if (c
->personality
!= PERSONALITY_INVALID
)
3781 "%sPersonality: %s\n",
3782 prefix
, strna(personality_to_string(c
->personality
)));
3785 "%sLockPersonality: %s\n",
3786 prefix
, yes_no(c
->lock_personality
));
3788 if (c
->syscall_filter
) {
3796 "%sSystemCallFilter: ",
3799 if (!c
->syscall_whitelist
)
3803 SET_FOREACH(id
, c
->syscall_filter
, j
) {
3804 _cleanup_free_
char *name
= NULL
;
3811 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
3812 fputs(strna(name
), f
);
3819 if (c
->syscall_archs
) {
3826 "%sSystemCallArchitectures:",
3830 SET_FOREACH(id
, c
->syscall_archs
, j
)
3831 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
3836 if (exec_context_restrict_namespaces_set(c
)) {
3837 _cleanup_free_
char *s
= NULL
;
3839 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
3841 fprintf(f
, "%sRestrictNamespaces: %s\n",
3845 if (c
->syscall_errno
> 0)
3847 "%sSystemCallErrorNumber: %s\n",
3848 prefix
, strna(errno_to_name(c
->syscall_errno
)));
3850 if (c
->apparmor_profile
)
3852 "%sAppArmorProfile: %s%s\n",
3853 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
3856 bool exec_context_maintains_privileges(ExecContext
*c
) {
3859 /* Returns true if the process forked off would run under
3860 * an unchanged UID or as root. */
3865 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
3871 int exec_context_get_effective_ioprio(ExecContext
*c
) {
3879 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
3881 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
3886 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
3891 dual_timestamp_get(&s
->start_timestamp
);
3894 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
3897 if (s
->pid
&& s
->pid
!= pid
)
3901 dual_timestamp_get(&s
->exit_timestamp
);
3907 if (context
->utmp_id
)
3908 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
3910 exec_context_tty_reset(context
, NULL
);
3914 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
3915 char buf
[FORMAT_TIMESTAMP_MAX
];
3923 prefix
= strempty(prefix
);
3926 "%sPID: "PID_FMT
"\n",
3929 if (dual_timestamp_is_set(&s
->start_timestamp
))
3931 "%sStart Timestamp: %s\n",
3932 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
3934 if (dual_timestamp_is_set(&s
->exit_timestamp
))
3936 "%sExit Timestamp: %s\n"
3938 "%sExit Status: %i\n",
3939 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
3940 prefix
, sigchld_code_to_string(s
->code
),
3944 char *exec_command_line(char **argv
) {
3952 STRV_FOREACH(a
, argv
)
3960 STRV_FOREACH(a
, argv
) {
3967 if (strpbrk(*a
, WHITESPACE
)) {
3978 /* FIXME: this doesn't really handle arguments that have
3979 * spaces and ticks in them */
3984 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
3985 _cleanup_free_
char *cmd
= NULL
;
3986 const char *prefix2
;
3991 prefix
= strempty(prefix
);
3992 prefix2
= strjoina(prefix
, "\t");
3994 cmd
= exec_command_line(c
->argv
);
3996 "%sCommand Line: %s\n",
3997 prefix
, cmd
? cmd
: strerror(ENOMEM
));
3999 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4002 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4005 prefix
= strempty(prefix
);
4007 LIST_FOREACH(command
, c
, c
)
4008 exec_command_dump(c
, f
, prefix
);
4011 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4018 /* It's kind of important, that we keep the order here */
4019 LIST_FIND_TAIL(command
, *l
, end
);
4020 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4025 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4033 l
= strv_new_ap(path
, ap
);
4054 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4055 _cleanup_strv_free_
char **l
= NULL
;
4063 l
= strv_new_ap(path
, ap
);
4069 r
= strv_extend_strv(&c
->argv
, l
, false);
4077 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4082 *rt
= new0(ExecRuntime
, 1);
4087 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4092 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4102 if (!c
->private_network
&& !c
->private_tmp
)
4105 r
= exec_runtime_allocate(rt
);
4109 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4110 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4114 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4115 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4123 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4125 assert(r
->n_ref
> 0);
4131 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4136 assert(r
->n_ref
> 0);
4143 free(r
->var_tmp_dir
);
4144 safe_close_pair(r
->netns_storage_socket
);
4148 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4157 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4159 if (rt
->var_tmp_dir
)
4160 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4162 if (rt
->netns_storage_socket
[0] >= 0) {
4165 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4169 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4172 if (rt
->netns_storage_socket
[1] >= 0) {
4175 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4179 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4185 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4192 if (streq(key
, "tmp-dir")) {
4195 r
= exec_runtime_allocate(rt
);
4199 copy
= strdup(value
);
4203 free((*rt
)->tmp_dir
);
4204 (*rt
)->tmp_dir
= copy
;
4206 } else if (streq(key
, "var-tmp-dir")) {
4209 r
= exec_runtime_allocate(rt
);
4213 copy
= strdup(value
);
4217 free((*rt
)->var_tmp_dir
);
4218 (*rt
)->var_tmp_dir
= copy
;
4220 } else if (streq(key
, "netns-socket-0")) {
4223 r
= exec_runtime_allocate(rt
);
4227 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4228 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4230 safe_close((*rt
)->netns_storage_socket
[0]);
4231 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4233 } else if (streq(key
, "netns-socket-1")) {
4236 r
= exec_runtime_allocate(rt
);
4240 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4241 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4243 safe_close((*rt
)->netns_storage_socket
[1]);
4244 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4252 static void *remove_tmpdir_thread(void *p
) {
4253 _cleanup_free_
char *path
= p
;
4255 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4259 void exec_runtime_destroy(ExecRuntime
*rt
) {
4265 /* If there are multiple users of this, let's leave the stuff around */
4270 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4272 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4274 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4281 if (rt
->var_tmp_dir
) {
4282 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4284 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4286 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4287 free(rt
->var_tmp_dir
);
4290 rt
->var_tmp_dir
= NULL
;
4293 safe_close_pair(rt
->netns_storage_socket
);
4296 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4297 [EXEC_INPUT_NULL
] = "null",
4298 [EXEC_INPUT_TTY
] = "tty",
4299 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4300 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4301 [EXEC_INPUT_SOCKET
] = "socket",
4302 [EXEC_INPUT_NAMED_FD
] = "fd",
4305 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4307 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4308 [EXEC_OUTPUT_INHERIT
] = "inherit",
4309 [EXEC_OUTPUT_NULL
] = "null",
4310 [EXEC_OUTPUT_TTY
] = "tty",
4311 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4312 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4313 [EXEC_OUTPUT_KMSG
] = "kmsg",
4314 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4315 [EXEC_OUTPUT_JOURNAL
] = "journal",
4316 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4317 [EXEC_OUTPUT_SOCKET
] = "socket",
4318 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4321 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4323 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4324 [EXEC_UTMP_INIT
] = "init",
4325 [EXEC_UTMP_LOGIN
] = "login",
4326 [EXEC_UTMP_USER
] = "user",
4329 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4331 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4332 [EXEC_PRESERVE_NO
] = "no",
4333 [EXEC_PRESERVE_YES
] = "yes",
4334 [EXEC_PRESERVE_RESTART
] = "restart",
4337 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4339 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_MAX
] = {
4340 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4341 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4342 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4343 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4344 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4347 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);