2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
33 #include <sys/socket.h>
35 #include <sys/types.h>
41 #include <security/pam_appl.h>
45 #include <selinux/selinux.h>
53 #include <sys/apparmor.h>
56 #include "sd-messages.h"
59 #include "alloc-util.h"
61 #include "apparmor-util.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
70 #include "errno-list.h"
72 #include "exit-status.h"
75 #include "format-util.h"
77 #include "glob-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
92 #include "seccomp-util.h"
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
100 #include "string-table.h"
101 #include "string-util.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
106 #include "user-util.h"
108 #include "utmp-wtmp.h"
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
116 #define SNDBUF_SIZE (8*1024*1024)
118 static int shift_fds(int fds
[], unsigned n_fds
) {
119 int start
, restart_from
;
124 /* Modifies the fds array! (sorts it) */
134 for (i
= start
; i
< (int) n_fds
; i
++) {
137 /* Already at right index? */
141 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd
!= i
+3 && restart_from
< 0)
154 if (restart_from
< 0)
157 start
= restart_from
;
163 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
167 n_fds
= n_storage_fds
+ n_socket_fds
;
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
176 for (i
= 0; i
< n_fds
; i
++) {
178 if (i
< n_socket_fds
) {
179 r
= fd_nonblock(fds
[i
], nonblock
);
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
188 r
= fd_cloexec(fds
[i
], false);
196 static const char *exec_context_tty_path(const ExecContext
*context
) {
199 if (context
->stdio_as_fds
)
202 if (context
->tty_path
)
203 return context
->tty_path
;
205 return "/dev/console";
208 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
213 path
= exec_context_tty_path(context
);
215 if (context
->tty_vhangup
) {
216 if (p
&& p
->stdin_fd
>= 0)
217 (void) terminal_vhangup_fd(p
->stdin_fd
);
219 (void) terminal_vhangup(path
);
222 if (context
->tty_reset
) {
223 if (p
&& p
->stdin_fd
>= 0)
224 (void) reset_terminal_fd(p
->stdin_fd
, true);
226 (void) reset_terminal(path
);
229 if (context
->tty_vt_disallocate
&& path
)
230 (void) vt_disallocate(path
);
233 static bool is_terminal_input(ExecInput i
) {
236 EXEC_INPUT_TTY_FORCE
,
237 EXEC_INPUT_TTY_FAIL
);
240 static bool is_terminal_output(ExecOutput o
) {
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
248 static bool is_syslog_output(ExecOutput o
) {
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
254 static bool is_kmsg_output(ExecOutput o
) {
257 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
260 static bool exec_context_needs_term(const ExecContext
*c
) {
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
265 if (is_terminal_input(c
->std_input
))
268 if (is_terminal_output(c
->std_output
))
271 if (is_terminal_output(c
->std_error
))
274 return !!c
->tty_path
;
277 static int open_null_as(int flags
, int nfd
) {
282 fd
= open("/dev/null", flags
|O_NOCTTY
);
287 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
295 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
296 static const union sockaddr_union sa
= {
297 .un
.sun_family
= AF_UNIX
,
298 .un
.sun_path
= "/run/systemd/journal/stdout",
300 uid_t olduid
= UID_INVALID
;
301 gid_t oldgid
= GID_INVALID
;
304 if (gid_is_valid(gid
)) {
307 if (setegid(gid
) < 0)
311 if (uid_is_valid(uid
)) {
314 if (seteuid(uid
) < 0) {
320 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
322 /* If we fail to restore the uid or gid, things will likely
323 fail later on. This should only happen if an LSM interferes. */
325 if (uid_is_valid(uid
))
326 (void) seteuid(olduid
);
329 if (gid_is_valid(gid
))
330 (void) setegid(oldgid
);
335 static int connect_logger_as(
337 const ExecContext
*context
,
338 const ExecParameters
*params
,
349 assert(output
< _EXEC_OUTPUT_MAX
);
353 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
357 r
= connect_journal_socket(fd
, uid
, gid
);
361 if (shutdown(fd
, SHUT_RD
) < 0) {
366 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
376 context
->syslog_identifier
?: ident
,
377 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
378 context
->syslog_priority
,
379 !!context
->syslog_level_prefix
,
380 is_syslog_output(output
),
381 is_kmsg_output(output
),
382 is_terminal_output(output
));
387 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
392 static int open_terminal_as(const char *path
, mode_t mode
, int nfd
) {
398 fd
= open_terminal(path
, mode
| O_NOCTTY
);
403 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
411 static int fixup_input(ExecInput std_input
, int socket_fd
, bool apply_tty_stdin
) {
413 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
414 return EXEC_INPUT_NULL
;
416 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
417 return EXEC_INPUT_NULL
;
422 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
424 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
425 return EXEC_OUTPUT_INHERIT
;
430 static int setup_input(
431 const ExecContext
*context
,
432 const ExecParameters
*params
,
434 int named_iofds
[3]) {
441 if (params
->stdin_fd
>= 0) {
442 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
445 /* Try to make this the controlling tty, if it is a tty, and reset it */
446 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
447 (void) reset_terminal_fd(STDIN_FILENO
, true);
452 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
456 case EXEC_INPUT_NULL
:
457 return open_null_as(O_RDONLY
, STDIN_FILENO
);
460 case EXEC_INPUT_TTY_FORCE
:
461 case EXEC_INPUT_TTY_FAIL
: {
464 fd
= acquire_terminal(exec_context_tty_path(context
),
465 i
== EXEC_INPUT_TTY_FAIL
,
466 i
== EXEC_INPUT_TTY_FORCE
,
472 if (fd
!= STDIN_FILENO
) {
473 r
= dup2(fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
481 case EXEC_INPUT_SOCKET
:
482 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
484 case EXEC_INPUT_NAMED_FD
:
485 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
486 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
489 assert_not_reached("Unknown input type");
493 static int setup_output(
495 const ExecContext
*context
,
496 const ExecParameters
*params
,
503 dev_t
*journal_stream_dev
,
504 ino_t
*journal_stream_ino
) {
514 assert(journal_stream_dev
);
515 assert(journal_stream_ino
);
517 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
519 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
522 return STDOUT_FILENO
;
525 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
526 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
529 return STDERR_FILENO
;
532 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
533 o
= fixup_output(context
->std_output
, socket_fd
);
535 if (fileno
== STDERR_FILENO
) {
537 e
= fixup_output(context
->std_error
, socket_fd
);
539 /* This expects the input and output are already set up */
541 /* Don't change the stderr file descriptor if we inherit all
542 * the way and are not on a tty */
543 if (e
== EXEC_OUTPUT_INHERIT
&&
544 o
== EXEC_OUTPUT_INHERIT
&&
545 i
== EXEC_INPUT_NULL
&&
546 !is_terminal_input(context
->std_input
) &&
550 /* Duplicate from stdout if possible */
551 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
552 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
556 } else if (o
== EXEC_OUTPUT_INHERIT
) {
557 /* If input got downgraded, inherit the original value */
558 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
559 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
561 /* If the input is connected to anything that's not a /dev/null, inherit that... */
562 if (i
!= EXEC_INPUT_NULL
)
563 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
565 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
569 /* We need to open /dev/null here anew, to get the right access mode. */
570 return open_null_as(O_WRONLY
, fileno
);
575 case EXEC_OUTPUT_NULL
:
576 return open_null_as(O_WRONLY
, fileno
);
578 case EXEC_OUTPUT_TTY
:
579 if (is_terminal_input(i
))
580 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
582 /* We don't reset the terminal if this is just about output */
583 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
585 case EXEC_OUTPUT_SYSLOG
:
586 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
587 case EXEC_OUTPUT_KMSG
:
588 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
589 case EXEC_OUTPUT_JOURNAL
:
590 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
591 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
593 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
594 r
= open_null_as(O_WRONLY
, fileno
);
598 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
599 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
600 * services to detect whether they are connected to the journal or not.
602 * If both stdout and stderr are connected to a stream then let's make sure to store the data
603 * about STDERR as that's usually the best way to do logging. */
605 if (fstat(fileno
, &st
) >= 0 &&
606 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
607 *journal_stream_dev
= st
.st_dev
;
608 *journal_stream_ino
= st
.st_ino
;
613 case EXEC_OUTPUT_SOCKET
:
614 assert(socket_fd
>= 0);
615 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
617 case EXEC_OUTPUT_NAMED_FD
:
618 (void) fd_nonblock(named_iofds
[fileno
], false);
619 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
622 assert_not_reached("Unknown error type");
626 static int chown_terminal(int fd
, uid_t uid
) {
631 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
635 /* This might fail. What matters are the results. */
636 (void) fchown(fd
, uid
, -1);
637 (void) fchmod(fd
, TTY_MODE
);
639 if (fstat(fd
, &st
) < 0)
642 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
648 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
649 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
652 assert(_saved_stdin
);
653 assert(_saved_stdout
);
655 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
659 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
660 if (saved_stdout
< 0)
663 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
667 r
= chown_terminal(fd
, getuid());
671 r
= reset_terminal_fd(fd
, true);
675 if (dup2(fd
, STDIN_FILENO
) < 0)
678 if (dup2(fd
, STDOUT_FILENO
) < 0)
685 *_saved_stdin
= saved_stdin
;
686 *_saved_stdout
= saved_stdout
;
688 saved_stdin
= saved_stdout
= -1;
693 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
696 if (err
== -ETIMEDOUT
)
697 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
700 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
704 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
705 _cleanup_close_
int fd
= -1;
709 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
713 write_confirm_error_fd(err
, fd
, u
);
716 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
720 assert(saved_stdout
);
724 if (*saved_stdin
>= 0)
725 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
728 if (*saved_stdout
>= 0)
729 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
732 *saved_stdin
= safe_close(*saved_stdin
);
733 *saved_stdout
= safe_close(*saved_stdout
);
739 CONFIRM_PRETEND_FAILURE
= -1,
740 CONFIRM_PRETEND_SUCCESS
= 0,
744 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
745 int saved_stdout
= -1, saved_stdin
= -1, r
;
746 _cleanup_free_
char *e
= NULL
;
749 /* For any internal errors, assume a positive response. */
750 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
752 write_confirm_error(r
, vc
, u
);
753 return CONFIRM_EXECUTE
;
756 /* confirm_spawn might have been disabled while we were sleeping. */
757 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
762 e
= ellipsize(cmdline
, 60, 100);
770 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
772 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
779 printf("Resuming normal execution.\n");
780 manager_disable_confirm_spawn();
784 unit_dump(u
, stdout
, " ");
785 continue; /* ask again */
787 printf("Failing execution.\n");
788 r
= CONFIRM_PRETEND_FAILURE
;
791 printf(" c - continue, proceed without asking anymore\n"
792 " D - dump, show the state of the unit\n"
793 " f - fail, don't execute the command and pretend it failed\n"
795 " i - info, show a short summary of the unit\n"
796 " j - jobs, show jobs that are in progress\n"
797 " s - skip, don't execute the command and pretend it succeeded\n"
798 " y - yes, execute the command\n");
799 continue; /* ask again */
801 printf(" Description: %s\n"
804 u
->id
, u
->description
, cmdline
);
805 continue; /* ask again */
807 manager_dump_jobs(u
->manager
, stdout
, " ");
808 continue; /* ask again */
810 /* 'n' was removed in favor of 'f'. */
811 printf("Didn't understand 'n', did you mean 'f'?\n");
812 continue; /* ask again */
814 printf("Skipping execution.\n");
815 r
= CONFIRM_PRETEND_SUCCESS
;
821 assert_not_reached("Unhandled choice");
827 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
831 static int get_fixed_user(const ExecContext
*c
, const char **user
,
832 uid_t
*uid
, gid_t
*gid
,
833 const char **home
, const char **shell
) {
842 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
843 * (i.e. are "/" or "/bin/nologin"). */
846 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
854 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
864 r
= get_group_creds(&name
, gid
);
872 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
873 const char *group
, gid_t gid
,
874 gid_t
**supplementary_gids
, int *ngids
) {
878 bool keep_groups
= false;
879 gid_t
*groups
= NULL
;
880 _cleanup_free_ gid_t
*l_gids
= NULL
;
885 * If user is given, then lookup GID and supplementary groups list.
886 * We avoid NSS lookups for gid=0. Also we have to initialize groups
887 * here and as early as possible so we keep the list of supplementary
888 * groups of the caller.
890 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
891 /* First step, initialize groups from /etc/groups */
892 if (initgroups(user
, gid
) < 0)
898 if (strv_isempty(c
->supplementary_groups
))
902 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
903 * be positive, otherwise fail.
906 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
907 if (ngroups_max
<= 0) {
911 return -EOPNOTSUPP
; /* For all other values */
914 l_gids
= new(gid_t
, ngroups_max
);
920 * Lookup the list of groups that the user belongs to, we
921 * avoid NSS lookups here too for gid=0.
924 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
929 STRV_FOREACH(i
, c
->supplementary_groups
) {
932 if (k
>= ngroups_max
)
936 r
= get_group_creds(&g
, l_gids
+k
);
944 * Sets ngids to zero to drop all supplementary groups, happens
945 * when we are under root and SupplementaryGroups= is empty.
952 /* Otherwise get the final list of supplementary groups */
953 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
957 *supplementary_gids
= groups
;
965 static int enforce_groups(const ExecContext
*context
, gid_t gid
,
966 gid_t
*supplementary_gids
, int ngids
) {
971 /* Handle SupplementaryGroups= even if it is empty */
972 if (!strv_isempty(context
->supplementary_groups
)) {
973 r
= maybe_setgroups(ngids
, supplementary_gids
);
978 if (gid_is_valid(gid
)) {
979 /* Then set our gids */
980 if (setresgid(gid
, gid
, gid
) < 0)
987 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
990 if (!uid_is_valid(uid
))
993 /* Sets (but doesn't look up) the uid and make sure we keep the
994 * capabilities while doing so. */
996 if (context
->capability_ambient_set
!= 0) {
998 /* First step: If we need to keep capabilities but
999 * drop privileges we need to make sure we keep our
1000 * caps, while we drop privileges. */
1002 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1004 if (prctl(PR_GET_SECUREBITS
) != sb
)
1005 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1010 /* Second step: actually set the uids */
1011 if (setresuid(uid
, uid
, uid
) < 0)
1014 /* At this point we should have all necessary capabilities but
1015 are otherwise a normal user. However, the caps might got
1016 corrupted due to the setresuid() so we need clean them up
1017 later. This is done outside of this call. */
1024 static int null_conv(
1026 const struct pam_message
**msg
,
1027 struct pam_response
**resp
,
1028 void *appdata_ptr
) {
1030 /* We don't support conversations */
1032 return PAM_CONV_ERR
;
1037 static int setup_pam(
1044 int fds
[], unsigned n_fds
) {
1048 static const struct pam_conv conv
= {
1053 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1054 pam_handle_t
*handle
= NULL
;
1056 int pam_code
= PAM_SUCCESS
, r
;
1057 char **nv
, **e
= NULL
;
1058 bool close_session
= false;
1059 pid_t pam_pid
= 0, parent_pid
;
1066 /* We set up PAM in the parent process, then fork. The child
1067 * will then stay around until killed via PR_GET_PDEATHSIG or
1068 * systemd via the cgroup logic. It will then remove the PAM
1069 * session again. The parent process will exec() the actual
1070 * daemon. We do things this way to ensure that the main PID
1071 * of the daemon is the one we initially fork()ed. */
1073 r
= barrier_create(&barrier
);
1077 if (log_get_max_level() < LOG_DEBUG
)
1078 flags
|= PAM_SILENT
;
1080 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1081 if (pam_code
!= PAM_SUCCESS
) {
1087 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1088 if (pam_code
!= PAM_SUCCESS
)
1092 STRV_FOREACH(nv
, *env
) {
1093 pam_code
= pam_putenv(handle
, *nv
);
1094 if (pam_code
!= PAM_SUCCESS
)
1098 pam_code
= pam_acct_mgmt(handle
, flags
);
1099 if (pam_code
!= PAM_SUCCESS
)
1102 pam_code
= pam_open_session(handle
, flags
);
1103 if (pam_code
!= PAM_SUCCESS
)
1106 close_session
= true;
1108 e
= pam_getenvlist(handle
);
1110 pam_code
= PAM_BUF_ERR
;
1114 /* Block SIGTERM, so that we know that it won't get lost in
1117 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1119 parent_pid
= getpid_cached();
1128 int sig
, ret
= EXIT_PAM
;
1130 /* The child's job is to reset the PAM session on
1132 barrier_set_role(&barrier
, BARRIER_CHILD
);
1134 /* This string must fit in 10 chars (i.e. the length
1135 * of "/sbin/init"), to look pretty in /bin/ps */
1136 rename_process("(sd-pam)");
1138 /* Make sure we don't keep open the passed fds in this
1139 child. We assume that otherwise only those fds are
1140 open here that have been opened by PAM. */
1141 close_many(fds
, n_fds
);
1143 /* Drop privileges - we don't need any to pam_close_session
1144 * and this will make PR_SET_PDEATHSIG work in most cases.
1145 * If this fails, ignore the error - but expect sd-pam threads
1146 * to fail to exit normally */
1148 r
= maybe_setgroups(0, NULL
);
1150 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1151 if (setresgid(gid
, gid
, gid
) < 0)
1152 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1153 if (setresuid(uid
, uid
, uid
) < 0)
1154 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1156 (void) ignore_signals(SIGPIPE
, -1);
1158 /* Wait until our parent died. This will only work if
1159 * the above setresuid() succeeds, otherwise the kernel
1160 * will not allow unprivileged parents kill their privileged
1161 * children this way. We rely on the control groups kill logic
1162 * to do the rest for us. */
1163 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1166 /* Tell the parent that our setup is done. This is especially
1167 * important regarding dropping privileges. Otherwise, unit
1168 * setup might race against our setresuid(2) call.
1170 * If the parent aborted, we'll detect this below, hence ignore
1171 * return failure here. */
1172 (void) barrier_place(&barrier
);
1174 /* Check if our parent process might already have died? */
1175 if (getppid() == parent_pid
) {
1178 assert_se(sigemptyset(&ss
) >= 0);
1179 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1182 if (sigwait(&ss
, &sig
) < 0) {
1189 assert(sig
== SIGTERM
);
1194 /* If our parent died we'll end the session */
1195 if (getppid() != parent_pid
) {
1196 pam_code
= pam_close_session(handle
, flags
);
1197 if (pam_code
!= PAM_SUCCESS
)
1204 pam_end(handle
, pam_code
| flags
);
1208 barrier_set_role(&barrier
, BARRIER_PARENT
);
1210 /* If the child was forked off successfully it will do all the
1211 * cleanups, so forget about the handle here. */
1214 /* Unblock SIGTERM again in the parent */
1215 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1217 /* We close the log explicitly here, since the PAM modules
1218 * might have opened it, but we don't want this fd around. */
1221 /* Synchronously wait for the child to initialize. We don't care for
1222 * errors as we cannot recover. However, warn loudly if it happens. */
1223 if (!barrier_place_and_sync(&barrier
))
1224 log_error("PAM initialization failed");
1232 if (pam_code
!= PAM_SUCCESS
) {
1233 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1234 r
= -EPERM
; /* PAM errors do not map to errno */
1236 log_error_errno(r
, "PAM failed: %m");
1240 pam_code
= pam_close_session(handle
, flags
);
1242 pam_end(handle
, pam_code
| flags
);
1254 static void rename_process_from_path(const char *path
) {
1255 char process_name
[11];
1259 /* This resulting string must fit in 10 chars (i.e. the length
1260 * of "/sbin/init") to look pretty in /bin/ps */
1264 rename_process("(...)");
1270 /* The end of the process name is usually more
1271 * interesting, since the first bit might just be
1277 process_name
[0] = '(';
1278 memcpy(process_name
+1, p
, l
);
1279 process_name
[1+l
] = ')';
1280 process_name
[1+l
+1] = 0;
1282 rename_process(process_name
);
1285 static bool context_has_address_families(const ExecContext
*c
) {
1288 return c
->address_families_whitelist
||
1289 !set_isempty(c
->address_families
);
1292 static bool context_has_syscall_filters(const ExecContext
*c
) {
1295 return c
->syscall_whitelist
||
1296 !set_isempty(c
->syscall_filter
);
1299 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1302 if (c
->no_new_privileges
)
1305 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1308 /* We need NNP if we have any form of seccomp and are unprivileged */
1309 return context_has_address_families(c
) ||
1310 c
->memory_deny_write_execute
||
1311 c
->restrict_realtime
||
1312 exec_context_restrict_namespaces_set(c
) ||
1313 c
->protect_kernel_tunables
||
1314 c
->protect_kernel_modules
||
1315 c
->private_devices
||
1316 context_has_syscall_filters(c
) ||
1317 !set_isempty(c
->syscall_archs
) ||
1318 c
->lock_personality
;
1323 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1325 if (is_seccomp_available())
1328 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1332 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1333 uint32_t negative_action
, default_action
, action
;
1339 if (!context_has_syscall_filters(c
))
1342 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1345 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1347 if (c
->syscall_whitelist
) {
1348 default_action
= negative_action
;
1349 action
= SCMP_ACT_ALLOW
;
1351 default_action
= SCMP_ACT_ALLOW
;
1352 action
= negative_action
;
1355 if (needs_ambient_hack
) {
1356 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1361 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1364 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1368 if (set_isempty(c
->syscall_archs
))
1371 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1374 return seccomp_restrict_archs(c
->syscall_archs
);
1377 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1381 if (!context_has_address_families(c
))
1384 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1387 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1390 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1394 if (!c
->memory_deny_write_execute
)
1397 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1400 return seccomp_memory_deny_write_execute();
1403 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1407 if (!c
->restrict_realtime
)
1410 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1413 return seccomp_restrict_realtime();
1416 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1420 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1421 * let's protect even those systems where this is left on in the kernel. */
1423 if (!c
->protect_kernel_tunables
)
1426 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1429 return seccomp_protect_sysctl();
1432 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1436 /* Turn off module syscalls on ProtectKernelModules=yes */
1438 if (!c
->protect_kernel_modules
)
1441 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1444 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1447 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1451 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1453 if (!c
->private_devices
)
1456 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1459 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1462 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1466 if (!exec_context_restrict_namespaces_set(c
))
1469 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1472 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1475 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1476 unsigned long personality
;
1482 if (!c
->lock_personality
)
1485 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1488 personality
= c
->personality
;
1490 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1491 if (personality
== PERSONALITY_INVALID
) {
1493 r
= opinionated_personality(&personality
);
1498 return seccomp_lock_personality(personality
);
1503 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1506 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1507 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1509 if (idle_pipe
[0] >= 0) {
1512 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1514 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1517 /* Signal systemd that we are bored and want to continue. */
1518 n
= write(idle_pipe
[3], "x", 1);
1520 /* Wait for systemd to react to the signal above. */
1521 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1524 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1528 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1531 static int build_environment(
1533 const ExecContext
*c
,
1534 const ExecParameters
*p
,
1537 const char *username
,
1539 dev_t journal_stream_dev
,
1540 ino_t journal_stream_ino
,
1543 _cleanup_strv_free_
char **our_env
= NULL
;
1551 our_env
= new0(char*, 14);
1556 _cleanup_free_
char *joined
= NULL
;
1558 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1560 our_env
[n_env
++] = x
;
1562 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1564 our_env
[n_env
++] = x
;
1566 joined
= strv_join(p
->fd_names
, ":");
1570 x
= strjoin("LISTEN_FDNAMES=", joined
);
1573 our_env
[n_env
++] = x
;
1576 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1577 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1579 our_env
[n_env
++] = x
;
1581 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1583 our_env
[n_env
++] = x
;
1586 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1587 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1588 * check the database directly. */
1589 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1590 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1593 our_env
[n_env
++] = x
;
1597 x
= strappend("HOME=", home
);
1600 our_env
[n_env
++] = x
;
1604 x
= strappend("LOGNAME=", username
);
1607 our_env
[n_env
++] = x
;
1609 x
= strappend("USER=", username
);
1612 our_env
[n_env
++] = x
;
1616 x
= strappend("SHELL=", shell
);
1619 our_env
[n_env
++] = x
;
1622 if (!sd_id128_is_null(u
->invocation_id
)) {
1623 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1626 our_env
[n_env
++] = x
;
1629 if (exec_context_needs_term(c
)) {
1630 const char *tty_path
, *term
= NULL
;
1632 tty_path
= exec_context_tty_path(c
);
1634 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1635 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1636 * passes to PID 1 ends up all the way in the console login shown. */
1638 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1639 term
= getenv("TERM");
1641 term
= default_term_for_tty(tty_path
);
1643 x
= strappend("TERM=", term
);
1646 our_env
[n_env
++] = x
;
1649 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1650 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1653 our_env
[n_env
++] = x
;
1656 our_env
[n_env
++] = NULL
;
1657 assert(n_env
<= 12);
1665 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1666 _cleanup_strv_free_
char **pass_env
= NULL
;
1667 size_t n_env
= 0, n_bufsize
= 0;
1670 STRV_FOREACH(i
, c
->pass_environment
) {
1671 _cleanup_free_
char *x
= NULL
;
1677 x
= strjoin(*i
, "=", v
);
1681 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1684 pass_env
[n_env
++] = x
;
1685 pass_env
[n_env
] = NULL
;
1695 static bool exec_needs_mount_namespace(
1696 const ExecContext
*context
,
1697 const ExecParameters
*params
,
1698 ExecRuntime
*runtime
) {
1703 if (context
->root_image
)
1706 if (!strv_isempty(context
->read_write_paths
) ||
1707 !strv_isempty(context
->read_only_paths
) ||
1708 !strv_isempty(context
->inaccessible_paths
))
1711 if (context
->n_bind_mounts
> 0)
1714 if (context
->mount_flags
!= 0)
1717 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1720 if (context
->private_devices
||
1721 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1722 context
->protect_home
!= PROTECT_HOME_NO
||
1723 context
->protect_kernel_tunables
||
1724 context
->protect_kernel_modules
||
1725 context
->protect_control_groups
)
1728 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1731 if (context
->dynamic_user
&&
1732 (!strv_isempty(context
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) ||
1733 !strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1734 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1735 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
)))
1741 static int setup_private_users(uid_t uid
, gid_t gid
) {
1742 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1743 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1744 _cleanup_close_
int unshare_ready_fd
= -1;
1745 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1751 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1752 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1753 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1754 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1755 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1756 * continues execution normally. */
1758 if (uid
!= 0 && uid_is_valid(uid
)) {
1759 r
= asprintf(&uid_map
,
1760 "0 0 1\n" /* Map root → root */
1761 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1766 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1771 if (gid
!= 0 && gid_is_valid(gid
)) {
1772 r
= asprintf(&gid_map
,
1773 "0 0 1\n" /* Map root → root */
1774 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1779 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1784 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1786 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1787 if (unshare_ready_fd
< 0)
1790 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1792 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1800 _cleanup_close_
int fd
= -1;
1804 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1805 * here, after the parent opened its own user namespace. */
1808 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1810 /* Wait until the parent unshared the user namespace */
1811 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1816 /* Disable the setgroups() system call in the child user namespace, for good. */
1817 a
= procfs_file_alloca(ppid
, "setgroups");
1818 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1820 if (errno
!= ENOENT
) {
1825 /* If the file is missing the kernel is too old, let's continue anyway. */
1827 if (write(fd
, "deny\n", 5) < 0) {
1832 fd
= safe_close(fd
);
1835 /* First write the GID map */
1836 a
= procfs_file_alloca(ppid
, "gid_map");
1837 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1842 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1846 fd
= safe_close(fd
);
1848 /* The write the UID map */
1849 a
= procfs_file_alloca(ppid
, "uid_map");
1850 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1855 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1860 _exit(EXIT_SUCCESS
);
1863 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1864 _exit(EXIT_FAILURE
);
1867 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1869 if (unshare(CLONE_NEWUSER
) < 0)
1872 /* Let the child know that the namespace is ready now */
1873 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1876 /* Try to read an error code from the child */
1877 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1880 if (n
== sizeof(r
)) { /* an error code was sent to us */
1885 if (n
!= 0) /* on success we should have read 0 bytes */
1888 r
= wait_for_terminate(pid
, &si
);
1893 /* If something strange happened with the child, let's consider this fatal, too */
1894 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1900 static int setup_exec_directory(
1901 const ExecContext
*context
,
1902 const ExecParameters
*params
,
1905 ExecDirectoryType type
,
1908 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1909 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1910 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1911 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1912 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1913 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1920 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
1921 assert(exit_status
);
1923 if (!params
->prefix
[type
])
1926 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1927 if (!uid_is_valid(uid
))
1929 if (!gid_is_valid(gid
))
1933 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1934 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
1935 const char *effective
;
1937 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1943 r
= mkdir_parents_label(p
, 0755);
1947 if (context
->dynamic_user
&& type
!= EXEC_DIRECTORY_CONFIGURATION
) {
1948 _cleanup_free_
char *private_root
= NULL
, *relative
= NULL
, *parent
= NULL
;
1950 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1951 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1952 * whose UID is later on reused. To lock this down we use the same trick used by container
1953 * managers to prohibit host users to get access to files of the same UID in containers: we
1954 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1955 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1956 * to make this directory permeable for the service itself.
1958 * Specifically: for a service which wants a special directory "foo/" we first create a
1959 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1960 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1961 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1962 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1963 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1964 * disabling the access boundary for the service and making sure it only gets access to the
1965 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1967 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1968 * owned by the service itself. */
1970 private_root
= strjoin(params
->prefix
[type
], "/private");
1971 if (!private_root
) {
1976 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1977 r
= mkdir_safe_label(private_root
, 0700, 0, 0, false);
1981 pp
= strjoin(private_root
, "/", *rt
);
1987 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1988 r
= mkdir_parents_label(pp
, 0755);
1992 /* Finally, create the actual directory for the service */
1993 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
1994 if (r
< 0 && r
!= -EEXIST
)
1997 parent
= dirname_malloc(p
);
2003 r
= path_make_relative(parent
, pp
, &relative
);
2007 /* And link it up from the original place */
2008 r
= symlink_idempotent(relative
, p
);
2015 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2016 if (r
< 0 && r
!= -EEXIST
)
2022 /* First lock down the access mode */
2023 if (chmod(effective
, context
->directories
[type
].mode
) < 0) {
2028 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2029 * a service, and shall not be writable. */
2030 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2033 /* Then, change the ownership of the whole tree, if necessary */
2034 r
= path_chown_recursive(effective
, uid
, gid
);
2042 *exit_status
= exit_status_table
[type
];
2046 static int setup_smack(
2047 const ExecContext
*context
,
2048 const ExecCommand
*command
) {
2055 if (context
->smack_process_label
) {
2056 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2060 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2062 _cleanup_free_
char *exec_label
= NULL
;
2064 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2065 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2068 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2077 static int compile_read_write_paths(
2078 const ExecContext
*context
,
2079 const ExecParameters
*params
,
2082 _cleanup_strv_free_
char **l
= NULL
;
2084 ExecDirectoryType i
;
2086 /* Compile the list of writable paths. This is the combination of
2087 * the explicitly configured paths, plus all runtime directories. */
2089 if (strv_isempty(context
->read_write_paths
)) {
2090 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
2091 if (!strv_isempty(context
->directories
[i
].paths
))
2094 if (i
== _EXEC_DIRECTORY_TYPE_MAX
) {
2095 *ret
= NULL
; /* NOP if neither is set */
2100 l
= strv_copy(context
->read_write_paths
);
2104 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++) {
2105 if (!params
->prefix
[i
])
2108 STRV_FOREACH(rt
, context
->directories
[i
].paths
) {
2111 s
= strjoin(params
->prefix
[i
], "/", *rt
);
2115 if (strv_consume(&l
, s
) < 0)
2126 static int compile_bind_mounts(
2127 const ExecContext
*context
,
2128 const ExecParameters
*params
,
2129 BindMount
**ret_bind_mounts
,
2130 unsigned *ret_n_bind_mounts
,
2131 char ***ret_empty_directories
) {
2133 _cleanup_strv_free_
char **empty_directories
= NULL
;
2134 BindMount
*bind_mounts
;
2135 unsigned n
, h
= 0, i
;
2136 ExecDirectoryType t
;
2141 assert(ret_bind_mounts
);
2142 assert(ret_n_bind_mounts
);
2143 assert(ret_empty_directories
);
2145 n
= context
->n_bind_mounts
;
2146 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2147 if (!params
->prefix
[t
])
2150 n
+= strv_length(context
->directories
[t
].paths
);
2154 *ret_bind_mounts
= NULL
;
2155 *ret_n_bind_mounts
= 0;
2156 *ret_empty_directories
= NULL
;
2160 bind_mounts
= new(BindMount
, n
);
2164 for (i
= 0; context
->n_bind_mounts
; i
++) {
2165 BindMount
*item
= context
->bind_mounts
+ i
;
2168 s
= strdup(item
->source
);
2174 d
= strdup(item
->destination
);
2181 bind_mounts
[h
++] = (BindMount
) {
2184 .read_only
= item
->read_only
,
2185 .recursive
= item
->recursive
,
2186 .ignore_enoent
= item
->ignore_enoent
,
2190 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2193 if (!params
->prefix
[t
])
2196 if (strv_isempty(context
->directories
[t
].paths
))
2199 if (context
->dynamic_user
&& t
!= EXEC_DIRECTORY_CONFIGURATION
) {
2202 /* So this is for a dynamic user, and we need to make sure the process can access its own
2203 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2204 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2206 private_root
= strjoin(params
->prefix
[t
], "/private");
2207 if (!private_root
) {
2212 r
= strv_consume(&empty_directories
, private_root
);
2219 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2222 if (context
->dynamic_user
&& t
!= EXEC_DIRECTORY_CONFIGURATION
)
2223 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2225 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2238 bind_mounts
[h
++] = (BindMount
) {
2243 .ignore_enoent
= false,
2250 *ret_bind_mounts
= bind_mounts
;
2251 *ret_n_bind_mounts
= n
;
2252 *ret_empty_directories
= empty_directories
;
2254 empty_directories
= NULL
;
2259 bind_mount_free_many(bind_mounts
, h
);
2263 static int apply_mount_namespace(
2265 ExecCommand
*command
,
2266 const ExecContext
*context
,
2267 const ExecParameters
*params
,
2268 ExecRuntime
*runtime
) {
2270 _cleanup_strv_free_
char **rw
= NULL
, **empty_directories
= NULL
;
2271 char *tmp
= NULL
, *var
= NULL
;
2272 const char *root_dir
= NULL
, *root_image
= NULL
;
2273 NameSpaceInfo ns_info
= {
2274 .ignore_protect_paths
= false,
2275 .private_dev
= context
->private_devices
,
2276 .protect_control_groups
= context
->protect_control_groups
,
2277 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2278 .protect_kernel_modules
= context
->protect_kernel_modules
,
2279 .mount_apivfs
= context
->mount_apivfs
,
2281 bool needs_sandboxing
;
2282 BindMount
*bind_mounts
= NULL
;
2283 unsigned n_bind_mounts
= 0;
2288 /* The runtime struct only contains the parent of the private /tmp,
2289 * which is non-accessible to world users. Inside of it there's a /tmp
2290 * that is sticky, and that's the one we want to use here. */
2292 if (context
->private_tmp
&& runtime
) {
2293 if (runtime
->tmp_dir
)
2294 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2295 if (runtime
->var_tmp_dir
)
2296 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2299 r
= compile_read_write_paths(context
, params
, &rw
);
2303 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2304 root_image
= context
->root_image
;
2307 root_dir
= context
->root_directory
;
2310 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2315 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2316 * sandbox info, otherwise enforce it, don't ignore protected paths and
2317 * fail if we are enable to apply the sandbox inside the mount namespace.
2319 if (!context
->dynamic_user
&& root_dir
)
2320 ns_info
.ignore_protect_paths
= true;
2322 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2324 r
= setup_namespace(root_dir
, root_image
,
2326 needs_sandboxing
? context
->read_only_paths
: NULL
,
2327 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2333 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2334 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2335 context
->mount_flags
,
2336 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2338 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2340 /* If we couldn't set up the namespace this is probably due to a
2341 * missing capability. In this case, silently proceeed. */
2342 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2343 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2350 static int apply_working_directory(
2351 const ExecContext
*context
,
2352 const ExecParameters
*params
,
2354 const bool needs_mount_ns
,
2360 assert(exit_status
);
2362 if (context
->working_directory_home
) {
2365 *exit_status
= EXIT_CHDIR
;
2371 } else if (context
->working_directory
)
2372 wd
= context
->working_directory
;
2376 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2377 if (!needs_mount_ns
&& context
->root_directory
)
2378 if (chroot(context
->root_directory
) < 0) {
2379 *exit_status
= EXIT_CHROOT
;
2385 d
= prefix_roota(context
->root_directory
, wd
);
2387 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2388 *exit_status
= EXIT_CHDIR
;
2395 static int setup_keyring(
2397 const ExecContext
*context
,
2398 const ExecParameters
*p
,
2399 uid_t uid
, gid_t gid
) {
2401 key_serial_t keyring
;
2408 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2409 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2410 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2411 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2412 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2413 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2415 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2418 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2421 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2422 if (keyring
== -1) {
2423 if (errno
== ENOSYS
)
2424 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2425 else if (IN_SET(errno
, EACCES
, EPERM
))
2426 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2427 else if (errno
== EDQUOT
)
2428 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2430 return log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2435 /* Populate they keyring with the invocation ID by default. */
2436 if (!sd_id128_is_null(u
->invocation_id
)) {
2439 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2441 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2443 if (keyctl(KEYCTL_SETPERM
, key
,
2444 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2445 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2446 return log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2450 /* And now, make the keyring owned by the service's user */
2451 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2452 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2453 return log_unit_error_errno(u
, errno
, "Failed to change ownership of session keyring: %m");
2455 /* When requested link the user keyring into the session keyring. */
2456 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2460 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2461 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2462 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2464 saved_uid
= getuid();
2465 saved_gid
= getgid();
2467 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2468 if (setregid(gid
, -1) < 0)
2469 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2472 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2473 if (setreuid(uid
, -1) < 0) {
2474 (void) setregid(saved_gid
, -1);
2475 return log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2479 if (keyctl(KEYCTL_LINK
,
2480 KEY_SPEC_USER_KEYRING
,
2481 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2485 (void) setreuid(saved_uid
, -1);
2486 (void) setregid(saved_gid
, -1);
2488 return log_unit_error_errno(u
, r
, "Failed to link user keyring into session keyring: %m");
2491 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2492 if (setreuid(saved_uid
, -1) < 0) {
2493 (void) setregid(saved_gid
, -1);
2494 return log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2498 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2499 if (setregid(saved_gid
, -1) < 0)
2500 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2507 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2515 array
[(*n
)++] = pair
[0];
2517 array
[(*n
)++] = pair
[1];
2520 static int close_remaining_fds(
2521 const ExecParameters
*params
,
2522 ExecRuntime
*runtime
,
2523 DynamicCreds
*dcreds
,
2526 int *fds
, unsigned n_fds
) {
2528 unsigned n_dont_close
= 0;
2529 int dont_close
[n_fds
+ 12];
2533 if (params
->stdin_fd
>= 0)
2534 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2535 if (params
->stdout_fd
>= 0)
2536 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2537 if (params
->stderr_fd
>= 0)
2538 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2541 dont_close
[n_dont_close
++] = socket_fd
;
2543 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2544 n_dont_close
+= n_fds
;
2548 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2552 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2554 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2557 if (user_lookup_fd
>= 0)
2558 dont_close
[n_dont_close
++] = user_lookup_fd
;
2560 return close_all_fds(dont_close
, n_dont_close
);
2563 static int send_user_lookup(
2571 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2572 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2575 if (user_lookup_fd
< 0)
2578 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2581 if (writev(user_lookup_fd
,
2583 IOVEC_INIT(&uid
, sizeof(uid
)),
2584 IOVEC_INIT(&gid
, sizeof(gid
)),
2585 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2591 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2598 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2603 if (!c
->working_directory_home
)
2607 /* Hardcode /root as home directory for UID 0 */
2612 r
= get_home_dir(buf
);
2620 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2621 _cleanup_strv_free_
char ** list
= NULL
;
2622 ExecDirectoryType t
;
2629 assert(c
->dynamic_user
);
2631 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2632 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2635 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2638 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2644 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2647 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2651 r
= strv_consume(&list
, e
);
2663 static int exec_child(
2665 ExecCommand
*command
,
2666 const ExecContext
*context
,
2667 const ExecParameters
*params
,
2668 ExecRuntime
*runtime
,
2669 DynamicCreds
*dcreds
,
2674 unsigned n_storage_fds
,
2675 unsigned n_socket_fds
,
2680 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2681 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2682 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2683 const char *username
= NULL
, *groupname
= NULL
;
2684 const char *home
= NULL
, *shell
= NULL
;
2685 dev_t journal_stream_dev
= 0;
2686 ino_t journal_stream_ino
= 0;
2687 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2688 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2689 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2690 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2692 bool use_selinux
= false;
2695 bool use_smack
= false;
2698 bool use_apparmor
= false;
2700 uid_t uid
= UID_INVALID
;
2701 gid_t gid
= GID_INVALID
;
2702 int i
, r
, ngids
= 0;
2704 ExecDirectoryType dt
;
2711 assert(exit_status
);
2713 rename_process_from_path(command
->path
);
2715 /* We reset exactly these signals, since they are the
2716 * only ones we set to SIG_IGN in the main daemon. All
2717 * others we leave untouched because we set them to
2718 * SIG_DFL or a valid handler initially, both of which
2719 * will be demoted to SIG_DFL. */
2720 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2721 SIGNALS_IGNORE
, -1);
2723 if (context
->ignore_sigpipe
)
2724 (void) ignore_signals(SIGPIPE
, -1);
2726 r
= reset_signal_mask();
2728 *exit_status
= EXIT_SIGNAL_MASK
;
2729 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2732 if (params
->idle_pipe
)
2733 do_idle_pipe_dance(params
->idle_pipe
);
2735 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2736 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2737 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2738 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2741 log_set_open_when_needed(true);
2743 /* In case anything used libc syslog(), close this here, too */
2746 n_fds
= n_storage_fds
+ n_socket_fds
;
2747 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2749 *exit_status
= EXIT_FDS
;
2750 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2753 if (!context
->same_pgrp
)
2755 *exit_status
= EXIT_SETSID
;
2756 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2759 exec_context_tty_reset(context
, params
);
2761 if (unit_shall_confirm_spawn(unit
)) {
2762 const char *vc
= params
->confirm_spawn
;
2763 _cleanup_free_
char *cmdline
= NULL
;
2765 cmdline
= exec_command_line(argv
);
2767 *exit_status
= EXIT_MEMORY
;
2771 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2772 if (r
!= CONFIRM_EXECUTE
) {
2773 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2774 *exit_status
= EXIT_SUCCESS
;
2777 *exit_status
= EXIT_CONFIRM
;
2778 log_unit_error(unit
, "Execution cancelled by the user");
2783 if (context
->dynamic_user
&& dcreds
) {
2784 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2786 /* Make sure we bypass our own NSS module for any NSS checks */
2787 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2788 *exit_status
= EXIT_USER
;
2789 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2792 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2794 *exit_status
= EXIT_MEMORY
;
2798 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2800 *exit_status
= EXIT_USER
;
2801 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2804 if (!uid_is_valid(uid
)) {
2805 *exit_status
= EXIT_USER
;
2806 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2810 if (!gid_is_valid(gid
)) {
2811 *exit_status
= EXIT_USER
;
2812 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2817 username
= dcreds
->user
->name
;
2820 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2822 *exit_status
= EXIT_USER
;
2823 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2826 r
= get_fixed_group(context
, &groupname
, &gid
);
2828 *exit_status
= EXIT_GROUP
;
2829 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
2833 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2834 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2835 &supplementary_gids
, &ngids
);
2837 *exit_status
= EXIT_GROUP
;
2838 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
2841 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2843 *exit_status
= EXIT_USER
;
2844 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
2847 user_lookup_fd
= safe_close(user_lookup_fd
);
2849 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2851 *exit_status
= EXIT_CHDIR
;
2852 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
2855 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2856 * must sure to drop O_NONBLOCK */
2858 (void) fd_nonblock(socket_fd
, false);
2860 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2862 *exit_status
= EXIT_STDIN
;
2863 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
2866 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2868 *exit_status
= EXIT_STDOUT
;
2869 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
2872 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2874 *exit_status
= EXIT_STDERR
;
2875 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
2878 if (params
->cgroup_path
) {
2879 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2881 *exit_status
= EXIT_CGROUP
;
2882 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", params
->cgroup_path
);
2886 if (context
->oom_score_adjust_set
) {
2887 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2889 /* When we can't make this change due to EPERM, then
2890 * let's silently skip over it. User namespaces
2891 * prohibit write access to this file, and we
2892 * shouldn't trip up over that. */
2894 sprintf(t
, "%i", context
->oom_score_adjust
);
2895 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2896 if (IN_SET(r
, -EPERM
, -EACCES
))
2897 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2899 *exit_status
= EXIT_OOM_ADJUST
;
2900 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
2904 if (context
->nice_set
)
2905 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2906 *exit_status
= EXIT_NICE
;
2907 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
2910 if (context
->cpu_sched_set
) {
2911 struct sched_param param
= {
2912 .sched_priority
= context
->cpu_sched_priority
,
2915 r
= sched_setscheduler(0,
2916 context
->cpu_sched_policy
|
2917 (context
->cpu_sched_reset_on_fork
?
2918 SCHED_RESET_ON_FORK
: 0),
2921 *exit_status
= EXIT_SETSCHEDULER
;
2922 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
2926 if (context
->cpuset
)
2927 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2928 *exit_status
= EXIT_CPUAFFINITY
;
2929 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
2932 if (context
->ioprio_set
)
2933 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2934 *exit_status
= EXIT_IOPRIO
;
2935 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
2938 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2939 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2940 *exit_status
= EXIT_TIMERSLACK
;
2941 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
2944 if (context
->personality
!= PERSONALITY_INVALID
) {
2945 r
= safe_personality(context
->personality
);
2947 *exit_status
= EXIT_PERSONALITY
;
2948 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
2952 if (context
->utmp_id
)
2953 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2955 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2956 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2960 if (context
->user
) {
2961 r
= chown_terminal(STDIN_FILENO
, uid
);
2963 *exit_status
= EXIT_STDIN
;
2964 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
2968 /* If delegation is enabled we'll pass ownership of the cgroup
2969 * (but only in systemd's own controller hierarchy!) to the
2970 * user of the new process. */
2971 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2972 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2974 *exit_status
= EXIT_CGROUP
;
2975 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2978 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2980 *exit_status
= EXIT_CGROUP
;
2981 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2985 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
2986 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2988 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
2991 r
= build_environment(
3003 *exit_status
= EXIT_MEMORY
;
3007 r
= build_pass_environment(context
, &pass_env
);
3009 *exit_status
= EXIT_MEMORY
;
3013 accum_env
= strv_env_merge(5,
3014 params
->environment
,
3017 context
->environment
,
3021 *exit_status
= EXIT_MEMORY
;
3024 accum_env
= strv_env_clean(accum_env
);
3026 (void) umask(context
->umask
);
3028 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
3030 *exit_status
= EXIT_KEYRING
;
3031 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
3034 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3035 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3037 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3038 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3040 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3041 if (needs_ambient_hack
)
3042 needs_setuid
= false;
3044 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3046 if (needs_sandboxing
) {
3047 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3048 * present. The actual MAC context application will happen later, as late as possible, to avoid
3049 * impacting our own code paths. */
3052 use_selinux
= mac_selinux_use();
3055 use_smack
= mac_smack_use();
3058 use_apparmor
= mac_apparmor_use();
3063 if (context
->pam_name
&& username
) {
3064 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3066 *exit_status
= EXIT_PAM
;
3067 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3072 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3073 r
= setup_netns(runtime
->netns_storage_socket
);
3075 *exit_status
= EXIT_NETWORK
;
3076 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3080 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3081 if (needs_mount_namespace
) {
3082 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3084 *exit_status
= EXIT_NAMESPACE
;
3085 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3089 /* Apply just after mount namespace setup */
3090 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3092 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3094 /* Drop groups as early as possbile */
3096 r
= enforce_groups(context
, gid
, supplementary_gids
, ngids
);
3098 *exit_status
= EXIT_GROUP
;
3099 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3103 if (needs_sandboxing
) {
3105 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3106 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3108 *exit_status
= EXIT_SELINUX_CONTEXT
;
3109 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3114 if (context
->private_users
) {
3115 r
= setup_private_users(uid
, gid
);
3117 *exit_status
= EXIT_USER
;
3118 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3123 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3124 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3125 * was needed to upload the policy and can now be closed as well. */
3126 r
= close_all_fds(fds
, n_fds
);
3128 r
= shift_fds(fds
, n_fds
);
3130 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
3132 *exit_status
= EXIT_FDS
;
3133 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3136 secure_bits
= context
->secure_bits
;
3138 if (needs_sandboxing
) {
3141 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
3143 if (!context
->rlimit
[i
])
3146 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
3148 *exit_status
= EXIT_LIMITS
;
3149 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit %s: %m", rlimit_to_string(i
));
3153 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3154 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3155 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3156 *exit_status
= EXIT_LIMITS
;
3157 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3161 bset
= context
->capability_bounding_set
;
3162 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3163 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3164 * instead of us doing that */
3165 if (needs_ambient_hack
)
3166 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3167 (UINT64_C(1) << CAP_SETUID
) |
3168 (UINT64_C(1) << CAP_SETGID
);
3170 if (!cap_test_all(bset
)) {
3171 r
= capability_bounding_set_drop(bset
, false);
3173 *exit_status
= EXIT_CAPABILITIES
;
3174 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3178 /* This is done before enforce_user, but ambient set
3179 * does not survive over setresuid() if keep_caps is not set. */
3180 if (!needs_ambient_hack
&&
3181 context
->capability_ambient_set
!= 0) {
3182 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3184 *exit_status
= EXIT_CAPABILITIES
;
3185 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3191 if (context
->user
) {
3192 r
= enforce_user(context
, uid
);
3194 *exit_status
= EXIT_USER
;
3195 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3198 if (!needs_ambient_hack
&&
3199 context
->capability_ambient_set
!= 0) {
3201 /* Fix the ambient capabilities after user change. */
3202 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3204 *exit_status
= EXIT_CAPABILITIES
;
3205 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3208 /* If we were asked to change user and ambient capabilities
3209 * were requested, we had to add keep-caps to the securebits
3210 * so that we would maintain the inherited capability set
3211 * through the setresuid(). Make sure that the bit is added
3212 * also to the context secure_bits so that we don't try to
3213 * drop the bit away next. */
3215 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3220 if (needs_sandboxing
) {
3221 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3222 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3223 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3224 * are restricted. */
3228 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3231 r
= setexeccon(exec_context
);
3233 *exit_status
= EXIT_SELINUX_CONTEXT
;
3234 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3242 r
= setup_smack(context
, command
);
3244 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3245 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3251 if (use_apparmor
&& context
->apparmor_profile
) {
3252 r
= aa_change_onexec(context
->apparmor_profile
);
3253 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3254 *exit_status
= EXIT_APPARMOR_PROFILE
;
3255 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3260 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3261 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3262 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3263 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3264 *exit_status
= EXIT_SECUREBITS
;
3265 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3268 if (context_has_no_new_privileges(context
))
3269 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3270 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3271 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3275 r
= apply_address_families(unit
, context
);
3277 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3278 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3281 r
= apply_memory_deny_write_execute(unit
, context
);
3283 *exit_status
= EXIT_SECCOMP
;
3284 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3287 r
= apply_restrict_realtime(unit
, context
);
3289 *exit_status
= EXIT_SECCOMP
;
3290 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3293 r
= apply_restrict_namespaces(unit
, context
);
3295 *exit_status
= EXIT_SECCOMP
;
3296 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3299 r
= apply_protect_sysctl(unit
, context
);
3301 *exit_status
= EXIT_SECCOMP
;
3302 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3305 r
= apply_protect_kernel_modules(unit
, context
);
3307 *exit_status
= EXIT_SECCOMP
;
3308 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3311 r
= apply_private_devices(unit
, context
);
3313 *exit_status
= EXIT_SECCOMP
;
3314 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3317 r
= apply_syscall_archs(unit
, context
);
3319 *exit_status
= EXIT_SECCOMP
;
3320 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3323 r
= apply_lock_personality(unit
, context
);
3325 *exit_status
= EXIT_SECCOMP
;
3326 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3329 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3330 * by the filter as little as possible. */
3331 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3333 *exit_status
= EXIT_SECCOMP
;
3334 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3339 if (!strv_isempty(context
->unset_environment
)) {
3342 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3344 *exit_status
= EXIT_MEMORY
;
3348 strv_free(accum_env
);
3352 final_argv
= replace_env_argv(argv
, accum_env
);
3354 *exit_status
= EXIT_MEMORY
;
3358 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3359 _cleanup_free_
char *line
;
3361 line
= exec_command_line(final_argv
);
3363 log_struct(LOG_DEBUG
,
3364 "EXECUTABLE=%s", command
->path
,
3365 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3367 LOG_UNIT_INVOCATION_ID(unit
),
3372 execve(command
->path
, final_argv
, accum_env
);
3374 if (errno
== ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3376 log_struct_errno(LOG_INFO
, errno
,
3377 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3379 LOG_UNIT_INVOCATION_ID(unit
),
3380 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3382 "EXECUTABLE=%s", command
->path
,
3388 *exit_status
= EXIT_EXEC
;
3389 return log_unit_error_errno(unit
, errno
, "Failed to execute command: %m");
3392 int exec_spawn(Unit
*unit
,
3393 ExecCommand
*command
,
3394 const ExecContext
*context
,
3395 const ExecParameters
*params
,
3396 ExecRuntime
*runtime
,
3397 DynamicCreds
*dcreds
,
3400 _cleanup_strv_free_
char **files_env
= NULL
;
3402 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3403 _cleanup_free_
char *line
= NULL
;
3405 int named_iofds
[3] = { -1, -1, -1 };
3414 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3416 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3417 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3418 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3420 if (params
->n_socket_fds
> 1) {
3421 log_unit_error(unit
, "Got more than one socket.");
3425 if (params
->n_socket_fds
== 0) {
3426 log_unit_error(unit
, "Got no socket.");
3430 socket_fd
= params
->fds
[0];
3434 n_storage_fds
= params
->n_storage_fds
;
3435 n_socket_fds
= params
->n_socket_fds
;
3438 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3440 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3442 r
= exec_context_load_environment(unit
, context
, &files_env
);
3444 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3446 argv
= params
->argv
?: command
->argv
;
3447 line
= exec_command_line(argv
);
3451 log_struct(LOG_DEBUG
,
3452 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3453 "EXECUTABLE=%s", command
->path
,
3455 LOG_UNIT_INVOCATION_ID(unit
),
3460 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3463 int exit_status
= EXIT_SUCCESS
;
3465 r
= exec_child(unit
,
3478 unit
->manager
->user_lookup_fds
[1],
3482 log_struct_errno(LOG_ERR
, r
,
3483 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3485 LOG_UNIT_INVOCATION_ID(unit
),
3486 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3487 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3489 "EXECUTABLE=%s", command
->path
,
3496 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3498 /* We add the new process to the cgroup both in the child (so
3499 * that we can be sure that no user code is ever executed
3500 * outside of the cgroup) and in the parent (so that we can be
3501 * sure that when we kill the cgroup the process will be
3503 if (params
->cgroup_path
)
3504 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3506 exec_status_start(&command
->exec_status
, pid
);
3512 void exec_context_init(ExecContext
*c
) {
3513 ExecDirectoryType i
;
3518 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3519 c
->cpu_sched_policy
= SCHED_OTHER
;
3520 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3521 c
->syslog_level_prefix
= true;
3522 c
->ignore_sigpipe
= true;
3523 c
->timer_slack_nsec
= NSEC_INFINITY
;
3524 c
->personality
= PERSONALITY_INVALID
;
3525 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3526 c
->directories
[i
].mode
= 0755;
3527 c
->capability_bounding_set
= CAP_ALL
;
3528 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3531 void exec_context_done(ExecContext
*c
) {
3533 ExecDirectoryType i
;
3537 c
->environment
= strv_free(c
->environment
);
3538 c
->environment_files
= strv_free(c
->environment_files
);
3539 c
->pass_environment
= strv_free(c
->pass_environment
);
3540 c
->unset_environment
= strv_free(c
->unset_environment
);
3542 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3543 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3545 for (l
= 0; l
< 3; l
++)
3546 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3548 c
->working_directory
= mfree(c
->working_directory
);
3549 c
->root_directory
= mfree(c
->root_directory
);
3550 c
->root_image
= mfree(c
->root_image
);
3551 c
->tty_path
= mfree(c
->tty_path
);
3552 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3553 c
->user
= mfree(c
->user
);
3554 c
->group
= mfree(c
->group
);
3556 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3558 c
->pam_name
= mfree(c
->pam_name
);
3560 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3561 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3562 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3564 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3567 CPU_FREE(c
->cpuset
);
3569 c
->utmp_id
= mfree(c
->utmp_id
);
3570 c
->selinux_context
= mfree(c
->selinux_context
);
3571 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3572 c
->smack_process_label
= mfree(c
->smack_process_label
);
3574 c
->syscall_filter
= set_free(c
->syscall_filter
);
3575 c
->syscall_archs
= set_free(c
->syscall_archs
);
3576 c
->address_families
= set_free(c
->address_families
);
3578 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3579 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3582 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3587 if (!runtime_prefix
)
3590 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3591 _cleanup_free_
char *p
;
3593 p
= strjoin(runtime_prefix
, "/", *i
);
3597 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3599 (void) rm_rf(p
, REMOVE_ROOT
);
3601 /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1
3602 * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it
3603 * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst
3604 * case this is a NOP. */
3607 p
= strjoin(runtime_prefix
, "/private/", *i
);
3611 (void) rm_rf(p
, REMOVE_ROOT
);
3617 void exec_command_done(ExecCommand
*c
) {
3620 c
->path
= mfree(c
->path
);
3622 c
->argv
= strv_free(c
->argv
);
3625 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3628 for (i
= 0; i
< n
; i
++)
3629 exec_command_done(c
+i
);
3632 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3636 LIST_REMOVE(command
, c
, i
);
3637 exec_command_done(i
);
3644 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3647 for (i
= 0; i
< n
; i
++)
3648 c
[i
] = exec_command_free_list(c
[i
]);
3651 typedef struct InvalidEnvInfo
{
3656 static void invalid_env(const char *p
, void *userdata
) {
3657 InvalidEnvInfo
*info
= userdata
;
3659 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3662 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3667 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3669 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3671 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3673 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3675 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3677 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3683 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3684 unsigned i
, targets
;
3685 const char* stdio_fdname
[3];
3691 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3692 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3693 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3695 for (i
= 0; i
< 3; i
++)
3696 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3698 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3700 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3701 if (named_iofds
[STDIN_FILENO
] < 0 &&
3702 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3703 stdio_fdname
[STDIN_FILENO
] &&
3704 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3706 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3709 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3710 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3711 stdio_fdname
[STDOUT_FILENO
] &&
3712 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3714 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3717 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3718 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3719 stdio_fdname
[STDERR_FILENO
] &&
3720 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3722 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3726 return targets
== 0 ? 0 : -ENOENT
;
3729 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3730 char **i
, **r
= NULL
;
3735 STRV_FOREACH(i
, c
->environment_files
) {
3739 bool ignore
= false;
3741 _cleanup_globfree_ glob_t pglob
= {};
3750 if (!path_is_absolute(fn
)) {
3758 /* Filename supports globbing, take all matching files */
3759 k
= safe_glob(fn
, 0, &pglob
);
3768 /* When we don't match anything, -ENOENT should be returned */
3769 assert(pglob
.gl_pathc
> 0);
3771 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3772 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3780 /* Log invalid environment variables with filename */
3782 InvalidEnvInfo info
= {
3784 .path
= pglob
.gl_pathv
[n
]
3787 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3795 m
= strv_env_merge(2, r
, p
);
3811 static bool tty_may_match_dev_console(const char *tty
) {
3812 _cleanup_free_
char *active
= NULL
;
3818 tty
= skip_dev_prefix(tty
);
3820 /* trivial identity? */
3821 if (streq(tty
, "console"))
3824 console
= resolve_dev_console(&active
);
3825 /* if we could not resolve, assume it may */
3829 /* "tty0" means the active VC, so it may be the same sometimes */
3830 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3833 bool exec_context_may_touch_console(ExecContext
*ec
) {
3835 return (ec
->tty_reset
||
3837 ec
->tty_vt_disallocate
||
3838 is_terminal_input(ec
->std_input
) ||
3839 is_terminal_output(ec
->std_output
) ||
3840 is_terminal_output(ec
->std_error
)) &&
3841 tty_may_match_dev_console(exec_context_tty_path(ec
));
3844 static void strv_fprintf(FILE *f
, char **l
) {
3850 fprintf(f
, " %s", *g
);
3853 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3856 ExecDirectoryType dt
;
3862 prefix
= strempty(prefix
);
3866 "%sWorkingDirectory: %s\n"
3867 "%sRootDirectory: %s\n"
3868 "%sNonBlocking: %s\n"
3869 "%sPrivateTmp: %s\n"
3870 "%sPrivateDevices: %s\n"
3871 "%sProtectKernelTunables: %s\n"
3872 "%sProtectKernelModules: %s\n"
3873 "%sProtectControlGroups: %s\n"
3874 "%sPrivateNetwork: %s\n"
3875 "%sPrivateUsers: %s\n"
3876 "%sProtectHome: %s\n"
3877 "%sProtectSystem: %s\n"
3878 "%sMountAPIVFS: %s\n"
3879 "%sIgnoreSIGPIPE: %s\n"
3880 "%sMemoryDenyWriteExecute: %s\n"
3881 "%sRestrictRealtime: %s\n"
3882 "%sKeyringMode: %s\n",
3884 prefix
, c
->working_directory
? c
->working_directory
: "/",
3885 prefix
, c
->root_directory
? c
->root_directory
: "/",
3886 prefix
, yes_no(c
->non_blocking
),
3887 prefix
, yes_no(c
->private_tmp
),
3888 prefix
, yes_no(c
->private_devices
),
3889 prefix
, yes_no(c
->protect_kernel_tunables
),
3890 prefix
, yes_no(c
->protect_kernel_modules
),
3891 prefix
, yes_no(c
->protect_control_groups
),
3892 prefix
, yes_no(c
->private_network
),
3893 prefix
, yes_no(c
->private_users
),
3894 prefix
, protect_home_to_string(c
->protect_home
),
3895 prefix
, protect_system_to_string(c
->protect_system
),
3896 prefix
, yes_no(c
->mount_apivfs
),
3897 prefix
, yes_no(c
->ignore_sigpipe
),
3898 prefix
, yes_no(c
->memory_deny_write_execute
),
3899 prefix
, yes_no(c
->restrict_realtime
),
3900 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3903 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3905 STRV_FOREACH(e
, c
->environment
)
3906 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3908 STRV_FOREACH(e
, c
->environment_files
)
3909 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3911 STRV_FOREACH(e
, c
->pass_environment
)
3912 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3914 STRV_FOREACH(e
, c
->unset_environment
)
3915 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3917 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3919 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3920 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3922 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3923 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3931 if (c
->oom_score_adjust_set
)
3933 "%sOOMScoreAdjust: %i\n",
3934 prefix
, c
->oom_score_adjust
);
3936 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3938 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3939 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3940 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3941 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3944 if (c
->ioprio_set
) {
3945 _cleanup_free_
char *class_str
= NULL
;
3947 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3949 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3951 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3954 if (c
->cpu_sched_set
) {
3955 _cleanup_free_
char *policy_str
= NULL
;
3957 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3959 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3962 "%sCPUSchedulingPriority: %i\n"
3963 "%sCPUSchedulingResetOnFork: %s\n",
3964 prefix
, c
->cpu_sched_priority
,
3965 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3969 fprintf(f
, "%sCPUAffinity:", prefix
);
3970 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3971 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3972 fprintf(f
, " %u", i
);
3976 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3977 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3980 "%sStandardInput: %s\n"
3981 "%sStandardOutput: %s\n"
3982 "%sStandardError: %s\n",
3983 prefix
, exec_input_to_string(c
->std_input
),
3984 prefix
, exec_output_to_string(c
->std_output
),
3985 prefix
, exec_output_to_string(c
->std_error
));
3991 "%sTTYVHangup: %s\n"
3992 "%sTTYVTDisallocate: %s\n",
3993 prefix
, c
->tty_path
,
3994 prefix
, yes_no(c
->tty_reset
),
3995 prefix
, yes_no(c
->tty_vhangup
),
3996 prefix
, yes_no(c
->tty_vt_disallocate
));
3998 if (IN_SET(c
->std_output
,
4001 EXEC_OUTPUT_JOURNAL
,
4002 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4003 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4004 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
4005 IN_SET(c
->std_error
,
4008 EXEC_OUTPUT_JOURNAL
,
4009 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4010 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4011 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
4013 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
4015 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
4017 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
4019 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
4021 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
4024 if (c
->secure_bits
) {
4025 _cleanup_free_
char *str
= NULL
;
4027 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4029 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4032 if (c
->capability_bounding_set
!= CAP_ALL
) {
4033 _cleanup_free_
char *str
= NULL
;
4035 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4037 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4040 if (c
->capability_ambient_set
!= 0) {
4041 _cleanup_free_
char *str
= NULL
;
4043 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4045 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4049 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4051 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4053 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4055 if (!strv_isempty(c
->supplementary_groups
)) {
4056 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4057 strv_fprintf(f
, c
->supplementary_groups
);
4062 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4064 if (strv_length(c
->read_write_paths
) > 0) {
4065 fprintf(f
, "%sReadWritePaths:", prefix
);
4066 strv_fprintf(f
, c
->read_write_paths
);
4070 if (strv_length(c
->read_only_paths
) > 0) {
4071 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4072 strv_fprintf(f
, c
->read_only_paths
);
4076 if (strv_length(c
->inaccessible_paths
) > 0) {
4077 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4078 strv_fprintf(f
, c
->inaccessible_paths
);
4082 if (c
->n_bind_mounts
> 0)
4083 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
4084 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
4085 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4086 c
->bind_mounts
[i
].source
,
4087 c
->bind_mounts
[i
].destination
,
4088 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4093 "%sUtmpIdentifier: %s\n",
4094 prefix
, c
->utmp_id
);
4096 if (c
->selinux_context
)
4098 "%sSELinuxContext: %s%s\n",
4099 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4101 if (c
->apparmor_profile
)
4103 "%sAppArmorProfile: %s%s\n",
4104 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4106 if (c
->smack_process_label
)
4108 "%sSmackProcessLabel: %s%s\n",
4109 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4111 if (c
->personality
!= PERSONALITY_INVALID
)
4113 "%sPersonality: %s\n",
4114 prefix
, strna(personality_to_string(c
->personality
)));
4117 "%sLockPersonality: %s\n",
4118 prefix
, yes_no(c
->lock_personality
));
4120 if (c
->syscall_filter
) {
4128 "%sSystemCallFilter: ",
4131 if (!c
->syscall_whitelist
)
4135 SET_FOREACH(id
, c
->syscall_filter
, j
) {
4136 _cleanup_free_
char *name
= NULL
;
4143 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4144 fputs(strna(name
), f
);
4151 if (c
->syscall_archs
) {
4158 "%sSystemCallArchitectures:",
4162 SET_FOREACH(id
, c
->syscall_archs
, j
)
4163 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4168 if (exec_context_restrict_namespaces_set(c
)) {
4169 _cleanup_free_
char *s
= NULL
;
4171 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
4173 fprintf(f
, "%sRestrictNamespaces: %s\n",
4177 if (c
->syscall_errno
> 0)
4179 "%sSystemCallErrorNumber: %s\n",
4180 prefix
, strna(errno_to_name(c
->syscall_errno
)));
4182 if (c
->apparmor_profile
)
4184 "%sAppArmorProfile: %s%s\n",
4185 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4188 bool exec_context_maintains_privileges(ExecContext
*c
) {
4191 /* Returns true if the process forked off would run under
4192 * an unchanged UID or as root. */
4197 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4203 int exec_context_get_effective_ioprio(ExecContext
*c
) {
4211 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4213 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4218 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4223 dual_timestamp_get(&s
->start_timestamp
);
4226 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
4229 if (s
->pid
&& s
->pid
!= pid
)
4233 dual_timestamp_get(&s
->exit_timestamp
);
4239 if (context
->utmp_id
)
4240 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4242 exec_context_tty_reset(context
, NULL
);
4246 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
4247 char buf
[FORMAT_TIMESTAMP_MAX
];
4255 prefix
= strempty(prefix
);
4258 "%sPID: "PID_FMT
"\n",
4261 if (dual_timestamp_is_set(&s
->start_timestamp
))
4263 "%sStart Timestamp: %s\n",
4264 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4266 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4268 "%sExit Timestamp: %s\n"
4270 "%sExit Status: %i\n",
4271 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4272 prefix
, sigchld_code_to_string(s
->code
),
4276 char *exec_command_line(char **argv
) {
4284 STRV_FOREACH(a
, argv
)
4292 STRV_FOREACH(a
, argv
) {
4299 if (strpbrk(*a
, WHITESPACE
)) {
4310 /* FIXME: this doesn't really handle arguments that have
4311 * spaces and ticks in them */
4316 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4317 _cleanup_free_
char *cmd
= NULL
;
4318 const char *prefix2
;
4323 prefix
= strempty(prefix
);
4324 prefix2
= strjoina(prefix
, "\t");
4326 cmd
= exec_command_line(c
->argv
);
4328 "%sCommand Line: %s\n",
4329 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4331 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4334 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4337 prefix
= strempty(prefix
);
4339 LIST_FOREACH(command
, c
, c
)
4340 exec_command_dump(c
, f
, prefix
);
4343 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4350 /* It's kind of important, that we keep the order here */
4351 LIST_FIND_TAIL(command
, *l
, end
);
4352 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4357 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4365 l
= strv_new_ap(path
, ap
);
4386 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4387 _cleanup_strv_free_
char **l
= NULL
;
4395 l
= strv_new_ap(path
, ap
);
4401 r
= strv_extend_strv(&c
->argv
, l
, false);
4409 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4414 *rt
= new0(ExecRuntime
, 1);
4419 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4424 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4434 if (!c
->private_network
&& !c
->private_tmp
)
4437 r
= exec_runtime_allocate(rt
);
4441 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4442 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4446 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4447 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4455 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4457 assert(r
->n_ref
> 0);
4463 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4468 assert(r
->n_ref
> 0);
4475 free(r
->var_tmp_dir
);
4476 safe_close_pair(r
->netns_storage_socket
);
4480 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4489 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4491 if (rt
->var_tmp_dir
)
4492 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4494 if (rt
->netns_storage_socket
[0] >= 0) {
4497 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4501 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4504 if (rt
->netns_storage_socket
[1] >= 0) {
4507 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4511 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4517 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4524 if (streq(key
, "tmp-dir")) {
4527 r
= exec_runtime_allocate(rt
);
4531 copy
= strdup(value
);
4535 free((*rt
)->tmp_dir
);
4536 (*rt
)->tmp_dir
= copy
;
4538 } else if (streq(key
, "var-tmp-dir")) {
4541 r
= exec_runtime_allocate(rt
);
4545 copy
= strdup(value
);
4549 free((*rt
)->var_tmp_dir
);
4550 (*rt
)->var_tmp_dir
= copy
;
4552 } else if (streq(key
, "netns-socket-0")) {
4555 r
= exec_runtime_allocate(rt
);
4559 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4560 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4562 safe_close((*rt
)->netns_storage_socket
[0]);
4563 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4565 } else if (streq(key
, "netns-socket-1")) {
4568 r
= exec_runtime_allocate(rt
);
4572 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4573 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4575 safe_close((*rt
)->netns_storage_socket
[1]);
4576 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4584 static void *remove_tmpdir_thread(void *p
) {
4585 _cleanup_free_
char *path
= p
;
4587 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4591 void exec_runtime_destroy(ExecRuntime
*rt
) {
4597 /* If there are multiple users of this, let's leave the stuff around */
4602 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4604 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4606 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4613 if (rt
->var_tmp_dir
) {
4614 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4616 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4618 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4619 free(rt
->var_tmp_dir
);
4622 rt
->var_tmp_dir
= NULL
;
4625 safe_close_pair(rt
->netns_storage_socket
);
4628 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4629 [EXEC_INPUT_NULL
] = "null",
4630 [EXEC_INPUT_TTY
] = "tty",
4631 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4632 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4633 [EXEC_INPUT_SOCKET
] = "socket",
4634 [EXEC_INPUT_NAMED_FD
] = "fd",
4637 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4639 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4640 [EXEC_OUTPUT_INHERIT
] = "inherit",
4641 [EXEC_OUTPUT_NULL
] = "null",
4642 [EXEC_OUTPUT_TTY
] = "tty",
4643 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4644 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4645 [EXEC_OUTPUT_KMSG
] = "kmsg",
4646 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4647 [EXEC_OUTPUT_JOURNAL
] = "journal",
4648 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4649 [EXEC_OUTPUT_SOCKET
] = "socket",
4650 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4653 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4655 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4656 [EXEC_UTMP_INIT
] = "init",
4657 [EXEC_UTMP_LOGIN
] = "login",
4658 [EXEC_UTMP_USER
] = "user",
4661 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4663 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4664 [EXEC_PRESERVE_NO
] = "no",
4665 [EXEC_PRESERVE_YES
] = "yes",
4666 [EXEC_PRESERVE_RESTART
] = "restart",
4669 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4671 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
4672 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4673 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4674 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4675 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4676 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4679 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
4681 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
4682 [EXEC_KEYRING_INHERIT
] = "inherit",
4683 [EXEC_KEYRING_PRIVATE
] = "private",
4684 [EXEC_KEYRING_SHARED
] = "shared",
4687 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);