2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
33 #include <sys/socket.h>
35 #include <sys/types.h>
41 #include <security/pam_appl.h>
45 #include <selinux/selinux.h>
53 #include <sys/apparmor.h>
56 #include "sd-messages.h"
59 #include "alloc-util.h"
61 #include "apparmor-util.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
70 #include "errno-list.h"
72 #include "exit-status.h"
75 #include "format-util.h"
77 #include "glob-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
92 #include "seccomp-util.h"
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
100 #include "string-table.h"
101 #include "string-util.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
106 #include "user-util.h"
108 #include "utmp-wtmp.h"
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
116 #define SNDBUF_SIZE (8*1024*1024)
118 static int shift_fds(int fds
[], unsigned n_fds
) {
119 int start
, restart_from
;
124 /* Modifies the fds array! (sorts it) */
134 for (i
= start
; i
< (int) n_fds
; i
++) {
137 /* Already at right index? */
141 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd
!= i
+3 && restart_from
< 0)
154 if (restart_from
< 0)
157 start
= restart_from
;
163 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
167 n_fds
= n_storage_fds
+ n_socket_fds
;
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
176 for (i
= 0; i
< n_fds
; i
++) {
178 if (i
< n_socket_fds
) {
179 r
= fd_nonblock(fds
[i
], nonblock
);
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
188 r
= fd_cloexec(fds
[i
], false);
196 static const char *exec_context_tty_path(const ExecContext
*context
) {
199 if (context
->stdio_as_fds
)
202 if (context
->tty_path
)
203 return context
->tty_path
;
205 return "/dev/console";
208 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
213 path
= exec_context_tty_path(context
);
215 if (context
->tty_vhangup
) {
216 if (p
&& p
->stdin_fd
>= 0)
217 (void) terminal_vhangup_fd(p
->stdin_fd
);
219 (void) terminal_vhangup(path
);
222 if (context
->tty_reset
) {
223 if (p
&& p
->stdin_fd
>= 0)
224 (void) reset_terminal_fd(p
->stdin_fd
, true);
226 (void) reset_terminal(path
);
229 if (context
->tty_vt_disallocate
&& path
)
230 (void) vt_disallocate(path
);
233 static bool is_terminal_input(ExecInput i
) {
236 EXEC_INPUT_TTY_FORCE
,
237 EXEC_INPUT_TTY_FAIL
);
240 static bool is_terminal_output(ExecOutput o
) {
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
248 static bool is_syslog_output(ExecOutput o
) {
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
254 static bool is_kmsg_output(ExecOutput o
) {
257 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
260 static bool exec_context_needs_term(const ExecContext
*c
) {
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
265 if (is_terminal_input(c
->std_input
))
268 if (is_terminal_output(c
->std_output
))
271 if (is_terminal_output(c
->std_error
))
274 return !!c
->tty_path
;
277 static int open_null_as(int flags
, int nfd
) {
282 fd
= open("/dev/null", flags
|O_NOCTTY
);
287 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
295 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
296 static const union sockaddr_union sa
= {
297 .un
.sun_family
= AF_UNIX
,
298 .un
.sun_path
= "/run/systemd/journal/stdout",
300 uid_t olduid
= UID_INVALID
;
301 gid_t oldgid
= GID_INVALID
;
304 if (gid_is_valid(gid
)) {
307 if (setegid(gid
) < 0)
311 if (uid_is_valid(uid
)) {
314 if (seteuid(uid
) < 0) {
320 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
322 /* If we fail to restore the uid or gid, things will likely
323 fail later on. This should only happen if an LSM interferes. */
325 if (uid_is_valid(uid
))
326 (void) seteuid(olduid
);
329 if (gid_is_valid(gid
))
330 (void) setegid(oldgid
);
335 static int connect_logger_as(
337 const ExecContext
*context
,
338 const ExecParameters
*params
,
349 assert(output
< _EXEC_OUTPUT_MAX
);
353 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
357 r
= connect_journal_socket(fd
, uid
, gid
);
361 if (shutdown(fd
, SHUT_RD
) < 0) {
366 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
376 context
->syslog_identifier
?: ident
,
377 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
378 context
->syslog_priority
,
379 !!context
->syslog_level_prefix
,
380 is_syslog_output(output
),
381 is_kmsg_output(output
),
382 is_terminal_output(output
));
387 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
392 static int open_terminal_as(const char *path
, mode_t mode
, int nfd
) {
398 fd
= open_terminal(path
, mode
| O_NOCTTY
);
403 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
411 static int fixup_input(ExecInput std_input
, int socket_fd
, bool apply_tty_stdin
) {
413 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
414 return EXEC_INPUT_NULL
;
416 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
417 return EXEC_INPUT_NULL
;
422 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
424 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
425 return EXEC_OUTPUT_INHERIT
;
430 static int setup_input(
431 const ExecContext
*context
,
432 const ExecParameters
*params
,
434 int named_iofds
[3]) {
441 if (params
->stdin_fd
>= 0) {
442 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
445 /* Try to make this the controlling tty, if it is a tty, and reset it */
446 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
447 (void) reset_terminal_fd(STDIN_FILENO
, true);
452 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
456 case EXEC_INPUT_NULL
:
457 return open_null_as(O_RDONLY
, STDIN_FILENO
);
460 case EXEC_INPUT_TTY_FORCE
:
461 case EXEC_INPUT_TTY_FAIL
: {
464 fd
= acquire_terminal(exec_context_tty_path(context
),
465 i
== EXEC_INPUT_TTY_FAIL
,
466 i
== EXEC_INPUT_TTY_FORCE
,
472 if (fd
!= STDIN_FILENO
) {
473 r
= dup2(fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
481 case EXEC_INPUT_SOCKET
:
482 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
484 case EXEC_INPUT_NAMED_FD
:
485 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
486 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
489 assert_not_reached("Unknown input type");
493 static int setup_output(
495 const ExecContext
*context
,
496 const ExecParameters
*params
,
503 dev_t
*journal_stream_dev
,
504 ino_t
*journal_stream_ino
) {
514 assert(journal_stream_dev
);
515 assert(journal_stream_ino
);
517 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
519 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
522 return STDOUT_FILENO
;
525 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
526 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
529 return STDERR_FILENO
;
532 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
533 o
= fixup_output(context
->std_output
, socket_fd
);
535 if (fileno
== STDERR_FILENO
) {
537 e
= fixup_output(context
->std_error
, socket_fd
);
539 /* This expects the input and output are already set up */
541 /* Don't change the stderr file descriptor if we inherit all
542 * the way and are not on a tty */
543 if (e
== EXEC_OUTPUT_INHERIT
&&
544 o
== EXEC_OUTPUT_INHERIT
&&
545 i
== EXEC_INPUT_NULL
&&
546 !is_terminal_input(context
->std_input
) &&
550 /* Duplicate from stdout if possible */
551 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
552 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
556 } else if (o
== EXEC_OUTPUT_INHERIT
) {
557 /* If input got downgraded, inherit the original value */
558 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
559 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
561 /* If the input is connected to anything that's not a /dev/null, inherit that... */
562 if (i
!= EXEC_INPUT_NULL
)
563 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
565 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
569 /* We need to open /dev/null here anew, to get the right access mode. */
570 return open_null_as(O_WRONLY
, fileno
);
575 case EXEC_OUTPUT_NULL
:
576 return open_null_as(O_WRONLY
, fileno
);
578 case EXEC_OUTPUT_TTY
:
579 if (is_terminal_input(i
))
580 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
582 /* We don't reset the terminal if this is just about output */
583 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
585 case EXEC_OUTPUT_SYSLOG
:
586 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
587 case EXEC_OUTPUT_KMSG
:
588 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
589 case EXEC_OUTPUT_JOURNAL
:
590 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
591 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
593 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
594 r
= open_null_as(O_WRONLY
, fileno
);
598 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
599 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
600 * services to detect whether they are connected to the journal or not.
602 * If both stdout and stderr are connected to a stream then let's make sure to store the data
603 * about STDERR as that's usually the best way to do logging. */
605 if (fstat(fileno
, &st
) >= 0 &&
606 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
607 *journal_stream_dev
= st
.st_dev
;
608 *journal_stream_ino
= st
.st_ino
;
613 case EXEC_OUTPUT_SOCKET
:
614 assert(socket_fd
>= 0);
615 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
617 case EXEC_OUTPUT_NAMED_FD
:
618 (void) fd_nonblock(named_iofds
[fileno
], false);
619 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
622 assert_not_reached("Unknown error type");
626 static int chown_terminal(int fd
, uid_t uid
) {
631 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
635 /* This might fail. What matters are the results. */
636 (void) fchown(fd
, uid
, -1);
637 (void) fchmod(fd
, TTY_MODE
);
639 if (fstat(fd
, &st
) < 0)
642 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
648 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
649 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
652 assert(_saved_stdin
);
653 assert(_saved_stdout
);
655 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
659 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
660 if (saved_stdout
< 0)
663 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
667 r
= chown_terminal(fd
, getuid());
671 r
= reset_terminal_fd(fd
, true);
675 if (dup2(fd
, STDIN_FILENO
) < 0)
678 if (dup2(fd
, STDOUT_FILENO
) < 0)
685 *_saved_stdin
= saved_stdin
;
686 *_saved_stdout
= saved_stdout
;
688 saved_stdin
= saved_stdout
= -1;
693 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
696 if (err
== -ETIMEDOUT
)
697 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
700 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
704 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
705 _cleanup_close_
int fd
= -1;
709 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
713 write_confirm_error_fd(err
, fd
, u
);
716 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
720 assert(saved_stdout
);
724 if (*saved_stdin
>= 0)
725 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
728 if (*saved_stdout
>= 0)
729 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
732 *saved_stdin
= safe_close(*saved_stdin
);
733 *saved_stdout
= safe_close(*saved_stdout
);
739 CONFIRM_PRETEND_FAILURE
= -1,
740 CONFIRM_PRETEND_SUCCESS
= 0,
744 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
745 int saved_stdout
= -1, saved_stdin
= -1, r
;
746 _cleanup_free_
char *e
= NULL
;
749 /* For any internal errors, assume a positive response. */
750 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
752 write_confirm_error(r
, vc
, u
);
753 return CONFIRM_EXECUTE
;
756 /* confirm_spawn might have been disabled while we were sleeping. */
757 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
762 e
= ellipsize(cmdline
, 60, 100);
770 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
772 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
779 printf("Resuming normal execution.\n");
780 manager_disable_confirm_spawn();
784 unit_dump(u
, stdout
, " ");
785 continue; /* ask again */
787 printf("Failing execution.\n");
788 r
= CONFIRM_PRETEND_FAILURE
;
791 printf(" c - continue, proceed without asking anymore\n"
792 " D - dump, show the state of the unit\n"
793 " f - fail, don't execute the command and pretend it failed\n"
795 " i - info, show a short summary of the unit\n"
796 " j - jobs, show jobs that are in progress\n"
797 " s - skip, don't execute the command and pretend it succeeded\n"
798 " y - yes, execute the command\n");
799 continue; /* ask again */
801 printf(" Description: %s\n"
804 u
->id
, u
->description
, cmdline
);
805 continue; /* ask again */
807 manager_dump_jobs(u
->manager
, stdout
, " ");
808 continue; /* ask again */
810 /* 'n' was removed in favor of 'f'. */
811 printf("Didn't understand 'n', did you mean 'f'?\n");
812 continue; /* ask again */
814 printf("Skipping execution.\n");
815 r
= CONFIRM_PRETEND_SUCCESS
;
821 assert_not_reached("Unhandled choice");
827 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
831 static int get_fixed_user(const ExecContext
*c
, const char **user
,
832 uid_t
*uid
, gid_t
*gid
,
833 const char **home
, const char **shell
) {
842 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
843 * (i.e. are "/" or "/bin/nologin"). */
846 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
854 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
864 r
= get_group_creds(&name
, gid
);
872 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
873 const char *group
, gid_t gid
,
874 gid_t
**supplementary_gids
, int *ngids
) {
878 bool keep_groups
= false;
879 gid_t
*groups
= NULL
;
880 _cleanup_free_ gid_t
*l_gids
= NULL
;
885 * If user is given, then lookup GID and supplementary groups list.
886 * We avoid NSS lookups for gid=0. Also we have to initialize groups
887 * here and as early as possible so we keep the list of supplementary
888 * groups of the caller.
890 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
891 /* First step, initialize groups from /etc/groups */
892 if (initgroups(user
, gid
) < 0)
898 if (strv_isempty(c
->supplementary_groups
))
902 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
903 * be positive, otherwise fail.
906 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
907 if (ngroups_max
<= 0) {
911 return -EOPNOTSUPP
; /* For all other values */
914 l_gids
= new(gid_t
, ngroups_max
);
920 * Lookup the list of groups that the user belongs to, we
921 * avoid NSS lookups here too for gid=0.
924 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
929 STRV_FOREACH(i
, c
->supplementary_groups
) {
932 if (k
>= ngroups_max
)
936 r
= get_group_creds(&g
, l_gids
+k
);
944 * Sets ngids to zero to drop all supplementary groups, happens
945 * when we are under root and SupplementaryGroups= is empty.
952 /* Otherwise get the final list of supplementary groups */
953 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
957 *supplementary_gids
= groups
;
965 static int enforce_groups(gid_t gid
, gid_t
*supplementary_gids
, int ngids
) {
968 /* Handle SupplementaryGroups= if it is not empty */
970 r
= maybe_setgroups(ngids
, supplementary_gids
);
975 if (gid_is_valid(gid
)) {
976 /* Then set our gids */
977 if (setresgid(gid
, gid
, gid
) < 0)
984 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
987 if (!uid_is_valid(uid
))
990 /* Sets (but doesn't look up) the uid and make sure we keep the
991 * capabilities while doing so. */
993 if (context
->capability_ambient_set
!= 0) {
995 /* First step: If we need to keep capabilities but
996 * drop privileges we need to make sure we keep our
997 * caps, while we drop privileges. */
999 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1001 if (prctl(PR_GET_SECUREBITS
) != sb
)
1002 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1007 /* Second step: actually set the uids */
1008 if (setresuid(uid
, uid
, uid
) < 0)
1011 /* At this point we should have all necessary capabilities but
1012 are otherwise a normal user. However, the caps might got
1013 corrupted due to the setresuid() so we need clean them up
1014 later. This is done outside of this call. */
1021 static int null_conv(
1023 const struct pam_message
**msg
,
1024 struct pam_response
**resp
,
1025 void *appdata_ptr
) {
1027 /* We don't support conversations */
1029 return PAM_CONV_ERR
;
1034 static int setup_pam(
1041 int fds
[], unsigned n_fds
) {
1045 static const struct pam_conv conv
= {
1050 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1051 pam_handle_t
*handle
= NULL
;
1053 int pam_code
= PAM_SUCCESS
, r
;
1054 char **nv
, **e
= NULL
;
1055 bool close_session
= false;
1056 pid_t pam_pid
= 0, parent_pid
;
1063 /* We set up PAM in the parent process, then fork. The child
1064 * will then stay around until killed via PR_GET_PDEATHSIG or
1065 * systemd via the cgroup logic. It will then remove the PAM
1066 * session again. The parent process will exec() the actual
1067 * daemon. We do things this way to ensure that the main PID
1068 * of the daemon is the one we initially fork()ed. */
1070 r
= barrier_create(&barrier
);
1074 if (log_get_max_level() < LOG_DEBUG
)
1075 flags
|= PAM_SILENT
;
1077 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1078 if (pam_code
!= PAM_SUCCESS
) {
1084 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1085 if (pam_code
!= PAM_SUCCESS
)
1089 STRV_FOREACH(nv
, *env
) {
1090 pam_code
= pam_putenv(handle
, *nv
);
1091 if (pam_code
!= PAM_SUCCESS
)
1095 pam_code
= pam_acct_mgmt(handle
, flags
);
1096 if (pam_code
!= PAM_SUCCESS
)
1099 pam_code
= pam_open_session(handle
, flags
);
1100 if (pam_code
!= PAM_SUCCESS
)
1103 close_session
= true;
1105 e
= pam_getenvlist(handle
);
1107 pam_code
= PAM_BUF_ERR
;
1111 /* Block SIGTERM, so that we know that it won't get lost in
1114 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1116 parent_pid
= getpid_cached();
1125 int sig
, ret
= EXIT_PAM
;
1127 /* The child's job is to reset the PAM session on
1129 barrier_set_role(&barrier
, BARRIER_CHILD
);
1131 /* This string must fit in 10 chars (i.e. the length
1132 * of "/sbin/init"), to look pretty in /bin/ps */
1133 rename_process("(sd-pam)");
1135 /* Make sure we don't keep open the passed fds in this
1136 child. We assume that otherwise only those fds are
1137 open here that have been opened by PAM. */
1138 close_many(fds
, n_fds
);
1140 /* Drop privileges - we don't need any to pam_close_session
1141 * and this will make PR_SET_PDEATHSIG work in most cases.
1142 * If this fails, ignore the error - but expect sd-pam threads
1143 * to fail to exit normally */
1145 r
= maybe_setgroups(0, NULL
);
1147 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1148 if (setresgid(gid
, gid
, gid
) < 0)
1149 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1150 if (setresuid(uid
, uid
, uid
) < 0)
1151 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1153 (void) ignore_signals(SIGPIPE
, -1);
1155 /* Wait until our parent died. This will only work if
1156 * the above setresuid() succeeds, otherwise the kernel
1157 * will not allow unprivileged parents kill their privileged
1158 * children this way. We rely on the control groups kill logic
1159 * to do the rest for us. */
1160 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1163 /* Tell the parent that our setup is done. This is especially
1164 * important regarding dropping privileges. Otherwise, unit
1165 * setup might race against our setresuid(2) call.
1167 * If the parent aborted, we'll detect this below, hence ignore
1168 * return failure here. */
1169 (void) barrier_place(&barrier
);
1171 /* Check if our parent process might already have died? */
1172 if (getppid() == parent_pid
) {
1175 assert_se(sigemptyset(&ss
) >= 0);
1176 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1179 if (sigwait(&ss
, &sig
) < 0) {
1186 assert(sig
== SIGTERM
);
1191 /* If our parent died we'll end the session */
1192 if (getppid() != parent_pid
) {
1193 pam_code
= pam_close_session(handle
, flags
);
1194 if (pam_code
!= PAM_SUCCESS
)
1201 pam_end(handle
, pam_code
| flags
);
1205 barrier_set_role(&barrier
, BARRIER_PARENT
);
1207 /* If the child was forked off successfully it will do all the
1208 * cleanups, so forget about the handle here. */
1211 /* Unblock SIGTERM again in the parent */
1212 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1214 /* We close the log explicitly here, since the PAM modules
1215 * might have opened it, but we don't want this fd around. */
1218 /* Synchronously wait for the child to initialize. We don't care for
1219 * errors as we cannot recover. However, warn loudly if it happens. */
1220 if (!barrier_place_and_sync(&barrier
))
1221 log_error("PAM initialization failed");
1229 if (pam_code
!= PAM_SUCCESS
) {
1230 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1231 r
= -EPERM
; /* PAM errors do not map to errno */
1233 log_error_errno(r
, "PAM failed: %m");
1237 pam_code
= pam_close_session(handle
, flags
);
1239 pam_end(handle
, pam_code
| flags
);
1251 static void rename_process_from_path(const char *path
) {
1252 char process_name
[11];
1256 /* This resulting string must fit in 10 chars (i.e. the length
1257 * of "/sbin/init") to look pretty in /bin/ps */
1261 rename_process("(...)");
1267 /* The end of the process name is usually more
1268 * interesting, since the first bit might just be
1274 process_name
[0] = '(';
1275 memcpy(process_name
+1, p
, l
);
1276 process_name
[1+l
] = ')';
1277 process_name
[1+l
+1] = 0;
1279 rename_process(process_name
);
1282 static bool context_has_address_families(const ExecContext
*c
) {
1285 return c
->address_families_whitelist
||
1286 !set_isempty(c
->address_families
);
1289 static bool context_has_syscall_filters(const ExecContext
*c
) {
1292 return c
->syscall_whitelist
||
1293 !hashmap_isempty(c
->syscall_filter
);
1296 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1299 if (c
->no_new_privileges
)
1302 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1305 /* We need NNP if we have any form of seccomp and are unprivileged */
1306 return context_has_address_families(c
) ||
1307 c
->memory_deny_write_execute
||
1308 c
->restrict_realtime
||
1309 exec_context_restrict_namespaces_set(c
) ||
1310 c
->protect_kernel_tunables
||
1311 c
->protect_kernel_modules
||
1312 c
->private_devices
||
1313 context_has_syscall_filters(c
) ||
1314 !set_isempty(c
->syscall_archs
) ||
1315 c
->lock_personality
;
1320 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1322 if (is_seccomp_available())
1325 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1329 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1330 uint32_t negative_action
, default_action
, action
;
1336 if (!context_has_syscall_filters(c
))
1339 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1342 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1344 if (c
->syscall_whitelist
) {
1345 default_action
= negative_action
;
1346 action
= SCMP_ACT_ALLOW
;
1348 default_action
= SCMP_ACT_ALLOW
;
1349 action
= negative_action
;
1352 if (needs_ambient_hack
) {
1353 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1358 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1361 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1365 if (set_isempty(c
->syscall_archs
))
1368 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1371 return seccomp_restrict_archs(c
->syscall_archs
);
1374 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1378 if (!context_has_address_families(c
))
1381 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1384 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1387 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1391 if (!c
->memory_deny_write_execute
)
1394 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1397 return seccomp_memory_deny_write_execute();
1400 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1404 if (!c
->restrict_realtime
)
1407 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1410 return seccomp_restrict_realtime();
1413 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1417 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1418 * let's protect even those systems where this is left on in the kernel. */
1420 if (!c
->protect_kernel_tunables
)
1423 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1426 return seccomp_protect_sysctl();
1429 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1433 /* Turn off module syscalls on ProtectKernelModules=yes */
1435 if (!c
->protect_kernel_modules
)
1438 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1441 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1444 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1448 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1450 if (!c
->private_devices
)
1453 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1456 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1459 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1463 if (!exec_context_restrict_namespaces_set(c
))
1466 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1469 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1472 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1473 unsigned long personality
;
1479 if (!c
->lock_personality
)
1482 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1485 personality
= c
->personality
;
1487 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1488 if (personality
== PERSONALITY_INVALID
) {
1490 r
= opinionated_personality(&personality
);
1495 return seccomp_lock_personality(personality
);
1500 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1503 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1504 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1506 if (idle_pipe
[0] >= 0) {
1509 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1511 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1514 /* Signal systemd that we are bored and want to continue. */
1515 n
= write(idle_pipe
[3], "x", 1);
1517 /* Wait for systemd to react to the signal above. */
1518 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1521 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1525 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1528 static int build_environment(
1530 const ExecContext
*c
,
1531 const ExecParameters
*p
,
1534 const char *username
,
1536 dev_t journal_stream_dev
,
1537 ino_t journal_stream_ino
,
1540 _cleanup_strv_free_
char **our_env
= NULL
;
1548 our_env
= new0(char*, 14);
1553 _cleanup_free_
char *joined
= NULL
;
1555 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1557 our_env
[n_env
++] = x
;
1559 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1561 our_env
[n_env
++] = x
;
1563 joined
= strv_join(p
->fd_names
, ":");
1567 x
= strjoin("LISTEN_FDNAMES=", joined
);
1570 our_env
[n_env
++] = x
;
1573 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1574 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1576 our_env
[n_env
++] = x
;
1578 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1580 our_env
[n_env
++] = x
;
1583 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1584 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1585 * check the database directly. */
1586 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1587 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1590 our_env
[n_env
++] = x
;
1594 x
= strappend("HOME=", home
);
1597 our_env
[n_env
++] = x
;
1601 x
= strappend("LOGNAME=", username
);
1604 our_env
[n_env
++] = x
;
1606 x
= strappend("USER=", username
);
1609 our_env
[n_env
++] = x
;
1613 x
= strappend("SHELL=", shell
);
1616 our_env
[n_env
++] = x
;
1619 if (!sd_id128_is_null(u
->invocation_id
)) {
1620 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1623 our_env
[n_env
++] = x
;
1626 if (exec_context_needs_term(c
)) {
1627 const char *tty_path
, *term
= NULL
;
1629 tty_path
= exec_context_tty_path(c
);
1631 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1632 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1633 * passes to PID 1 ends up all the way in the console login shown. */
1635 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1636 term
= getenv("TERM");
1638 term
= default_term_for_tty(tty_path
);
1640 x
= strappend("TERM=", term
);
1643 our_env
[n_env
++] = x
;
1646 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1647 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1650 our_env
[n_env
++] = x
;
1653 our_env
[n_env
++] = NULL
;
1654 assert(n_env
<= 12);
1662 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1663 _cleanup_strv_free_
char **pass_env
= NULL
;
1664 size_t n_env
= 0, n_bufsize
= 0;
1667 STRV_FOREACH(i
, c
->pass_environment
) {
1668 _cleanup_free_
char *x
= NULL
;
1674 x
= strjoin(*i
, "=", v
);
1678 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1681 pass_env
[n_env
++] = x
;
1682 pass_env
[n_env
] = NULL
;
1692 static bool exec_needs_mount_namespace(
1693 const ExecContext
*context
,
1694 const ExecParameters
*params
,
1695 ExecRuntime
*runtime
) {
1700 if (context
->root_image
)
1703 if (!strv_isempty(context
->read_write_paths
) ||
1704 !strv_isempty(context
->read_only_paths
) ||
1705 !strv_isempty(context
->inaccessible_paths
))
1708 if (context
->n_bind_mounts
> 0 ||
1709 !strv_isempty(context
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) ||
1710 !strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1711 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1712 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
) ||
1713 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CONFIGURATION
].paths
))
1716 if (context
->mount_flags
!= 0)
1719 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1722 if (context
->private_devices
||
1723 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1724 context
->protect_home
!= PROTECT_HOME_NO
||
1725 context
->protect_kernel_tunables
||
1726 context
->protect_kernel_modules
||
1727 context
->protect_control_groups
)
1730 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1736 static int setup_private_users(uid_t uid
, gid_t gid
) {
1737 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1738 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1739 _cleanup_close_
int unshare_ready_fd
= -1;
1740 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1746 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1747 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1748 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1749 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1750 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1751 * continues execution normally. */
1753 if (uid
!= 0 && uid_is_valid(uid
)) {
1754 r
= asprintf(&uid_map
,
1755 "0 0 1\n" /* Map root → root */
1756 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1761 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1766 if (gid
!= 0 && gid_is_valid(gid
)) {
1767 r
= asprintf(&gid_map
,
1768 "0 0 1\n" /* Map root → root */
1769 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1774 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1779 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1781 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1782 if (unshare_ready_fd
< 0)
1785 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1787 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1795 _cleanup_close_
int fd
= -1;
1799 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1800 * here, after the parent opened its own user namespace. */
1803 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1805 /* Wait until the parent unshared the user namespace */
1806 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1811 /* Disable the setgroups() system call in the child user namespace, for good. */
1812 a
= procfs_file_alloca(ppid
, "setgroups");
1813 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1815 if (errno
!= ENOENT
) {
1820 /* If the file is missing the kernel is too old, let's continue anyway. */
1822 if (write(fd
, "deny\n", 5) < 0) {
1827 fd
= safe_close(fd
);
1830 /* First write the GID map */
1831 a
= procfs_file_alloca(ppid
, "gid_map");
1832 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1837 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1841 fd
= safe_close(fd
);
1843 /* The write the UID map */
1844 a
= procfs_file_alloca(ppid
, "uid_map");
1845 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1850 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1855 _exit(EXIT_SUCCESS
);
1858 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1859 _exit(EXIT_FAILURE
);
1862 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1864 if (unshare(CLONE_NEWUSER
) < 0)
1867 /* Let the child know that the namespace is ready now */
1868 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1871 /* Try to read an error code from the child */
1872 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1875 if (n
== sizeof(r
)) { /* an error code was sent to us */
1880 if (n
!= 0) /* on success we should have read 0 bytes */
1883 r
= wait_for_terminate(pid
, &si
);
1888 /* If something strange happened with the child, let's consider this fatal, too */
1889 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1895 static int setup_exec_directory(
1896 const ExecContext
*context
,
1897 const ExecParameters
*params
,
1900 ExecDirectoryType type
,
1903 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1904 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1905 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1906 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1907 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1908 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1915 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
1916 assert(exit_status
);
1918 if (!params
->prefix
[type
])
1921 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1922 if (!uid_is_valid(uid
))
1924 if (!gid_is_valid(gid
))
1928 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1929 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
1930 const char *effective
;
1932 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1938 r
= mkdir_parents_label(p
, 0755);
1942 if (context
->dynamic_user
&&
1943 !IN_SET(type
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
1944 _cleanup_free_
char *private_root
= NULL
, *relative
= NULL
, *parent
= NULL
;
1946 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1947 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1948 * whose UID is later on reused. To lock this down we use the same trick used by container
1949 * managers to prohibit host users to get access to files of the same UID in containers: we
1950 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1951 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1952 * to make this directory permeable for the service itself.
1954 * Specifically: for a service which wants a special directory "foo/" we first create a
1955 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1956 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1957 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1958 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1959 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1960 * disabling the access boundary for the service and making sure it only gets access to the
1961 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1963 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1964 * owned by the service itself.
1965 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1966 * files or sockets with other services. */
1968 private_root
= strjoin(params
->prefix
[type
], "/private");
1969 if (!private_root
) {
1974 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1975 r
= mkdir_safe_label(private_root
, 0700, 0, 0, false);
1979 pp
= strjoin(private_root
, "/", *rt
);
1985 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1986 r
= mkdir_parents_label(pp
, 0755);
1990 /* Finally, create the actual directory for the service */
1991 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
1992 if (r
< 0 && r
!= -EEXIST
)
1995 parent
= dirname_malloc(p
);
2001 r
= path_make_relative(parent
, pp
, &relative
);
2005 /* And link it up from the original place */
2006 r
= symlink_idempotent(relative
, p
);
2013 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2014 if (r
< 0 && r
!= -EEXIST
)
2020 /* First lock down the access mode */
2021 if (chmod(effective
, context
->directories
[type
].mode
) < 0) {
2026 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2027 * a service, and shall not be writable. */
2028 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2031 /* Then, change the ownership of the whole tree, if necessary */
2032 r
= path_chown_recursive(effective
, uid
, gid
);
2040 *exit_status
= exit_status_table
[type
];
2044 static int setup_smack(
2045 const ExecContext
*context
,
2046 const ExecCommand
*command
) {
2053 if (context
->smack_process_label
) {
2054 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2058 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2060 _cleanup_free_
char *exec_label
= NULL
;
2062 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2063 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2066 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2075 static int compile_bind_mounts(
2076 const ExecContext
*context
,
2077 const ExecParameters
*params
,
2078 BindMount
**ret_bind_mounts
,
2079 unsigned *ret_n_bind_mounts
,
2080 char ***ret_empty_directories
) {
2082 _cleanup_strv_free_
char **empty_directories
= NULL
;
2083 BindMount
*bind_mounts
;
2084 unsigned n
, h
= 0, i
;
2085 ExecDirectoryType t
;
2090 assert(ret_bind_mounts
);
2091 assert(ret_n_bind_mounts
);
2092 assert(ret_empty_directories
);
2094 n
= context
->n_bind_mounts
;
2095 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2096 if (!params
->prefix
[t
])
2099 n
+= strv_length(context
->directories
[t
].paths
);
2103 *ret_bind_mounts
= NULL
;
2104 *ret_n_bind_mounts
= 0;
2105 *ret_empty_directories
= NULL
;
2109 bind_mounts
= new(BindMount
, n
);
2113 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2114 BindMount
*item
= context
->bind_mounts
+ i
;
2117 s
= strdup(item
->source
);
2123 d
= strdup(item
->destination
);
2130 bind_mounts
[h
++] = (BindMount
) {
2133 .read_only
= item
->read_only
,
2134 .recursive
= item
->recursive
,
2135 .ignore_enoent
= item
->ignore_enoent
,
2139 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2142 if (!params
->prefix
[t
])
2145 if (strv_isempty(context
->directories
[t
].paths
))
2148 if (context
->dynamic_user
&&
2149 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2152 /* So this is for a dynamic user, and we need to make sure the process can access its own
2153 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2154 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2156 private_root
= strjoin(params
->prefix
[t
], "/private");
2157 if (!private_root
) {
2162 r
= strv_consume(&empty_directories
, private_root
);
2169 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2172 if (context
->dynamic_user
&&
2173 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
))
2174 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2176 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2189 bind_mounts
[h
++] = (BindMount
) {
2194 .ignore_enoent
= false,
2201 *ret_bind_mounts
= bind_mounts
;
2202 *ret_n_bind_mounts
= n
;
2203 *ret_empty_directories
= empty_directories
;
2205 empty_directories
= NULL
;
2210 bind_mount_free_many(bind_mounts
, h
);
2214 static int apply_mount_namespace(
2216 ExecCommand
*command
,
2217 const ExecContext
*context
,
2218 const ExecParameters
*params
,
2219 ExecRuntime
*runtime
) {
2221 _cleanup_strv_free_
char **empty_directories
= NULL
;
2222 char *tmp
= NULL
, *var
= NULL
;
2223 const char *root_dir
= NULL
, *root_image
= NULL
;
2224 NamespaceInfo ns_info
= {
2225 .ignore_protect_paths
= false,
2226 .private_dev
= context
->private_devices
,
2227 .protect_control_groups
= context
->protect_control_groups
,
2228 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2229 .protect_kernel_modules
= context
->protect_kernel_modules
,
2230 .mount_apivfs
= context
->mount_apivfs
,
2232 bool needs_sandboxing
;
2233 BindMount
*bind_mounts
= NULL
;
2234 unsigned n_bind_mounts
= 0;
2239 /* The runtime struct only contains the parent of the private /tmp,
2240 * which is non-accessible to world users. Inside of it there's a /tmp
2241 * that is sticky, and that's the one we want to use here. */
2243 if (context
->private_tmp
&& runtime
) {
2244 if (runtime
->tmp_dir
)
2245 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2246 if (runtime
->var_tmp_dir
)
2247 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2250 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2251 root_image
= context
->root_image
;
2254 root_dir
= context
->root_directory
;
2257 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2262 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2263 * sandbox info, otherwise enforce it, don't ignore protected paths and
2264 * fail if we are enable to apply the sandbox inside the mount namespace.
2266 if (!context
->dynamic_user
&& root_dir
)
2267 ns_info
.ignore_protect_paths
= true;
2269 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2271 r
= setup_namespace(root_dir
, root_image
,
2272 &ns_info
, context
->read_write_paths
,
2273 needs_sandboxing
? context
->read_only_paths
: NULL
,
2274 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2280 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2281 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2282 context
->mount_flags
,
2283 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2285 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2287 /* If we couldn't set up the namespace this is probably due to a
2288 * missing capability. In this case, silently proceeed. */
2289 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2290 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2297 static int apply_working_directory(
2298 const ExecContext
*context
,
2299 const ExecParameters
*params
,
2301 const bool needs_mount_ns
,
2307 assert(exit_status
);
2309 if (context
->working_directory_home
) {
2312 *exit_status
= EXIT_CHDIR
;
2318 } else if (context
->working_directory
)
2319 wd
= context
->working_directory
;
2323 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2324 if (!needs_mount_ns
&& context
->root_directory
)
2325 if (chroot(context
->root_directory
) < 0) {
2326 *exit_status
= EXIT_CHROOT
;
2332 d
= prefix_roota(context
->root_directory
, wd
);
2334 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2335 *exit_status
= EXIT_CHDIR
;
2342 static int setup_keyring(
2344 const ExecContext
*context
,
2345 const ExecParameters
*p
,
2346 uid_t uid
, gid_t gid
) {
2348 key_serial_t keyring
;
2355 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2356 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2357 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2358 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2359 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2360 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2362 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2365 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2368 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2369 if (keyring
== -1) {
2370 if (errno
== ENOSYS
)
2371 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2372 else if (IN_SET(errno
, EACCES
, EPERM
))
2373 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2374 else if (errno
== EDQUOT
)
2375 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2377 return log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2382 /* Populate they keyring with the invocation ID by default. */
2383 if (!sd_id128_is_null(u
->invocation_id
)) {
2386 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2388 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2390 if (keyctl(KEYCTL_SETPERM
, key
,
2391 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2392 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2393 return log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2397 /* And now, make the keyring owned by the service's user */
2398 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2399 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2400 return log_unit_error_errno(u
, errno
, "Failed to change ownership of session keyring: %m");
2402 /* When requested link the user keyring into the session keyring. */
2403 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2407 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2408 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2409 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2411 saved_uid
= getuid();
2412 saved_gid
= getgid();
2414 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2415 if (setregid(gid
, -1) < 0)
2416 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2419 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2420 if (setreuid(uid
, -1) < 0) {
2421 (void) setregid(saved_gid
, -1);
2422 return log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2426 if (keyctl(KEYCTL_LINK
,
2427 KEY_SPEC_USER_KEYRING
,
2428 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2432 (void) setreuid(saved_uid
, -1);
2433 (void) setregid(saved_gid
, -1);
2435 return log_unit_error_errno(u
, r
, "Failed to link user keyring into session keyring: %m");
2438 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2439 if (setreuid(saved_uid
, -1) < 0) {
2440 (void) setregid(saved_gid
, -1);
2441 return log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2445 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2446 if (setregid(saved_gid
, -1) < 0)
2447 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2454 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2462 array
[(*n
)++] = pair
[0];
2464 array
[(*n
)++] = pair
[1];
2467 static int close_remaining_fds(
2468 const ExecParameters
*params
,
2469 ExecRuntime
*runtime
,
2470 DynamicCreds
*dcreds
,
2473 int *fds
, unsigned n_fds
) {
2475 unsigned n_dont_close
= 0;
2476 int dont_close
[n_fds
+ 12];
2480 if (params
->stdin_fd
>= 0)
2481 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2482 if (params
->stdout_fd
>= 0)
2483 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2484 if (params
->stderr_fd
>= 0)
2485 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2488 dont_close
[n_dont_close
++] = socket_fd
;
2490 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2491 n_dont_close
+= n_fds
;
2495 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2499 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2501 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2504 if (user_lookup_fd
>= 0)
2505 dont_close
[n_dont_close
++] = user_lookup_fd
;
2507 return close_all_fds(dont_close
, n_dont_close
);
2510 static int send_user_lookup(
2518 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2519 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2522 if (user_lookup_fd
< 0)
2525 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2528 if (writev(user_lookup_fd
,
2530 IOVEC_INIT(&uid
, sizeof(uid
)),
2531 IOVEC_INIT(&gid
, sizeof(gid
)),
2532 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2538 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2545 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2550 if (!c
->working_directory_home
)
2554 /* Hardcode /root as home directory for UID 0 */
2559 r
= get_home_dir(buf
);
2567 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2568 _cleanup_strv_free_
char ** list
= NULL
;
2569 ExecDirectoryType t
;
2576 assert(c
->dynamic_user
);
2578 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2579 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2582 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2585 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2591 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2594 if (t
== EXEC_DIRECTORY_RUNTIME
)
2595 e
= strjoin(p
->prefix
[t
], "/", *i
);
2597 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2601 r
= strv_consume(&list
, e
);
2613 static int exec_child(
2615 ExecCommand
*command
,
2616 const ExecContext
*context
,
2617 const ExecParameters
*params
,
2618 ExecRuntime
*runtime
,
2619 DynamicCreds
*dcreds
,
2624 unsigned n_storage_fds
,
2625 unsigned n_socket_fds
,
2630 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2631 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2632 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2633 const char *username
= NULL
, *groupname
= NULL
;
2634 const char *home
= NULL
, *shell
= NULL
;
2635 dev_t journal_stream_dev
= 0;
2636 ino_t journal_stream_ino
= 0;
2637 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2638 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2639 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2640 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2642 bool use_selinux
= false;
2645 bool use_smack
= false;
2648 bool use_apparmor
= false;
2650 uid_t uid
= UID_INVALID
;
2651 gid_t gid
= GID_INVALID
;
2652 int i
, r
, ngids
= 0;
2654 ExecDirectoryType dt
;
2661 assert(exit_status
);
2663 rename_process_from_path(command
->path
);
2665 /* We reset exactly these signals, since they are the
2666 * only ones we set to SIG_IGN in the main daemon. All
2667 * others we leave untouched because we set them to
2668 * SIG_DFL or a valid handler initially, both of which
2669 * will be demoted to SIG_DFL. */
2670 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2671 SIGNALS_IGNORE
, -1);
2673 if (context
->ignore_sigpipe
)
2674 (void) ignore_signals(SIGPIPE
, -1);
2676 r
= reset_signal_mask();
2678 *exit_status
= EXIT_SIGNAL_MASK
;
2679 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2682 if (params
->idle_pipe
)
2683 do_idle_pipe_dance(params
->idle_pipe
);
2685 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2686 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2687 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2688 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2691 log_set_open_when_needed(true);
2693 /* In case anything used libc syslog(), close this here, too */
2696 n_fds
= n_storage_fds
+ n_socket_fds
;
2697 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2699 *exit_status
= EXIT_FDS
;
2700 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2703 if (!context
->same_pgrp
)
2705 *exit_status
= EXIT_SETSID
;
2706 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2709 exec_context_tty_reset(context
, params
);
2711 if (unit_shall_confirm_spawn(unit
)) {
2712 const char *vc
= params
->confirm_spawn
;
2713 _cleanup_free_
char *cmdline
= NULL
;
2715 cmdline
= exec_command_line(argv
);
2717 *exit_status
= EXIT_MEMORY
;
2721 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2722 if (r
!= CONFIRM_EXECUTE
) {
2723 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2724 *exit_status
= EXIT_SUCCESS
;
2727 *exit_status
= EXIT_CONFIRM
;
2728 log_unit_error(unit
, "Execution cancelled by the user");
2733 if (context
->dynamic_user
&& dcreds
) {
2734 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2736 /* Make sure we bypass our own NSS module for any NSS checks */
2737 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2738 *exit_status
= EXIT_USER
;
2739 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2742 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2744 *exit_status
= EXIT_MEMORY
;
2748 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2750 *exit_status
= EXIT_USER
;
2752 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2755 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2758 if (!uid_is_valid(uid
)) {
2759 *exit_status
= EXIT_USER
;
2760 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2764 if (!gid_is_valid(gid
)) {
2765 *exit_status
= EXIT_USER
;
2766 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2771 username
= dcreds
->user
->name
;
2774 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2776 *exit_status
= EXIT_USER
;
2777 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2780 r
= get_fixed_group(context
, &groupname
, &gid
);
2782 *exit_status
= EXIT_GROUP
;
2783 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
2787 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2788 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2789 &supplementary_gids
, &ngids
);
2791 *exit_status
= EXIT_GROUP
;
2792 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
2795 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2797 *exit_status
= EXIT_USER
;
2798 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
2801 user_lookup_fd
= safe_close(user_lookup_fd
);
2803 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2805 *exit_status
= EXIT_CHDIR
;
2806 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
2809 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2810 * must sure to drop O_NONBLOCK */
2812 (void) fd_nonblock(socket_fd
, false);
2814 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2816 *exit_status
= EXIT_STDIN
;
2817 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
2820 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2822 *exit_status
= EXIT_STDOUT
;
2823 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
2826 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2828 *exit_status
= EXIT_STDERR
;
2829 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
2832 if (params
->cgroup_path
) {
2833 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2835 *exit_status
= EXIT_CGROUP
;
2836 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", params
->cgroup_path
);
2840 if (context
->oom_score_adjust_set
) {
2841 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2843 /* When we can't make this change due to EPERM, then
2844 * let's silently skip over it. User namespaces
2845 * prohibit write access to this file, and we
2846 * shouldn't trip up over that. */
2848 sprintf(t
, "%i", context
->oom_score_adjust
);
2849 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2850 if (IN_SET(r
, -EPERM
, -EACCES
))
2851 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2853 *exit_status
= EXIT_OOM_ADJUST
;
2854 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
2858 if (context
->nice_set
)
2859 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2860 *exit_status
= EXIT_NICE
;
2861 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
2864 if (context
->cpu_sched_set
) {
2865 struct sched_param param
= {
2866 .sched_priority
= context
->cpu_sched_priority
,
2869 r
= sched_setscheduler(0,
2870 context
->cpu_sched_policy
|
2871 (context
->cpu_sched_reset_on_fork
?
2872 SCHED_RESET_ON_FORK
: 0),
2875 *exit_status
= EXIT_SETSCHEDULER
;
2876 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
2880 if (context
->cpuset
)
2881 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2882 *exit_status
= EXIT_CPUAFFINITY
;
2883 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
2886 if (context
->ioprio_set
)
2887 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2888 *exit_status
= EXIT_IOPRIO
;
2889 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
2892 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2893 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2894 *exit_status
= EXIT_TIMERSLACK
;
2895 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
2898 if (context
->personality
!= PERSONALITY_INVALID
) {
2899 r
= safe_personality(context
->personality
);
2901 *exit_status
= EXIT_PERSONALITY
;
2902 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
2906 if (context
->utmp_id
)
2907 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2909 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2910 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2914 if (context
->user
) {
2915 r
= chown_terminal(STDIN_FILENO
, uid
);
2917 *exit_status
= EXIT_STDIN
;
2918 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
2922 /* If delegation is enabled we'll pass ownership of the cgroup
2923 * (but only in systemd's own controller hierarchy!) to the
2924 * user of the new process. */
2925 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2926 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2928 *exit_status
= EXIT_CGROUP
;
2929 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2932 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2934 *exit_status
= EXIT_CGROUP
;
2935 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2939 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
2940 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2942 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
2945 r
= build_environment(
2957 *exit_status
= EXIT_MEMORY
;
2961 r
= build_pass_environment(context
, &pass_env
);
2963 *exit_status
= EXIT_MEMORY
;
2967 accum_env
= strv_env_merge(5,
2968 params
->environment
,
2971 context
->environment
,
2975 *exit_status
= EXIT_MEMORY
;
2978 accum_env
= strv_env_clean(accum_env
);
2980 (void) umask(context
->umask
);
2982 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
2984 *exit_status
= EXIT_KEYRING
;
2985 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
2988 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2989 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2991 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2992 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
2994 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2995 if (needs_ambient_hack
)
2996 needs_setuid
= false;
2998 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3000 if (needs_sandboxing
) {
3001 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3002 * present. The actual MAC context application will happen later, as late as possible, to avoid
3003 * impacting our own code paths. */
3006 use_selinux
= mac_selinux_use();
3009 use_smack
= mac_smack_use();
3012 use_apparmor
= mac_apparmor_use();
3017 if (context
->pam_name
&& username
) {
3018 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3020 *exit_status
= EXIT_PAM
;
3021 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3026 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3027 if (ns_type_supported(NAMESPACE_NET
)) {
3028 r
= setup_netns(runtime
->netns_storage_socket
);
3030 *exit_status
= EXIT_NETWORK
;
3031 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3034 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3037 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3038 if (needs_mount_namespace
) {
3039 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3041 *exit_status
= EXIT_NAMESPACE
;
3042 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3046 /* Apply just after mount namespace setup */
3047 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3049 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3051 /* Drop groups as early as possbile */
3053 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3055 *exit_status
= EXIT_GROUP
;
3056 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3060 if (needs_sandboxing
) {
3062 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3063 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3065 *exit_status
= EXIT_SELINUX_CONTEXT
;
3066 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3071 if (context
->private_users
) {
3072 r
= setup_private_users(uid
, gid
);
3074 *exit_status
= EXIT_USER
;
3075 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3080 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3081 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3082 * was needed to upload the policy and can now be closed as well. */
3083 r
= close_all_fds(fds
, n_fds
);
3085 r
= shift_fds(fds
, n_fds
);
3087 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
3089 *exit_status
= EXIT_FDS
;
3090 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3093 secure_bits
= context
->secure_bits
;
3095 if (needs_sandboxing
) {
3098 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
3100 if (!context
->rlimit
[i
])
3103 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
3105 *exit_status
= EXIT_LIMITS
;
3106 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit %s: %m", rlimit_to_string(i
));
3110 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3111 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3112 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3113 *exit_status
= EXIT_LIMITS
;
3114 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3118 bset
= context
->capability_bounding_set
;
3119 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3120 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3121 * instead of us doing that */
3122 if (needs_ambient_hack
)
3123 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3124 (UINT64_C(1) << CAP_SETUID
) |
3125 (UINT64_C(1) << CAP_SETGID
);
3127 if (!cap_test_all(bset
)) {
3128 r
= capability_bounding_set_drop(bset
, false);
3130 *exit_status
= EXIT_CAPABILITIES
;
3131 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3135 /* This is done before enforce_user, but ambient set
3136 * does not survive over setresuid() if keep_caps is not set. */
3137 if (!needs_ambient_hack
&&
3138 context
->capability_ambient_set
!= 0) {
3139 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3141 *exit_status
= EXIT_CAPABILITIES
;
3142 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3148 if (context
->user
) {
3149 r
= enforce_user(context
, uid
);
3151 *exit_status
= EXIT_USER
;
3152 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3155 if (!needs_ambient_hack
&&
3156 context
->capability_ambient_set
!= 0) {
3158 /* Fix the ambient capabilities after user change. */
3159 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3161 *exit_status
= EXIT_CAPABILITIES
;
3162 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3165 /* If we were asked to change user and ambient capabilities
3166 * were requested, we had to add keep-caps to the securebits
3167 * so that we would maintain the inherited capability set
3168 * through the setresuid(). Make sure that the bit is added
3169 * also to the context secure_bits so that we don't try to
3170 * drop the bit away next. */
3172 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3177 if (needs_sandboxing
) {
3178 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3179 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3180 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3181 * are restricted. */
3185 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3188 r
= setexeccon(exec_context
);
3190 *exit_status
= EXIT_SELINUX_CONTEXT
;
3191 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3199 r
= setup_smack(context
, command
);
3201 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3202 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3208 if (use_apparmor
&& context
->apparmor_profile
) {
3209 r
= aa_change_onexec(context
->apparmor_profile
);
3210 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3211 *exit_status
= EXIT_APPARMOR_PROFILE
;
3212 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3217 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3218 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3219 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3220 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3221 *exit_status
= EXIT_SECUREBITS
;
3222 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3225 if (context_has_no_new_privileges(context
))
3226 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3227 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3228 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3232 r
= apply_address_families(unit
, context
);
3234 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3235 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3238 r
= apply_memory_deny_write_execute(unit
, context
);
3240 *exit_status
= EXIT_SECCOMP
;
3241 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3244 r
= apply_restrict_realtime(unit
, context
);
3246 *exit_status
= EXIT_SECCOMP
;
3247 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3250 r
= apply_restrict_namespaces(unit
, context
);
3252 *exit_status
= EXIT_SECCOMP
;
3253 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3256 r
= apply_protect_sysctl(unit
, context
);
3258 *exit_status
= EXIT_SECCOMP
;
3259 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3262 r
= apply_protect_kernel_modules(unit
, context
);
3264 *exit_status
= EXIT_SECCOMP
;
3265 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3268 r
= apply_private_devices(unit
, context
);
3270 *exit_status
= EXIT_SECCOMP
;
3271 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3274 r
= apply_syscall_archs(unit
, context
);
3276 *exit_status
= EXIT_SECCOMP
;
3277 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3280 r
= apply_lock_personality(unit
, context
);
3282 *exit_status
= EXIT_SECCOMP
;
3283 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3286 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3287 * by the filter as little as possible. */
3288 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3290 *exit_status
= EXIT_SECCOMP
;
3291 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3296 if (!strv_isempty(context
->unset_environment
)) {
3299 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3301 *exit_status
= EXIT_MEMORY
;
3305 strv_free(accum_env
);
3309 final_argv
= replace_env_argv(argv
, accum_env
);
3311 *exit_status
= EXIT_MEMORY
;
3315 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3316 _cleanup_free_
char *line
;
3318 line
= exec_command_line(final_argv
);
3320 log_struct(LOG_DEBUG
,
3321 "EXECUTABLE=%s", command
->path
,
3322 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3324 LOG_UNIT_INVOCATION_ID(unit
),
3329 execve(command
->path
, final_argv
, accum_env
);
3331 if (errno
== ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3333 log_struct_errno(LOG_INFO
, errno
,
3334 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3336 LOG_UNIT_INVOCATION_ID(unit
),
3337 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3339 "EXECUTABLE=%s", command
->path
,
3345 *exit_status
= EXIT_EXEC
;
3346 return log_unit_error_errno(unit
, errno
, "Failed to execute command: %m");
3349 int exec_spawn(Unit
*unit
,
3350 ExecCommand
*command
,
3351 const ExecContext
*context
,
3352 const ExecParameters
*params
,
3353 ExecRuntime
*runtime
,
3354 DynamicCreds
*dcreds
,
3357 _cleanup_strv_free_
char **files_env
= NULL
;
3359 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3360 _cleanup_free_
char *line
= NULL
;
3362 int named_iofds
[3] = { -1, -1, -1 };
3371 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3373 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3374 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3375 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3377 if (params
->n_socket_fds
> 1) {
3378 log_unit_error(unit
, "Got more than one socket.");
3382 if (params
->n_socket_fds
== 0) {
3383 log_unit_error(unit
, "Got no socket.");
3387 socket_fd
= params
->fds
[0];
3391 n_storage_fds
= params
->n_storage_fds
;
3392 n_socket_fds
= params
->n_socket_fds
;
3395 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3397 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3399 r
= exec_context_load_environment(unit
, context
, &files_env
);
3401 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3403 argv
= params
->argv
?: command
->argv
;
3404 line
= exec_command_line(argv
);
3408 log_struct(LOG_DEBUG
,
3409 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3410 "EXECUTABLE=%s", command
->path
,
3412 LOG_UNIT_INVOCATION_ID(unit
),
3417 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3420 int exit_status
= EXIT_SUCCESS
;
3422 r
= exec_child(unit
,
3435 unit
->manager
->user_lookup_fds
[1],
3439 log_struct_errno(LOG_ERR
, r
,
3440 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3442 LOG_UNIT_INVOCATION_ID(unit
),
3443 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3444 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3446 "EXECUTABLE=%s", command
->path
,
3453 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3455 /* We add the new process to the cgroup both in the child (so
3456 * that we can be sure that no user code is ever executed
3457 * outside of the cgroup) and in the parent (so that we can be
3458 * sure that when we kill the cgroup the process will be
3460 if (params
->cgroup_path
)
3461 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3463 exec_status_start(&command
->exec_status
, pid
);
3469 void exec_context_init(ExecContext
*c
) {
3470 ExecDirectoryType i
;
3475 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3476 c
->cpu_sched_policy
= SCHED_OTHER
;
3477 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3478 c
->syslog_level_prefix
= true;
3479 c
->ignore_sigpipe
= true;
3480 c
->timer_slack_nsec
= NSEC_INFINITY
;
3481 c
->personality
= PERSONALITY_INVALID
;
3482 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3483 c
->directories
[i
].mode
= 0755;
3484 c
->capability_bounding_set
= CAP_ALL
;
3485 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3486 c
->log_level_max
= -1;
3489 void exec_context_done(ExecContext
*c
) {
3490 ExecDirectoryType i
;
3495 c
->environment
= strv_free(c
->environment
);
3496 c
->environment_files
= strv_free(c
->environment_files
);
3497 c
->pass_environment
= strv_free(c
->pass_environment
);
3498 c
->unset_environment
= strv_free(c
->unset_environment
);
3500 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3501 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3503 for (l
= 0; l
< 3; l
++)
3504 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3506 c
->working_directory
= mfree(c
->working_directory
);
3507 c
->root_directory
= mfree(c
->root_directory
);
3508 c
->root_image
= mfree(c
->root_image
);
3509 c
->tty_path
= mfree(c
->tty_path
);
3510 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3511 c
->user
= mfree(c
->user
);
3512 c
->group
= mfree(c
->group
);
3514 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3516 c
->pam_name
= mfree(c
->pam_name
);
3518 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3519 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3520 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3522 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3525 CPU_FREE(c
->cpuset
);
3527 c
->utmp_id
= mfree(c
->utmp_id
);
3528 c
->selinux_context
= mfree(c
->selinux_context
);
3529 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3530 c
->smack_process_label
= mfree(c
->smack_process_label
);
3532 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3533 c
->syscall_archs
= set_free(c
->syscall_archs
);
3534 c
->address_families
= set_free(c
->address_families
);
3536 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3537 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3539 c
->log_level_max
= -1;
3541 exec_context_free_log_extra_fields(c
);
3544 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3549 if (!runtime_prefix
)
3552 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3553 _cleanup_free_
char *p
;
3555 p
= strjoin(runtime_prefix
, "/", *i
);
3559 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3561 (void) rm_rf(p
, REMOVE_ROOT
);
3567 void exec_command_done(ExecCommand
*c
) {
3570 c
->path
= mfree(c
->path
);
3572 c
->argv
= strv_free(c
->argv
);
3575 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3578 for (i
= 0; i
< n
; i
++)
3579 exec_command_done(c
+i
);
3582 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3586 LIST_REMOVE(command
, c
, i
);
3587 exec_command_done(i
);
3594 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3597 for (i
= 0; i
< n
; i
++)
3598 c
[i
] = exec_command_free_list(c
[i
]);
3601 typedef struct InvalidEnvInfo
{
3606 static void invalid_env(const char *p
, void *userdata
) {
3607 InvalidEnvInfo
*info
= userdata
;
3609 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3612 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3617 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3619 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3621 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3623 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3625 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3627 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3633 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3634 unsigned i
, targets
;
3635 const char* stdio_fdname
[3];
3641 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3642 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3643 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3645 for (i
= 0; i
< 3; i
++)
3646 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3648 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3650 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3651 if (named_iofds
[STDIN_FILENO
] < 0 &&
3652 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3653 stdio_fdname
[STDIN_FILENO
] &&
3654 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3656 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3659 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3660 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3661 stdio_fdname
[STDOUT_FILENO
] &&
3662 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3664 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3667 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3668 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3669 stdio_fdname
[STDERR_FILENO
] &&
3670 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3672 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3676 return targets
== 0 ? 0 : -ENOENT
;
3679 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3680 char **i
, **r
= NULL
;
3685 STRV_FOREACH(i
, c
->environment_files
) {
3689 bool ignore
= false;
3691 _cleanup_globfree_ glob_t pglob
= {};
3700 if (!path_is_absolute(fn
)) {
3708 /* Filename supports globbing, take all matching files */
3709 k
= safe_glob(fn
, 0, &pglob
);
3718 /* When we don't match anything, -ENOENT should be returned */
3719 assert(pglob
.gl_pathc
> 0);
3721 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3722 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3730 /* Log invalid environment variables with filename */
3732 InvalidEnvInfo info
= {
3734 .path
= pglob
.gl_pathv
[n
]
3737 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3745 m
= strv_env_merge(2, r
, p
);
3761 static bool tty_may_match_dev_console(const char *tty
) {
3762 _cleanup_free_
char *active
= NULL
;
3768 tty
= skip_dev_prefix(tty
);
3770 /* trivial identity? */
3771 if (streq(tty
, "console"))
3774 console
= resolve_dev_console(&active
);
3775 /* if we could not resolve, assume it may */
3779 /* "tty0" means the active VC, so it may be the same sometimes */
3780 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3783 bool exec_context_may_touch_console(ExecContext
*ec
) {
3785 return (ec
->tty_reset
||
3787 ec
->tty_vt_disallocate
||
3788 is_terminal_input(ec
->std_input
) ||
3789 is_terminal_output(ec
->std_output
) ||
3790 is_terminal_output(ec
->std_error
)) &&
3791 tty_may_match_dev_console(exec_context_tty_path(ec
));
3794 static void strv_fprintf(FILE *f
, char **l
) {
3800 fprintf(f
, " %s", *g
);
3803 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3804 ExecDirectoryType dt
;
3812 prefix
= strempty(prefix
);
3816 "%sWorkingDirectory: %s\n"
3817 "%sRootDirectory: %s\n"
3818 "%sNonBlocking: %s\n"
3819 "%sPrivateTmp: %s\n"
3820 "%sPrivateDevices: %s\n"
3821 "%sProtectKernelTunables: %s\n"
3822 "%sProtectKernelModules: %s\n"
3823 "%sProtectControlGroups: %s\n"
3824 "%sPrivateNetwork: %s\n"
3825 "%sPrivateUsers: %s\n"
3826 "%sProtectHome: %s\n"
3827 "%sProtectSystem: %s\n"
3828 "%sMountAPIVFS: %s\n"
3829 "%sIgnoreSIGPIPE: %s\n"
3830 "%sMemoryDenyWriteExecute: %s\n"
3831 "%sRestrictRealtime: %s\n"
3832 "%sKeyringMode: %s\n",
3834 prefix
, c
->working_directory
? c
->working_directory
: "/",
3835 prefix
, c
->root_directory
? c
->root_directory
: "/",
3836 prefix
, yes_no(c
->non_blocking
),
3837 prefix
, yes_no(c
->private_tmp
),
3838 prefix
, yes_no(c
->private_devices
),
3839 prefix
, yes_no(c
->protect_kernel_tunables
),
3840 prefix
, yes_no(c
->protect_kernel_modules
),
3841 prefix
, yes_no(c
->protect_control_groups
),
3842 prefix
, yes_no(c
->private_network
),
3843 prefix
, yes_no(c
->private_users
),
3844 prefix
, protect_home_to_string(c
->protect_home
),
3845 prefix
, protect_system_to_string(c
->protect_system
),
3846 prefix
, yes_no(c
->mount_apivfs
),
3847 prefix
, yes_no(c
->ignore_sigpipe
),
3848 prefix
, yes_no(c
->memory_deny_write_execute
),
3849 prefix
, yes_no(c
->restrict_realtime
),
3850 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3853 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3855 STRV_FOREACH(e
, c
->environment
)
3856 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3858 STRV_FOREACH(e
, c
->environment_files
)
3859 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3861 STRV_FOREACH(e
, c
->pass_environment
)
3862 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3864 STRV_FOREACH(e
, c
->unset_environment
)
3865 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3867 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3869 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3870 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3872 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3873 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3881 if (c
->oom_score_adjust_set
)
3883 "%sOOMScoreAdjust: %i\n",
3884 prefix
, c
->oom_score_adjust
);
3886 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3888 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3889 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3890 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3891 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3894 if (c
->ioprio_set
) {
3895 _cleanup_free_
char *class_str
= NULL
;
3897 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3899 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3901 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3904 if (c
->cpu_sched_set
) {
3905 _cleanup_free_
char *policy_str
= NULL
;
3907 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3909 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3912 "%sCPUSchedulingPriority: %i\n"
3913 "%sCPUSchedulingResetOnFork: %s\n",
3914 prefix
, c
->cpu_sched_priority
,
3915 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3919 fprintf(f
, "%sCPUAffinity:", prefix
);
3920 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3921 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3922 fprintf(f
, " %u", i
);
3926 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3927 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3930 "%sStandardInput: %s\n"
3931 "%sStandardOutput: %s\n"
3932 "%sStandardError: %s\n",
3933 prefix
, exec_input_to_string(c
->std_input
),
3934 prefix
, exec_output_to_string(c
->std_output
),
3935 prefix
, exec_output_to_string(c
->std_error
));
3941 "%sTTYVHangup: %s\n"
3942 "%sTTYVTDisallocate: %s\n",
3943 prefix
, c
->tty_path
,
3944 prefix
, yes_no(c
->tty_reset
),
3945 prefix
, yes_no(c
->tty_vhangup
),
3946 prefix
, yes_no(c
->tty_vt_disallocate
));
3948 if (IN_SET(c
->std_output
,
3951 EXEC_OUTPUT_JOURNAL
,
3952 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3953 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3954 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
3955 IN_SET(c
->std_error
,
3958 EXEC_OUTPUT_JOURNAL
,
3959 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3960 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3961 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
3963 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
3965 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
3967 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
3969 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
3971 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
3974 if (c
->log_level_max
>= 0) {
3975 _cleanup_free_
char *t
= NULL
;
3977 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
3979 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
3982 if (c
->n_log_extra_fields
> 0) {
3985 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
3986 fprintf(f
, "%sLogExtraFields: ", prefix
);
3987 fwrite(c
->log_extra_fields
[j
].iov_base
,
3988 1, c
->log_extra_fields
[j
].iov_len
,
3994 if (c
->secure_bits
) {
3995 _cleanup_free_
char *str
= NULL
;
3997 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
3999 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4002 if (c
->capability_bounding_set
!= CAP_ALL
) {
4003 _cleanup_free_
char *str
= NULL
;
4005 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4007 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4010 if (c
->capability_ambient_set
!= 0) {
4011 _cleanup_free_
char *str
= NULL
;
4013 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4015 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4019 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4021 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4023 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4025 if (!strv_isempty(c
->supplementary_groups
)) {
4026 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4027 strv_fprintf(f
, c
->supplementary_groups
);
4032 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4034 if (strv_length(c
->read_write_paths
) > 0) {
4035 fprintf(f
, "%sReadWritePaths:", prefix
);
4036 strv_fprintf(f
, c
->read_write_paths
);
4040 if (strv_length(c
->read_only_paths
) > 0) {
4041 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4042 strv_fprintf(f
, c
->read_only_paths
);
4046 if (strv_length(c
->inaccessible_paths
) > 0) {
4047 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4048 strv_fprintf(f
, c
->inaccessible_paths
);
4052 if (c
->n_bind_mounts
> 0)
4053 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
4054 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
4055 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4056 c
->bind_mounts
[i
].source
,
4057 c
->bind_mounts
[i
].destination
,
4058 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4063 "%sUtmpIdentifier: %s\n",
4064 prefix
, c
->utmp_id
);
4066 if (c
->selinux_context
)
4068 "%sSELinuxContext: %s%s\n",
4069 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4071 if (c
->apparmor_profile
)
4073 "%sAppArmorProfile: %s%s\n",
4074 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4076 if (c
->smack_process_label
)
4078 "%sSmackProcessLabel: %s%s\n",
4079 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4081 if (c
->personality
!= PERSONALITY_INVALID
)
4083 "%sPersonality: %s\n",
4084 prefix
, strna(personality_to_string(c
->personality
)));
4087 "%sLockPersonality: %s\n",
4088 prefix
, yes_no(c
->lock_personality
));
4090 if (c
->syscall_filter
) {
4098 "%sSystemCallFilter: ",
4101 if (!c
->syscall_whitelist
)
4105 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4106 _cleanup_free_
char *name
= NULL
;
4107 const char *errno_name
= NULL
;
4108 int num
= PTR_TO_INT(val
);
4115 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4116 fputs(strna(name
), f
);
4119 errno_name
= errno_to_name(num
);
4121 fprintf(f
, ":%s", errno_name
);
4123 fprintf(f
, ":%d", num
);
4131 if (c
->syscall_archs
) {
4138 "%sSystemCallArchitectures:",
4142 SET_FOREACH(id
, c
->syscall_archs
, j
)
4143 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4148 if (exec_context_restrict_namespaces_set(c
)) {
4149 _cleanup_free_
char *s
= NULL
;
4151 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
4153 fprintf(f
, "%sRestrictNamespaces: %s\n",
4157 if (c
->syscall_errno
> 0) {
4158 const char *errno_name
;
4160 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4162 errno_name
= errno_to_name(c
->syscall_errno
);
4164 fprintf(f
, "%s\n", errno_name
);
4166 fprintf(f
, "%d\n", c
->syscall_errno
);
4169 if (c
->apparmor_profile
)
4171 "%sAppArmorProfile: %s%s\n",
4172 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4175 bool exec_context_maintains_privileges(ExecContext
*c
) {
4178 /* Returns true if the process forked off would run under
4179 * an unchanged UID or as root. */
4184 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4190 int exec_context_get_effective_ioprio(ExecContext
*c
) {
4198 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4200 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4205 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4210 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4211 free(c
->log_extra_fields
[l
].iov_base
);
4212 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4213 c
->n_log_extra_fields
= 0;
4216 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4221 dual_timestamp_get(&s
->start_timestamp
);
4224 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
4227 if (s
->pid
&& s
->pid
!= pid
)
4231 dual_timestamp_get(&s
->exit_timestamp
);
4237 if (context
->utmp_id
)
4238 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4240 exec_context_tty_reset(context
, NULL
);
4244 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
4245 char buf
[FORMAT_TIMESTAMP_MAX
];
4253 prefix
= strempty(prefix
);
4256 "%sPID: "PID_FMT
"\n",
4259 if (dual_timestamp_is_set(&s
->start_timestamp
))
4261 "%sStart Timestamp: %s\n",
4262 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4264 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4266 "%sExit Timestamp: %s\n"
4268 "%sExit Status: %i\n",
4269 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4270 prefix
, sigchld_code_to_string(s
->code
),
4274 char *exec_command_line(char **argv
) {
4282 STRV_FOREACH(a
, argv
)
4290 STRV_FOREACH(a
, argv
) {
4297 if (strpbrk(*a
, WHITESPACE
)) {
4308 /* FIXME: this doesn't really handle arguments that have
4309 * spaces and ticks in them */
4314 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4315 _cleanup_free_
char *cmd
= NULL
;
4316 const char *prefix2
;
4321 prefix
= strempty(prefix
);
4322 prefix2
= strjoina(prefix
, "\t");
4324 cmd
= exec_command_line(c
->argv
);
4326 "%sCommand Line: %s\n",
4327 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4329 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4332 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4335 prefix
= strempty(prefix
);
4337 LIST_FOREACH(command
, c
, c
)
4338 exec_command_dump(c
, f
, prefix
);
4341 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4348 /* It's kind of important, that we keep the order here */
4349 LIST_FIND_TAIL(command
, *l
, end
);
4350 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4355 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4363 l
= strv_new_ap(path
, ap
);
4384 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4385 _cleanup_strv_free_
char **l
= NULL
;
4393 l
= strv_new_ap(path
, ap
);
4399 r
= strv_extend_strv(&c
->argv
, l
, false);
4407 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4412 *rt
= new0(ExecRuntime
, 1);
4417 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4422 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4432 if (!c
->private_network
&& !c
->private_tmp
)
4435 r
= exec_runtime_allocate(rt
);
4439 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4440 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4444 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4445 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4453 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4455 assert(r
->n_ref
> 0);
4461 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4466 assert(r
->n_ref
> 0);
4473 free(r
->var_tmp_dir
);
4474 safe_close_pair(r
->netns_storage_socket
);
4478 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4487 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4489 if (rt
->var_tmp_dir
)
4490 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4492 if (rt
->netns_storage_socket
[0] >= 0) {
4495 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4499 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4502 if (rt
->netns_storage_socket
[1] >= 0) {
4505 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4509 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4515 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4522 if (streq(key
, "tmp-dir")) {
4525 r
= exec_runtime_allocate(rt
);
4529 copy
= strdup(value
);
4533 free((*rt
)->tmp_dir
);
4534 (*rt
)->tmp_dir
= copy
;
4536 } else if (streq(key
, "var-tmp-dir")) {
4539 r
= exec_runtime_allocate(rt
);
4543 copy
= strdup(value
);
4547 free((*rt
)->var_tmp_dir
);
4548 (*rt
)->var_tmp_dir
= copy
;
4550 } else if (streq(key
, "netns-socket-0")) {
4553 r
= exec_runtime_allocate(rt
);
4557 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4558 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4560 safe_close((*rt
)->netns_storage_socket
[0]);
4561 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4563 } else if (streq(key
, "netns-socket-1")) {
4566 r
= exec_runtime_allocate(rt
);
4570 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4571 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4573 safe_close((*rt
)->netns_storage_socket
[1]);
4574 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4582 static void *remove_tmpdir_thread(void *p
) {
4583 _cleanup_free_
char *path
= p
;
4585 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4589 void exec_runtime_destroy(ExecRuntime
*rt
) {
4595 /* If there are multiple users of this, let's leave the stuff around */
4600 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4602 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4604 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4611 if (rt
->var_tmp_dir
) {
4612 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4614 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4616 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4617 free(rt
->var_tmp_dir
);
4620 rt
->var_tmp_dir
= NULL
;
4623 safe_close_pair(rt
->netns_storage_socket
);
4626 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4627 [EXEC_INPUT_NULL
] = "null",
4628 [EXEC_INPUT_TTY
] = "tty",
4629 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4630 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4631 [EXEC_INPUT_SOCKET
] = "socket",
4632 [EXEC_INPUT_NAMED_FD
] = "fd",
4635 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4637 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4638 [EXEC_OUTPUT_INHERIT
] = "inherit",
4639 [EXEC_OUTPUT_NULL
] = "null",
4640 [EXEC_OUTPUT_TTY
] = "tty",
4641 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4642 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4643 [EXEC_OUTPUT_KMSG
] = "kmsg",
4644 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4645 [EXEC_OUTPUT_JOURNAL
] = "journal",
4646 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4647 [EXEC_OUTPUT_SOCKET
] = "socket",
4648 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4651 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4653 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4654 [EXEC_UTMP_INIT
] = "init",
4655 [EXEC_UTMP_LOGIN
] = "login",
4656 [EXEC_UTMP_USER
] = "user",
4659 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4661 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4662 [EXEC_PRESERVE_NO
] = "no",
4663 [EXEC_PRESERVE_YES
] = "yes",
4664 [EXEC_PRESERVE_RESTART
] = "restart",
4667 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4669 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
4670 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4671 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4672 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4673 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4674 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4677 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
4679 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
4680 [EXEC_KEYRING_INHERIT
] = "inherit",
4681 [EXEC_KEYRING_PRIVATE
] = "private",
4682 [EXEC_KEYRING_SHARED
] = "shared",
4685 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);