1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
28 #include <sys/capability.h>
29 #include <sys/eventfd.h>
31 #include <sys/personality.h>
32 #include <sys/prctl.h>
34 #include <sys/socket.h>
36 #include <sys/types.h>
42 #include <security/pam_appl.h>
46 #include <selinux/selinux.h>
54 #include <sys/apparmor.h>
57 #include "sd-messages.h"
60 #include "alloc-util.h"
62 #include "apparmor-util.h"
67 #include "capability-util.h"
68 #include "chown-recursive.h"
71 #include "errno-list.h"
73 #include "exit-status.h"
76 #include "format-util.h"
78 #include "glob-util.h"
86 #include "namespace.h"
87 #include "parse-util.h"
88 #include "path-util.h"
89 #include "process-util.h"
90 #include "rlimit-util.h"
93 #include "seccomp-util.h"
95 #include "securebits.h"
96 #include "securebits-util.h"
97 #include "selinux-util.h"
98 #include "signal-util.h"
99 #include "smack-util.h"
101 #include "string-table.h"
102 #include "string-util.h"
104 #include "syslog-util.h"
105 #include "terminal-util.h"
107 #include "user-util.h"
109 #include "utmp-wtmp.h"
111 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
112 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
114 /* This assumes there is a 'tty' group */
115 #define TTY_MODE 0620
117 #define SNDBUF_SIZE (8*1024*1024)
119 static int shift_fds(int fds
[], unsigned n_fds
) {
120 int start
, restart_from
;
125 /* Modifies the fds array! (sorts it) */
135 for (i
= start
; i
< (int) n_fds
; i
++) {
138 /* Already at right index? */
142 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
149 /* Hmm, the fd we wanted isn't free? Then
150 * let's remember that and try again from here */
151 if (nfd
!= i
+3 && restart_from
< 0)
155 if (restart_from
< 0)
158 start
= restart_from
;
164 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
168 n_fds
= n_storage_fds
+ n_socket_fds
;
174 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
175 * O_NONBLOCK only applies to socket activation though. */
177 for (i
= 0; i
< n_fds
; i
++) {
179 if (i
< n_socket_fds
) {
180 r
= fd_nonblock(fds
[i
], nonblock
);
185 /* We unconditionally drop FD_CLOEXEC from the fds,
186 * since after all we want to pass these fds to our
189 r
= fd_cloexec(fds
[i
], false);
197 static const char *exec_context_tty_path(const ExecContext
*context
) {
200 if (context
->stdio_as_fds
)
203 if (context
->tty_path
)
204 return context
->tty_path
;
206 return "/dev/console";
209 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
214 path
= exec_context_tty_path(context
);
216 if (context
->tty_vhangup
) {
217 if (p
&& p
->stdin_fd
>= 0)
218 (void) terminal_vhangup_fd(p
->stdin_fd
);
220 (void) terminal_vhangup(path
);
223 if (context
->tty_reset
) {
224 if (p
&& p
->stdin_fd
>= 0)
225 (void) reset_terminal_fd(p
->stdin_fd
, true);
227 (void) reset_terminal(path
);
230 if (context
->tty_vt_disallocate
&& path
)
231 (void) vt_disallocate(path
);
234 static bool is_terminal_input(ExecInput i
) {
237 EXEC_INPUT_TTY_FORCE
,
238 EXEC_INPUT_TTY_FAIL
);
241 static bool is_terminal_output(ExecOutput o
) {
244 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
246 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
249 static bool is_syslog_output(ExecOutput o
) {
252 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
255 static bool is_kmsg_output(ExecOutput o
) {
258 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
261 static bool exec_context_needs_term(const ExecContext
*c
) {
264 /* Return true if the execution context suggests we should set $TERM to something useful. */
266 if (is_terminal_input(c
->std_input
))
269 if (is_terminal_output(c
->std_output
))
272 if (is_terminal_output(c
->std_error
))
275 return !!c
->tty_path
;
278 static int open_null_as(int flags
, int nfd
) {
283 fd
= open("/dev/null", flags
|O_NOCTTY
);
288 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
296 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
297 static const union sockaddr_union sa
= {
298 .un
.sun_family
= AF_UNIX
,
299 .un
.sun_path
= "/run/systemd/journal/stdout",
301 uid_t olduid
= UID_INVALID
;
302 gid_t oldgid
= GID_INVALID
;
305 if (gid_is_valid(gid
)) {
308 if (setegid(gid
) < 0)
312 if (uid_is_valid(uid
)) {
315 if (seteuid(uid
) < 0) {
321 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
323 /* If we fail to restore the uid or gid, things will likely
324 fail later on. This should only happen if an LSM interferes. */
326 if (uid_is_valid(uid
))
327 (void) seteuid(olduid
);
330 if (gid_is_valid(gid
))
331 (void) setegid(oldgid
);
336 static int connect_logger_as(
338 const ExecContext
*context
,
339 const ExecParameters
*params
,
350 assert(output
< _EXEC_OUTPUT_MAX
);
354 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
358 r
= connect_journal_socket(fd
, uid
, gid
);
362 if (shutdown(fd
, SHUT_RD
) < 0) {
367 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
377 context
->syslog_identifier
?: ident
,
378 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
379 context
->syslog_priority
,
380 !!context
->syslog_level_prefix
,
381 is_syslog_output(output
),
382 is_kmsg_output(output
),
383 is_terminal_output(output
));
388 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
393 static int open_terminal_as(const char *path
, mode_t mode
, int nfd
) {
399 fd
= open_terminal(path
, mode
| O_NOCTTY
);
404 r
= dup2(fd
, nfd
) < 0 ? -errno
: nfd
;
412 static int fixup_input(ExecInput std_input
, int socket_fd
, bool apply_tty_stdin
) {
414 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
415 return EXEC_INPUT_NULL
;
417 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
418 return EXEC_INPUT_NULL
;
423 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
425 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
426 return EXEC_OUTPUT_INHERIT
;
431 static int setup_input(
432 const ExecContext
*context
,
433 const ExecParameters
*params
,
435 int named_iofds
[3]) {
442 if (params
->stdin_fd
>= 0) {
443 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
446 /* Try to make this the controlling tty, if it is a tty, and reset it */
447 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
448 (void) reset_terminal_fd(STDIN_FILENO
, true);
453 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
457 case EXEC_INPUT_NULL
:
458 return open_null_as(O_RDONLY
, STDIN_FILENO
);
461 case EXEC_INPUT_TTY_FORCE
:
462 case EXEC_INPUT_TTY_FAIL
: {
465 fd
= acquire_terminal(exec_context_tty_path(context
),
466 i
== EXEC_INPUT_TTY_FAIL
,
467 i
== EXEC_INPUT_TTY_FORCE
,
473 if (fd
!= STDIN_FILENO
) {
474 r
= dup2(fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
482 case EXEC_INPUT_SOCKET
:
483 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
485 case EXEC_INPUT_NAMED_FD
:
486 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
487 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
490 assert_not_reached("Unknown input type");
494 static int setup_output(
496 const ExecContext
*context
,
497 const ExecParameters
*params
,
504 dev_t
*journal_stream_dev
,
505 ino_t
*journal_stream_ino
) {
515 assert(journal_stream_dev
);
516 assert(journal_stream_ino
);
518 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
520 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
523 return STDOUT_FILENO
;
526 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
527 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
530 return STDERR_FILENO
;
533 i
= fixup_input(context
->std_input
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
534 o
= fixup_output(context
->std_output
, socket_fd
);
536 if (fileno
== STDERR_FILENO
) {
538 e
= fixup_output(context
->std_error
, socket_fd
);
540 /* This expects the input and output are already set up */
542 /* Don't change the stderr file descriptor if we inherit all
543 * the way and are not on a tty */
544 if (e
== EXEC_OUTPUT_INHERIT
&&
545 o
== EXEC_OUTPUT_INHERIT
&&
546 i
== EXEC_INPUT_NULL
&&
547 !is_terminal_input(context
->std_input
) &&
551 /* Duplicate from stdout if possible */
552 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
553 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
557 } else if (o
== EXEC_OUTPUT_INHERIT
) {
558 /* If input got downgraded, inherit the original value */
559 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
560 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
562 /* If the input is connected to anything that's not a /dev/null, inherit that... */
563 if (i
!= EXEC_INPUT_NULL
)
564 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
566 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
570 /* We need to open /dev/null here anew, to get the right access mode. */
571 return open_null_as(O_WRONLY
, fileno
);
576 case EXEC_OUTPUT_NULL
:
577 return open_null_as(O_WRONLY
, fileno
);
579 case EXEC_OUTPUT_TTY
:
580 if (is_terminal_input(i
))
581 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
583 /* We don't reset the terminal if this is just about output */
584 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
586 case EXEC_OUTPUT_SYSLOG
:
587 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
588 case EXEC_OUTPUT_KMSG
:
589 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
590 case EXEC_OUTPUT_JOURNAL
:
591 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
592 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
594 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
595 r
= open_null_as(O_WRONLY
, fileno
);
599 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
600 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
601 * services to detect whether they are connected to the journal or not.
603 * If both stdout and stderr are connected to a stream then let's make sure to store the data
604 * about STDERR as that's usually the best way to do logging. */
606 if (fstat(fileno
, &st
) >= 0 &&
607 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
608 *journal_stream_dev
= st
.st_dev
;
609 *journal_stream_ino
= st
.st_ino
;
614 case EXEC_OUTPUT_SOCKET
:
615 assert(socket_fd
>= 0);
616 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
618 case EXEC_OUTPUT_NAMED_FD
:
619 (void) fd_nonblock(named_iofds
[fileno
], false);
620 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
623 assert_not_reached("Unknown error type");
627 static int chown_terminal(int fd
, uid_t uid
) {
632 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
636 /* This might fail. What matters are the results. */
637 (void) fchown(fd
, uid
, -1);
638 (void) fchmod(fd
, TTY_MODE
);
640 if (fstat(fd
, &st
) < 0)
643 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
649 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
650 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
653 assert(_saved_stdin
);
654 assert(_saved_stdout
);
656 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
660 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
661 if (saved_stdout
< 0)
664 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
668 r
= chown_terminal(fd
, getuid());
672 r
= reset_terminal_fd(fd
, true);
676 if (dup2(fd
, STDIN_FILENO
) < 0)
679 if (dup2(fd
, STDOUT_FILENO
) < 0)
686 *_saved_stdin
= saved_stdin
;
687 *_saved_stdout
= saved_stdout
;
689 saved_stdin
= saved_stdout
= -1;
694 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
697 if (err
== -ETIMEDOUT
)
698 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
701 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
705 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
706 _cleanup_close_
int fd
= -1;
710 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
714 write_confirm_error_fd(err
, fd
, u
);
717 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
721 assert(saved_stdout
);
725 if (*saved_stdin
>= 0)
726 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
729 if (*saved_stdout
>= 0)
730 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
733 *saved_stdin
= safe_close(*saved_stdin
);
734 *saved_stdout
= safe_close(*saved_stdout
);
740 CONFIRM_PRETEND_FAILURE
= -1,
741 CONFIRM_PRETEND_SUCCESS
= 0,
745 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
746 int saved_stdout
= -1, saved_stdin
= -1, r
;
747 _cleanup_free_
char *e
= NULL
;
750 /* For any internal errors, assume a positive response. */
751 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
753 write_confirm_error(r
, vc
, u
);
754 return CONFIRM_EXECUTE
;
757 /* confirm_spawn might have been disabled while we were sleeping. */
758 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
763 e
= ellipsize(cmdline
, 60, 100);
771 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
773 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
780 printf("Resuming normal execution.\n");
781 manager_disable_confirm_spawn();
785 unit_dump(u
, stdout
, " ");
786 continue; /* ask again */
788 printf("Failing execution.\n");
789 r
= CONFIRM_PRETEND_FAILURE
;
792 printf(" c - continue, proceed without asking anymore\n"
793 " D - dump, show the state of the unit\n"
794 " f - fail, don't execute the command and pretend it failed\n"
796 " i - info, show a short summary of the unit\n"
797 " j - jobs, show jobs that are in progress\n"
798 " s - skip, don't execute the command and pretend it succeeded\n"
799 " y - yes, execute the command\n");
800 continue; /* ask again */
802 printf(" Description: %s\n"
805 u
->id
, u
->description
, cmdline
);
806 continue; /* ask again */
808 manager_dump_jobs(u
->manager
, stdout
, " ");
809 continue; /* ask again */
811 /* 'n' was removed in favor of 'f'. */
812 printf("Didn't understand 'n', did you mean 'f'?\n");
813 continue; /* ask again */
815 printf("Skipping execution.\n");
816 r
= CONFIRM_PRETEND_SUCCESS
;
822 assert_not_reached("Unhandled choice");
828 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
832 static int get_fixed_user(const ExecContext
*c
, const char **user
,
833 uid_t
*uid
, gid_t
*gid
,
834 const char **home
, const char **shell
) {
843 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
844 * (i.e. are "/" or "/bin/nologin"). */
847 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
855 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
865 r
= get_group_creds(&name
, gid
);
873 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
874 const char *group
, gid_t gid
,
875 gid_t
**supplementary_gids
, int *ngids
) {
879 bool keep_groups
= false;
880 gid_t
*groups
= NULL
;
881 _cleanup_free_ gid_t
*l_gids
= NULL
;
886 * If user is given, then lookup GID and supplementary groups list.
887 * We avoid NSS lookups for gid=0. Also we have to initialize groups
888 * here and as early as possible so we keep the list of supplementary
889 * groups of the caller.
891 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
892 /* First step, initialize groups from /etc/groups */
893 if (initgroups(user
, gid
) < 0)
899 if (strv_isempty(c
->supplementary_groups
))
903 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
904 * be positive, otherwise fail.
907 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
908 if (ngroups_max
<= 0) {
912 return -EOPNOTSUPP
; /* For all other values */
915 l_gids
= new(gid_t
, ngroups_max
);
921 * Lookup the list of groups that the user belongs to, we
922 * avoid NSS lookups here too for gid=0.
925 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
930 STRV_FOREACH(i
, c
->supplementary_groups
) {
933 if (k
>= ngroups_max
)
937 r
= get_group_creds(&g
, l_gids
+k
);
945 * Sets ngids to zero to drop all supplementary groups, happens
946 * when we are under root and SupplementaryGroups= is empty.
953 /* Otherwise get the final list of supplementary groups */
954 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
958 *supplementary_gids
= groups
;
966 static int enforce_groups(gid_t gid
, gid_t
*supplementary_gids
, int ngids
) {
969 /* Handle SupplementaryGroups= if it is not empty */
971 r
= maybe_setgroups(ngids
, supplementary_gids
);
976 if (gid_is_valid(gid
)) {
977 /* Then set our gids */
978 if (setresgid(gid
, gid
, gid
) < 0)
985 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
988 if (!uid_is_valid(uid
))
991 /* Sets (but doesn't look up) the uid and make sure we keep the
992 * capabilities while doing so. */
994 if (context
->capability_ambient_set
!= 0) {
996 /* First step: If we need to keep capabilities but
997 * drop privileges we need to make sure we keep our
998 * caps, while we drop privileges. */
1000 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1002 if (prctl(PR_GET_SECUREBITS
) != sb
)
1003 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1008 /* Second step: actually set the uids */
1009 if (setresuid(uid
, uid
, uid
) < 0)
1012 /* At this point we should have all necessary capabilities but
1013 are otherwise a normal user. However, the caps might got
1014 corrupted due to the setresuid() so we need clean them up
1015 later. This is done outside of this call. */
1022 static int null_conv(
1024 const struct pam_message
**msg
,
1025 struct pam_response
**resp
,
1026 void *appdata_ptr
) {
1028 /* We don't support conversations */
1030 return PAM_CONV_ERR
;
1035 static int setup_pam(
1042 int fds
[], unsigned n_fds
) {
1046 static const struct pam_conv conv
= {
1051 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1052 pam_handle_t
*handle
= NULL
;
1054 int pam_code
= PAM_SUCCESS
, r
;
1055 char **nv
, **e
= NULL
;
1056 bool close_session
= false;
1057 pid_t pam_pid
= 0, parent_pid
;
1064 /* We set up PAM in the parent process, then fork. The child
1065 * will then stay around until killed via PR_GET_PDEATHSIG or
1066 * systemd via the cgroup logic. It will then remove the PAM
1067 * session again. The parent process will exec() the actual
1068 * daemon. We do things this way to ensure that the main PID
1069 * of the daemon is the one we initially fork()ed. */
1071 r
= barrier_create(&barrier
);
1075 if (log_get_max_level() < LOG_DEBUG
)
1076 flags
|= PAM_SILENT
;
1078 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1079 if (pam_code
!= PAM_SUCCESS
) {
1085 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1086 if (pam_code
!= PAM_SUCCESS
)
1090 STRV_FOREACH(nv
, *env
) {
1091 pam_code
= pam_putenv(handle
, *nv
);
1092 if (pam_code
!= PAM_SUCCESS
)
1096 pam_code
= pam_acct_mgmt(handle
, flags
);
1097 if (pam_code
!= PAM_SUCCESS
)
1100 pam_code
= pam_open_session(handle
, flags
);
1101 if (pam_code
!= PAM_SUCCESS
)
1104 close_session
= true;
1106 e
= pam_getenvlist(handle
);
1108 pam_code
= PAM_BUF_ERR
;
1112 /* Block SIGTERM, so that we know that it won't get lost in
1115 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1117 parent_pid
= getpid_cached();
1126 int sig
, ret
= EXIT_PAM
;
1128 /* The child's job is to reset the PAM session on
1130 barrier_set_role(&barrier
, BARRIER_CHILD
);
1132 /* This string must fit in 10 chars (i.e. the length
1133 * of "/sbin/init"), to look pretty in /bin/ps */
1134 rename_process("(sd-pam)");
1136 /* Make sure we don't keep open the passed fds in this
1137 child. We assume that otherwise only those fds are
1138 open here that have been opened by PAM. */
1139 close_many(fds
, n_fds
);
1141 /* Drop privileges - we don't need any to pam_close_session
1142 * and this will make PR_SET_PDEATHSIG work in most cases.
1143 * If this fails, ignore the error - but expect sd-pam threads
1144 * to fail to exit normally */
1146 r
= maybe_setgroups(0, NULL
);
1148 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1149 if (setresgid(gid
, gid
, gid
) < 0)
1150 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1151 if (setresuid(uid
, uid
, uid
) < 0)
1152 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1154 (void) ignore_signals(SIGPIPE
, -1);
1156 /* Wait until our parent died. This will only work if
1157 * the above setresuid() succeeds, otherwise the kernel
1158 * will not allow unprivileged parents kill their privileged
1159 * children this way. We rely on the control groups kill logic
1160 * to do the rest for us. */
1161 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1164 /* Tell the parent that our setup is done. This is especially
1165 * important regarding dropping privileges. Otherwise, unit
1166 * setup might race against our setresuid(2) call.
1168 * If the parent aborted, we'll detect this below, hence ignore
1169 * return failure here. */
1170 (void) barrier_place(&barrier
);
1172 /* Check if our parent process might already have died? */
1173 if (getppid() == parent_pid
) {
1176 assert_se(sigemptyset(&ss
) >= 0);
1177 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1180 if (sigwait(&ss
, &sig
) < 0) {
1187 assert(sig
== SIGTERM
);
1192 /* If our parent died we'll end the session */
1193 if (getppid() != parent_pid
) {
1194 pam_code
= pam_close_session(handle
, flags
);
1195 if (pam_code
!= PAM_SUCCESS
)
1202 pam_end(handle
, pam_code
| flags
);
1206 barrier_set_role(&barrier
, BARRIER_PARENT
);
1208 /* If the child was forked off successfully it will do all the
1209 * cleanups, so forget about the handle here. */
1212 /* Unblock SIGTERM again in the parent */
1213 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1215 /* We close the log explicitly here, since the PAM modules
1216 * might have opened it, but we don't want this fd around. */
1219 /* Synchronously wait for the child to initialize. We don't care for
1220 * errors as we cannot recover. However, warn loudly if it happens. */
1221 if (!barrier_place_and_sync(&barrier
))
1222 log_error("PAM initialization failed");
1230 if (pam_code
!= PAM_SUCCESS
) {
1231 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1232 r
= -EPERM
; /* PAM errors do not map to errno */
1234 log_error_errno(r
, "PAM failed: %m");
1238 pam_code
= pam_close_session(handle
, flags
);
1240 pam_end(handle
, pam_code
| flags
);
1252 static void rename_process_from_path(const char *path
) {
1253 char process_name
[11];
1257 /* This resulting string must fit in 10 chars (i.e. the length
1258 * of "/sbin/init") to look pretty in /bin/ps */
1262 rename_process("(...)");
1268 /* The end of the process name is usually more
1269 * interesting, since the first bit might just be
1275 process_name
[0] = '(';
1276 memcpy(process_name
+1, p
, l
);
1277 process_name
[1+l
] = ')';
1278 process_name
[1+l
+1] = 0;
1280 rename_process(process_name
);
1283 static bool context_has_address_families(const ExecContext
*c
) {
1286 return c
->address_families_whitelist
||
1287 !set_isempty(c
->address_families
);
1290 static bool context_has_syscall_filters(const ExecContext
*c
) {
1293 return c
->syscall_whitelist
||
1294 !hashmap_isempty(c
->syscall_filter
);
1297 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1300 if (c
->no_new_privileges
)
1303 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1306 /* We need NNP if we have any form of seccomp and are unprivileged */
1307 return context_has_address_families(c
) ||
1308 c
->memory_deny_write_execute
||
1309 c
->restrict_realtime
||
1310 exec_context_restrict_namespaces_set(c
) ||
1311 c
->protect_kernel_tunables
||
1312 c
->protect_kernel_modules
||
1313 c
->private_devices
||
1314 context_has_syscall_filters(c
) ||
1315 !set_isempty(c
->syscall_archs
) ||
1316 c
->lock_personality
;
1321 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1323 if (is_seccomp_available())
1326 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1330 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1331 uint32_t negative_action
, default_action
, action
;
1337 if (!context_has_syscall_filters(c
))
1340 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1343 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1345 if (c
->syscall_whitelist
) {
1346 default_action
= negative_action
;
1347 action
= SCMP_ACT_ALLOW
;
1349 default_action
= SCMP_ACT_ALLOW
;
1350 action
= negative_action
;
1353 if (needs_ambient_hack
) {
1354 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1359 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1362 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1366 if (set_isempty(c
->syscall_archs
))
1369 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1372 return seccomp_restrict_archs(c
->syscall_archs
);
1375 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1379 if (!context_has_address_families(c
))
1382 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1385 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1388 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1392 if (!c
->memory_deny_write_execute
)
1395 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1398 return seccomp_memory_deny_write_execute();
1401 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1405 if (!c
->restrict_realtime
)
1408 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1411 return seccomp_restrict_realtime();
1414 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1418 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1419 * let's protect even those systems where this is left on in the kernel. */
1421 if (!c
->protect_kernel_tunables
)
1424 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1427 return seccomp_protect_sysctl();
1430 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1434 /* Turn off module syscalls on ProtectKernelModules=yes */
1436 if (!c
->protect_kernel_modules
)
1439 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1442 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1445 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1449 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1451 if (!c
->private_devices
)
1454 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1457 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1460 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1464 if (!exec_context_restrict_namespaces_set(c
))
1467 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1470 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1473 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1474 unsigned long personality
;
1480 if (!c
->lock_personality
)
1483 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1486 personality
= c
->personality
;
1488 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1489 if (personality
== PERSONALITY_INVALID
) {
1491 r
= opinionated_personality(&personality
);
1496 return seccomp_lock_personality(personality
);
1501 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1504 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1505 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1507 if (idle_pipe
[0] >= 0) {
1510 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1512 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1515 /* Signal systemd that we are bored and want to continue. */
1516 n
= write(idle_pipe
[3], "x", 1);
1518 /* Wait for systemd to react to the signal above. */
1519 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1522 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1526 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1529 static int build_environment(
1531 const ExecContext
*c
,
1532 const ExecParameters
*p
,
1535 const char *username
,
1537 dev_t journal_stream_dev
,
1538 ino_t journal_stream_ino
,
1541 _cleanup_strv_free_
char **our_env
= NULL
;
1549 our_env
= new0(char*, 14);
1554 _cleanup_free_
char *joined
= NULL
;
1556 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1558 our_env
[n_env
++] = x
;
1560 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1562 our_env
[n_env
++] = x
;
1564 joined
= strv_join(p
->fd_names
, ":");
1568 x
= strjoin("LISTEN_FDNAMES=", joined
);
1571 our_env
[n_env
++] = x
;
1574 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1575 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1577 our_env
[n_env
++] = x
;
1579 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1581 our_env
[n_env
++] = x
;
1584 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1585 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1586 * check the database directly. */
1587 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1588 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1591 our_env
[n_env
++] = x
;
1595 x
= strappend("HOME=", home
);
1598 our_env
[n_env
++] = x
;
1602 x
= strappend("LOGNAME=", username
);
1605 our_env
[n_env
++] = x
;
1607 x
= strappend("USER=", username
);
1610 our_env
[n_env
++] = x
;
1614 x
= strappend("SHELL=", shell
);
1617 our_env
[n_env
++] = x
;
1620 if (!sd_id128_is_null(u
->invocation_id
)) {
1621 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1624 our_env
[n_env
++] = x
;
1627 if (exec_context_needs_term(c
)) {
1628 const char *tty_path
, *term
= NULL
;
1630 tty_path
= exec_context_tty_path(c
);
1632 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1633 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1634 * passes to PID 1 ends up all the way in the console login shown. */
1636 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1637 term
= getenv("TERM");
1639 term
= default_term_for_tty(tty_path
);
1641 x
= strappend("TERM=", term
);
1644 our_env
[n_env
++] = x
;
1647 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1648 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1651 our_env
[n_env
++] = x
;
1654 our_env
[n_env
++] = NULL
;
1655 assert(n_env
<= 12);
1663 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1664 _cleanup_strv_free_
char **pass_env
= NULL
;
1665 size_t n_env
= 0, n_bufsize
= 0;
1668 STRV_FOREACH(i
, c
->pass_environment
) {
1669 _cleanup_free_
char *x
= NULL
;
1675 x
= strjoin(*i
, "=", v
);
1679 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1682 pass_env
[n_env
++] = x
;
1683 pass_env
[n_env
] = NULL
;
1693 static bool exec_needs_mount_namespace(
1694 const ExecContext
*context
,
1695 const ExecParameters
*params
,
1696 ExecRuntime
*runtime
) {
1701 if (context
->root_image
)
1704 if (!strv_isempty(context
->read_write_paths
) ||
1705 !strv_isempty(context
->read_only_paths
) ||
1706 !strv_isempty(context
->inaccessible_paths
))
1709 if (context
->n_bind_mounts
> 0 ||
1710 !strv_isempty(context
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) ||
1711 !strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1712 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1713 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
) ||
1714 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CONFIGURATION
].paths
))
1717 if (context
->mount_flags
!= 0)
1720 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1723 if (context
->private_devices
||
1724 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1725 context
->protect_home
!= PROTECT_HOME_NO
||
1726 context
->protect_kernel_tunables
||
1727 context
->protect_kernel_modules
||
1728 context
->protect_control_groups
)
1731 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1737 static int setup_private_users(uid_t uid
, gid_t gid
) {
1738 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1739 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1740 _cleanup_close_
int unshare_ready_fd
= -1;
1741 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1747 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1748 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1749 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1750 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1751 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1752 * continues execution normally. */
1754 if (uid
!= 0 && uid_is_valid(uid
)) {
1755 r
= asprintf(&uid_map
,
1756 "0 0 1\n" /* Map root → root */
1757 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1762 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1767 if (gid
!= 0 && gid_is_valid(gid
)) {
1768 r
= asprintf(&gid_map
,
1769 "0 0 1\n" /* Map root → root */
1770 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1775 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1780 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1782 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1783 if (unshare_ready_fd
< 0)
1786 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1788 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1796 _cleanup_close_
int fd
= -1;
1800 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1801 * here, after the parent opened its own user namespace. */
1804 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1806 /* Wait until the parent unshared the user namespace */
1807 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1812 /* Disable the setgroups() system call in the child user namespace, for good. */
1813 a
= procfs_file_alloca(ppid
, "setgroups");
1814 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1816 if (errno
!= ENOENT
) {
1821 /* If the file is missing the kernel is too old, let's continue anyway. */
1823 if (write(fd
, "deny\n", 5) < 0) {
1828 fd
= safe_close(fd
);
1831 /* First write the GID map */
1832 a
= procfs_file_alloca(ppid
, "gid_map");
1833 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1838 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1842 fd
= safe_close(fd
);
1844 /* The write the UID map */
1845 a
= procfs_file_alloca(ppid
, "uid_map");
1846 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1851 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1856 _exit(EXIT_SUCCESS
);
1859 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1860 _exit(EXIT_FAILURE
);
1863 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1865 if (unshare(CLONE_NEWUSER
) < 0)
1868 /* Let the child know that the namespace is ready now */
1869 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1872 /* Try to read an error code from the child */
1873 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1876 if (n
== sizeof(r
)) { /* an error code was sent to us */
1881 if (n
!= 0) /* on success we should have read 0 bytes */
1884 r
= wait_for_terminate(pid
, &si
);
1889 /* If something strange happened with the child, let's consider this fatal, too */
1890 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1896 static int setup_exec_directory(
1897 const ExecContext
*context
,
1898 const ExecParameters
*params
,
1901 ExecDirectoryType type
,
1904 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1905 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1906 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1907 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1908 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1909 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1916 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
1917 assert(exit_status
);
1919 if (!params
->prefix
[type
])
1922 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1923 if (!uid_is_valid(uid
))
1925 if (!gid_is_valid(gid
))
1929 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1930 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
1931 const char *effective
;
1933 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1939 r
= mkdir_parents_label(p
, 0755);
1943 if (context
->dynamic_user
&&
1944 !IN_SET(type
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
1945 _cleanup_free_
char *private_root
= NULL
, *relative
= NULL
, *parent
= NULL
;
1947 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1948 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1949 * whose UID is later on reused. To lock this down we use the same trick used by container
1950 * managers to prohibit host users to get access to files of the same UID in containers: we
1951 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1952 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1953 * to make this directory permeable for the service itself.
1955 * Specifically: for a service which wants a special directory "foo/" we first create a
1956 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1957 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1958 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1959 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1960 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1961 * disabling the access boundary for the service and making sure it only gets access to the
1962 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1964 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1965 * owned by the service itself.
1966 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1967 * files or sockets with other services. */
1969 private_root
= strjoin(params
->prefix
[type
], "/private");
1970 if (!private_root
) {
1975 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1976 r
= mkdir_safe_label(private_root
, 0700, 0, 0, false);
1980 pp
= strjoin(private_root
, "/", *rt
);
1986 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1987 r
= mkdir_parents_label(pp
, 0755);
1991 /* Finally, create the actual directory for the service */
1992 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
1993 if (r
< 0 && r
!= -EEXIST
)
1996 parent
= dirname_malloc(p
);
2002 r
= path_make_relative(parent
, pp
, &relative
);
2006 /* And link it up from the original place */
2007 r
= symlink_idempotent(relative
, p
);
2014 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2015 if (r
< 0 && r
!= -EEXIST
)
2021 /* First lock down the access mode */
2022 if (chmod(effective
, context
->directories
[type
].mode
) < 0) {
2027 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2028 * a service, and shall not be writable. */
2029 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2032 /* Then, change the ownership of the whole tree, if necessary */
2033 r
= path_chown_recursive(effective
, uid
, gid
);
2041 *exit_status
= exit_status_table
[type
];
2045 static int setup_smack(
2046 const ExecContext
*context
,
2047 const ExecCommand
*command
) {
2054 if (context
->smack_process_label
) {
2055 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2059 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2061 _cleanup_free_
char *exec_label
= NULL
;
2063 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2064 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2067 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2076 static int compile_bind_mounts(
2077 const ExecContext
*context
,
2078 const ExecParameters
*params
,
2079 BindMount
**ret_bind_mounts
,
2080 unsigned *ret_n_bind_mounts
,
2081 char ***ret_empty_directories
) {
2083 _cleanup_strv_free_
char **empty_directories
= NULL
;
2084 BindMount
*bind_mounts
;
2085 unsigned n
, h
= 0, i
;
2086 ExecDirectoryType t
;
2091 assert(ret_bind_mounts
);
2092 assert(ret_n_bind_mounts
);
2093 assert(ret_empty_directories
);
2095 n
= context
->n_bind_mounts
;
2096 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2097 if (!params
->prefix
[t
])
2100 n
+= strv_length(context
->directories
[t
].paths
);
2104 *ret_bind_mounts
= NULL
;
2105 *ret_n_bind_mounts
= 0;
2106 *ret_empty_directories
= NULL
;
2110 bind_mounts
= new(BindMount
, n
);
2114 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2115 BindMount
*item
= context
->bind_mounts
+ i
;
2118 s
= strdup(item
->source
);
2124 d
= strdup(item
->destination
);
2131 bind_mounts
[h
++] = (BindMount
) {
2134 .read_only
= item
->read_only
,
2135 .recursive
= item
->recursive
,
2136 .ignore_enoent
= item
->ignore_enoent
,
2140 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2143 if (!params
->prefix
[t
])
2146 if (strv_isempty(context
->directories
[t
].paths
))
2149 if (context
->dynamic_user
&&
2150 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2153 /* So this is for a dynamic user, and we need to make sure the process can access its own
2154 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2155 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2157 private_root
= strjoin(params
->prefix
[t
], "/private");
2158 if (!private_root
) {
2163 r
= strv_consume(&empty_directories
, private_root
);
2170 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2173 if (context
->dynamic_user
&&
2174 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
))
2175 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2177 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2190 bind_mounts
[h
++] = (BindMount
) {
2195 .ignore_enoent
= false,
2202 *ret_bind_mounts
= bind_mounts
;
2203 *ret_n_bind_mounts
= n
;
2204 *ret_empty_directories
= empty_directories
;
2206 empty_directories
= NULL
;
2211 bind_mount_free_many(bind_mounts
, h
);
2215 static int apply_mount_namespace(
2217 ExecCommand
*command
,
2218 const ExecContext
*context
,
2219 const ExecParameters
*params
,
2220 ExecRuntime
*runtime
) {
2222 _cleanup_strv_free_
char **empty_directories
= NULL
;
2223 char *tmp
= NULL
, *var
= NULL
;
2224 const char *root_dir
= NULL
, *root_image
= NULL
;
2225 NamespaceInfo ns_info
= {
2226 .ignore_protect_paths
= false,
2227 .private_dev
= context
->private_devices
,
2228 .protect_control_groups
= context
->protect_control_groups
,
2229 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2230 .protect_kernel_modules
= context
->protect_kernel_modules
,
2231 .mount_apivfs
= context
->mount_apivfs
,
2233 bool needs_sandboxing
;
2234 BindMount
*bind_mounts
= NULL
;
2235 unsigned n_bind_mounts
= 0;
2240 /* The runtime struct only contains the parent of the private /tmp,
2241 * which is non-accessible to world users. Inside of it there's a /tmp
2242 * that is sticky, and that's the one we want to use here. */
2244 if (context
->private_tmp
&& runtime
) {
2245 if (runtime
->tmp_dir
)
2246 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2247 if (runtime
->var_tmp_dir
)
2248 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2251 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2252 root_image
= context
->root_image
;
2255 root_dir
= context
->root_directory
;
2258 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2263 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2264 * sandbox info, otherwise enforce it, don't ignore protected paths and
2265 * fail if we are enable to apply the sandbox inside the mount namespace.
2267 if (!context
->dynamic_user
&& root_dir
)
2268 ns_info
.ignore_protect_paths
= true;
2270 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2272 r
= setup_namespace(root_dir
, root_image
,
2273 &ns_info
, context
->read_write_paths
,
2274 needs_sandboxing
? context
->read_only_paths
: NULL
,
2275 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2281 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2282 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2283 context
->mount_flags
,
2284 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2286 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2288 /* If we couldn't set up the namespace this is probably due to a
2289 * missing capability. In this case, silently proceeed. */
2290 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2291 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2298 static int apply_working_directory(
2299 const ExecContext
*context
,
2300 const ExecParameters
*params
,
2302 const bool needs_mount_ns
,
2308 assert(exit_status
);
2310 if (context
->working_directory_home
) {
2313 *exit_status
= EXIT_CHDIR
;
2319 } else if (context
->working_directory
)
2320 wd
= context
->working_directory
;
2324 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2325 if (!needs_mount_ns
&& context
->root_directory
)
2326 if (chroot(context
->root_directory
) < 0) {
2327 *exit_status
= EXIT_CHROOT
;
2333 d
= prefix_roota(context
->root_directory
, wd
);
2335 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2336 *exit_status
= EXIT_CHDIR
;
2343 static int setup_keyring(
2345 const ExecContext
*context
,
2346 const ExecParameters
*p
,
2347 uid_t uid
, gid_t gid
) {
2349 key_serial_t keyring
;
2356 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2357 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2358 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2359 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2360 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2361 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2363 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2366 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2369 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2370 if (keyring
== -1) {
2371 if (errno
== ENOSYS
)
2372 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2373 else if (IN_SET(errno
, EACCES
, EPERM
))
2374 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2375 else if (errno
== EDQUOT
)
2376 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2378 return log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2383 /* Populate they keyring with the invocation ID by default. */
2384 if (!sd_id128_is_null(u
->invocation_id
)) {
2387 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2389 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2391 if (keyctl(KEYCTL_SETPERM
, key
,
2392 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2393 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2394 return log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2398 /* And now, make the keyring owned by the service's user */
2399 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2400 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2401 return log_unit_error_errno(u
, errno
, "Failed to change ownership of session keyring: %m");
2403 /* When requested link the user keyring into the session keyring. */
2404 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2408 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2409 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2410 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2412 saved_uid
= getuid();
2413 saved_gid
= getgid();
2415 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2416 if (setregid(gid
, -1) < 0)
2417 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2420 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2421 if (setreuid(uid
, -1) < 0) {
2422 (void) setregid(saved_gid
, -1);
2423 return log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2427 if (keyctl(KEYCTL_LINK
,
2428 KEY_SPEC_USER_KEYRING
,
2429 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2433 (void) setreuid(saved_uid
, -1);
2434 (void) setregid(saved_gid
, -1);
2436 return log_unit_error_errno(u
, r
, "Failed to link user keyring into session keyring: %m");
2439 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2440 if (setreuid(saved_uid
, -1) < 0) {
2441 (void) setregid(saved_gid
, -1);
2442 return log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2446 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2447 if (setregid(saved_gid
, -1) < 0)
2448 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2455 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2463 array
[(*n
)++] = pair
[0];
2465 array
[(*n
)++] = pair
[1];
2468 static int close_remaining_fds(
2469 const ExecParameters
*params
,
2470 ExecRuntime
*runtime
,
2471 DynamicCreds
*dcreds
,
2474 int *fds
, unsigned n_fds
) {
2476 unsigned n_dont_close
= 0;
2477 int dont_close
[n_fds
+ 12];
2481 if (params
->stdin_fd
>= 0)
2482 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2483 if (params
->stdout_fd
>= 0)
2484 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2485 if (params
->stderr_fd
>= 0)
2486 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2489 dont_close
[n_dont_close
++] = socket_fd
;
2491 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2492 n_dont_close
+= n_fds
;
2496 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2500 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2502 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2505 if (user_lookup_fd
>= 0)
2506 dont_close
[n_dont_close
++] = user_lookup_fd
;
2508 return close_all_fds(dont_close
, n_dont_close
);
2511 static int send_user_lookup(
2519 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2520 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2523 if (user_lookup_fd
< 0)
2526 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2529 if (writev(user_lookup_fd
,
2531 IOVEC_INIT(&uid
, sizeof(uid
)),
2532 IOVEC_INIT(&gid
, sizeof(gid
)),
2533 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2539 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2546 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2551 if (!c
->working_directory_home
)
2555 /* Hardcode /root as home directory for UID 0 */
2560 r
= get_home_dir(buf
);
2568 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2569 _cleanup_strv_free_
char ** list
= NULL
;
2570 ExecDirectoryType t
;
2577 assert(c
->dynamic_user
);
2579 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2580 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2583 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2586 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2592 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2595 if (t
== EXEC_DIRECTORY_RUNTIME
)
2596 e
= strjoin(p
->prefix
[t
], "/", *i
);
2598 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2602 r
= strv_consume(&list
, e
);
2614 static int exec_child(
2616 ExecCommand
*command
,
2617 const ExecContext
*context
,
2618 const ExecParameters
*params
,
2619 ExecRuntime
*runtime
,
2620 DynamicCreds
*dcreds
,
2625 unsigned n_storage_fds
,
2626 unsigned n_socket_fds
,
2631 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2632 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2633 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2634 const char *username
= NULL
, *groupname
= NULL
;
2635 const char *home
= NULL
, *shell
= NULL
;
2636 dev_t journal_stream_dev
= 0;
2637 ino_t journal_stream_ino
= 0;
2638 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2639 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2640 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2641 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2643 bool use_selinux
= false;
2646 bool use_smack
= false;
2649 bool use_apparmor
= false;
2651 uid_t uid
= UID_INVALID
;
2652 gid_t gid
= GID_INVALID
;
2653 int i
, r
, ngids
= 0;
2655 ExecDirectoryType dt
;
2662 assert(exit_status
);
2664 rename_process_from_path(command
->path
);
2666 /* We reset exactly these signals, since they are the
2667 * only ones we set to SIG_IGN in the main daemon. All
2668 * others we leave untouched because we set them to
2669 * SIG_DFL or a valid handler initially, both of which
2670 * will be demoted to SIG_DFL. */
2671 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2672 SIGNALS_IGNORE
, -1);
2674 if (context
->ignore_sigpipe
)
2675 (void) ignore_signals(SIGPIPE
, -1);
2677 r
= reset_signal_mask();
2679 *exit_status
= EXIT_SIGNAL_MASK
;
2680 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2683 if (params
->idle_pipe
)
2684 do_idle_pipe_dance(params
->idle_pipe
);
2686 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2687 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2688 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2689 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2692 log_set_open_when_needed(true);
2694 /* In case anything used libc syslog(), close this here, too */
2697 n_fds
= n_storage_fds
+ n_socket_fds
;
2698 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2700 *exit_status
= EXIT_FDS
;
2701 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2704 if (!context
->same_pgrp
)
2706 *exit_status
= EXIT_SETSID
;
2707 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2710 exec_context_tty_reset(context
, params
);
2712 if (unit_shall_confirm_spawn(unit
)) {
2713 const char *vc
= params
->confirm_spawn
;
2714 _cleanup_free_
char *cmdline
= NULL
;
2716 cmdline
= exec_command_line(argv
);
2718 *exit_status
= EXIT_MEMORY
;
2722 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2723 if (r
!= CONFIRM_EXECUTE
) {
2724 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2725 *exit_status
= EXIT_SUCCESS
;
2728 *exit_status
= EXIT_CONFIRM
;
2729 log_unit_error(unit
, "Execution cancelled by the user");
2734 if (context
->dynamic_user
&& dcreds
) {
2735 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2737 /* Make sure we bypass our own NSS module for any NSS checks */
2738 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2739 *exit_status
= EXIT_USER
;
2740 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2743 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2745 *exit_status
= EXIT_MEMORY
;
2749 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2751 *exit_status
= EXIT_USER
;
2753 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2756 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2759 if (!uid_is_valid(uid
)) {
2760 *exit_status
= EXIT_USER
;
2761 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2765 if (!gid_is_valid(gid
)) {
2766 *exit_status
= EXIT_USER
;
2767 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2772 username
= dcreds
->user
->name
;
2775 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2777 *exit_status
= EXIT_USER
;
2778 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2781 r
= get_fixed_group(context
, &groupname
, &gid
);
2783 *exit_status
= EXIT_GROUP
;
2784 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
2788 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2789 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2790 &supplementary_gids
, &ngids
);
2792 *exit_status
= EXIT_GROUP
;
2793 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
2796 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2798 *exit_status
= EXIT_USER
;
2799 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
2802 user_lookup_fd
= safe_close(user_lookup_fd
);
2804 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2806 *exit_status
= EXIT_CHDIR
;
2807 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
2810 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2811 * must sure to drop O_NONBLOCK */
2813 (void) fd_nonblock(socket_fd
, false);
2815 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2817 *exit_status
= EXIT_STDIN
;
2818 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
2821 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2823 *exit_status
= EXIT_STDOUT
;
2824 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
2827 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2829 *exit_status
= EXIT_STDERR
;
2830 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
2833 if (params
->cgroup_path
) {
2834 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2836 *exit_status
= EXIT_CGROUP
;
2837 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", params
->cgroup_path
);
2841 if (context
->oom_score_adjust_set
) {
2842 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2844 /* When we can't make this change due to EPERM, then
2845 * let's silently skip over it. User namespaces
2846 * prohibit write access to this file, and we
2847 * shouldn't trip up over that. */
2849 sprintf(t
, "%i", context
->oom_score_adjust
);
2850 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2851 if (IN_SET(r
, -EPERM
, -EACCES
))
2852 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2854 *exit_status
= EXIT_OOM_ADJUST
;
2855 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
2859 if (context
->nice_set
)
2860 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2861 *exit_status
= EXIT_NICE
;
2862 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
2865 if (context
->cpu_sched_set
) {
2866 struct sched_param param
= {
2867 .sched_priority
= context
->cpu_sched_priority
,
2870 r
= sched_setscheduler(0,
2871 context
->cpu_sched_policy
|
2872 (context
->cpu_sched_reset_on_fork
?
2873 SCHED_RESET_ON_FORK
: 0),
2876 *exit_status
= EXIT_SETSCHEDULER
;
2877 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
2881 if (context
->cpuset
)
2882 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2883 *exit_status
= EXIT_CPUAFFINITY
;
2884 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
2887 if (context
->ioprio_set
)
2888 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2889 *exit_status
= EXIT_IOPRIO
;
2890 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
2893 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2894 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2895 *exit_status
= EXIT_TIMERSLACK
;
2896 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
2899 if (context
->personality
!= PERSONALITY_INVALID
) {
2900 r
= safe_personality(context
->personality
);
2902 *exit_status
= EXIT_PERSONALITY
;
2903 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
2907 if (context
->utmp_id
)
2908 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2910 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2911 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2915 if (context
->user
) {
2916 r
= chown_terminal(STDIN_FILENO
, uid
);
2918 *exit_status
= EXIT_STDIN
;
2919 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
2923 /* If delegation is enabled we'll pass ownership of the cgroup
2924 * (but only in systemd's own controller hierarchy!) to the
2925 * user of the new process. */
2926 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2927 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2929 *exit_status
= EXIT_CGROUP
;
2930 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2933 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2935 *exit_status
= EXIT_CGROUP
;
2936 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2940 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
2941 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2943 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
2946 r
= build_environment(
2958 *exit_status
= EXIT_MEMORY
;
2962 r
= build_pass_environment(context
, &pass_env
);
2964 *exit_status
= EXIT_MEMORY
;
2968 accum_env
= strv_env_merge(5,
2969 params
->environment
,
2972 context
->environment
,
2976 *exit_status
= EXIT_MEMORY
;
2979 accum_env
= strv_env_clean(accum_env
);
2981 (void) umask(context
->umask
);
2983 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
2985 *exit_status
= EXIT_KEYRING
;
2986 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
2989 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2990 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2992 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2993 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
2995 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2996 if (needs_ambient_hack
)
2997 needs_setuid
= false;
2999 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3001 if (needs_sandboxing
) {
3002 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3003 * present. The actual MAC context application will happen later, as late as possible, to avoid
3004 * impacting our own code paths. */
3007 use_selinux
= mac_selinux_use();
3010 use_smack
= mac_smack_use();
3013 use_apparmor
= mac_apparmor_use();
3018 if (context
->pam_name
&& username
) {
3019 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3021 *exit_status
= EXIT_PAM
;
3022 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3027 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3028 if (ns_type_supported(NAMESPACE_NET
)) {
3029 r
= setup_netns(runtime
->netns_storage_socket
);
3031 *exit_status
= EXIT_NETWORK
;
3032 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3035 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3038 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3039 if (needs_mount_namespace
) {
3040 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3042 *exit_status
= EXIT_NAMESPACE
;
3043 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3047 /* Apply just after mount namespace setup */
3048 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3050 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3052 /* Drop groups as early as possbile */
3054 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3056 *exit_status
= EXIT_GROUP
;
3057 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3061 if (needs_sandboxing
) {
3063 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3064 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3066 *exit_status
= EXIT_SELINUX_CONTEXT
;
3067 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3072 if (context
->private_users
) {
3073 r
= setup_private_users(uid
, gid
);
3075 *exit_status
= EXIT_USER
;
3076 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3081 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3082 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3083 * was needed to upload the policy and can now be closed as well. */
3084 r
= close_all_fds(fds
, n_fds
);
3086 r
= shift_fds(fds
, n_fds
);
3088 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
3090 *exit_status
= EXIT_FDS
;
3091 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3094 secure_bits
= context
->secure_bits
;
3096 if (needs_sandboxing
) {
3099 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
3101 if (!context
->rlimit
[i
])
3104 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
3106 *exit_status
= EXIT_LIMITS
;
3107 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit %s: %m", rlimit_to_string(i
));
3111 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3112 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3113 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3114 *exit_status
= EXIT_LIMITS
;
3115 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3119 bset
= context
->capability_bounding_set
;
3120 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3121 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3122 * instead of us doing that */
3123 if (needs_ambient_hack
)
3124 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3125 (UINT64_C(1) << CAP_SETUID
) |
3126 (UINT64_C(1) << CAP_SETGID
);
3128 if (!cap_test_all(bset
)) {
3129 r
= capability_bounding_set_drop(bset
, false);
3131 *exit_status
= EXIT_CAPABILITIES
;
3132 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3136 /* This is done before enforce_user, but ambient set
3137 * does not survive over setresuid() if keep_caps is not set. */
3138 if (!needs_ambient_hack
&&
3139 context
->capability_ambient_set
!= 0) {
3140 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3142 *exit_status
= EXIT_CAPABILITIES
;
3143 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3149 if (context
->user
) {
3150 r
= enforce_user(context
, uid
);
3152 *exit_status
= EXIT_USER
;
3153 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3156 if (!needs_ambient_hack
&&
3157 context
->capability_ambient_set
!= 0) {
3159 /* Fix the ambient capabilities after user change. */
3160 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3162 *exit_status
= EXIT_CAPABILITIES
;
3163 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3166 /* If we were asked to change user and ambient capabilities
3167 * were requested, we had to add keep-caps to the securebits
3168 * so that we would maintain the inherited capability set
3169 * through the setresuid(). Make sure that the bit is added
3170 * also to the context secure_bits so that we don't try to
3171 * drop the bit away next. */
3173 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3178 if (needs_sandboxing
) {
3179 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3180 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3181 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3182 * are restricted. */
3186 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3189 r
= setexeccon(exec_context
);
3191 *exit_status
= EXIT_SELINUX_CONTEXT
;
3192 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3200 r
= setup_smack(context
, command
);
3202 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3203 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3209 if (use_apparmor
&& context
->apparmor_profile
) {
3210 r
= aa_change_onexec(context
->apparmor_profile
);
3211 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3212 *exit_status
= EXIT_APPARMOR_PROFILE
;
3213 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3218 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3219 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3220 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3221 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3222 *exit_status
= EXIT_SECUREBITS
;
3223 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3226 if (context_has_no_new_privileges(context
))
3227 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3228 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3229 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3233 r
= apply_address_families(unit
, context
);
3235 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3236 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3239 r
= apply_memory_deny_write_execute(unit
, context
);
3241 *exit_status
= EXIT_SECCOMP
;
3242 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3245 r
= apply_restrict_realtime(unit
, context
);
3247 *exit_status
= EXIT_SECCOMP
;
3248 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3251 r
= apply_restrict_namespaces(unit
, context
);
3253 *exit_status
= EXIT_SECCOMP
;
3254 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3257 r
= apply_protect_sysctl(unit
, context
);
3259 *exit_status
= EXIT_SECCOMP
;
3260 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3263 r
= apply_protect_kernel_modules(unit
, context
);
3265 *exit_status
= EXIT_SECCOMP
;
3266 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3269 r
= apply_private_devices(unit
, context
);
3271 *exit_status
= EXIT_SECCOMP
;
3272 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3275 r
= apply_syscall_archs(unit
, context
);
3277 *exit_status
= EXIT_SECCOMP
;
3278 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3281 r
= apply_lock_personality(unit
, context
);
3283 *exit_status
= EXIT_SECCOMP
;
3284 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3287 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3288 * by the filter as little as possible. */
3289 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3291 *exit_status
= EXIT_SECCOMP
;
3292 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3297 if (!strv_isempty(context
->unset_environment
)) {
3300 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3302 *exit_status
= EXIT_MEMORY
;
3306 strv_free(accum_env
);
3310 final_argv
= replace_env_argv(argv
, accum_env
);
3312 *exit_status
= EXIT_MEMORY
;
3316 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3317 _cleanup_free_
char *line
;
3319 line
= exec_command_line(final_argv
);
3321 log_struct(LOG_DEBUG
,
3322 "EXECUTABLE=%s", command
->path
,
3323 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3325 LOG_UNIT_INVOCATION_ID(unit
),
3330 execve(command
->path
, final_argv
, accum_env
);
3332 if (errno
== ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3334 log_struct_errno(LOG_INFO
, errno
,
3335 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3337 LOG_UNIT_INVOCATION_ID(unit
),
3338 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3340 "EXECUTABLE=%s", command
->path
,
3346 *exit_status
= EXIT_EXEC
;
3347 return log_unit_error_errno(unit
, errno
, "Failed to execute command: %m");
3350 int exec_spawn(Unit
*unit
,
3351 ExecCommand
*command
,
3352 const ExecContext
*context
,
3353 const ExecParameters
*params
,
3354 ExecRuntime
*runtime
,
3355 DynamicCreds
*dcreds
,
3358 _cleanup_strv_free_
char **files_env
= NULL
;
3360 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3361 _cleanup_free_
char *line
= NULL
;
3363 int named_iofds
[3] = { -1, -1, -1 };
3372 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3374 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3375 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3376 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3378 if (params
->n_socket_fds
> 1) {
3379 log_unit_error(unit
, "Got more than one socket.");
3383 if (params
->n_socket_fds
== 0) {
3384 log_unit_error(unit
, "Got no socket.");
3388 socket_fd
= params
->fds
[0];
3392 n_storage_fds
= params
->n_storage_fds
;
3393 n_socket_fds
= params
->n_socket_fds
;
3396 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3398 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3400 r
= exec_context_load_environment(unit
, context
, &files_env
);
3402 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3404 argv
= params
->argv
?: command
->argv
;
3405 line
= exec_command_line(argv
);
3409 log_struct(LOG_DEBUG
,
3410 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3411 "EXECUTABLE=%s", command
->path
,
3413 LOG_UNIT_INVOCATION_ID(unit
),
3418 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3421 int exit_status
= EXIT_SUCCESS
;
3423 r
= exec_child(unit
,
3436 unit
->manager
->user_lookup_fds
[1],
3440 log_struct_errno(LOG_ERR
, r
,
3441 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3443 LOG_UNIT_INVOCATION_ID(unit
),
3444 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3445 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3447 "EXECUTABLE=%s", command
->path
,
3454 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3456 /* We add the new process to the cgroup both in the child (so
3457 * that we can be sure that no user code is ever executed
3458 * outside of the cgroup) and in the parent (so that we can be
3459 * sure that when we kill the cgroup the process will be
3461 if (params
->cgroup_path
)
3462 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3464 exec_status_start(&command
->exec_status
, pid
);
3470 void exec_context_init(ExecContext
*c
) {
3471 ExecDirectoryType i
;
3476 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3477 c
->cpu_sched_policy
= SCHED_OTHER
;
3478 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3479 c
->syslog_level_prefix
= true;
3480 c
->ignore_sigpipe
= true;
3481 c
->timer_slack_nsec
= NSEC_INFINITY
;
3482 c
->personality
= PERSONALITY_INVALID
;
3483 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3484 c
->directories
[i
].mode
= 0755;
3485 c
->capability_bounding_set
= CAP_ALL
;
3486 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3487 c
->log_level_max
= -1;
3490 void exec_context_done(ExecContext
*c
) {
3491 ExecDirectoryType i
;
3496 c
->environment
= strv_free(c
->environment
);
3497 c
->environment_files
= strv_free(c
->environment_files
);
3498 c
->pass_environment
= strv_free(c
->pass_environment
);
3499 c
->unset_environment
= strv_free(c
->unset_environment
);
3501 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3502 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3504 for (l
= 0; l
< 3; l
++)
3505 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3507 c
->working_directory
= mfree(c
->working_directory
);
3508 c
->root_directory
= mfree(c
->root_directory
);
3509 c
->root_image
= mfree(c
->root_image
);
3510 c
->tty_path
= mfree(c
->tty_path
);
3511 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3512 c
->user
= mfree(c
->user
);
3513 c
->group
= mfree(c
->group
);
3515 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3517 c
->pam_name
= mfree(c
->pam_name
);
3519 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3520 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3521 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3523 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3526 CPU_FREE(c
->cpuset
);
3528 c
->utmp_id
= mfree(c
->utmp_id
);
3529 c
->selinux_context
= mfree(c
->selinux_context
);
3530 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3531 c
->smack_process_label
= mfree(c
->smack_process_label
);
3533 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3534 c
->syscall_archs
= set_free(c
->syscall_archs
);
3535 c
->address_families
= set_free(c
->address_families
);
3537 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3538 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3540 c
->log_level_max
= -1;
3542 exec_context_free_log_extra_fields(c
);
3545 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3550 if (!runtime_prefix
)
3553 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3554 _cleanup_free_
char *p
;
3556 p
= strjoin(runtime_prefix
, "/", *i
);
3560 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3562 (void) rm_rf(p
, REMOVE_ROOT
);
3568 void exec_command_done(ExecCommand
*c
) {
3571 c
->path
= mfree(c
->path
);
3573 c
->argv
= strv_free(c
->argv
);
3576 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3579 for (i
= 0; i
< n
; i
++)
3580 exec_command_done(c
+i
);
3583 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3587 LIST_REMOVE(command
, c
, i
);
3588 exec_command_done(i
);
3595 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3598 for (i
= 0; i
< n
; i
++)
3599 c
[i
] = exec_command_free_list(c
[i
]);
3602 typedef struct InvalidEnvInfo
{
3607 static void invalid_env(const char *p
, void *userdata
) {
3608 InvalidEnvInfo
*info
= userdata
;
3610 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3613 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3618 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3620 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3622 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3624 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3626 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3628 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3634 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3635 unsigned i
, targets
;
3636 const char* stdio_fdname
[3];
3642 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3643 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3644 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3646 for (i
= 0; i
< 3; i
++)
3647 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3649 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3651 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3652 if (named_iofds
[STDIN_FILENO
] < 0 &&
3653 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3654 stdio_fdname
[STDIN_FILENO
] &&
3655 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3657 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3660 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3661 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3662 stdio_fdname
[STDOUT_FILENO
] &&
3663 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3665 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3668 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3669 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3670 stdio_fdname
[STDERR_FILENO
] &&
3671 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3673 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3677 return targets
== 0 ? 0 : -ENOENT
;
3680 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3681 char **i
, **r
= NULL
;
3686 STRV_FOREACH(i
, c
->environment_files
) {
3690 bool ignore
= false;
3692 _cleanup_globfree_ glob_t pglob
= {};
3701 if (!path_is_absolute(fn
)) {
3709 /* Filename supports globbing, take all matching files */
3710 k
= safe_glob(fn
, 0, &pglob
);
3719 /* When we don't match anything, -ENOENT should be returned */
3720 assert(pglob
.gl_pathc
> 0);
3722 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3723 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3731 /* Log invalid environment variables with filename */
3733 InvalidEnvInfo info
= {
3735 .path
= pglob
.gl_pathv
[n
]
3738 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3746 m
= strv_env_merge(2, r
, p
);
3762 static bool tty_may_match_dev_console(const char *tty
) {
3763 _cleanup_free_
char *active
= NULL
;
3769 tty
= skip_dev_prefix(tty
);
3771 /* trivial identity? */
3772 if (streq(tty
, "console"))
3775 console
= resolve_dev_console(&active
);
3776 /* if we could not resolve, assume it may */
3780 /* "tty0" means the active VC, so it may be the same sometimes */
3781 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3784 bool exec_context_may_touch_console(ExecContext
*ec
) {
3786 return (ec
->tty_reset
||
3788 ec
->tty_vt_disallocate
||
3789 is_terminal_input(ec
->std_input
) ||
3790 is_terminal_output(ec
->std_output
) ||
3791 is_terminal_output(ec
->std_error
)) &&
3792 tty_may_match_dev_console(exec_context_tty_path(ec
));
3795 static void strv_fprintf(FILE *f
, char **l
) {
3801 fprintf(f
, " %s", *g
);
3804 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3805 ExecDirectoryType dt
;
3813 prefix
= strempty(prefix
);
3817 "%sWorkingDirectory: %s\n"
3818 "%sRootDirectory: %s\n"
3819 "%sNonBlocking: %s\n"
3820 "%sPrivateTmp: %s\n"
3821 "%sPrivateDevices: %s\n"
3822 "%sProtectKernelTunables: %s\n"
3823 "%sProtectKernelModules: %s\n"
3824 "%sProtectControlGroups: %s\n"
3825 "%sPrivateNetwork: %s\n"
3826 "%sPrivateUsers: %s\n"
3827 "%sProtectHome: %s\n"
3828 "%sProtectSystem: %s\n"
3829 "%sMountAPIVFS: %s\n"
3830 "%sIgnoreSIGPIPE: %s\n"
3831 "%sMemoryDenyWriteExecute: %s\n"
3832 "%sRestrictRealtime: %s\n"
3833 "%sKeyringMode: %s\n",
3835 prefix
, c
->working_directory
? c
->working_directory
: "/",
3836 prefix
, c
->root_directory
? c
->root_directory
: "/",
3837 prefix
, yes_no(c
->non_blocking
),
3838 prefix
, yes_no(c
->private_tmp
),
3839 prefix
, yes_no(c
->private_devices
),
3840 prefix
, yes_no(c
->protect_kernel_tunables
),
3841 prefix
, yes_no(c
->protect_kernel_modules
),
3842 prefix
, yes_no(c
->protect_control_groups
),
3843 prefix
, yes_no(c
->private_network
),
3844 prefix
, yes_no(c
->private_users
),
3845 prefix
, protect_home_to_string(c
->protect_home
),
3846 prefix
, protect_system_to_string(c
->protect_system
),
3847 prefix
, yes_no(c
->mount_apivfs
),
3848 prefix
, yes_no(c
->ignore_sigpipe
),
3849 prefix
, yes_no(c
->memory_deny_write_execute
),
3850 prefix
, yes_no(c
->restrict_realtime
),
3851 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3854 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3856 STRV_FOREACH(e
, c
->environment
)
3857 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3859 STRV_FOREACH(e
, c
->environment_files
)
3860 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3862 STRV_FOREACH(e
, c
->pass_environment
)
3863 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3865 STRV_FOREACH(e
, c
->unset_environment
)
3866 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3868 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3870 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3871 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3873 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3874 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3882 if (c
->oom_score_adjust_set
)
3884 "%sOOMScoreAdjust: %i\n",
3885 prefix
, c
->oom_score_adjust
);
3887 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3889 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3890 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3891 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3892 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3895 if (c
->ioprio_set
) {
3896 _cleanup_free_
char *class_str
= NULL
;
3898 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3900 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3902 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3905 if (c
->cpu_sched_set
) {
3906 _cleanup_free_
char *policy_str
= NULL
;
3908 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3910 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3913 "%sCPUSchedulingPriority: %i\n"
3914 "%sCPUSchedulingResetOnFork: %s\n",
3915 prefix
, c
->cpu_sched_priority
,
3916 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3920 fprintf(f
, "%sCPUAffinity:", prefix
);
3921 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3922 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3923 fprintf(f
, " %u", i
);
3927 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3928 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3931 "%sStandardInput: %s\n"
3932 "%sStandardOutput: %s\n"
3933 "%sStandardError: %s\n",
3934 prefix
, exec_input_to_string(c
->std_input
),
3935 prefix
, exec_output_to_string(c
->std_output
),
3936 prefix
, exec_output_to_string(c
->std_error
));
3942 "%sTTYVHangup: %s\n"
3943 "%sTTYVTDisallocate: %s\n",
3944 prefix
, c
->tty_path
,
3945 prefix
, yes_no(c
->tty_reset
),
3946 prefix
, yes_no(c
->tty_vhangup
),
3947 prefix
, yes_no(c
->tty_vt_disallocate
));
3949 if (IN_SET(c
->std_output
,
3952 EXEC_OUTPUT_JOURNAL
,
3953 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3954 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3955 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
3956 IN_SET(c
->std_error
,
3959 EXEC_OUTPUT_JOURNAL
,
3960 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3961 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3962 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
3964 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
3966 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
3968 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
3970 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
3972 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
3975 if (c
->log_level_max
>= 0) {
3976 _cleanup_free_
char *t
= NULL
;
3978 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
3980 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
3983 if (c
->n_log_extra_fields
> 0) {
3986 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
3987 fprintf(f
, "%sLogExtraFields: ", prefix
);
3988 fwrite(c
->log_extra_fields
[j
].iov_base
,
3989 1, c
->log_extra_fields
[j
].iov_len
,
3995 if (c
->secure_bits
) {
3996 _cleanup_free_
char *str
= NULL
;
3998 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4000 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4003 if (c
->capability_bounding_set
!= CAP_ALL
) {
4004 _cleanup_free_
char *str
= NULL
;
4006 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4008 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4011 if (c
->capability_ambient_set
!= 0) {
4012 _cleanup_free_
char *str
= NULL
;
4014 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4016 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4020 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4022 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4024 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4026 if (!strv_isempty(c
->supplementary_groups
)) {
4027 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4028 strv_fprintf(f
, c
->supplementary_groups
);
4033 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4035 if (strv_length(c
->read_write_paths
) > 0) {
4036 fprintf(f
, "%sReadWritePaths:", prefix
);
4037 strv_fprintf(f
, c
->read_write_paths
);
4041 if (strv_length(c
->read_only_paths
) > 0) {
4042 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4043 strv_fprintf(f
, c
->read_only_paths
);
4047 if (strv_length(c
->inaccessible_paths
) > 0) {
4048 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4049 strv_fprintf(f
, c
->inaccessible_paths
);
4053 if (c
->n_bind_mounts
> 0)
4054 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
4055 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
4056 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4057 c
->bind_mounts
[i
].source
,
4058 c
->bind_mounts
[i
].destination
,
4059 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4064 "%sUtmpIdentifier: %s\n",
4065 prefix
, c
->utmp_id
);
4067 if (c
->selinux_context
)
4069 "%sSELinuxContext: %s%s\n",
4070 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4072 if (c
->apparmor_profile
)
4074 "%sAppArmorProfile: %s%s\n",
4075 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4077 if (c
->smack_process_label
)
4079 "%sSmackProcessLabel: %s%s\n",
4080 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4082 if (c
->personality
!= PERSONALITY_INVALID
)
4084 "%sPersonality: %s\n",
4085 prefix
, strna(personality_to_string(c
->personality
)));
4088 "%sLockPersonality: %s\n",
4089 prefix
, yes_no(c
->lock_personality
));
4091 if (c
->syscall_filter
) {
4099 "%sSystemCallFilter: ",
4102 if (!c
->syscall_whitelist
)
4106 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4107 _cleanup_free_
char *name
= NULL
;
4108 const char *errno_name
= NULL
;
4109 int num
= PTR_TO_INT(val
);
4116 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4117 fputs(strna(name
), f
);
4120 errno_name
= errno_to_name(num
);
4122 fprintf(f
, ":%s", errno_name
);
4124 fprintf(f
, ":%d", num
);
4132 if (c
->syscall_archs
) {
4139 "%sSystemCallArchitectures:",
4143 SET_FOREACH(id
, c
->syscall_archs
, j
)
4144 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4149 if (exec_context_restrict_namespaces_set(c
)) {
4150 _cleanup_free_
char *s
= NULL
;
4152 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
4154 fprintf(f
, "%sRestrictNamespaces: %s\n",
4158 if (c
->syscall_errno
> 0) {
4159 const char *errno_name
;
4161 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4163 errno_name
= errno_to_name(c
->syscall_errno
);
4165 fprintf(f
, "%s\n", errno_name
);
4167 fprintf(f
, "%d\n", c
->syscall_errno
);
4170 if (c
->apparmor_profile
)
4172 "%sAppArmorProfile: %s%s\n",
4173 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4176 bool exec_context_maintains_privileges(ExecContext
*c
) {
4179 /* Returns true if the process forked off would run under
4180 * an unchanged UID or as root. */
4185 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4191 int exec_context_get_effective_ioprio(ExecContext
*c
) {
4199 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4201 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4206 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4211 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4212 free(c
->log_extra_fields
[l
].iov_base
);
4213 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4214 c
->n_log_extra_fields
= 0;
4217 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4222 dual_timestamp_get(&s
->start_timestamp
);
4225 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
4228 if (s
->pid
&& s
->pid
!= pid
)
4232 dual_timestamp_get(&s
->exit_timestamp
);
4238 if (context
->utmp_id
)
4239 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4241 exec_context_tty_reset(context
, NULL
);
4245 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
4246 char buf
[FORMAT_TIMESTAMP_MAX
];
4254 prefix
= strempty(prefix
);
4257 "%sPID: "PID_FMT
"\n",
4260 if (dual_timestamp_is_set(&s
->start_timestamp
))
4262 "%sStart Timestamp: %s\n",
4263 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4265 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4267 "%sExit Timestamp: %s\n"
4269 "%sExit Status: %i\n",
4270 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4271 prefix
, sigchld_code_to_string(s
->code
),
4275 char *exec_command_line(char **argv
) {
4283 STRV_FOREACH(a
, argv
)
4291 STRV_FOREACH(a
, argv
) {
4298 if (strpbrk(*a
, WHITESPACE
)) {
4309 /* FIXME: this doesn't really handle arguments that have
4310 * spaces and ticks in them */
4315 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4316 _cleanup_free_
char *cmd
= NULL
;
4317 const char *prefix2
;
4322 prefix
= strempty(prefix
);
4323 prefix2
= strjoina(prefix
, "\t");
4325 cmd
= exec_command_line(c
->argv
);
4327 "%sCommand Line: %s\n",
4328 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4330 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4333 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4336 prefix
= strempty(prefix
);
4338 LIST_FOREACH(command
, c
, c
)
4339 exec_command_dump(c
, f
, prefix
);
4342 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4349 /* It's kind of important, that we keep the order here */
4350 LIST_FIND_TAIL(command
, *l
, end
);
4351 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4356 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4364 l
= strv_new_ap(path
, ap
);
4385 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4386 _cleanup_strv_free_
char **l
= NULL
;
4394 l
= strv_new_ap(path
, ap
);
4400 r
= strv_extend_strv(&c
->argv
, l
, false);
4408 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4413 *rt
= new0(ExecRuntime
, 1);
4418 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4423 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4433 if (!c
->private_network
&& !c
->private_tmp
)
4436 r
= exec_runtime_allocate(rt
);
4440 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4441 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4445 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4446 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4454 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4456 assert(r
->n_ref
> 0);
4462 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4467 assert(r
->n_ref
> 0);
4474 free(r
->var_tmp_dir
);
4475 safe_close_pair(r
->netns_storage_socket
);
4479 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4488 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4490 if (rt
->var_tmp_dir
)
4491 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4493 if (rt
->netns_storage_socket
[0] >= 0) {
4496 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4500 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4503 if (rt
->netns_storage_socket
[1] >= 0) {
4506 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4510 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4516 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4523 if (streq(key
, "tmp-dir")) {
4526 r
= exec_runtime_allocate(rt
);
4530 copy
= strdup(value
);
4534 free((*rt
)->tmp_dir
);
4535 (*rt
)->tmp_dir
= copy
;
4537 } else if (streq(key
, "var-tmp-dir")) {
4540 r
= exec_runtime_allocate(rt
);
4544 copy
= strdup(value
);
4548 free((*rt
)->var_tmp_dir
);
4549 (*rt
)->var_tmp_dir
= copy
;
4551 } else if (streq(key
, "netns-socket-0")) {
4554 r
= exec_runtime_allocate(rt
);
4558 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4559 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4561 safe_close((*rt
)->netns_storage_socket
[0]);
4562 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4564 } else if (streq(key
, "netns-socket-1")) {
4567 r
= exec_runtime_allocate(rt
);
4571 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4572 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4574 safe_close((*rt
)->netns_storage_socket
[1]);
4575 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4583 static void *remove_tmpdir_thread(void *p
) {
4584 _cleanup_free_
char *path
= p
;
4586 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4590 void exec_runtime_destroy(ExecRuntime
*rt
) {
4596 /* If there are multiple users of this, let's leave the stuff around */
4601 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4603 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4605 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4612 if (rt
->var_tmp_dir
) {
4613 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4615 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4617 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4618 free(rt
->var_tmp_dir
);
4621 rt
->var_tmp_dir
= NULL
;
4624 safe_close_pair(rt
->netns_storage_socket
);
4627 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4628 [EXEC_INPUT_NULL
] = "null",
4629 [EXEC_INPUT_TTY
] = "tty",
4630 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4631 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4632 [EXEC_INPUT_SOCKET
] = "socket",
4633 [EXEC_INPUT_NAMED_FD
] = "fd",
4636 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4638 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4639 [EXEC_OUTPUT_INHERIT
] = "inherit",
4640 [EXEC_OUTPUT_NULL
] = "null",
4641 [EXEC_OUTPUT_TTY
] = "tty",
4642 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4643 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4644 [EXEC_OUTPUT_KMSG
] = "kmsg",
4645 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4646 [EXEC_OUTPUT_JOURNAL
] = "journal",
4647 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4648 [EXEC_OUTPUT_SOCKET
] = "socket",
4649 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4652 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4654 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4655 [EXEC_UTMP_INIT
] = "init",
4656 [EXEC_UTMP_LOGIN
] = "login",
4657 [EXEC_UTMP_USER
] = "user",
4660 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4662 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4663 [EXEC_PRESERVE_NO
] = "no",
4664 [EXEC_PRESERVE_YES
] = "yes",
4665 [EXEC_PRESERVE_RESTART
] = "restart",
4668 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4670 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
4671 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4672 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4673 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4674 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4675 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4678 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
4680 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
4681 [EXEC_KEYRING_INHERIT
] = "inherit",
4682 [EXEC_KEYRING_PRIVATE
] = "private",
4683 [EXEC_KEYRING_SHARED
] = "shared",
4686 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);