2 This file is part of systemd.
4 Copyright 2010 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
33 #include <sys/socket.h>
35 #include <sys/types.h>
41 #include <security/pam_appl.h>
45 #include <selinux/selinux.h>
53 #include <sys/apparmor.h>
56 #include "sd-messages.h"
59 #include "alloc-util.h"
61 #include "apparmor-util.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
70 #include "errno-list.h"
72 #include "exit-status.h"
75 #include "format-util.h"
77 #include "glob-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
92 #include "seccomp-util.h"
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
100 #include "string-table.h"
101 #include "string-util.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
106 #include "user-util.h"
108 #include "utmp-wtmp.h"
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
116 #define SNDBUF_SIZE (8*1024*1024)
118 static int shift_fds(int fds
[], unsigned n_fds
) {
119 int start
, restart_from
;
124 /* Modifies the fds array! (sorts it) */
134 for (i
= start
; i
< (int) n_fds
; i
++) {
137 /* Already at right index? */
141 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd
!= i
+3 && restart_from
< 0)
154 if (restart_from
< 0)
157 start
= restart_from
;
163 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
167 n_fds
= n_storage_fds
+ n_socket_fds
;
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
176 for (i
= 0; i
< n_fds
; i
++) {
178 if (i
< n_socket_fds
) {
179 r
= fd_nonblock(fds
[i
], nonblock
);
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
188 r
= fd_cloexec(fds
[i
], false);
196 static const char *exec_context_tty_path(const ExecContext
*context
) {
199 if (context
->stdio_as_fds
)
202 if (context
->tty_path
)
203 return context
->tty_path
;
205 return "/dev/console";
208 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
213 path
= exec_context_tty_path(context
);
215 if (context
->tty_vhangup
) {
216 if (p
&& p
->stdin_fd
>= 0)
217 (void) terminal_vhangup_fd(p
->stdin_fd
);
219 (void) terminal_vhangup(path
);
222 if (context
->tty_reset
) {
223 if (p
&& p
->stdin_fd
>= 0)
224 (void) reset_terminal_fd(p
->stdin_fd
, true);
226 (void) reset_terminal(path
);
229 if (context
->tty_vt_disallocate
&& path
)
230 (void) vt_disallocate(path
);
233 static bool is_terminal_input(ExecInput i
) {
236 EXEC_INPUT_TTY_FORCE
,
237 EXEC_INPUT_TTY_FAIL
);
240 static bool is_terminal_output(ExecOutput o
) {
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
248 static bool is_syslog_output(ExecOutput o
) {
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
254 static bool is_kmsg_output(ExecOutput o
) {
257 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
260 static bool exec_context_needs_term(const ExecContext
*c
) {
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
265 if (is_terminal_input(c
->std_input
))
268 if (is_terminal_output(c
->std_output
))
271 if (is_terminal_output(c
->std_error
))
274 return !!c
->tty_path
;
277 static int open_null_as(int flags
, int nfd
) {
282 fd
= open("/dev/null", flags
|O_NOCTTY
);
286 return move_fd(fd
, nfd
, false);
289 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
290 static const union sockaddr_union sa
= {
291 .un
.sun_family
= AF_UNIX
,
292 .un
.sun_path
= "/run/systemd/journal/stdout",
294 uid_t olduid
= UID_INVALID
;
295 gid_t oldgid
= GID_INVALID
;
298 if (gid_is_valid(gid
)) {
301 if (setegid(gid
) < 0)
305 if (uid_is_valid(uid
)) {
308 if (seteuid(uid
) < 0) {
314 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
316 /* If we fail to restore the uid or gid, things will likely
317 fail later on. This should only happen if an LSM interferes. */
319 if (uid_is_valid(uid
))
320 (void) seteuid(olduid
);
323 if (gid_is_valid(gid
))
324 (void) setegid(oldgid
);
329 static int connect_logger_as(
331 const ExecContext
*context
,
332 const ExecParameters
*params
,
343 assert(output
< _EXEC_OUTPUT_MAX
);
347 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
351 r
= connect_journal_socket(fd
, uid
, gid
);
355 if (shutdown(fd
, SHUT_RD
) < 0) {
360 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
370 context
->syslog_identifier
?: ident
,
371 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
372 context
->syslog_priority
,
373 !!context
->syslog_level_prefix
,
374 is_syslog_output(output
),
375 is_kmsg_output(output
),
376 is_terminal_output(output
));
378 return move_fd(fd
, nfd
, false);
380 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
386 fd
= open_terminal(path
, flags
| O_NOCTTY
);
390 return move_fd(fd
, nfd
, false);
393 static int fixup_input(
394 const ExecContext
*context
,
396 bool apply_tty_stdin
) {
402 std_input
= context
->std_input
;
404 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
405 return EXEC_INPUT_NULL
;
407 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
408 return EXEC_INPUT_NULL
;
410 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
411 return EXEC_INPUT_NULL
;
416 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
418 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
419 return EXEC_OUTPUT_INHERIT
;
424 static int setup_input(
425 const ExecContext
*context
,
426 const ExecParameters
*params
,
428 int named_iofds
[3]) {
435 if (params
->stdin_fd
>= 0) {
436 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
439 /* Try to make this the controlling tty, if it is a tty, and reset it */
440 if (isatty(STDIN_FILENO
)) {
441 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
442 (void) reset_terminal_fd(STDIN_FILENO
, true);
448 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
452 case EXEC_INPUT_NULL
:
453 return open_null_as(O_RDONLY
, STDIN_FILENO
);
456 case EXEC_INPUT_TTY_FORCE
:
457 case EXEC_INPUT_TTY_FAIL
: {
460 fd
= acquire_terminal(exec_context_tty_path(context
),
461 i
== EXEC_INPUT_TTY_FAIL
,
462 i
== EXEC_INPUT_TTY_FORCE
,
468 return move_fd(fd
, STDIN_FILENO
, false);
471 case EXEC_INPUT_SOCKET
:
472 assert(socket_fd
>= 0);
474 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
476 case EXEC_INPUT_NAMED_FD
:
477 assert(named_iofds
[STDIN_FILENO
] >= 0);
479 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
480 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
482 case EXEC_INPUT_DATA
: {
485 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
489 return move_fd(fd
, STDIN_FILENO
, false);
493 assert_not_reached("Unknown input type");
497 static int setup_output(
499 const ExecContext
*context
,
500 const ExecParameters
*params
,
507 dev_t
*journal_stream_dev
,
508 ino_t
*journal_stream_ino
) {
518 assert(journal_stream_dev
);
519 assert(journal_stream_ino
);
521 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
523 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
526 return STDOUT_FILENO
;
529 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
530 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
533 return STDERR_FILENO
;
536 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
537 o
= fixup_output(context
->std_output
, socket_fd
);
539 if (fileno
== STDERR_FILENO
) {
541 e
= fixup_output(context
->std_error
, socket_fd
);
543 /* This expects the input and output are already set up */
545 /* Don't change the stderr file descriptor if we inherit all
546 * the way and are not on a tty */
547 if (e
== EXEC_OUTPUT_INHERIT
&&
548 o
== EXEC_OUTPUT_INHERIT
&&
549 i
== EXEC_INPUT_NULL
&&
550 !is_terminal_input(context
->std_input
) &&
554 /* Duplicate from stdout if possible */
555 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
556 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
560 } else if (o
== EXEC_OUTPUT_INHERIT
) {
561 /* If input got downgraded, inherit the original value */
562 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
563 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
565 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
566 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
567 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
569 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
573 /* We need to open /dev/null here anew, to get the right access mode. */
574 return open_null_as(O_WRONLY
, fileno
);
579 case EXEC_OUTPUT_NULL
:
580 return open_null_as(O_WRONLY
, fileno
);
582 case EXEC_OUTPUT_TTY
:
583 if (is_terminal_input(i
))
584 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
586 /* We don't reset the terminal if this is just about output */
587 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
589 case EXEC_OUTPUT_SYSLOG
:
590 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
591 case EXEC_OUTPUT_KMSG
:
592 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
593 case EXEC_OUTPUT_JOURNAL
:
594 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
595 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
597 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
598 r
= open_null_as(O_WRONLY
, fileno
);
602 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
603 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
604 * services to detect whether they are connected to the journal or not.
606 * If both stdout and stderr are connected to a stream then let's make sure to store the data
607 * about STDERR as that's usually the best way to do logging. */
609 if (fstat(fileno
, &st
) >= 0 &&
610 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
611 *journal_stream_dev
= st
.st_dev
;
612 *journal_stream_ino
= st
.st_ino
;
617 case EXEC_OUTPUT_SOCKET
:
618 assert(socket_fd
>= 0);
620 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
622 case EXEC_OUTPUT_NAMED_FD
:
623 assert(named_iofds
[fileno
] >= 0);
625 (void) fd_nonblock(named_iofds
[fileno
], false);
626 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
629 assert_not_reached("Unknown error type");
633 static int chown_terminal(int fd
, uid_t uid
) {
638 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
642 /* This might fail. What matters are the results. */
643 (void) fchown(fd
, uid
, -1);
644 (void) fchmod(fd
, TTY_MODE
);
646 if (fstat(fd
, &st
) < 0)
649 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
655 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
656 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
659 assert(_saved_stdin
);
660 assert(_saved_stdout
);
662 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
666 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
667 if (saved_stdout
< 0)
670 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
674 r
= chown_terminal(fd
, getuid());
678 r
= reset_terminal_fd(fd
, true);
682 if (dup2(fd
, STDIN_FILENO
) < 0)
685 if (dup2(fd
, STDOUT_FILENO
) < 0)
692 *_saved_stdin
= saved_stdin
;
693 *_saved_stdout
= saved_stdout
;
695 saved_stdin
= saved_stdout
= -1;
700 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
703 if (err
== -ETIMEDOUT
)
704 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
707 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
711 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
712 _cleanup_close_
int fd
= -1;
716 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
720 write_confirm_error_fd(err
, fd
, u
);
723 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
727 assert(saved_stdout
);
731 if (*saved_stdin
>= 0)
732 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
735 if (*saved_stdout
>= 0)
736 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
739 *saved_stdin
= safe_close(*saved_stdin
);
740 *saved_stdout
= safe_close(*saved_stdout
);
746 CONFIRM_PRETEND_FAILURE
= -1,
747 CONFIRM_PRETEND_SUCCESS
= 0,
751 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
752 int saved_stdout
= -1, saved_stdin
= -1, r
;
753 _cleanup_free_
char *e
= NULL
;
756 /* For any internal errors, assume a positive response. */
757 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
759 write_confirm_error(r
, vc
, u
);
760 return CONFIRM_EXECUTE
;
763 /* confirm_spawn might have been disabled while we were sleeping. */
764 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
769 e
= ellipsize(cmdline
, 60, 100);
777 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
779 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
786 printf("Resuming normal execution.\n");
787 manager_disable_confirm_spawn();
791 unit_dump(u
, stdout
, " ");
792 continue; /* ask again */
794 printf("Failing execution.\n");
795 r
= CONFIRM_PRETEND_FAILURE
;
798 printf(" c - continue, proceed without asking anymore\n"
799 " D - dump, show the state of the unit\n"
800 " f - fail, don't execute the command and pretend it failed\n"
802 " i - info, show a short summary of the unit\n"
803 " j - jobs, show jobs that are in progress\n"
804 " s - skip, don't execute the command and pretend it succeeded\n"
805 " y - yes, execute the command\n");
806 continue; /* ask again */
808 printf(" Description: %s\n"
811 u
->id
, u
->description
, cmdline
);
812 continue; /* ask again */
814 manager_dump_jobs(u
->manager
, stdout
, " ");
815 continue; /* ask again */
817 /* 'n' was removed in favor of 'f'. */
818 printf("Didn't understand 'n', did you mean 'f'?\n");
819 continue; /* ask again */
821 printf("Skipping execution.\n");
822 r
= CONFIRM_PRETEND_SUCCESS
;
828 assert_not_reached("Unhandled choice");
834 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
838 static int get_fixed_user(const ExecContext
*c
, const char **user
,
839 uid_t
*uid
, gid_t
*gid
,
840 const char **home
, const char **shell
) {
849 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
850 * (i.e. are "/" or "/bin/nologin"). */
853 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
861 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
871 r
= get_group_creds(&name
, gid
);
879 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
880 const char *group
, gid_t gid
,
881 gid_t
**supplementary_gids
, int *ngids
) {
885 bool keep_groups
= false;
886 gid_t
*groups
= NULL
;
887 _cleanup_free_ gid_t
*l_gids
= NULL
;
892 * If user is given, then lookup GID and supplementary groups list.
893 * We avoid NSS lookups for gid=0. Also we have to initialize groups
894 * here and as early as possible so we keep the list of supplementary
895 * groups of the caller.
897 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
898 /* First step, initialize groups from /etc/groups */
899 if (initgroups(user
, gid
) < 0)
905 if (strv_isempty(c
->supplementary_groups
))
909 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
910 * be positive, otherwise fail.
913 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
914 if (ngroups_max
<= 0) {
918 return -EOPNOTSUPP
; /* For all other values */
921 l_gids
= new(gid_t
, ngroups_max
);
927 * Lookup the list of groups that the user belongs to, we
928 * avoid NSS lookups here too for gid=0.
931 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
936 STRV_FOREACH(i
, c
->supplementary_groups
) {
939 if (k
>= ngroups_max
)
943 r
= get_group_creds(&g
, l_gids
+k
);
951 * Sets ngids to zero to drop all supplementary groups, happens
952 * when we are under root and SupplementaryGroups= is empty.
959 /* Otherwise get the final list of supplementary groups */
960 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
964 *supplementary_gids
= groups
;
972 static int enforce_groups(gid_t gid
, gid_t
*supplementary_gids
, int ngids
) {
975 /* Handle SupplementaryGroups= if it is not empty */
977 r
= maybe_setgroups(ngids
, supplementary_gids
);
982 if (gid_is_valid(gid
)) {
983 /* Then set our gids */
984 if (setresgid(gid
, gid
, gid
) < 0)
991 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
994 if (!uid_is_valid(uid
))
997 /* Sets (but doesn't look up) the uid and make sure we keep the
998 * capabilities while doing so. */
1000 if (context
->capability_ambient_set
!= 0) {
1002 /* First step: If we need to keep capabilities but
1003 * drop privileges we need to make sure we keep our
1004 * caps, while we drop privileges. */
1006 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1008 if (prctl(PR_GET_SECUREBITS
) != sb
)
1009 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1014 /* Second step: actually set the uids */
1015 if (setresuid(uid
, uid
, uid
) < 0)
1018 /* At this point we should have all necessary capabilities but
1019 are otherwise a normal user. However, the caps might got
1020 corrupted due to the setresuid() so we need clean them up
1021 later. This is done outside of this call. */
1028 static int null_conv(
1030 const struct pam_message
**msg
,
1031 struct pam_response
**resp
,
1032 void *appdata_ptr
) {
1034 /* We don't support conversations */
1036 return PAM_CONV_ERR
;
1041 static int setup_pam(
1048 int fds
[], unsigned n_fds
) {
1052 static const struct pam_conv conv
= {
1057 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1058 pam_handle_t
*handle
= NULL
;
1060 int pam_code
= PAM_SUCCESS
, r
;
1061 char **nv
, **e
= NULL
;
1062 bool close_session
= false;
1063 pid_t pam_pid
= 0, parent_pid
;
1070 /* We set up PAM in the parent process, then fork. The child
1071 * will then stay around until killed via PR_GET_PDEATHSIG or
1072 * systemd via the cgroup logic. It will then remove the PAM
1073 * session again. The parent process will exec() the actual
1074 * daemon. We do things this way to ensure that the main PID
1075 * of the daemon is the one we initially fork()ed. */
1077 r
= barrier_create(&barrier
);
1081 if (log_get_max_level() < LOG_DEBUG
)
1082 flags
|= PAM_SILENT
;
1084 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1085 if (pam_code
!= PAM_SUCCESS
) {
1091 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1092 if (pam_code
!= PAM_SUCCESS
)
1096 STRV_FOREACH(nv
, *env
) {
1097 pam_code
= pam_putenv(handle
, *nv
);
1098 if (pam_code
!= PAM_SUCCESS
)
1102 pam_code
= pam_acct_mgmt(handle
, flags
);
1103 if (pam_code
!= PAM_SUCCESS
)
1106 pam_code
= pam_open_session(handle
, flags
);
1107 if (pam_code
!= PAM_SUCCESS
)
1110 close_session
= true;
1112 e
= pam_getenvlist(handle
);
1114 pam_code
= PAM_BUF_ERR
;
1118 /* Block SIGTERM, so that we know that it won't get lost in
1121 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1123 parent_pid
= getpid_cached();
1132 int sig
, ret
= EXIT_PAM
;
1134 /* The child's job is to reset the PAM session on
1136 barrier_set_role(&barrier
, BARRIER_CHILD
);
1138 /* This string must fit in 10 chars (i.e. the length
1139 * of "/sbin/init"), to look pretty in /bin/ps */
1140 rename_process("(sd-pam)");
1142 /* Make sure we don't keep open the passed fds in this
1143 child. We assume that otherwise only those fds are
1144 open here that have been opened by PAM. */
1145 close_many(fds
, n_fds
);
1147 /* Drop privileges - we don't need any to pam_close_session
1148 * and this will make PR_SET_PDEATHSIG work in most cases.
1149 * If this fails, ignore the error - but expect sd-pam threads
1150 * to fail to exit normally */
1152 r
= maybe_setgroups(0, NULL
);
1154 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1155 if (setresgid(gid
, gid
, gid
) < 0)
1156 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1157 if (setresuid(uid
, uid
, uid
) < 0)
1158 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1160 (void) ignore_signals(SIGPIPE
, -1);
1162 /* Wait until our parent died. This will only work if
1163 * the above setresuid() succeeds, otherwise the kernel
1164 * will not allow unprivileged parents kill their privileged
1165 * children this way. We rely on the control groups kill logic
1166 * to do the rest for us. */
1167 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1170 /* Tell the parent that our setup is done. This is especially
1171 * important regarding dropping privileges. Otherwise, unit
1172 * setup might race against our setresuid(2) call.
1174 * If the parent aborted, we'll detect this below, hence ignore
1175 * return failure here. */
1176 (void) barrier_place(&barrier
);
1178 /* Check if our parent process might already have died? */
1179 if (getppid() == parent_pid
) {
1182 assert_se(sigemptyset(&ss
) >= 0);
1183 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1186 if (sigwait(&ss
, &sig
) < 0) {
1193 assert(sig
== SIGTERM
);
1198 /* If our parent died we'll end the session */
1199 if (getppid() != parent_pid
) {
1200 pam_code
= pam_close_session(handle
, flags
);
1201 if (pam_code
!= PAM_SUCCESS
)
1208 pam_end(handle
, pam_code
| flags
);
1212 barrier_set_role(&barrier
, BARRIER_PARENT
);
1214 /* If the child was forked off successfully it will do all the
1215 * cleanups, so forget about the handle here. */
1218 /* Unblock SIGTERM again in the parent */
1219 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1221 /* We close the log explicitly here, since the PAM modules
1222 * might have opened it, but we don't want this fd around. */
1225 /* Synchronously wait for the child to initialize. We don't care for
1226 * errors as we cannot recover. However, warn loudly if it happens. */
1227 if (!barrier_place_and_sync(&barrier
))
1228 log_error("PAM initialization failed");
1236 if (pam_code
!= PAM_SUCCESS
) {
1237 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1238 r
= -EPERM
; /* PAM errors do not map to errno */
1240 log_error_errno(r
, "PAM failed: %m");
1244 pam_code
= pam_close_session(handle
, flags
);
1246 pam_end(handle
, pam_code
| flags
);
1258 static void rename_process_from_path(const char *path
) {
1259 char process_name
[11];
1263 /* This resulting string must fit in 10 chars (i.e. the length
1264 * of "/sbin/init") to look pretty in /bin/ps */
1268 rename_process("(...)");
1274 /* The end of the process name is usually more
1275 * interesting, since the first bit might just be
1281 process_name
[0] = '(';
1282 memcpy(process_name
+1, p
, l
);
1283 process_name
[1+l
] = ')';
1284 process_name
[1+l
+1] = 0;
1286 rename_process(process_name
);
1289 static bool context_has_address_families(const ExecContext
*c
) {
1292 return c
->address_families_whitelist
||
1293 !set_isempty(c
->address_families
);
1296 static bool context_has_syscall_filters(const ExecContext
*c
) {
1299 return c
->syscall_whitelist
||
1300 !hashmap_isempty(c
->syscall_filter
);
1303 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1306 if (c
->no_new_privileges
)
1309 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1312 /* We need NNP if we have any form of seccomp and are unprivileged */
1313 return context_has_address_families(c
) ||
1314 c
->memory_deny_write_execute
||
1315 c
->restrict_realtime
||
1316 exec_context_restrict_namespaces_set(c
) ||
1317 c
->protect_kernel_tunables
||
1318 c
->protect_kernel_modules
||
1319 c
->private_devices
||
1320 context_has_syscall_filters(c
) ||
1321 !set_isempty(c
->syscall_archs
) ||
1322 c
->lock_personality
;
1327 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1329 if (is_seccomp_available())
1332 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1336 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1337 uint32_t negative_action
, default_action
, action
;
1343 if (!context_has_syscall_filters(c
))
1346 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1349 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1351 if (c
->syscall_whitelist
) {
1352 default_action
= negative_action
;
1353 action
= SCMP_ACT_ALLOW
;
1355 default_action
= SCMP_ACT_ALLOW
;
1356 action
= negative_action
;
1359 if (needs_ambient_hack
) {
1360 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1365 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1368 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1372 if (set_isempty(c
->syscall_archs
))
1375 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1378 return seccomp_restrict_archs(c
->syscall_archs
);
1381 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1385 if (!context_has_address_families(c
))
1388 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1391 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1394 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1398 if (!c
->memory_deny_write_execute
)
1401 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1404 return seccomp_memory_deny_write_execute();
1407 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1411 if (!c
->restrict_realtime
)
1414 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1417 return seccomp_restrict_realtime();
1420 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1424 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1425 * let's protect even those systems where this is left on in the kernel. */
1427 if (!c
->protect_kernel_tunables
)
1430 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1433 return seccomp_protect_sysctl();
1436 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1440 /* Turn off module syscalls on ProtectKernelModules=yes */
1442 if (!c
->protect_kernel_modules
)
1445 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1448 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1451 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1455 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1457 if (!c
->private_devices
)
1460 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1463 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1466 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1470 if (!exec_context_restrict_namespaces_set(c
))
1473 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1476 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1479 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1480 unsigned long personality
;
1486 if (!c
->lock_personality
)
1489 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1492 personality
= c
->personality
;
1494 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1495 if (personality
== PERSONALITY_INVALID
) {
1497 r
= opinionated_personality(&personality
);
1502 return seccomp_lock_personality(personality
);
1507 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1510 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1511 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1513 if (idle_pipe
[0] >= 0) {
1516 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1518 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1521 /* Signal systemd that we are bored and want to continue. */
1522 n
= write(idle_pipe
[3], "x", 1);
1524 /* Wait for systemd to react to the signal above. */
1525 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1528 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1532 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1535 static int build_environment(
1537 const ExecContext
*c
,
1538 const ExecParameters
*p
,
1541 const char *username
,
1543 dev_t journal_stream_dev
,
1544 ino_t journal_stream_ino
,
1547 _cleanup_strv_free_
char **our_env
= NULL
;
1555 our_env
= new0(char*, 14);
1560 _cleanup_free_
char *joined
= NULL
;
1562 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1564 our_env
[n_env
++] = x
;
1566 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1568 our_env
[n_env
++] = x
;
1570 joined
= strv_join(p
->fd_names
, ":");
1574 x
= strjoin("LISTEN_FDNAMES=", joined
);
1577 our_env
[n_env
++] = x
;
1580 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1581 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1583 our_env
[n_env
++] = x
;
1585 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1587 our_env
[n_env
++] = x
;
1590 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1591 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1592 * check the database directly. */
1593 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1594 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1597 our_env
[n_env
++] = x
;
1601 x
= strappend("HOME=", home
);
1604 our_env
[n_env
++] = x
;
1608 x
= strappend("LOGNAME=", username
);
1611 our_env
[n_env
++] = x
;
1613 x
= strappend("USER=", username
);
1616 our_env
[n_env
++] = x
;
1620 x
= strappend("SHELL=", shell
);
1623 our_env
[n_env
++] = x
;
1626 if (!sd_id128_is_null(u
->invocation_id
)) {
1627 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1630 our_env
[n_env
++] = x
;
1633 if (exec_context_needs_term(c
)) {
1634 const char *tty_path
, *term
= NULL
;
1636 tty_path
= exec_context_tty_path(c
);
1638 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1639 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1640 * passes to PID 1 ends up all the way in the console login shown. */
1642 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1643 term
= getenv("TERM");
1645 term
= default_term_for_tty(tty_path
);
1647 x
= strappend("TERM=", term
);
1650 our_env
[n_env
++] = x
;
1653 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1654 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1657 our_env
[n_env
++] = x
;
1660 our_env
[n_env
++] = NULL
;
1661 assert(n_env
<= 12);
1669 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1670 _cleanup_strv_free_
char **pass_env
= NULL
;
1671 size_t n_env
= 0, n_bufsize
= 0;
1674 STRV_FOREACH(i
, c
->pass_environment
) {
1675 _cleanup_free_
char *x
= NULL
;
1681 x
= strjoin(*i
, "=", v
);
1685 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1688 pass_env
[n_env
++] = x
;
1689 pass_env
[n_env
] = NULL
;
1699 static bool exec_needs_mount_namespace(
1700 const ExecContext
*context
,
1701 const ExecParameters
*params
,
1702 ExecRuntime
*runtime
) {
1707 if (context
->root_image
)
1710 if (!strv_isempty(context
->read_write_paths
) ||
1711 !strv_isempty(context
->read_only_paths
) ||
1712 !strv_isempty(context
->inaccessible_paths
))
1715 if (context
->n_bind_mounts
> 0 ||
1716 !strv_isempty(context
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) ||
1717 !strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1718 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1719 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
) ||
1720 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CONFIGURATION
].paths
))
1723 if (context
->mount_flags
!= 0)
1726 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1729 if (context
->private_devices
||
1730 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1731 context
->protect_home
!= PROTECT_HOME_NO
||
1732 context
->protect_kernel_tunables
||
1733 context
->protect_kernel_modules
||
1734 context
->protect_control_groups
)
1737 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1743 static int setup_private_users(uid_t uid
, gid_t gid
) {
1744 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1745 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1746 _cleanup_close_
int unshare_ready_fd
= -1;
1747 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1753 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1754 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1755 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1756 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1757 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1758 * continues execution normally. */
1760 if (uid
!= 0 && uid_is_valid(uid
)) {
1761 r
= asprintf(&uid_map
,
1762 "0 0 1\n" /* Map root → root */
1763 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1768 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1773 if (gid
!= 0 && gid_is_valid(gid
)) {
1774 r
= asprintf(&gid_map
,
1775 "0 0 1\n" /* Map root → root */
1776 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1781 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1786 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1788 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1789 if (unshare_ready_fd
< 0)
1792 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1794 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1802 _cleanup_close_
int fd
= -1;
1806 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1807 * here, after the parent opened its own user namespace. */
1810 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1812 /* Wait until the parent unshared the user namespace */
1813 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1818 /* Disable the setgroups() system call in the child user namespace, for good. */
1819 a
= procfs_file_alloca(ppid
, "setgroups");
1820 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1822 if (errno
!= ENOENT
) {
1827 /* If the file is missing the kernel is too old, let's continue anyway. */
1829 if (write(fd
, "deny\n", 5) < 0) {
1834 fd
= safe_close(fd
);
1837 /* First write the GID map */
1838 a
= procfs_file_alloca(ppid
, "gid_map");
1839 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1844 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1848 fd
= safe_close(fd
);
1850 /* The write the UID map */
1851 a
= procfs_file_alloca(ppid
, "uid_map");
1852 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1857 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1862 _exit(EXIT_SUCCESS
);
1865 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1866 _exit(EXIT_FAILURE
);
1869 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1871 if (unshare(CLONE_NEWUSER
) < 0)
1874 /* Let the child know that the namespace is ready now */
1875 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1878 /* Try to read an error code from the child */
1879 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1882 if (n
== sizeof(r
)) { /* an error code was sent to us */
1887 if (n
!= 0) /* on success we should have read 0 bytes */
1890 r
= wait_for_terminate(pid
, &si
);
1895 /* If something strange happened with the child, let's consider this fatal, too */
1896 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1902 static int setup_exec_directory(
1903 const ExecContext
*context
,
1904 const ExecParameters
*params
,
1907 ExecDirectoryType type
,
1910 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1911 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1912 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1913 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1914 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1915 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
1922 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
1923 assert(exit_status
);
1925 if (!params
->prefix
[type
])
1928 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
1929 if (!uid_is_valid(uid
))
1931 if (!gid_is_valid(gid
))
1935 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
1936 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
1937 const char *effective
;
1939 p
= strjoin(params
->prefix
[type
], "/", *rt
);
1945 r
= mkdir_parents_label(p
, 0755);
1949 if (context
->dynamic_user
&&
1950 !IN_SET(type
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
1951 _cleanup_free_
char *private_root
= NULL
, *relative
= NULL
, *parent
= NULL
;
1953 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1954 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1955 * whose UID is later on reused. To lock this down we use the same trick used by container
1956 * managers to prohibit host users to get access to files of the same UID in containers: we
1957 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1958 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1959 * to make this directory permeable for the service itself.
1961 * Specifically: for a service which wants a special directory "foo/" we first create a
1962 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1963 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1964 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1965 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1966 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1967 * disabling the access boundary for the service and making sure it only gets access to the
1968 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1970 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1971 * owned by the service itself.
1972 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1973 * files or sockets with other services. */
1975 private_root
= strjoin(params
->prefix
[type
], "/private");
1976 if (!private_root
) {
1981 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1982 r
= mkdir_safe_label(private_root
, 0700, 0, 0, false);
1986 pp
= strjoin(private_root
, "/", *rt
);
1992 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1993 r
= mkdir_parents_label(pp
, 0755);
1997 /* Finally, create the actual directory for the service */
1998 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
1999 if (r
< 0 && r
!= -EEXIST
)
2002 parent
= dirname_malloc(p
);
2008 r
= path_make_relative(parent
, pp
, &relative
);
2012 /* And link it up from the original place */
2013 r
= symlink_idempotent(relative
, p
);
2020 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2021 if (r
< 0 && r
!= -EEXIST
)
2027 /* First lock down the access mode */
2028 if (chmod(effective
, context
->directories
[type
].mode
) < 0) {
2033 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2034 * a service, and shall not be writable. */
2035 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2038 /* Then, change the ownership of the whole tree, if necessary */
2039 r
= path_chown_recursive(effective
, uid
, gid
);
2047 *exit_status
= exit_status_table
[type
];
2051 static int setup_smack(
2052 const ExecContext
*context
,
2053 const ExecCommand
*command
) {
2060 if (context
->smack_process_label
) {
2061 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2065 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2067 _cleanup_free_
char *exec_label
= NULL
;
2069 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2070 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2073 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2082 static int compile_bind_mounts(
2083 const ExecContext
*context
,
2084 const ExecParameters
*params
,
2085 BindMount
**ret_bind_mounts
,
2086 unsigned *ret_n_bind_mounts
,
2087 char ***ret_empty_directories
) {
2089 _cleanup_strv_free_
char **empty_directories
= NULL
;
2090 BindMount
*bind_mounts
;
2091 unsigned n
, h
= 0, i
;
2092 ExecDirectoryType t
;
2097 assert(ret_bind_mounts
);
2098 assert(ret_n_bind_mounts
);
2099 assert(ret_empty_directories
);
2101 n
= context
->n_bind_mounts
;
2102 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2103 if (!params
->prefix
[t
])
2106 n
+= strv_length(context
->directories
[t
].paths
);
2110 *ret_bind_mounts
= NULL
;
2111 *ret_n_bind_mounts
= 0;
2112 *ret_empty_directories
= NULL
;
2116 bind_mounts
= new(BindMount
, n
);
2120 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2121 BindMount
*item
= context
->bind_mounts
+ i
;
2124 s
= strdup(item
->source
);
2130 d
= strdup(item
->destination
);
2137 bind_mounts
[h
++] = (BindMount
) {
2140 .read_only
= item
->read_only
,
2141 .recursive
= item
->recursive
,
2142 .ignore_enoent
= item
->ignore_enoent
,
2146 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2149 if (!params
->prefix
[t
])
2152 if (strv_isempty(context
->directories
[t
].paths
))
2155 if (context
->dynamic_user
&&
2156 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2159 /* So this is for a dynamic user, and we need to make sure the process can access its own
2160 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2161 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2163 private_root
= strjoin(params
->prefix
[t
], "/private");
2164 if (!private_root
) {
2169 r
= strv_consume(&empty_directories
, private_root
);
2176 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2179 if (context
->dynamic_user
&&
2180 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
))
2181 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2183 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2196 bind_mounts
[h
++] = (BindMount
) {
2201 .ignore_enoent
= false,
2208 *ret_bind_mounts
= bind_mounts
;
2209 *ret_n_bind_mounts
= n
;
2210 *ret_empty_directories
= empty_directories
;
2212 empty_directories
= NULL
;
2217 bind_mount_free_many(bind_mounts
, h
);
2221 static int apply_mount_namespace(
2223 ExecCommand
*command
,
2224 const ExecContext
*context
,
2225 const ExecParameters
*params
,
2226 ExecRuntime
*runtime
) {
2228 _cleanup_strv_free_
char **empty_directories
= NULL
;
2229 char *tmp
= NULL
, *var
= NULL
;
2230 const char *root_dir
= NULL
, *root_image
= NULL
;
2231 NamespaceInfo ns_info
= {
2232 .ignore_protect_paths
= false,
2233 .private_dev
= context
->private_devices
,
2234 .protect_control_groups
= context
->protect_control_groups
,
2235 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2236 .protect_kernel_modules
= context
->protect_kernel_modules
,
2237 .mount_apivfs
= context
->mount_apivfs
,
2239 bool needs_sandboxing
;
2240 BindMount
*bind_mounts
= NULL
;
2241 unsigned n_bind_mounts
= 0;
2246 /* The runtime struct only contains the parent of the private /tmp,
2247 * which is non-accessible to world users. Inside of it there's a /tmp
2248 * that is sticky, and that's the one we want to use here. */
2250 if (context
->private_tmp
&& runtime
) {
2251 if (runtime
->tmp_dir
)
2252 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2253 if (runtime
->var_tmp_dir
)
2254 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2257 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2258 root_image
= context
->root_image
;
2261 root_dir
= context
->root_directory
;
2264 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2269 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2270 * sandbox info, otherwise enforce it, don't ignore protected paths and
2271 * fail if we are enable to apply the sandbox inside the mount namespace.
2273 if (!context
->dynamic_user
&& root_dir
)
2274 ns_info
.ignore_protect_paths
= true;
2276 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2278 r
= setup_namespace(root_dir
, root_image
,
2279 &ns_info
, context
->read_write_paths
,
2280 needs_sandboxing
? context
->read_only_paths
: NULL
,
2281 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2287 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2288 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2289 context
->mount_flags
,
2290 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2292 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2294 /* If we couldn't set up the namespace this is probably due to a
2295 * missing capability. In this case, silently proceeed. */
2296 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2297 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2304 static int apply_working_directory(
2305 const ExecContext
*context
,
2306 const ExecParameters
*params
,
2308 const bool needs_mount_ns
,
2314 assert(exit_status
);
2316 if (context
->working_directory_home
) {
2319 *exit_status
= EXIT_CHDIR
;
2325 } else if (context
->working_directory
)
2326 wd
= context
->working_directory
;
2330 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2331 if (!needs_mount_ns
&& context
->root_directory
)
2332 if (chroot(context
->root_directory
) < 0) {
2333 *exit_status
= EXIT_CHROOT
;
2339 d
= prefix_roota(context
->root_directory
, wd
);
2341 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2342 *exit_status
= EXIT_CHDIR
;
2349 static int setup_keyring(
2351 const ExecContext
*context
,
2352 const ExecParameters
*p
,
2353 uid_t uid
, gid_t gid
) {
2355 key_serial_t keyring
;
2362 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2363 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2364 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2365 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2366 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2367 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2369 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2372 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2375 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2376 if (keyring
== -1) {
2377 if (errno
== ENOSYS
)
2378 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2379 else if (IN_SET(errno
, EACCES
, EPERM
))
2380 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2381 else if (errno
== EDQUOT
)
2382 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2384 return log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2389 /* Populate they keyring with the invocation ID by default. */
2390 if (!sd_id128_is_null(u
->invocation_id
)) {
2393 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2395 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2397 if (keyctl(KEYCTL_SETPERM
, key
,
2398 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2399 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2400 return log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2404 /* And now, make the keyring owned by the service's user */
2405 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2406 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2407 return log_unit_error_errno(u
, errno
, "Failed to change ownership of session keyring: %m");
2409 /* When requested link the user keyring into the session keyring. */
2410 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2414 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2415 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2416 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2418 saved_uid
= getuid();
2419 saved_gid
= getgid();
2421 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2422 if (setregid(gid
, -1) < 0)
2423 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2426 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2427 if (setreuid(uid
, -1) < 0) {
2428 (void) setregid(saved_gid
, -1);
2429 return log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2433 if (keyctl(KEYCTL_LINK
,
2434 KEY_SPEC_USER_KEYRING
,
2435 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2439 (void) setreuid(saved_uid
, -1);
2440 (void) setregid(saved_gid
, -1);
2442 return log_unit_error_errno(u
, r
, "Failed to link user keyring into session keyring: %m");
2445 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2446 if (setreuid(saved_uid
, -1) < 0) {
2447 (void) setregid(saved_gid
, -1);
2448 return log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2452 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2453 if (setregid(saved_gid
, -1) < 0)
2454 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2461 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2469 array
[(*n
)++] = pair
[0];
2471 array
[(*n
)++] = pair
[1];
2474 static int close_remaining_fds(
2475 const ExecParameters
*params
,
2476 ExecRuntime
*runtime
,
2477 DynamicCreds
*dcreds
,
2480 int *fds
, unsigned n_fds
) {
2482 unsigned n_dont_close
= 0;
2483 int dont_close
[n_fds
+ 12];
2487 if (params
->stdin_fd
>= 0)
2488 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2489 if (params
->stdout_fd
>= 0)
2490 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2491 if (params
->stderr_fd
>= 0)
2492 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2495 dont_close
[n_dont_close
++] = socket_fd
;
2497 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2498 n_dont_close
+= n_fds
;
2502 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2506 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2508 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2511 if (user_lookup_fd
>= 0)
2512 dont_close
[n_dont_close
++] = user_lookup_fd
;
2514 return close_all_fds(dont_close
, n_dont_close
);
2517 static int send_user_lookup(
2525 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2526 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2529 if (user_lookup_fd
< 0)
2532 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2535 if (writev(user_lookup_fd
,
2537 IOVEC_INIT(&uid
, sizeof(uid
)),
2538 IOVEC_INIT(&gid
, sizeof(gid
)),
2539 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2545 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2552 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2557 if (!c
->working_directory_home
)
2561 /* Hardcode /root as home directory for UID 0 */
2566 r
= get_home_dir(buf
);
2574 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2575 _cleanup_strv_free_
char ** list
= NULL
;
2576 ExecDirectoryType t
;
2583 assert(c
->dynamic_user
);
2585 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2586 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2589 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2592 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2598 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2601 if (t
== EXEC_DIRECTORY_RUNTIME
)
2602 e
= strjoin(p
->prefix
[t
], "/", *i
);
2604 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2608 r
= strv_consume(&list
, e
);
2620 static int exec_child(
2622 ExecCommand
*command
,
2623 const ExecContext
*context
,
2624 const ExecParameters
*params
,
2625 ExecRuntime
*runtime
,
2626 DynamicCreds
*dcreds
,
2631 unsigned n_storage_fds
,
2632 unsigned n_socket_fds
,
2637 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2638 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2639 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2640 const char *username
= NULL
, *groupname
= NULL
;
2641 const char *home
= NULL
, *shell
= NULL
;
2642 dev_t journal_stream_dev
= 0;
2643 ino_t journal_stream_ino
= 0;
2644 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2645 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2646 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2647 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2649 bool use_selinux
= false;
2652 bool use_smack
= false;
2655 bool use_apparmor
= false;
2657 uid_t uid
= UID_INVALID
;
2658 gid_t gid
= GID_INVALID
;
2659 int i
, r
, ngids
= 0;
2661 ExecDirectoryType dt
;
2668 assert(exit_status
);
2670 rename_process_from_path(command
->path
);
2672 /* We reset exactly these signals, since they are the
2673 * only ones we set to SIG_IGN in the main daemon. All
2674 * others we leave untouched because we set them to
2675 * SIG_DFL or a valid handler initially, both of which
2676 * will be demoted to SIG_DFL. */
2677 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2678 SIGNALS_IGNORE
, -1);
2680 if (context
->ignore_sigpipe
)
2681 (void) ignore_signals(SIGPIPE
, -1);
2683 r
= reset_signal_mask();
2685 *exit_status
= EXIT_SIGNAL_MASK
;
2686 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2689 if (params
->idle_pipe
)
2690 do_idle_pipe_dance(params
->idle_pipe
);
2692 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2693 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2694 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2695 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2698 log_set_open_when_needed(true);
2700 /* In case anything used libc syslog(), close this here, too */
2703 n_fds
= n_storage_fds
+ n_socket_fds
;
2704 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2706 *exit_status
= EXIT_FDS
;
2707 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2710 if (!context
->same_pgrp
)
2712 *exit_status
= EXIT_SETSID
;
2713 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2716 exec_context_tty_reset(context
, params
);
2718 if (unit_shall_confirm_spawn(unit
)) {
2719 const char *vc
= params
->confirm_spawn
;
2720 _cleanup_free_
char *cmdline
= NULL
;
2722 cmdline
= exec_command_line(argv
);
2724 *exit_status
= EXIT_MEMORY
;
2728 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2729 if (r
!= CONFIRM_EXECUTE
) {
2730 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2731 *exit_status
= EXIT_SUCCESS
;
2734 *exit_status
= EXIT_CONFIRM
;
2735 log_unit_error(unit
, "Execution cancelled by the user");
2740 if (context
->dynamic_user
&& dcreds
) {
2741 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2743 /* Make sure we bypass our own NSS module for any NSS checks */
2744 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2745 *exit_status
= EXIT_USER
;
2746 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2749 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2751 *exit_status
= EXIT_MEMORY
;
2755 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2757 *exit_status
= EXIT_USER
;
2759 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2762 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2765 if (!uid_is_valid(uid
)) {
2766 *exit_status
= EXIT_USER
;
2767 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2771 if (!gid_is_valid(gid
)) {
2772 *exit_status
= EXIT_USER
;
2773 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2778 username
= dcreds
->user
->name
;
2781 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2783 *exit_status
= EXIT_USER
;
2784 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2787 r
= get_fixed_group(context
, &groupname
, &gid
);
2789 *exit_status
= EXIT_GROUP
;
2790 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
2794 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2795 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2796 &supplementary_gids
, &ngids
);
2798 *exit_status
= EXIT_GROUP
;
2799 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
2802 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2804 *exit_status
= EXIT_USER
;
2805 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
2808 user_lookup_fd
= safe_close(user_lookup_fd
);
2810 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2812 *exit_status
= EXIT_CHDIR
;
2813 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
2816 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2817 * must sure to drop O_NONBLOCK */
2819 (void) fd_nonblock(socket_fd
, false);
2821 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2823 *exit_status
= EXIT_STDIN
;
2824 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
2827 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2829 *exit_status
= EXIT_STDOUT
;
2830 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
2833 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2835 *exit_status
= EXIT_STDERR
;
2836 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
2839 if (params
->cgroup_path
) {
2840 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2842 *exit_status
= EXIT_CGROUP
;
2843 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", params
->cgroup_path
);
2847 if (context
->oom_score_adjust_set
) {
2848 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2850 /* When we can't make this change due to EPERM, then
2851 * let's silently skip over it. User namespaces
2852 * prohibit write access to this file, and we
2853 * shouldn't trip up over that. */
2855 sprintf(t
, "%i", context
->oom_score_adjust
);
2856 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2857 if (IN_SET(r
, -EPERM
, -EACCES
))
2858 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2860 *exit_status
= EXIT_OOM_ADJUST
;
2861 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
2865 if (context
->nice_set
)
2866 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2867 *exit_status
= EXIT_NICE
;
2868 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
2871 if (context
->cpu_sched_set
) {
2872 struct sched_param param
= {
2873 .sched_priority
= context
->cpu_sched_priority
,
2876 r
= sched_setscheduler(0,
2877 context
->cpu_sched_policy
|
2878 (context
->cpu_sched_reset_on_fork
?
2879 SCHED_RESET_ON_FORK
: 0),
2882 *exit_status
= EXIT_SETSCHEDULER
;
2883 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
2887 if (context
->cpuset
)
2888 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2889 *exit_status
= EXIT_CPUAFFINITY
;
2890 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
2893 if (context
->ioprio_set
)
2894 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2895 *exit_status
= EXIT_IOPRIO
;
2896 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
2899 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2900 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2901 *exit_status
= EXIT_TIMERSLACK
;
2902 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
2905 if (context
->personality
!= PERSONALITY_INVALID
) {
2906 r
= safe_personality(context
->personality
);
2908 *exit_status
= EXIT_PERSONALITY
;
2909 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
2913 if (context
->utmp_id
)
2914 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2916 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
2917 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
2921 if (context
->user
) {
2922 r
= chown_terminal(STDIN_FILENO
, uid
);
2924 *exit_status
= EXIT_STDIN
;
2925 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
2929 /* If delegation is enabled we'll pass ownership of the cgroup
2930 * (but only in systemd's own controller hierarchy!) to the
2931 * user of the new process. */
2932 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
2933 r
= cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0644, uid
, gid
);
2935 *exit_status
= EXIT_CGROUP
;
2936 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2939 r
= cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, 0755, uid
, gid
);
2941 *exit_status
= EXIT_CGROUP
;
2942 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
2946 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
2947 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
2949 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
2952 r
= build_environment(
2964 *exit_status
= EXIT_MEMORY
;
2968 r
= build_pass_environment(context
, &pass_env
);
2970 *exit_status
= EXIT_MEMORY
;
2974 accum_env
= strv_env_merge(5,
2975 params
->environment
,
2978 context
->environment
,
2982 *exit_status
= EXIT_MEMORY
;
2985 accum_env
= strv_env_clean(accum_env
);
2987 (void) umask(context
->umask
);
2989 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
2991 *exit_status
= EXIT_KEYRING
;
2992 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
2995 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2996 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2998 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2999 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3001 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3002 if (needs_ambient_hack
)
3003 needs_setuid
= false;
3005 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3007 if (needs_sandboxing
) {
3008 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3009 * present. The actual MAC context application will happen later, as late as possible, to avoid
3010 * impacting our own code paths. */
3013 use_selinux
= mac_selinux_use();
3016 use_smack
= mac_smack_use();
3019 use_apparmor
= mac_apparmor_use();
3024 if (context
->pam_name
&& username
) {
3025 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3027 *exit_status
= EXIT_PAM
;
3028 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3033 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3034 if (ns_type_supported(NAMESPACE_NET
)) {
3035 r
= setup_netns(runtime
->netns_storage_socket
);
3037 *exit_status
= EXIT_NETWORK
;
3038 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3041 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3044 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3045 if (needs_mount_namespace
) {
3046 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3048 *exit_status
= EXIT_NAMESPACE
;
3049 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3053 /* Apply just after mount namespace setup */
3054 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3056 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3058 /* Drop groups as early as possbile */
3060 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3062 *exit_status
= EXIT_GROUP
;
3063 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3067 if (needs_sandboxing
) {
3069 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3070 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3072 *exit_status
= EXIT_SELINUX_CONTEXT
;
3073 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3078 if (context
->private_users
) {
3079 r
= setup_private_users(uid
, gid
);
3081 *exit_status
= EXIT_USER
;
3082 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3087 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3088 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3089 * was needed to upload the policy and can now be closed as well. */
3090 r
= close_all_fds(fds
, n_fds
);
3092 r
= shift_fds(fds
, n_fds
);
3094 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
3096 *exit_status
= EXIT_FDS
;
3097 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3100 secure_bits
= context
->secure_bits
;
3102 if (needs_sandboxing
) {
3105 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
3107 if (!context
->rlimit
[i
])
3110 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
3112 *exit_status
= EXIT_LIMITS
;
3113 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit %s: %m", rlimit_to_string(i
));
3117 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3118 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3119 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3120 *exit_status
= EXIT_LIMITS
;
3121 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3125 bset
= context
->capability_bounding_set
;
3126 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3127 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3128 * instead of us doing that */
3129 if (needs_ambient_hack
)
3130 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3131 (UINT64_C(1) << CAP_SETUID
) |
3132 (UINT64_C(1) << CAP_SETGID
);
3134 if (!cap_test_all(bset
)) {
3135 r
= capability_bounding_set_drop(bset
, false);
3137 *exit_status
= EXIT_CAPABILITIES
;
3138 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3142 /* This is done before enforce_user, but ambient set
3143 * does not survive over setresuid() if keep_caps is not set. */
3144 if (!needs_ambient_hack
&&
3145 context
->capability_ambient_set
!= 0) {
3146 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3148 *exit_status
= EXIT_CAPABILITIES
;
3149 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3155 if (context
->user
) {
3156 r
= enforce_user(context
, uid
);
3158 *exit_status
= EXIT_USER
;
3159 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3162 if (!needs_ambient_hack
&&
3163 context
->capability_ambient_set
!= 0) {
3165 /* Fix the ambient capabilities after user change. */
3166 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3168 *exit_status
= EXIT_CAPABILITIES
;
3169 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3172 /* If we were asked to change user and ambient capabilities
3173 * were requested, we had to add keep-caps to the securebits
3174 * so that we would maintain the inherited capability set
3175 * through the setresuid(). Make sure that the bit is added
3176 * also to the context secure_bits so that we don't try to
3177 * drop the bit away next. */
3179 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3184 if (needs_sandboxing
) {
3185 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3186 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3187 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3188 * are restricted. */
3192 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3195 r
= setexeccon(exec_context
);
3197 *exit_status
= EXIT_SELINUX_CONTEXT
;
3198 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3206 r
= setup_smack(context
, command
);
3208 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3209 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3215 if (use_apparmor
&& context
->apparmor_profile
) {
3216 r
= aa_change_onexec(context
->apparmor_profile
);
3217 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3218 *exit_status
= EXIT_APPARMOR_PROFILE
;
3219 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3224 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3225 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3226 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3227 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3228 *exit_status
= EXIT_SECUREBITS
;
3229 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3232 if (context_has_no_new_privileges(context
))
3233 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3234 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3235 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3239 r
= apply_address_families(unit
, context
);
3241 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3242 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3245 r
= apply_memory_deny_write_execute(unit
, context
);
3247 *exit_status
= EXIT_SECCOMP
;
3248 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3251 r
= apply_restrict_realtime(unit
, context
);
3253 *exit_status
= EXIT_SECCOMP
;
3254 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3257 r
= apply_restrict_namespaces(unit
, context
);
3259 *exit_status
= EXIT_SECCOMP
;
3260 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3263 r
= apply_protect_sysctl(unit
, context
);
3265 *exit_status
= EXIT_SECCOMP
;
3266 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3269 r
= apply_protect_kernel_modules(unit
, context
);
3271 *exit_status
= EXIT_SECCOMP
;
3272 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3275 r
= apply_private_devices(unit
, context
);
3277 *exit_status
= EXIT_SECCOMP
;
3278 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3281 r
= apply_syscall_archs(unit
, context
);
3283 *exit_status
= EXIT_SECCOMP
;
3284 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3287 r
= apply_lock_personality(unit
, context
);
3289 *exit_status
= EXIT_SECCOMP
;
3290 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3293 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3294 * by the filter as little as possible. */
3295 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3297 *exit_status
= EXIT_SECCOMP
;
3298 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3303 if (!strv_isempty(context
->unset_environment
)) {
3306 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3308 *exit_status
= EXIT_MEMORY
;
3312 strv_free(accum_env
);
3316 final_argv
= replace_env_argv(argv
, accum_env
);
3318 *exit_status
= EXIT_MEMORY
;
3322 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3323 _cleanup_free_
char *line
;
3325 line
= exec_command_line(final_argv
);
3327 log_struct(LOG_DEBUG
,
3328 "EXECUTABLE=%s", command
->path
,
3329 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3331 LOG_UNIT_INVOCATION_ID(unit
),
3336 execve(command
->path
, final_argv
, accum_env
);
3338 if (errno
== ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3340 log_struct_errno(LOG_INFO
, errno
,
3341 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3343 LOG_UNIT_INVOCATION_ID(unit
),
3344 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3346 "EXECUTABLE=%s", command
->path
,
3352 *exit_status
= EXIT_EXEC
;
3353 return log_unit_error_errno(unit
, errno
, "Failed to execute command: %m");
3356 int exec_spawn(Unit
*unit
,
3357 ExecCommand
*command
,
3358 const ExecContext
*context
,
3359 const ExecParameters
*params
,
3360 ExecRuntime
*runtime
,
3361 DynamicCreds
*dcreds
,
3364 _cleanup_strv_free_
char **files_env
= NULL
;
3366 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3367 _cleanup_free_
char *line
= NULL
;
3369 int named_iofds
[3] = { -1, -1, -1 };
3378 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3380 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3381 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3382 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3384 if (params
->n_socket_fds
> 1) {
3385 log_unit_error(unit
, "Got more than one socket.");
3389 if (params
->n_socket_fds
== 0) {
3390 log_unit_error(unit
, "Got no socket.");
3394 socket_fd
= params
->fds
[0];
3398 n_storage_fds
= params
->n_storage_fds
;
3399 n_socket_fds
= params
->n_socket_fds
;
3402 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3404 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3406 r
= exec_context_load_environment(unit
, context
, &files_env
);
3408 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3410 argv
= params
->argv
?: command
->argv
;
3411 line
= exec_command_line(argv
);
3415 log_struct(LOG_DEBUG
,
3416 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3417 "EXECUTABLE=%s", command
->path
,
3419 LOG_UNIT_INVOCATION_ID(unit
),
3424 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3427 int exit_status
= EXIT_SUCCESS
;
3429 r
= exec_child(unit
,
3442 unit
->manager
->user_lookup_fds
[1],
3446 log_struct_errno(LOG_ERR
, r
,
3447 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3449 LOG_UNIT_INVOCATION_ID(unit
),
3450 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3451 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3453 "EXECUTABLE=%s", command
->path
,
3460 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3462 /* We add the new process to the cgroup both in the child (so
3463 * that we can be sure that no user code is ever executed
3464 * outside of the cgroup) and in the parent (so that we can be
3465 * sure that when we kill the cgroup the process will be
3467 if (params
->cgroup_path
)
3468 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3470 exec_status_start(&command
->exec_status
, pid
);
3476 void exec_context_init(ExecContext
*c
) {
3477 ExecDirectoryType i
;
3482 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3483 c
->cpu_sched_policy
= SCHED_OTHER
;
3484 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3485 c
->syslog_level_prefix
= true;
3486 c
->ignore_sigpipe
= true;
3487 c
->timer_slack_nsec
= NSEC_INFINITY
;
3488 c
->personality
= PERSONALITY_INVALID
;
3489 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3490 c
->directories
[i
].mode
= 0755;
3491 c
->capability_bounding_set
= CAP_ALL
;
3492 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3493 c
->log_level_max
= -1;
3496 void exec_context_done(ExecContext
*c
) {
3497 ExecDirectoryType i
;
3502 c
->environment
= strv_free(c
->environment
);
3503 c
->environment_files
= strv_free(c
->environment_files
);
3504 c
->pass_environment
= strv_free(c
->pass_environment
);
3505 c
->unset_environment
= strv_free(c
->unset_environment
);
3507 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3508 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3510 for (l
= 0; l
< 3; l
++)
3511 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3513 c
->working_directory
= mfree(c
->working_directory
);
3514 c
->root_directory
= mfree(c
->root_directory
);
3515 c
->root_image
= mfree(c
->root_image
);
3516 c
->tty_path
= mfree(c
->tty_path
);
3517 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3518 c
->user
= mfree(c
->user
);
3519 c
->group
= mfree(c
->group
);
3521 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3523 c
->pam_name
= mfree(c
->pam_name
);
3525 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3526 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3527 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3529 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3532 CPU_FREE(c
->cpuset
);
3534 c
->utmp_id
= mfree(c
->utmp_id
);
3535 c
->selinux_context
= mfree(c
->selinux_context
);
3536 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3537 c
->smack_process_label
= mfree(c
->smack_process_label
);
3539 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3540 c
->syscall_archs
= set_free(c
->syscall_archs
);
3541 c
->address_families
= set_free(c
->address_families
);
3543 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3544 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3546 c
->log_level_max
= -1;
3548 exec_context_free_log_extra_fields(c
);
3550 c
->stdin_data
= mfree(c
->stdin_data
);
3551 c
->stdin_data_size
= 0;
3554 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3559 if (!runtime_prefix
)
3562 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3563 _cleanup_free_
char *p
;
3565 p
= strjoin(runtime_prefix
, "/", *i
);
3569 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3571 (void) rm_rf(p
, REMOVE_ROOT
);
3577 void exec_command_done(ExecCommand
*c
) {
3580 c
->path
= mfree(c
->path
);
3582 c
->argv
= strv_free(c
->argv
);
3585 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3588 for (i
= 0; i
< n
; i
++)
3589 exec_command_done(c
+i
);
3592 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3596 LIST_REMOVE(command
, c
, i
);
3597 exec_command_done(i
);
3604 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3607 for (i
= 0; i
< n
; i
++)
3608 c
[i
] = exec_command_free_list(c
[i
]);
3611 typedef struct InvalidEnvInfo
{
3616 static void invalid_env(const char *p
, void *userdata
) {
3617 InvalidEnvInfo
*info
= userdata
;
3619 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3622 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3628 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3631 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3634 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3637 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3640 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3643 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3650 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3651 unsigned i
, targets
;
3652 const char* stdio_fdname
[3];
3658 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3659 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3660 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3662 for (i
= 0; i
< 3; i
++)
3663 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3665 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3667 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3668 if (named_iofds
[STDIN_FILENO
] < 0 &&
3669 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3670 stdio_fdname
[STDIN_FILENO
] &&
3671 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3673 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3676 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3677 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3678 stdio_fdname
[STDOUT_FILENO
] &&
3679 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3681 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3684 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3685 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3686 stdio_fdname
[STDERR_FILENO
] &&
3687 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3689 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3693 return targets
== 0 ? 0 : -ENOENT
;
3696 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3697 char **i
, **r
= NULL
;
3702 STRV_FOREACH(i
, c
->environment_files
) {
3706 bool ignore
= false;
3708 _cleanup_globfree_ glob_t pglob
= {};
3717 if (!path_is_absolute(fn
)) {
3725 /* Filename supports globbing, take all matching files */
3726 k
= safe_glob(fn
, 0, &pglob
);
3735 /* When we don't match anything, -ENOENT should be returned */
3736 assert(pglob
.gl_pathc
> 0);
3738 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3739 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3747 /* Log invalid environment variables with filename */
3749 InvalidEnvInfo info
= {
3751 .path
= pglob
.gl_pathv
[n
]
3754 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3762 m
= strv_env_merge(2, r
, p
);
3778 static bool tty_may_match_dev_console(const char *tty
) {
3779 _cleanup_free_
char *active
= NULL
;
3785 tty
= skip_dev_prefix(tty
);
3787 /* trivial identity? */
3788 if (streq(tty
, "console"))
3791 console
= resolve_dev_console(&active
);
3792 /* if we could not resolve, assume it may */
3796 /* "tty0" means the active VC, so it may be the same sometimes */
3797 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3800 bool exec_context_may_touch_console(ExecContext
*ec
) {
3802 return (ec
->tty_reset
||
3804 ec
->tty_vt_disallocate
||
3805 is_terminal_input(ec
->std_input
) ||
3806 is_terminal_output(ec
->std_output
) ||
3807 is_terminal_output(ec
->std_error
)) &&
3808 tty_may_match_dev_console(exec_context_tty_path(ec
));
3811 static void strv_fprintf(FILE *f
, char **l
) {
3817 fprintf(f
, " %s", *g
);
3820 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3821 ExecDirectoryType dt
;
3829 prefix
= strempty(prefix
);
3833 "%sWorkingDirectory: %s\n"
3834 "%sRootDirectory: %s\n"
3835 "%sNonBlocking: %s\n"
3836 "%sPrivateTmp: %s\n"
3837 "%sPrivateDevices: %s\n"
3838 "%sProtectKernelTunables: %s\n"
3839 "%sProtectKernelModules: %s\n"
3840 "%sProtectControlGroups: %s\n"
3841 "%sPrivateNetwork: %s\n"
3842 "%sPrivateUsers: %s\n"
3843 "%sProtectHome: %s\n"
3844 "%sProtectSystem: %s\n"
3845 "%sMountAPIVFS: %s\n"
3846 "%sIgnoreSIGPIPE: %s\n"
3847 "%sMemoryDenyWriteExecute: %s\n"
3848 "%sRestrictRealtime: %s\n"
3849 "%sKeyringMode: %s\n",
3851 prefix
, c
->working_directory
? c
->working_directory
: "/",
3852 prefix
, c
->root_directory
? c
->root_directory
: "/",
3853 prefix
, yes_no(c
->non_blocking
),
3854 prefix
, yes_no(c
->private_tmp
),
3855 prefix
, yes_no(c
->private_devices
),
3856 prefix
, yes_no(c
->protect_kernel_tunables
),
3857 prefix
, yes_no(c
->protect_kernel_modules
),
3858 prefix
, yes_no(c
->protect_control_groups
),
3859 prefix
, yes_no(c
->private_network
),
3860 prefix
, yes_no(c
->private_users
),
3861 prefix
, protect_home_to_string(c
->protect_home
),
3862 prefix
, protect_system_to_string(c
->protect_system
),
3863 prefix
, yes_no(c
->mount_apivfs
),
3864 prefix
, yes_no(c
->ignore_sigpipe
),
3865 prefix
, yes_no(c
->memory_deny_write_execute
),
3866 prefix
, yes_no(c
->restrict_realtime
),
3867 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3870 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3872 STRV_FOREACH(e
, c
->environment
)
3873 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3875 STRV_FOREACH(e
, c
->environment_files
)
3876 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3878 STRV_FOREACH(e
, c
->pass_environment
)
3879 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3881 STRV_FOREACH(e
, c
->unset_environment
)
3882 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3884 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3886 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3887 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3889 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3890 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3898 if (c
->oom_score_adjust_set
)
3900 "%sOOMScoreAdjust: %i\n",
3901 prefix
, c
->oom_score_adjust
);
3903 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3905 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3906 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3907 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3908 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3911 if (c
->ioprio_set
) {
3912 _cleanup_free_
char *class_str
= NULL
;
3914 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3916 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
3918 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
3921 if (c
->cpu_sched_set
) {
3922 _cleanup_free_
char *policy_str
= NULL
;
3924 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
3926 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
3929 "%sCPUSchedulingPriority: %i\n"
3930 "%sCPUSchedulingResetOnFork: %s\n",
3931 prefix
, c
->cpu_sched_priority
,
3932 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
3936 fprintf(f
, "%sCPUAffinity:", prefix
);
3937 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
3938 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
3939 fprintf(f
, " %u", i
);
3943 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
3944 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
3947 "%sStandardInput: %s\n"
3948 "%sStandardOutput: %s\n"
3949 "%sStandardError: %s\n",
3950 prefix
, exec_input_to_string(c
->std_input
),
3951 prefix
, exec_output_to_string(c
->std_output
),
3952 prefix
, exec_output_to_string(c
->std_error
));
3958 "%sTTYVHangup: %s\n"
3959 "%sTTYVTDisallocate: %s\n",
3960 prefix
, c
->tty_path
,
3961 prefix
, yes_no(c
->tty_reset
),
3962 prefix
, yes_no(c
->tty_vhangup
),
3963 prefix
, yes_no(c
->tty_vt_disallocate
));
3965 if (IN_SET(c
->std_output
,
3968 EXEC_OUTPUT_JOURNAL
,
3969 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3970 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3971 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
3972 IN_SET(c
->std_error
,
3975 EXEC_OUTPUT_JOURNAL
,
3976 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
3977 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
3978 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
3980 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
3982 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
3984 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
3986 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
3988 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
3991 if (c
->log_level_max
>= 0) {
3992 _cleanup_free_
char *t
= NULL
;
3994 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
3996 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
3999 if (c
->n_log_extra_fields
> 0) {
4002 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
4003 fprintf(f
, "%sLogExtraFields: ", prefix
);
4004 fwrite(c
->log_extra_fields
[j
].iov_base
,
4005 1, c
->log_extra_fields
[j
].iov_len
,
4011 if (c
->secure_bits
) {
4012 _cleanup_free_
char *str
= NULL
;
4014 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4016 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4019 if (c
->capability_bounding_set
!= CAP_ALL
) {
4020 _cleanup_free_
char *str
= NULL
;
4022 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4024 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4027 if (c
->capability_ambient_set
!= 0) {
4028 _cleanup_free_
char *str
= NULL
;
4030 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4032 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4036 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4038 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4040 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4042 if (!strv_isempty(c
->supplementary_groups
)) {
4043 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4044 strv_fprintf(f
, c
->supplementary_groups
);
4049 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4051 if (strv_length(c
->read_write_paths
) > 0) {
4052 fprintf(f
, "%sReadWritePaths:", prefix
);
4053 strv_fprintf(f
, c
->read_write_paths
);
4057 if (strv_length(c
->read_only_paths
) > 0) {
4058 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4059 strv_fprintf(f
, c
->read_only_paths
);
4063 if (strv_length(c
->inaccessible_paths
) > 0) {
4064 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4065 strv_fprintf(f
, c
->inaccessible_paths
);
4069 if (c
->n_bind_mounts
> 0)
4070 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
4071 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
4072 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4073 c
->bind_mounts
[i
].source
,
4074 c
->bind_mounts
[i
].destination
,
4075 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4080 "%sUtmpIdentifier: %s\n",
4081 prefix
, c
->utmp_id
);
4083 if (c
->selinux_context
)
4085 "%sSELinuxContext: %s%s\n",
4086 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4088 if (c
->apparmor_profile
)
4090 "%sAppArmorProfile: %s%s\n",
4091 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4093 if (c
->smack_process_label
)
4095 "%sSmackProcessLabel: %s%s\n",
4096 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4098 if (c
->personality
!= PERSONALITY_INVALID
)
4100 "%sPersonality: %s\n",
4101 prefix
, strna(personality_to_string(c
->personality
)));
4104 "%sLockPersonality: %s\n",
4105 prefix
, yes_no(c
->lock_personality
));
4107 if (c
->syscall_filter
) {
4115 "%sSystemCallFilter: ",
4118 if (!c
->syscall_whitelist
)
4122 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4123 _cleanup_free_
char *name
= NULL
;
4124 const char *errno_name
= NULL
;
4125 int num
= PTR_TO_INT(val
);
4132 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4133 fputs(strna(name
), f
);
4136 errno_name
= errno_to_name(num
);
4138 fprintf(f
, ":%s", errno_name
);
4140 fprintf(f
, ":%d", num
);
4148 if (c
->syscall_archs
) {
4155 "%sSystemCallArchitectures:",
4159 SET_FOREACH(id
, c
->syscall_archs
, j
)
4160 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4165 if (exec_context_restrict_namespaces_set(c
)) {
4166 _cleanup_free_
char *s
= NULL
;
4168 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
4170 fprintf(f
, "%sRestrictNamespaces: %s\n",
4174 if (c
->syscall_errno
> 0) {
4175 const char *errno_name
;
4177 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4179 errno_name
= errno_to_name(c
->syscall_errno
);
4181 fprintf(f
, "%s\n", errno_name
);
4183 fprintf(f
, "%d\n", c
->syscall_errno
);
4186 if (c
->apparmor_profile
)
4188 "%sAppArmorProfile: %s%s\n",
4189 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4192 bool exec_context_maintains_privileges(ExecContext
*c
) {
4195 /* Returns true if the process forked off would run under
4196 * an unchanged UID or as root. */
4201 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4207 int exec_context_get_effective_ioprio(ExecContext
*c
) {
4215 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4217 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4222 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4227 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4228 free(c
->log_extra_fields
[l
].iov_base
);
4229 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4230 c
->n_log_extra_fields
= 0;
4233 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4238 dual_timestamp_get(&s
->start_timestamp
);
4241 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
4244 if (s
->pid
&& s
->pid
!= pid
)
4248 dual_timestamp_get(&s
->exit_timestamp
);
4254 if (context
->utmp_id
)
4255 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4257 exec_context_tty_reset(context
, NULL
);
4261 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
4262 char buf
[FORMAT_TIMESTAMP_MAX
];
4270 prefix
= strempty(prefix
);
4273 "%sPID: "PID_FMT
"\n",
4276 if (dual_timestamp_is_set(&s
->start_timestamp
))
4278 "%sStart Timestamp: %s\n",
4279 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4281 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4283 "%sExit Timestamp: %s\n"
4285 "%sExit Status: %i\n",
4286 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4287 prefix
, sigchld_code_to_string(s
->code
),
4291 char *exec_command_line(char **argv
) {
4299 STRV_FOREACH(a
, argv
)
4307 STRV_FOREACH(a
, argv
) {
4314 if (strpbrk(*a
, WHITESPACE
)) {
4325 /* FIXME: this doesn't really handle arguments that have
4326 * spaces and ticks in them */
4331 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4332 _cleanup_free_
char *cmd
= NULL
;
4333 const char *prefix2
;
4338 prefix
= strempty(prefix
);
4339 prefix2
= strjoina(prefix
, "\t");
4341 cmd
= exec_command_line(c
->argv
);
4343 "%sCommand Line: %s\n",
4344 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4346 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4349 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4352 prefix
= strempty(prefix
);
4354 LIST_FOREACH(command
, c
, c
)
4355 exec_command_dump(c
, f
, prefix
);
4358 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4365 /* It's kind of important, that we keep the order here */
4366 LIST_FIND_TAIL(command
, *l
, end
);
4367 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4372 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4380 l
= strv_new_ap(path
, ap
);
4401 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4402 _cleanup_strv_free_
char **l
= NULL
;
4410 l
= strv_new_ap(path
, ap
);
4416 r
= strv_extend_strv(&c
->argv
, l
, false);
4424 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4429 *rt
= new0(ExecRuntime
, 1);
4434 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4439 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4449 if (!c
->private_network
&& !c
->private_tmp
)
4452 r
= exec_runtime_allocate(rt
);
4456 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4457 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4461 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4462 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4470 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4472 assert(r
->n_ref
> 0);
4478 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4483 assert(r
->n_ref
> 0);
4490 free(r
->var_tmp_dir
);
4491 safe_close_pair(r
->netns_storage_socket
);
4495 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4504 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4506 if (rt
->var_tmp_dir
)
4507 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4509 if (rt
->netns_storage_socket
[0] >= 0) {
4512 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4516 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4519 if (rt
->netns_storage_socket
[1] >= 0) {
4522 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4526 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4532 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4539 if (streq(key
, "tmp-dir")) {
4542 r
= exec_runtime_allocate(rt
);
4546 copy
= strdup(value
);
4550 free((*rt
)->tmp_dir
);
4551 (*rt
)->tmp_dir
= copy
;
4553 } else if (streq(key
, "var-tmp-dir")) {
4556 r
= exec_runtime_allocate(rt
);
4560 copy
= strdup(value
);
4564 free((*rt
)->var_tmp_dir
);
4565 (*rt
)->var_tmp_dir
= copy
;
4567 } else if (streq(key
, "netns-socket-0")) {
4570 r
= exec_runtime_allocate(rt
);
4574 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4575 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4577 safe_close((*rt
)->netns_storage_socket
[0]);
4578 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4580 } else if (streq(key
, "netns-socket-1")) {
4583 r
= exec_runtime_allocate(rt
);
4587 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4588 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4590 safe_close((*rt
)->netns_storage_socket
[1]);
4591 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4599 static void *remove_tmpdir_thread(void *p
) {
4600 _cleanup_free_
char *path
= p
;
4602 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4606 void exec_runtime_destroy(ExecRuntime
*rt
) {
4612 /* If there are multiple users of this, let's leave the stuff around */
4617 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4619 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4621 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4628 if (rt
->var_tmp_dir
) {
4629 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4631 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4633 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4634 free(rt
->var_tmp_dir
);
4637 rt
->var_tmp_dir
= NULL
;
4640 safe_close_pair(rt
->netns_storage_socket
);
4643 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4644 [EXEC_INPUT_NULL
] = "null",
4645 [EXEC_INPUT_TTY
] = "tty",
4646 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4647 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4648 [EXEC_INPUT_SOCKET
] = "socket",
4649 [EXEC_INPUT_NAMED_FD
] = "fd",
4650 [EXEC_INPUT_DATA
] = "data",
4653 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4655 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4656 [EXEC_OUTPUT_INHERIT
] = "inherit",
4657 [EXEC_OUTPUT_NULL
] = "null",
4658 [EXEC_OUTPUT_TTY
] = "tty",
4659 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4660 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4661 [EXEC_OUTPUT_KMSG
] = "kmsg",
4662 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4663 [EXEC_OUTPUT_JOURNAL
] = "journal",
4664 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4665 [EXEC_OUTPUT_SOCKET
] = "socket",
4666 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4669 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4671 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4672 [EXEC_UTMP_INIT
] = "init",
4673 [EXEC_UTMP_LOGIN
] = "login",
4674 [EXEC_UTMP_USER
] = "user",
4677 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4679 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4680 [EXEC_PRESERVE_NO
] = "no",
4681 [EXEC_PRESERVE_YES
] = "yes",
4682 [EXEC_PRESERVE_RESTART
] = "restart",
4685 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4687 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
4688 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4689 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4690 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4691 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4692 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4695 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
4697 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
4698 [EXEC_KEYRING_INHERIT
] = "inherit",
4699 [EXEC_KEYRING_PRIVATE
] = "private",
4700 [EXEC_KEYRING_SHARED
] = "shared",
4703 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);