1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
28 #include <sys/capability.h>
29 #include <sys/eventfd.h>
31 #include <sys/personality.h>
32 #include <sys/prctl.h>
34 #include <sys/socket.h>
36 #include <sys/types.h>
42 #include <security/pam_appl.h>
46 #include <selinux/selinux.h>
54 #include <sys/apparmor.h>
57 #include "sd-messages.h"
60 #include "alloc-util.h"
62 #include "apparmor-util.h"
67 #include "capability-util.h"
68 #include "chown-recursive.h"
71 #include "errno-list.h"
73 #include "exit-status.h"
76 #include "format-util.h"
78 #include "glob-util.h"
86 #include "namespace.h"
87 #include "parse-util.h"
88 #include "path-util.h"
89 #include "process-util.h"
90 #include "rlimit-util.h"
93 #include "seccomp-util.h"
95 #include "securebits.h"
96 #include "securebits-util.h"
97 #include "selinux-util.h"
98 #include "signal-util.h"
99 #include "smack-util.h"
101 #include "string-table.h"
102 #include "string-util.h"
104 #include "syslog-util.h"
105 #include "terminal-util.h"
107 #include "user-util.h"
109 #include "utmp-wtmp.h"
111 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
112 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
114 /* This assumes there is a 'tty' group */
115 #define TTY_MODE 0620
117 #define SNDBUF_SIZE (8*1024*1024)
119 static int shift_fds(int fds
[], unsigned n_fds
) {
120 int start
, restart_from
;
125 /* Modifies the fds array! (sorts it) */
135 for (i
= start
; i
< (int) n_fds
; i
++) {
138 /* Already at right index? */
142 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
149 /* Hmm, the fd we wanted isn't free? Then
150 * let's remember that and try again from here */
151 if (nfd
!= i
+3 && restart_from
< 0)
155 if (restart_from
< 0)
158 start
= restart_from
;
164 static int flags_fds(const int fds
[], unsigned n_storage_fds
, unsigned n_socket_fds
, bool nonblock
) {
168 n_fds
= n_storage_fds
+ n_socket_fds
;
174 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
175 * O_NONBLOCK only applies to socket activation though. */
177 for (i
= 0; i
< n_fds
; i
++) {
179 if (i
< n_socket_fds
) {
180 r
= fd_nonblock(fds
[i
], nonblock
);
185 /* We unconditionally drop FD_CLOEXEC from the fds,
186 * since after all we want to pass these fds to our
189 r
= fd_cloexec(fds
[i
], false);
197 static const char *exec_context_tty_path(const ExecContext
*context
) {
200 if (context
->stdio_as_fds
)
203 if (context
->tty_path
)
204 return context
->tty_path
;
206 return "/dev/console";
209 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
214 path
= exec_context_tty_path(context
);
216 if (context
->tty_vhangup
) {
217 if (p
&& p
->stdin_fd
>= 0)
218 (void) terminal_vhangup_fd(p
->stdin_fd
);
220 (void) terminal_vhangup(path
);
223 if (context
->tty_reset
) {
224 if (p
&& p
->stdin_fd
>= 0)
225 (void) reset_terminal_fd(p
->stdin_fd
, true);
227 (void) reset_terminal(path
);
230 if (context
->tty_vt_disallocate
&& path
)
231 (void) vt_disallocate(path
);
234 static bool is_terminal_input(ExecInput i
) {
237 EXEC_INPUT_TTY_FORCE
,
238 EXEC_INPUT_TTY_FAIL
);
241 static bool is_terminal_output(ExecOutput o
) {
244 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
246 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
249 static bool is_syslog_output(ExecOutput o
) {
252 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
);
255 static bool is_kmsg_output(ExecOutput o
) {
258 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
261 static bool exec_context_needs_term(const ExecContext
*c
) {
264 /* Return true if the execution context suggests we should set $TERM to something useful. */
266 if (is_terminal_input(c
->std_input
))
269 if (is_terminal_output(c
->std_output
))
272 if (is_terminal_output(c
->std_error
))
275 return !!c
->tty_path
;
278 static int open_null_as(int flags
, int nfd
) {
283 fd
= open("/dev/null", flags
|O_NOCTTY
);
287 return move_fd(fd
, nfd
, false);
290 static int connect_journal_socket(int fd
, uid_t uid
, gid_t gid
) {
291 static const union sockaddr_union sa
= {
292 .un
.sun_family
= AF_UNIX
,
293 .un
.sun_path
= "/run/systemd/journal/stdout",
295 uid_t olduid
= UID_INVALID
;
296 gid_t oldgid
= GID_INVALID
;
299 if (gid_is_valid(gid
)) {
302 if (setegid(gid
) < 0)
306 if (uid_is_valid(uid
)) {
309 if (seteuid(uid
) < 0) {
315 r
= connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0 ? -errno
: 0;
317 /* If we fail to restore the uid or gid, things will likely
318 fail later on. This should only happen if an LSM interferes. */
320 if (uid_is_valid(uid
))
321 (void) seteuid(olduid
);
324 if (gid_is_valid(gid
))
325 (void) setegid(oldgid
);
330 static int connect_logger_as(
332 const ExecContext
*context
,
333 const ExecParameters
*params
,
344 assert(output
< _EXEC_OUTPUT_MAX
);
348 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
352 r
= connect_journal_socket(fd
, uid
, gid
);
356 if (shutdown(fd
, SHUT_RD
) < 0) {
361 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
371 context
->syslog_identifier
?: ident
,
372 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
373 context
->syslog_priority
,
374 !!context
->syslog_level_prefix
,
375 is_syslog_output(output
),
376 is_kmsg_output(output
),
377 is_terminal_output(output
));
379 return move_fd(fd
, nfd
, false);
381 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
387 fd
= open_terminal(path
, flags
| O_NOCTTY
);
391 return move_fd(fd
, nfd
, false);
394 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
395 union sockaddr_union sa
= {
396 .sa
.sa_family
= AF_UNIX
,
402 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
405 fd
= open(path
, flags
|O_NOCTTY
, mode
);
409 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
411 if (strlen(path
) > sizeof(sa
.un
.sun_path
)) /* Too long, can't be a UNIX socket */
414 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
416 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
420 strncpy(sa
.un
.sun_path
, path
, sizeof(sa
.un
.sun_path
));
421 if (connect(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
)) < 0) {
423 return errno
== EINVAL
? -ENXIO
: -errno
; /* Propagate initial error if we get EINVAL, i.e. we have
424 * indication that his wasn't an AF_UNIX socket after all */
427 if ((flags
& O_ACCMODE
) == O_RDONLY
)
428 r
= shutdown(fd
, SHUT_WR
);
429 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
430 r
= shutdown(fd
, SHUT_RD
);
441 static int fixup_input(
442 const ExecContext
*context
,
444 bool apply_tty_stdin
) {
450 std_input
= context
->std_input
;
452 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
453 return EXEC_INPUT_NULL
;
455 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
456 return EXEC_INPUT_NULL
;
458 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
459 return EXEC_INPUT_NULL
;
464 static int fixup_output(ExecOutput std_output
, int socket_fd
) {
466 if (std_output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
467 return EXEC_OUTPUT_INHERIT
;
472 static int setup_input(
473 const ExecContext
*context
,
474 const ExecParameters
*params
,
476 int named_iofds
[3]) {
483 if (params
->stdin_fd
>= 0) {
484 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
487 /* Try to make this the controlling tty, if it is a tty, and reset it */
488 if (isatty(STDIN_FILENO
)) {
489 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
490 (void) reset_terminal_fd(STDIN_FILENO
, true);
496 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
500 case EXEC_INPUT_NULL
:
501 return open_null_as(O_RDONLY
, STDIN_FILENO
);
504 case EXEC_INPUT_TTY_FORCE
:
505 case EXEC_INPUT_TTY_FAIL
: {
508 fd
= acquire_terminal(exec_context_tty_path(context
),
509 i
== EXEC_INPUT_TTY_FAIL
,
510 i
== EXEC_INPUT_TTY_FORCE
,
516 return move_fd(fd
, STDIN_FILENO
, false);
519 case EXEC_INPUT_SOCKET
:
520 assert(socket_fd
>= 0);
522 return dup2(socket_fd
, STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
524 case EXEC_INPUT_NAMED_FD
:
525 assert(named_iofds
[STDIN_FILENO
] >= 0);
527 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
528 return dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
) < 0 ? -errno
: STDIN_FILENO
;
530 case EXEC_INPUT_DATA
: {
533 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
537 return move_fd(fd
, STDIN_FILENO
, false);
540 case EXEC_INPUT_FILE
: {
544 assert(context
->stdio_file
[STDIN_FILENO
]);
546 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
547 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
549 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
553 return move_fd(fd
, STDIN_FILENO
, false);
557 assert_not_reached("Unknown input type");
561 static int setup_output(
563 const ExecContext
*context
,
564 const ExecParameters
*params
,
571 dev_t
*journal_stream_dev
,
572 ino_t
*journal_stream_ino
) {
582 assert(journal_stream_dev
);
583 assert(journal_stream_ino
);
585 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
587 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
590 return STDOUT_FILENO
;
593 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
594 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
597 return STDERR_FILENO
;
600 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
601 o
= fixup_output(context
->std_output
, socket_fd
);
603 if (fileno
== STDERR_FILENO
) {
605 e
= fixup_output(context
->std_error
, socket_fd
);
607 /* This expects the input and output are already set up */
609 /* Don't change the stderr file descriptor if we inherit all
610 * the way and are not on a tty */
611 if (e
== EXEC_OUTPUT_INHERIT
&&
612 o
== EXEC_OUTPUT_INHERIT
&&
613 i
== EXEC_INPUT_NULL
&&
614 !is_terminal_input(context
->std_input
) &&
618 /* Duplicate from stdout if possible */
619 if ((e
== o
&& e
!= EXEC_OUTPUT_NAMED_FD
) || e
== EXEC_OUTPUT_INHERIT
)
620 return dup2(STDOUT_FILENO
, fileno
) < 0 ? -errno
: fileno
;
624 } else if (o
== EXEC_OUTPUT_INHERIT
) {
625 /* If input got downgraded, inherit the original value */
626 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
627 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
629 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
630 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
631 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
633 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
637 /* We need to open /dev/null here anew, to get the right access mode. */
638 return open_null_as(O_WRONLY
, fileno
);
643 case EXEC_OUTPUT_NULL
:
644 return open_null_as(O_WRONLY
, fileno
);
646 case EXEC_OUTPUT_TTY
:
647 if (is_terminal_input(i
))
648 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
650 /* We don't reset the terminal if this is just about output */
651 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
653 case EXEC_OUTPUT_SYSLOG
:
654 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE
:
655 case EXEC_OUTPUT_KMSG
:
656 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
657 case EXEC_OUTPUT_JOURNAL
:
658 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
659 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
661 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m", fileno
== STDOUT_FILENO
? "stdout" : "stderr");
662 r
= open_null_as(O_WRONLY
, fileno
);
666 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
667 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
668 * services to detect whether they are connected to the journal or not.
670 * If both stdout and stderr are connected to a stream then let's make sure to store the data
671 * about STDERR as that's usually the best way to do logging. */
673 if (fstat(fileno
, &st
) >= 0 &&
674 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
675 *journal_stream_dev
= st
.st_dev
;
676 *journal_stream_ino
= st
.st_ino
;
681 case EXEC_OUTPUT_SOCKET
:
682 assert(socket_fd
>= 0);
684 return dup2(socket_fd
, fileno
) < 0 ? -errno
: fileno
;
686 case EXEC_OUTPUT_NAMED_FD
:
687 assert(named_iofds
[fileno
] >= 0);
689 (void) fd_nonblock(named_iofds
[fileno
], false);
690 return dup2(named_iofds
[fileno
], fileno
) < 0 ? -errno
: fileno
;
692 case EXEC_OUTPUT_FILE
: {
696 assert(context
->stdio_file
[fileno
]);
698 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
699 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
702 return dup2(STDIN_FILENO
, fileno
) < 0 ? -errno
: fileno
;
704 fd
= acquire_path(context
->stdio_file
[fileno
], O_WRONLY
, 0666 & ~context
->umask
);
708 return move_fd(fd
, fileno
, false);
712 assert_not_reached("Unknown error type");
716 static int chown_terminal(int fd
, uid_t uid
) {
721 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
725 /* This might fail. What matters are the results. */
726 (void) fchown(fd
, uid
, -1);
727 (void) fchmod(fd
, TTY_MODE
);
729 if (fstat(fd
, &st
) < 0)
732 if (st
.st_uid
!= uid
|| (st
.st_mode
& 0777) != TTY_MODE
)
738 static int setup_confirm_stdio(const char *vc
, int *_saved_stdin
, int *_saved_stdout
) {
739 _cleanup_close_
int fd
= -1, saved_stdin
= -1, saved_stdout
= -1;
742 assert(_saved_stdin
);
743 assert(_saved_stdout
);
745 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
749 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
750 if (saved_stdout
< 0)
753 fd
= acquire_terminal(vc
, false, false, false, DEFAULT_CONFIRM_USEC
);
757 r
= chown_terminal(fd
, getuid());
761 r
= reset_terminal_fd(fd
, true);
765 if (dup2(fd
, STDIN_FILENO
) < 0)
768 if (dup2(fd
, STDOUT_FILENO
) < 0)
775 *_saved_stdin
= saved_stdin
;
776 *_saved_stdout
= saved_stdout
;
778 saved_stdin
= saved_stdout
= -1;
783 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
786 if (err
== -ETIMEDOUT
)
787 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
790 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
794 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
795 _cleanup_close_
int fd
= -1;
799 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
803 write_confirm_error_fd(err
, fd
, u
);
806 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
810 assert(saved_stdout
);
814 if (*saved_stdin
>= 0)
815 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
818 if (*saved_stdout
>= 0)
819 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
822 *saved_stdin
= safe_close(*saved_stdin
);
823 *saved_stdout
= safe_close(*saved_stdout
);
829 CONFIRM_PRETEND_FAILURE
= -1,
830 CONFIRM_PRETEND_SUCCESS
= 0,
834 static int ask_for_confirmation(const char *vc
, Unit
*u
, const char *cmdline
) {
835 int saved_stdout
= -1, saved_stdin
= -1, r
;
836 _cleanup_free_
char *e
= NULL
;
839 /* For any internal errors, assume a positive response. */
840 r
= setup_confirm_stdio(vc
, &saved_stdin
, &saved_stdout
);
842 write_confirm_error(r
, vc
, u
);
843 return CONFIRM_EXECUTE
;
846 /* confirm_spawn might have been disabled while we were sleeping. */
847 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
852 e
= ellipsize(cmdline
, 60, 100);
860 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
862 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
869 printf("Resuming normal execution.\n");
870 manager_disable_confirm_spawn();
874 unit_dump(u
, stdout
, " ");
875 continue; /* ask again */
877 printf("Failing execution.\n");
878 r
= CONFIRM_PRETEND_FAILURE
;
881 printf(" c - continue, proceed without asking anymore\n"
882 " D - dump, show the state of the unit\n"
883 " f - fail, don't execute the command and pretend it failed\n"
885 " i - info, show a short summary of the unit\n"
886 " j - jobs, show jobs that are in progress\n"
887 " s - skip, don't execute the command and pretend it succeeded\n"
888 " y - yes, execute the command\n");
889 continue; /* ask again */
891 printf(" Description: %s\n"
894 u
->id
, u
->description
, cmdline
);
895 continue; /* ask again */
897 manager_dump_jobs(u
->manager
, stdout
, " ");
898 continue; /* ask again */
900 /* 'n' was removed in favor of 'f'. */
901 printf("Didn't understand 'n', did you mean 'f'?\n");
902 continue; /* ask again */
904 printf("Skipping execution.\n");
905 r
= CONFIRM_PRETEND_SUCCESS
;
911 assert_not_reached("Unhandled choice");
917 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
921 static int get_fixed_user(const ExecContext
*c
, const char **user
,
922 uid_t
*uid
, gid_t
*gid
,
923 const char **home
, const char **shell
) {
932 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
933 * (i.e. are "/" or "/bin/nologin"). */
936 r
= get_user_creds_clean(&name
, uid
, gid
, home
, shell
);
944 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
954 r
= get_group_creds(&name
, gid
);
962 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
963 const char *group
, gid_t gid
,
964 gid_t
**supplementary_gids
, int *ngids
) {
968 bool keep_groups
= false;
969 gid_t
*groups
= NULL
;
970 _cleanup_free_ gid_t
*l_gids
= NULL
;
975 * If user is given, then lookup GID and supplementary groups list.
976 * We avoid NSS lookups for gid=0. Also we have to initialize groups
977 * here and as early as possible so we keep the list of supplementary
978 * groups of the caller.
980 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
981 /* First step, initialize groups from /etc/groups */
982 if (initgroups(user
, gid
) < 0)
988 if (strv_isempty(c
->supplementary_groups
))
992 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
993 * be positive, otherwise fail.
996 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
997 if (ngroups_max
<= 0) {
1001 return -EOPNOTSUPP
; /* For all other values */
1004 l_gids
= new(gid_t
, ngroups_max
);
1010 * Lookup the list of groups that the user belongs to, we
1011 * avoid NSS lookups here too for gid=0.
1014 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1019 STRV_FOREACH(i
, c
->supplementary_groups
) {
1022 if (k
>= ngroups_max
)
1026 r
= get_group_creds(&g
, l_gids
+k
);
1034 * Sets ngids to zero to drop all supplementary groups, happens
1035 * when we are under root and SupplementaryGroups= is empty.
1042 /* Otherwise get the final list of supplementary groups */
1043 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1047 *supplementary_gids
= groups
;
1055 static int enforce_groups(gid_t gid
, gid_t
*supplementary_gids
, int ngids
) {
1058 /* Handle SupplementaryGroups= if it is not empty */
1060 r
= maybe_setgroups(ngids
, supplementary_gids
);
1065 if (gid_is_valid(gid
)) {
1066 /* Then set our gids */
1067 if (setresgid(gid
, gid
, gid
) < 0)
1074 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1077 if (!uid_is_valid(uid
))
1080 /* Sets (but doesn't look up) the uid and make sure we keep the
1081 * capabilities while doing so. */
1083 if (context
->capability_ambient_set
!= 0) {
1085 /* First step: If we need to keep capabilities but
1086 * drop privileges we need to make sure we keep our
1087 * caps, while we drop privileges. */
1089 int sb
= context
->secure_bits
| 1<<SECURE_KEEP_CAPS
;
1091 if (prctl(PR_GET_SECUREBITS
) != sb
)
1092 if (prctl(PR_SET_SECUREBITS
, sb
) < 0)
1097 /* Second step: actually set the uids */
1098 if (setresuid(uid
, uid
, uid
) < 0)
1101 /* At this point we should have all necessary capabilities but
1102 are otherwise a normal user. However, the caps might got
1103 corrupted due to the setresuid() so we need clean them up
1104 later. This is done outside of this call. */
1111 static int null_conv(
1113 const struct pam_message
**msg
,
1114 struct pam_response
**resp
,
1115 void *appdata_ptr
) {
1117 /* We don't support conversations */
1119 return PAM_CONV_ERR
;
1124 static int setup_pam(
1131 int fds
[], unsigned n_fds
) {
1135 static const struct pam_conv conv
= {
1140 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1141 pam_handle_t
*handle
= NULL
;
1143 int pam_code
= PAM_SUCCESS
, r
;
1144 char **nv
, **e
= NULL
;
1145 bool close_session
= false;
1146 pid_t pam_pid
= 0, parent_pid
;
1153 /* We set up PAM in the parent process, then fork. The child
1154 * will then stay around until killed via PR_GET_PDEATHSIG or
1155 * systemd via the cgroup logic. It will then remove the PAM
1156 * session again. The parent process will exec() the actual
1157 * daemon. We do things this way to ensure that the main PID
1158 * of the daemon is the one we initially fork()ed. */
1160 r
= barrier_create(&barrier
);
1164 if (log_get_max_level() < LOG_DEBUG
)
1165 flags
|= PAM_SILENT
;
1167 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1168 if (pam_code
!= PAM_SUCCESS
) {
1174 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1175 if (pam_code
!= PAM_SUCCESS
)
1179 STRV_FOREACH(nv
, *env
) {
1180 pam_code
= pam_putenv(handle
, *nv
);
1181 if (pam_code
!= PAM_SUCCESS
)
1185 pam_code
= pam_acct_mgmt(handle
, flags
);
1186 if (pam_code
!= PAM_SUCCESS
)
1189 pam_code
= pam_open_session(handle
, flags
);
1190 if (pam_code
!= PAM_SUCCESS
)
1193 close_session
= true;
1195 e
= pam_getenvlist(handle
);
1197 pam_code
= PAM_BUF_ERR
;
1201 /* Block SIGTERM, so that we know that it won't get lost in
1204 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1206 parent_pid
= getpid_cached();
1215 int sig
, ret
= EXIT_PAM
;
1217 /* The child's job is to reset the PAM session on
1219 barrier_set_role(&barrier
, BARRIER_CHILD
);
1221 /* This string must fit in 10 chars (i.e. the length
1222 * of "/sbin/init"), to look pretty in /bin/ps */
1223 rename_process("(sd-pam)");
1225 /* Make sure we don't keep open the passed fds in this
1226 child. We assume that otherwise only those fds are
1227 open here that have been opened by PAM. */
1228 close_many(fds
, n_fds
);
1230 /* Drop privileges - we don't need any to pam_close_session
1231 * and this will make PR_SET_PDEATHSIG work in most cases.
1232 * If this fails, ignore the error - but expect sd-pam threads
1233 * to fail to exit normally */
1235 r
= maybe_setgroups(0, NULL
);
1237 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1238 if (setresgid(gid
, gid
, gid
) < 0)
1239 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1240 if (setresuid(uid
, uid
, uid
) < 0)
1241 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1243 (void) ignore_signals(SIGPIPE
, -1);
1245 /* Wait until our parent died. This will only work if
1246 * the above setresuid() succeeds, otherwise the kernel
1247 * will not allow unprivileged parents kill their privileged
1248 * children this way. We rely on the control groups kill logic
1249 * to do the rest for us. */
1250 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1253 /* Tell the parent that our setup is done. This is especially
1254 * important regarding dropping privileges. Otherwise, unit
1255 * setup might race against our setresuid(2) call.
1257 * If the parent aborted, we'll detect this below, hence ignore
1258 * return failure here. */
1259 (void) barrier_place(&barrier
);
1261 /* Check if our parent process might already have died? */
1262 if (getppid() == parent_pid
) {
1265 assert_se(sigemptyset(&ss
) >= 0);
1266 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1269 if (sigwait(&ss
, &sig
) < 0) {
1276 assert(sig
== SIGTERM
);
1281 /* If our parent died we'll end the session */
1282 if (getppid() != parent_pid
) {
1283 pam_code
= pam_close_session(handle
, flags
);
1284 if (pam_code
!= PAM_SUCCESS
)
1291 pam_end(handle
, pam_code
| flags
);
1295 barrier_set_role(&barrier
, BARRIER_PARENT
);
1297 /* If the child was forked off successfully it will do all the
1298 * cleanups, so forget about the handle here. */
1301 /* Unblock SIGTERM again in the parent */
1302 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1304 /* We close the log explicitly here, since the PAM modules
1305 * might have opened it, but we don't want this fd around. */
1308 /* Synchronously wait for the child to initialize. We don't care for
1309 * errors as we cannot recover. However, warn loudly if it happens. */
1310 if (!barrier_place_and_sync(&barrier
))
1311 log_error("PAM initialization failed");
1319 if (pam_code
!= PAM_SUCCESS
) {
1320 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1321 r
= -EPERM
; /* PAM errors do not map to errno */
1323 log_error_errno(r
, "PAM failed: %m");
1327 pam_code
= pam_close_session(handle
, flags
);
1329 pam_end(handle
, pam_code
| flags
);
1341 static void rename_process_from_path(const char *path
) {
1342 char process_name
[11];
1346 /* This resulting string must fit in 10 chars (i.e. the length
1347 * of "/sbin/init") to look pretty in /bin/ps */
1351 rename_process("(...)");
1357 /* The end of the process name is usually more
1358 * interesting, since the first bit might just be
1364 process_name
[0] = '(';
1365 memcpy(process_name
+1, p
, l
);
1366 process_name
[1+l
] = ')';
1367 process_name
[1+l
+1] = 0;
1369 rename_process(process_name
);
1372 static bool context_has_address_families(const ExecContext
*c
) {
1375 return c
->address_families_whitelist
||
1376 !set_isempty(c
->address_families
);
1379 static bool context_has_syscall_filters(const ExecContext
*c
) {
1382 return c
->syscall_whitelist
||
1383 !hashmap_isempty(c
->syscall_filter
);
1386 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1389 if (c
->no_new_privileges
)
1392 if (have_effective_cap(CAP_SYS_ADMIN
)) /* if we are privileged, we don't need NNP */
1395 /* We need NNP if we have any form of seccomp and are unprivileged */
1396 return context_has_address_families(c
) ||
1397 c
->memory_deny_write_execute
||
1398 c
->restrict_realtime
||
1399 exec_context_restrict_namespaces_set(c
) ||
1400 c
->protect_kernel_tunables
||
1401 c
->protect_kernel_modules
||
1402 c
->private_devices
||
1403 context_has_syscall_filters(c
) ||
1404 !set_isempty(c
->syscall_archs
) ||
1405 c
->lock_personality
;
1410 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1412 if (is_seccomp_available())
1415 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1419 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1420 uint32_t negative_action
, default_action
, action
;
1426 if (!context_has_syscall_filters(c
))
1429 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1432 negative_action
= c
->syscall_errno
== 0 ? SCMP_ACT_KILL
: SCMP_ACT_ERRNO(c
->syscall_errno
);
1434 if (c
->syscall_whitelist
) {
1435 default_action
= negative_action
;
1436 action
= SCMP_ACT_ALLOW
;
1438 default_action
= SCMP_ACT_ALLOW
;
1439 action
= negative_action
;
1442 if (needs_ambient_hack
) {
1443 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_whitelist
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1448 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
);
1451 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1455 if (set_isempty(c
->syscall_archs
))
1458 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1461 return seccomp_restrict_archs(c
->syscall_archs
);
1464 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1468 if (!context_has_address_families(c
))
1471 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1474 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_whitelist
);
1477 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1481 if (!c
->memory_deny_write_execute
)
1484 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1487 return seccomp_memory_deny_write_execute();
1490 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1494 if (!c
->restrict_realtime
)
1497 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1500 return seccomp_restrict_realtime();
1503 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1507 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1508 * let's protect even those systems where this is left on in the kernel. */
1510 if (!c
->protect_kernel_tunables
)
1513 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1516 return seccomp_protect_sysctl();
1519 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1523 /* Turn off module syscalls on ProtectKernelModules=yes */
1525 if (!c
->protect_kernel_modules
)
1528 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1531 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
));
1534 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1538 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1540 if (!c
->private_devices
)
1543 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1546 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
));
1549 static int apply_restrict_namespaces(Unit
*u
, const ExecContext
*c
) {
1553 if (!exec_context_restrict_namespaces_set(c
))
1556 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1559 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1562 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1563 unsigned long personality
;
1569 if (!c
->lock_personality
)
1572 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1575 personality
= c
->personality
;
1577 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1578 if (personality
== PERSONALITY_INVALID
) {
1580 r
= opinionated_personality(&personality
);
1585 return seccomp_lock_personality(personality
);
1590 static void do_idle_pipe_dance(int idle_pipe
[4]) {
1593 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1594 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1596 if (idle_pipe
[0] >= 0) {
1599 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1601 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1604 /* Signal systemd that we are bored and want to continue. */
1605 n
= write(idle_pipe
[3], "x", 1);
1607 /* Wait for systemd to react to the signal above. */
1608 fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1611 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1615 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1618 static int build_environment(
1620 const ExecContext
*c
,
1621 const ExecParameters
*p
,
1624 const char *username
,
1626 dev_t journal_stream_dev
,
1627 ino_t journal_stream_ino
,
1630 _cleanup_strv_free_
char **our_env
= NULL
;
1638 our_env
= new0(char*, 14);
1643 _cleanup_free_
char *joined
= NULL
;
1645 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1647 our_env
[n_env
++] = x
;
1649 if (asprintf(&x
, "LISTEN_FDS=%u", n_fds
) < 0)
1651 our_env
[n_env
++] = x
;
1653 joined
= strv_join(p
->fd_names
, ":");
1657 x
= strjoin("LISTEN_FDNAMES=", joined
);
1660 our_env
[n_env
++] = x
;
1663 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1664 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1666 our_env
[n_env
++] = x
;
1668 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1670 our_env
[n_env
++] = x
;
1673 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1674 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1675 * check the database directly. */
1676 if (p
->flags
& EXEC_NSS_BYPASS_BUS
) {
1677 x
= strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1680 our_env
[n_env
++] = x
;
1684 x
= strappend("HOME=", home
);
1687 our_env
[n_env
++] = x
;
1691 x
= strappend("LOGNAME=", username
);
1694 our_env
[n_env
++] = x
;
1696 x
= strappend("USER=", username
);
1699 our_env
[n_env
++] = x
;
1703 x
= strappend("SHELL=", shell
);
1706 our_env
[n_env
++] = x
;
1709 if (!sd_id128_is_null(u
->invocation_id
)) {
1710 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1713 our_env
[n_env
++] = x
;
1716 if (exec_context_needs_term(c
)) {
1717 const char *tty_path
, *term
= NULL
;
1719 tty_path
= exec_context_tty_path(c
);
1721 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1722 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1723 * passes to PID 1 ends up all the way in the console login shown. */
1725 if (path_equal(tty_path
, "/dev/console") && getppid() == 1)
1726 term
= getenv("TERM");
1728 term
= default_term_for_tty(tty_path
);
1730 x
= strappend("TERM=", term
);
1733 our_env
[n_env
++] = x
;
1736 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1737 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1740 our_env
[n_env
++] = x
;
1743 our_env
[n_env
++] = NULL
;
1744 assert(n_env
<= 12);
1752 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
1753 _cleanup_strv_free_
char **pass_env
= NULL
;
1754 size_t n_env
= 0, n_bufsize
= 0;
1757 STRV_FOREACH(i
, c
->pass_environment
) {
1758 _cleanup_free_
char *x
= NULL
;
1764 x
= strjoin(*i
, "=", v
);
1768 if (!GREEDY_REALLOC(pass_env
, n_bufsize
, n_env
+ 2))
1771 pass_env
[n_env
++] = x
;
1772 pass_env
[n_env
] = NULL
;
1782 static bool exec_needs_mount_namespace(
1783 const ExecContext
*context
,
1784 const ExecParameters
*params
,
1785 ExecRuntime
*runtime
) {
1790 if (context
->root_image
)
1793 if (!strv_isempty(context
->read_write_paths
) ||
1794 !strv_isempty(context
->read_only_paths
) ||
1795 !strv_isempty(context
->inaccessible_paths
))
1798 if (context
->n_bind_mounts
> 0 ||
1799 !strv_isempty(context
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) ||
1800 !strv_isempty(context
->directories
[EXEC_DIRECTORY_STATE
].paths
) ||
1801 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CACHE
].paths
) ||
1802 !strv_isempty(context
->directories
[EXEC_DIRECTORY_LOGS
].paths
) ||
1803 !strv_isempty(context
->directories
[EXEC_DIRECTORY_CONFIGURATION
].paths
))
1806 if (context
->mount_flags
!= 0)
1809 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
1812 if (context
->private_devices
||
1813 context
->protect_system
!= PROTECT_SYSTEM_NO
||
1814 context
->protect_home
!= PROTECT_HOME_NO
||
1815 context
->protect_kernel_tunables
||
1816 context
->protect_kernel_modules
||
1817 context
->protect_control_groups
)
1820 if (context
->mount_apivfs
&& (context
->root_image
|| context
->root_directory
))
1826 static int setup_private_users(uid_t uid
, gid_t gid
) {
1827 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
1828 _cleanup_close_pair_
int errno_pipe
[2] = { -1, -1 };
1829 _cleanup_close_
int unshare_ready_fd
= -1;
1830 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
1836 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1837 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1838 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1839 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1840 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1841 * continues execution normally. */
1843 if (uid
!= 0 && uid_is_valid(uid
)) {
1844 r
= asprintf(&uid_map
,
1845 "0 0 1\n" /* Map root → root */
1846 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
1851 uid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1856 if (gid
!= 0 && gid_is_valid(gid
)) {
1857 r
= asprintf(&gid_map
,
1858 "0 0 1\n" /* Map root → root */
1859 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
1864 gid_map
= strdup("0 0 1\n"); /* The case where the above is the same */
1869 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1871 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
1872 if (unshare_ready_fd
< 0)
1875 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1877 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
1885 _cleanup_close_
int fd
= -1;
1889 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1890 * here, after the parent opened its own user namespace. */
1893 errno_pipe
[0] = safe_close(errno_pipe
[0]);
1895 /* Wait until the parent unshared the user namespace */
1896 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
1901 /* Disable the setgroups() system call in the child user namespace, for good. */
1902 a
= procfs_file_alloca(ppid
, "setgroups");
1903 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1905 if (errno
!= ENOENT
) {
1910 /* If the file is missing the kernel is too old, let's continue anyway. */
1912 if (write(fd
, "deny\n", 5) < 0) {
1917 fd
= safe_close(fd
);
1920 /* First write the GID map */
1921 a
= procfs_file_alloca(ppid
, "gid_map");
1922 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1927 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
1931 fd
= safe_close(fd
);
1933 /* The write the UID map */
1934 a
= procfs_file_alloca(ppid
, "uid_map");
1935 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
1940 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
1945 _exit(EXIT_SUCCESS
);
1948 (void) write(errno_pipe
[1], &r
, sizeof(r
));
1949 _exit(EXIT_FAILURE
);
1952 errno_pipe
[1] = safe_close(errno_pipe
[1]);
1954 if (unshare(CLONE_NEWUSER
) < 0)
1957 /* Let the child know that the namespace is ready now */
1958 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
1961 /* Try to read an error code from the child */
1962 n
= read(errno_pipe
[0], &r
, sizeof(r
));
1965 if (n
== sizeof(r
)) { /* an error code was sent to us */
1970 if (n
!= 0) /* on success we should have read 0 bytes */
1973 r
= wait_for_terminate(pid
, &si
);
1978 /* If something strange happened with the child, let's consider this fatal, too */
1979 if (si
.si_code
!= CLD_EXITED
|| si
.si_status
!= 0)
1985 static int setup_exec_directory(
1986 const ExecContext
*context
,
1987 const ExecParameters
*params
,
1990 ExecDirectoryType type
,
1993 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
1994 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
1995 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
1996 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
1997 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
1998 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2005 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2006 assert(exit_status
);
2008 if (!params
->prefix
[type
])
2011 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2012 if (!uid_is_valid(uid
))
2014 if (!gid_is_valid(gid
))
2018 STRV_FOREACH(rt
, context
->directories
[type
].paths
) {
2019 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2020 const char *effective
;
2022 p
= strjoin(params
->prefix
[type
], "/", *rt
);
2028 r
= mkdir_parents_label(p
, 0755);
2032 if (context
->dynamic_user
&&
2033 !IN_SET(type
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2034 _cleanup_free_
char *private_root
= NULL
, *relative
= NULL
, *parent
= NULL
;
2036 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2037 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2038 * whose UID is later on reused. To lock this down we use the same trick used by container
2039 * managers to prohibit host users to get access to files of the same UID in containers: we
2040 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2041 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2042 * to make this directory permeable for the service itself.
2044 * Specifically: for a service which wants a special directory "foo/" we first create a
2045 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2046 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2047 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2048 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2049 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2050 * disabling the access boundary for the service and making sure it only gets access to the
2051 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2053 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2054 * owned by the service itself.
2055 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2056 * files or sockets with other services. */
2058 private_root
= strjoin(params
->prefix
[type
], "/private");
2059 if (!private_root
) {
2064 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2065 r
= mkdir_safe_label(private_root
, 0700, 0, 0, false);
2069 pp
= strjoin(private_root
, "/", *rt
);
2075 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2076 r
= mkdir_parents_label(pp
, 0755);
2080 /* Finally, create the actual directory for the service */
2081 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2082 if (r
< 0 && r
!= -EEXIST
)
2085 parent
= dirname_malloc(p
);
2091 r
= path_make_relative(parent
, pp
, &relative
);
2095 /* And link it up from the original place */
2096 r
= symlink_idempotent(relative
, p
);
2103 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2104 if (r
< 0 && r
!= -EEXIST
)
2110 /* First lock down the access mode */
2111 if (chmod(effective
, context
->directories
[type
].mode
) < 0) {
2116 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2117 * a service, and shall not be writable. */
2118 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2121 /* Then, change the ownership of the whole tree, if necessary */
2122 r
= path_chown_recursive(effective
, uid
, gid
);
2130 *exit_status
= exit_status_table
[type
];
2134 static int setup_smack(
2135 const ExecContext
*context
,
2136 const ExecCommand
*command
) {
2143 if (context
->smack_process_label
) {
2144 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
2148 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2150 _cleanup_free_
char *exec_label
= NULL
;
2152 r
= mac_smack_read(command
->path
, SMACK_ATTR_EXEC
, &exec_label
);
2153 if (r
< 0 && !IN_SET(r
, -ENODATA
, -EOPNOTSUPP
))
2156 r
= mac_smack_apply_pid(0, exec_label
? : SMACK_DEFAULT_PROCESS_LABEL
);
2165 static int compile_bind_mounts(
2166 const ExecContext
*context
,
2167 const ExecParameters
*params
,
2168 BindMount
**ret_bind_mounts
,
2169 unsigned *ret_n_bind_mounts
,
2170 char ***ret_empty_directories
) {
2172 _cleanup_strv_free_
char **empty_directories
= NULL
;
2173 BindMount
*bind_mounts
;
2174 unsigned n
, h
= 0, i
;
2175 ExecDirectoryType t
;
2180 assert(ret_bind_mounts
);
2181 assert(ret_n_bind_mounts
);
2182 assert(ret_empty_directories
);
2184 n
= context
->n_bind_mounts
;
2185 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2186 if (!params
->prefix
[t
])
2189 n
+= strv_length(context
->directories
[t
].paths
);
2193 *ret_bind_mounts
= NULL
;
2194 *ret_n_bind_mounts
= 0;
2195 *ret_empty_directories
= NULL
;
2199 bind_mounts
= new(BindMount
, n
);
2203 for (i
= 0; i
< context
->n_bind_mounts
; i
++) {
2204 BindMount
*item
= context
->bind_mounts
+ i
;
2207 s
= strdup(item
->source
);
2213 d
= strdup(item
->destination
);
2220 bind_mounts
[h
++] = (BindMount
) {
2223 .read_only
= item
->read_only
,
2224 .recursive
= item
->recursive
,
2225 .ignore_enoent
= item
->ignore_enoent
,
2229 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2232 if (!params
->prefix
[t
])
2235 if (strv_isempty(context
->directories
[t
].paths
))
2238 if (context
->dynamic_user
&&
2239 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
)) {
2242 /* So this is for a dynamic user, and we need to make sure the process can access its own
2243 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2244 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2246 private_root
= strjoin(params
->prefix
[t
], "/private");
2247 if (!private_root
) {
2252 r
= strv_consume(&empty_directories
, private_root
);
2259 STRV_FOREACH(suffix
, context
->directories
[t
].paths
) {
2262 if (context
->dynamic_user
&&
2263 !IN_SET(t
, EXEC_DIRECTORY_RUNTIME
, EXEC_DIRECTORY_CONFIGURATION
))
2264 s
= strjoin(params
->prefix
[t
], "/private/", *suffix
);
2266 s
= strjoin(params
->prefix
[t
], "/", *suffix
);
2279 bind_mounts
[h
++] = (BindMount
) {
2284 .ignore_enoent
= false,
2291 *ret_bind_mounts
= bind_mounts
;
2292 *ret_n_bind_mounts
= n
;
2293 *ret_empty_directories
= empty_directories
;
2295 empty_directories
= NULL
;
2300 bind_mount_free_many(bind_mounts
, h
);
2304 static int apply_mount_namespace(
2306 ExecCommand
*command
,
2307 const ExecContext
*context
,
2308 const ExecParameters
*params
,
2309 ExecRuntime
*runtime
) {
2311 _cleanup_strv_free_
char **empty_directories
= NULL
;
2312 char *tmp
= NULL
, *var
= NULL
;
2313 const char *root_dir
= NULL
, *root_image
= NULL
;
2314 NamespaceInfo ns_info
= {
2315 .ignore_protect_paths
= false,
2316 .private_dev
= context
->private_devices
,
2317 .protect_control_groups
= context
->protect_control_groups
,
2318 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
2319 .protect_kernel_modules
= context
->protect_kernel_modules
,
2320 .mount_apivfs
= context
->mount_apivfs
,
2322 bool needs_sandboxing
;
2323 BindMount
*bind_mounts
= NULL
;
2324 unsigned n_bind_mounts
= 0;
2329 /* The runtime struct only contains the parent of the private /tmp,
2330 * which is non-accessible to world users. Inside of it there's a /tmp
2331 * that is sticky, and that's the one we want to use here. */
2333 if (context
->private_tmp
&& runtime
) {
2334 if (runtime
->tmp_dir
)
2335 tmp
= strjoina(runtime
->tmp_dir
, "/tmp");
2336 if (runtime
->var_tmp_dir
)
2337 var
= strjoina(runtime
->var_tmp_dir
, "/tmp");
2340 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2341 root_image
= context
->root_image
;
2344 root_dir
= context
->root_directory
;
2347 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
2352 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2353 * sandbox info, otherwise enforce it, don't ignore protected paths and
2354 * fail if we are enable to apply the sandbox inside the mount namespace.
2356 if (!context
->dynamic_user
&& root_dir
)
2357 ns_info
.ignore_protect_paths
= true;
2359 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
2361 r
= setup_namespace(root_dir
, root_image
,
2362 &ns_info
, context
->read_write_paths
,
2363 needs_sandboxing
? context
->read_only_paths
: NULL
,
2364 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
2370 needs_sandboxing
? context
->protect_home
: PROTECT_HOME_NO
,
2371 needs_sandboxing
? context
->protect_system
: PROTECT_SYSTEM_NO
,
2372 context
->mount_flags
,
2373 DISSECT_IMAGE_DISCARD_ON_LOOP
);
2375 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
2377 /* If we couldn't set up the namespace this is probably due to a
2378 * missing capability. In this case, silently proceeed. */
2379 if (IN_SET(r
, -EPERM
, -EACCES
)) {
2380 log_unit_debug_errno(u
, r
, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2387 static int apply_working_directory(
2388 const ExecContext
*context
,
2389 const ExecParameters
*params
,
2391 const bool needs_mount_ns
,
2397 assert(exit_status
);
2399 if (context
->working_directory_home
) {
2402 *exit_status
= EXIT_CHDIR
;
2408 } else if (context
->working_directory
)
2409 wd
= context
->working_directory
;
2413 if (params
->flags
& EXEC_APPLY_CHROOT
) {
2414 if (!needs_mount_ns
&& context
->root_directory
)
2415 if (chroot(context
->root_directory
) < 0) {
2416 *exit_status
= EXIT_CHROOT
;
2422 d
= prefix_roota(context
->root_directory
, wd
);
2424 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
2425 *exit_status
= EXIT_CHDIR
;
2432 static int setup_keyring(
2434 const ExecContext
*context
,
2435 const ExecParameters
*p
,
2436 uid_t uid
, gid_t gid
) {
2438 key_serial_t keyring
;
2445 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2446 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2447 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2448 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2449 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2450 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2452 if (!(p
->flags
& EXEC_NEW_KEYRING
))
2455 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
2458 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2459 if (keyring
== -1) {
2460 if (errno
== ENOSYS
)
2461 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
2462 else if (IN_SET(errno
, EACCES
, EPERM
))
2463 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
2464 else if (errno
== EDQUOT
)
2465 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
2467 return log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
2472 /* Populate they keyring with the invocation ID by default. */
2473 if (!sd_id128_is_null(u
->invocation_id
)) {
2476 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
2478 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
2480 if (keyctl(KEYCTL_SETPERM
, key
,
2481 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
2482 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
2483 return log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
2487 /* And now, make the keyring owned by the service's user */
2488 if (uid_is_valid(uid
) || gid_is_valid(gid
))
2489 if (keyctl(KEYCTL_CHOWN
, keyring
, uid
, gid
, 0) < 0)
2490 return log_unit_error_errno(u
, errno
, "Failed to change ownership of session keyring: %m");
2492 /* When requested link the user keyring into the session keyring. */
2493 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
2497 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2498 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2499 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2501 saved_uid
= getuid();
2502 saved_gid
= getgid();
2504 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2505 if (setregid(gid
, -1) < 0)
2506 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
2509 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2510 if (setreuid(uid
, -1) < 0) {
2511 (void) setregid(saved_gid
, -1);
2512 return log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
2516 if (keyctl(KEYCTL_LINK
,
2517 KEY_SPEC_USER_KEYRING
,
2518 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
2522 (void) setreuid(saved_uid
, -1);
2523 (void) setregid(saved_gid
, -1);
2525 return log_unit_error_errno(u
, r
, "Failed to link user keyring into session keyring: %m");
2528 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
2529 if (setreuid(saved_uid
, -1) < 0) {
2530 (void) setregid(saved_gid
, -1);
2531 return log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
2535 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
2536 if (setregid(saved_gid
, -1) < 0)
2537 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
2544 static void append_socket_pair(int *array
, unsigned *n
, int pair
[2]) {
2552 array
[(*n
)++] = pair
[0];
2554 array
[(*n
)++] = pair
[1];
2557 static int close_remaining_fds(
2558 const ExecParameters
*params
,
2559 ExecRuntime
*runtime
,
2560 DynamicCreds
*dcreds
,
2563 int *fds
, unsigned n_fds
) {
2565 unsigned n_dont_close
= 0;
2566 int dont_close
[n_fds
+ 12];
2570 if (params
->stdin_fd
>= 0)
2571 dont_close
[n_dont_close
++] = params
->stdin_fd
;
2572 if (params
->stdout_fd
>= 0)
2573 dont_close
[n_dont_close
++] = params
->stdout_fd
;
2574 if (params
->stderr_fd
>= 0)
2575 dont_close
[n_dont_close
++] = params
->stderr_fd
;
2578 dont_close
[n_dont_close
++] = socket_fd
;
2580 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
2581 n_dont_close
+= n_fds
;
2585 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
2589 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
2591 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
2594 if (user_lookup_fd
>= 0)
2595 dont_close
[n_dont_close
++] = user_lookup_fd
;
2597 return close_all_fds(dont_close
, n_dont_close
);
2600 static int send_user_lookup(
2608 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2609 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2612 if (user_lookup_fd
< 0)
2615 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
2618 if (writev(user_lookup_fd
,
2620 IOVEC_INIT(&uid
, sizeof(uid
)),
2621 IOVEC_INIT(&gid
, sizeof(gid
)),
2622 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
2628 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
2635 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2640 if (!c
->working_directory_home
)
2644 /* Hardcode /root as home directory for UID 0 */
2649 r
= get_home_dir(buf
);
2657 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
2658 _cleanup_strv_free_
char ** list
= NULL
;
2659 ExecDirectoryType t
;
2666 assert(c
->dynamic_user
);
2668 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2669 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2672 for (t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2675 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
2681 STRV_FOREACH(i
, c
->directories
[t
].paths
) {
2684 if (t
== EXEC_DIRECTORY_RUNTIME
)
2685 e
= strjoin(p
->prefix
[t
], "/", *i
);
2687 e
= strjoin(p
->prefix
[t
], "/private/", *i
);
2691 r
= strv_consume(&list
, e
);
2703 static int exec_child(
2705 ExecCommand
*command
,
2706 const ExecContext
*context
,
2707 const ExecParameters
*params
,
2708 ExecRuntime
*runtime
,
2709 DynamicCreds
*dcreds
,
2714 unsigned n_storage_fds
,
2715 unsigned n_socket_fds
,
2720 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **accum_env
= NULL
, **final_argv
= NULL
;
2721 _cleanup_free_
char *mac_selinux_context_net
= NULL
, *home_buffer
= NULL
;
2722 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
2723 const char *username
= NULL
, *groupname
= NULL
;
2724 const char *home
= NULL
, *shell
= NULL
;
2725 dev_t journal_stream_dev
= 0;
2726 ino_t journal_stream_ino
= 0;
2727 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2728 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
2729 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
2730 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
2732 bool use_selinux
= false;
2735 bool use_smack
= false;
2738 bool use_apparmor
= false;
2740 uid_t uid
= UID_INVALID
;
2741 gid_t gid
= GID_INVALID
;
2742 int i
, r
, ngids
= 0;
2744 ExecDirectoryType dt
;
2751 assert(exit_status
);
2753 rename_process_from_path(command
->path
);
2755 /* We reset exactly these signals, since they are the
2756 * only ones we set to SIG_IGN in the main daemon. All
2757 * others we leave untouched because we set them to
2758 * SIG_DFL or a valid handler initially, both of which
2759 * will be demoted to SIG_DFL. */
2760 (void) default_signals(SIGNALS_CRASH_HANDLER
,
2761 SIGNALS_IGNORE
, -1);
2763 if (context
->ignore_sigpipe
)
2764 (void) ignore_signals(SIGPIPE
, -1);
2766 r
= reset_signal_mask();
2768 *exit_status
= EXIT_SIGNAL_MASK
;
2769 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
2772 if (params
->idle_pipe
)
2773 do_idle_pipe_dance(params
->idle_pipe
);
2775 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2776 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2777 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2778 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2781 log_set_open_when_needed(true);
2783 /* In case anything used libc syslog(), close this here, too */
2786 n_fds
= n_storage_fds
+ n_socket_fds
;
2787 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, fds
, n_fds
);
2789 *exit_status
= EXIT_FDS
;
2790 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
2793 if (!context
->same_pgrp
)
2795 *exit_status
= EXIT_SETSID
;
2796 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
2799 exec_context_tty_reset(context
, params
);
2801 if (unit_shall_confirm_spawn(unit
)) {
2802 const char *vc
= params
->confirm_spawn
;
2803 _cleanup_free_
char *cmdline
= NULL
;
2805 cmdline
= exec_command_line(argv
);
2807 *exit_status
= EXIT_MEMORY
;
2811 r
= ask_for_confirmation(vc
, unit
, cmdline
);
2812 if (r
!= CONFIRM_EXECUTE
) {
2813 if (r
== CONFIRM_PRETEND_SUCCESS
) {
2814 *exit_status
= EXIT_SUCCESS
;
2817 *exit_status
= EXIT_CONFIRM
;
2818 log_unit_error(unit
, "Execution cancelled by the user");
2823 if (context
->dynamic_user
&& dcreds
) {
2824 _cleanup_strv_free_
char **suggested_paths
= NULL
;
2826 /* Make sure we bypass our own NSS module for any NSS checks */
2827 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2828 *exit_status
= EXIT_USER
;
2829 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
2832 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
2834 *exit_status
= EXIT_MEMORY
;
2838 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
2840 *exit_status
= EXIT_USER
;
2842 log_unit_error(unit
, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2845 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
2848 if (!uid_is_valid(uid
)) {
2849 *exit_status
= EXIT_USER
;
2850 log_unit_error(unit
, "UID validation failed for \""UID_FMT
"\"", uid
);
2854 if (!gid_is_valid(gid
)) {
2855 *exit_status
= EXIT_USER
;
2856 log_unit_error(unit
, "GID validation failed for \""GID_FMT
"\"", gid
);
2861 username
= dcreds
->user
->name
;
2864 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
2866 *exit_status
= EXIT_USER
;
2867 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
2870 r
= get_fixed_group(context
, &groupname
, &gid
);
2872 *exit_status
= EXIT_GROUP
;
2873 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
2877 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2878 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
2879 &supplementary_gids
, &ngids
);
2881 *exit_status
= EXIT_GROUP
;
2882 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
2885 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
2887 *exit_status
= EXIT_USER
;
2888 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
2891 user_lookup_fd
= safe_close(user_lookup_fd
);
2893 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
2895 *exit_status
= EXIT_CHDIR
;
2896 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
2899 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2900 * must sure to drop O_NONBLOCK */
2902 (void) fd_nonblock(socket_fd
, false);
2904 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
2906 *exit_status
= EXIT_STDIN
;
2907 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
2910 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2912 *exit_status
= EXIT_STDOUT
;
2913 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
2916 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
2918 *exit_status
= EXIT_STDERR
;
2919 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
2922 if (params
->cgroup_path
) {
2923 r
= cg_attach_everywhere(params
->cgroup_supported
, params
->cgroup_path
, 0, NULL
, NULL
);
2925 *exit_status
= EXIT_CGROUP
;
2926 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", params
->cgroup_path
);
2930 if (context
->oom_score_adjust_set
) {
2931 char t
[DECIMAL_STR_MAX(context
->oom_score_adjust
)];
2933 /* When we can't make this change due to EPERM, then
2934 * let's silently skip over it. User namespaces
2935 * prohibit write access to this file, and we
2936 * shouldn't trip up over that. */
2938 sprintf(t
, "%i", context
->oom_score_adjust
);
2939 r
= write_string_file("/proc/self/oom_score_adj", t
, 0);
2940 if (IN_SET(r
, -EPERM
, -EACCES
))
2941 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2943 *exit_status
= EXIT_OOM_ADJUST
;
2944 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
2948 if (context
->nice_set
)
2949 if (setpriority(PRIO_PROCESS
, 0, context
->nice
) < 0) {
2950 *exit_status
= EXIT_NICE
;
2951 return log_unit_error_errno(unit
, errno
, "Failed to set up process scheduling priority (nice level): %m");
2954 if (context
->cpu_sched_set
) {
2955 struct sched_param param
= {
2956 .sched_priority
= context
->cpu_sched_priority
,
2959 r
= sched_setscheduler(0,
2960 context
->cpu_sched_policy
|
2961 (context
->cpu_sched_reset_on_fork
?
2962 SCHED_RESET_ON_FORK
: 0),
2965 *exit_status
= EXIT_SETSCHEDULER
;
2966 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
2970 if (context
->cpuset
)
2971 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context
->cpuset_ncpus
), context
->cpuset
) < 0) {
2972 *exit_status
= EXIT_CPUAFFINITY
;
2973 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
2976 if (context
->ioprio_set
)
2977 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
2978 *exit_status
= EXIT_IOPRIO
;
2979 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
2982 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
2983 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
2984 *exit_status
= EXIT_TIMERSLACK
;
2985 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
2988 if (context
->personality
!= PERSONALITY_INVALID
) {
2989 r
= safe_personality(context
->personality
);
2991 *exit_status
= EXIT_PERSONALITY
;
2992 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
2996 if (context
->utmp_id
)
2997 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
2999 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
3000 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
3004 if (context
->user
) {
3005 r
= chown_terminal(STDIN_FILENO
, uid
);
3007 *exit_status
= EXIT_STDIN
;
3008 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
3012 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3013 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3014 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3015 * touch a single hierarchy too. */
3016 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
3017 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
3019 *exit_status
= EXIT_CGROUP
;
3020 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
3024 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3025 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, exit_status
);
3027 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
3030 r
= build_environment(
3042 *exit_status
= EXIT_MEMORY
;
3046 r
= build_pass_environment(context
, &pass_env
);
3048 *exit_status
= EXIT_MEMORY
;
3052 accum_env
= strv_env_merge(5,
3053 params
->environment
,
3056 context
->environment
,
3060 *exit_status
= EXIT_MEMORY
;
3063 accum_env
= strv_env_clean(accum_env
);
3065 (void) umask(context
->umask
);
3067 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
3069 *exit_status
= EXIT_KEYRING
;
3070 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
3073 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3074 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3076 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3077 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
3079 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3080 if (needs_ambient_hack
)
3081 needs_setuid
= false;
3083 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
3085 if (needs_sandboxing
) {
3086 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3087 * present. The actual MAC context application will happen later, as late as possible, to avoid
3088 * impacting our own code paths. */
3091 use_selinux
= mac_selinux_use();
3094 use_smack
= mac_smack_use();
3097 use_apparmor
= mac_apparmor_use();
3102 if (context
->pam_name
&& username
) {
3103 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
3105 *exit_status
= EXIT_PAM
;
3106 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
3111 if (context
->private_network
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
3112 if (ns_type_supported(NAMESPACE_NET
)) {
3113 r
= setup_netns(runtime
->netns_storage_socket
);
3115 *exit_status
= EXIT_NETWORK
;
3116 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
3119 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3122 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
3123 if (needs_mount_namespace
) {
3124 r
= apply_mount_namespace(unit
, command
, context
, params
, runtime
);
3126 *exit_status
= EXIT_NAMESPACE
;
3127 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing: %m");
3131 /* Apply just after mount namespace setup */
3132 r
= apply_working_directory(context
, params
, home
, needs_mount_namespace
, exit_status
);
3134 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
3136 /* Drop groups as early as possbile */
3138 r
= enforce_groups(gid
, supplementary_gids
, ngids
);
3140 *exit_status
= EXIT_GROUP
;
3141 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
3145 if (needs_sandboxing
) {
3147 if (use_selinux
&& params
->selinux_context_net
&& socket_fd
>= 0) {
3148 r
= mac_selinux_get_child_mls_label(socket_fd
, command
->path
, context
->selinux_context
, &mac_selinux_context_net
);
3150 *exit_status
= EXIT_SELINUX_CONTEXT
;
3151 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
3156 if (context
->private_users
) {
3157 r
= setup_private_users(uid
, gid
);
3159 *exit_status
= EXIT_USER
;
3160 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
3165 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3166 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3167 * was needed to upload the policy and can now be closed as well. */
3168 r
= close_all_fds(fds
, n_fds
);
3170 r
= shift_fds(fds
, n_fds
);
3172 r
= flags_fds(fds
, n_storage_fds
, n_socket_fds
, context
->non_blocking
);
3174 *exit_status
= EXIT_FDS
;
3175 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
3178 secure_bits
= context
->secure_bits
;
3180 if (needs_sandboxing
) {
3183 for (i
= 0; i
< _RLIMIT_MAX
; i
++) {
3185 if (!context
->rlimit
[i
])
3188 r
= setrlimit_closest(i
, context
->rlimit
[i
]);
3190 *exit_status
= EXIT_LIMITS
;
3191 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit %s: %m", rlimit_to_string(i
));
3195 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3196 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
3197 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
3198 *exit_status
= EXIT_LIMITS
;
3199 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3204 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3205 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3207 r
= setup_smack(context
, command
);
3209 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
3210 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
3215 bset
= context
->capability_bounding_set
;
3216 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3217 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3218 * instead of us doing that */
3219 if (needs_ambient_hack
)
3220 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
3221 (UINT64_C(1) << CAP_SETUID
) |
3222 (UINT64_C(1) << CAP_SETGID
);
3224 if (!cap_test_all(bset
)) {
3225 r
= capability_bounding_set_drop(bset
, false);
3227 *exit_status
= EXIT_CAPABILITIES
;
3228 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
3232 /* This is done before enforce_user, but ambient set
3233 * does not survive over setresuid() if keep_caps is not set. */
3234 if (!needs_ambient_hack
&&
3235 context
->capability_ambient_set
!= 0) {
3236 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
3238 *exit_status
= EXIT_CAPABILITIES
;
3239 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
3245 if (context
->user
) {
3246 r
= enforce_user(context
, uid
);
3248 *exit_status
= EXIT_USER
;
3249 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
3252 if (!needs_ambient_hack
&&
3253 context
->capability_ambient_set
!= 0) {
3255 /* Fix the ambient capabilities after user change. */
3256 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
3258 *exit_status
= EXIT_CAPABILITIES
;
3259 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
3262 /* If we were asked to change user and ambient capabilities
3263 * were requested, we had to add keep-caps to the securebits
3264 * so that we would maintain the inherited capability set
3265 * through the setresuid(). Make sure that the bit is added
3266 * also to the context secure_bits so that we don't try to
3267 * drop the bit away next. */
3269 secure_bits
|= 1<<SECURE_KEEP_CAPS
;
3274 if (needs_sandboxing
) {
3275 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3276 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3277 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3278 * are restricted. */
3282 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
3285 r
= setexeccon(exec_context
);
3287 *exit_status
= EXIT_SELINUX_CONTEXT
;
3288 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
3295 if (use_apparmor
&& context
->apparmor_profile
) {
3296 r
= aa_change_onexec(context
->apparmor_profile
);
3297 if (r
< 0 && !context
->apparmor_profile_ignore
) {
3298 *exit_status
= EXIT_APPARMOR_PROFILE
;
3299 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
3304 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3305 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3306 if (prctl(PR_GET_SECUREBITS
) != secure_bits
)
3307 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
3308 *exit_status
= EXIT_SECUREBITS
;
3309 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
3312 if (context_has_no_new_privileges(context
))
3313 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
3314 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
3315 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
3319 r
= apply_address_families(unit
, context
);
3321 *exit_status
= EXIT_ADDRESS_FAMILIES
;
3322 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
3325 r
= apply_memory_deny_write_execute(unit
, context
);
3327 *exit_status
= EXIT_SECCOMP
;
3328 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
3331 r
= apply_restrict_realtime(unit
, context
);
3333 *exit_status
= EXIT_SECCOMP
;
3334 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
3337 r
= apply_restrict_namespaces(unit
, context
);
3339 *exit_status
= EXIT_SECCOMP
;
3340 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
3343 r
= apply_protect_sysctl(unit
, context
);
3345 *exit_status
= EXIT_SECCOMP
;
3346 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
3349 r
= apply_protect_kernel_modules(unit
, context
);
3351 *exit_status
= EXIT_SECCOMP
;
3352 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
3355 r
= apply_private_devices(unit
, context
);
3357 *exit_status
= EXIT_SECCOMP
;
3358 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
3361 r
= apply_syscall_archs(unit
, context
);
3363 *exit_status
= EXIT_SECCOMP
;
3364 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
3367 r
= apply_lock_personality(unit
, context
);
3369 *exit_status
= EXIT_SECCOMP
;
3370 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
3373 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3374 * by the filter as little as possible. */
3375 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
3377 *exit_status
= EXIT_SECCOMP
;
3378 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
3383 if (!strv_isempty(context
->unset_environment
)) {
3386 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
3388 *exit_status
= EXIT_MEMORY
;
3392 strv_free(accum_env
);
3396 final_argv
= replace_env_argv(argv
, accum_env
);
3398 *exit_status
= EXIT_MEMORY
;
3402 if (_unlikely_(log_get_max_level() >= LOG_DEBUG
)) {
3403 _cleanup_free_
char *line
;
3405 line
= exec_command_line(final_argv
);
3407 log_struct(LOG_DEBUG
,
3408 "EXECUTABLE=%s", command
->path
,
3409 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
),
3411 LOG_UNIT_INVOCATION_ID(unit
),
3416 execve(command
->path
, final_argv
, accum_env
);
3418 if (errno
== ENOENT
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
3420 log_struct_errno(LOG_INFO
, errno
,
3421 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3423 LOG_UNIT_INVOCATION_ID(unit
),
3424 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
3426 "EXECUTABLE=%s", command
->path
,
3432 *exit_status
= EXIT_EXEC
;
3433 return log_unit_error_errno(unit
, errno
, "Failed to execute command: %m");
3436 int exec_spawn(Unit
*unit
,
3437 ExecCommand
*command
,
3438 const ExecContext
*context
,
3439 const ExecParameters
*params
,
3440 ExecRuntime
*runtime
,
3441 DynamicCreds
*dcreds
,
3444 _cleanup_strv_free_
char **files_env
= NULL
;
3446 unsigned n_storage_fds
= 0, n_socket_fds
= 0;
3447 _cleanup_free_
char *line
= NULL
;
3449 int named_iofds
[3] = { -1, -1, -1 };
3458 assert(params
->fds
|| (params
->n_storage_fds
+ params
->n_socket_fds
<= 0));
3460 if (context
->std_input
== EXEC_INPUT_SOCKET
||
3461 context
->std_output
== EXEC_OUTPUT_SOCKET
||
3462 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
3464 if (params
->n_socket_fds
> 1) {
3465 log_unit_error(unit
, "Got more than one socket.");
3469 if (params
->n_socket_fds
== 0) {
3470 log_unit_error(unit
, "Got no socket.");
3474 socket_fd
= params
->fds
[0];
3478 n_storage_fds
= params
->n_storage_fds
;
3479 n_socket_fds
= params
->n_socket_fds
;
3482 r
= exec_context_named_iofds(unit
, context
, params
, named_iofds
);
3484 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
3486 r
= exec_context_load_environment(unit
, context
, &files_env
);
3488 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
3490 argv
= params
->argv
?: command
->argv
;
3491 line
= exec_command_line(argv
);
3495 log_struct(LOG_DEBUG
,
3496 LOG_UNIT_MESSAGE(unit
, "About to execute: %s", line
),
3497 "EXECUTABLE=%s", command
->path
,
3499 LOG_UNIT_INVOCATION_ID(unit
),
3504 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
3507 int exit_status
= EXIT_SUCCESS
;
3509 r
= exec_child(unit
,
3522 unit
->manager
->user_lookup_fds
[1],
3526 log_struct_errno(LOG_ERR
, r
,
3527 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
3529 LOG_UNIT_INVOCATION_ID(unit
),
3530 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
3531 exit_status_to_string(exit_status
, EXIT_STATUS_SYSTEMD
),
3533 "EXECUTABLE=%s", command
->path
,
3540 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
3542 /* We add the new process to the cgroup both in the child (so
3543 * that we can be sure that no user code is ever executed
3544 * outside of the cgroup) and in the parent (so that we can be
3545 * sure that when we kill the cgroup the process will be
3547 if (params
->cgroup_path
)
3548 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, pid
);
3550 exec_status_start(&command
->exec_status
, pid
);
3556 void exec_context_init(ExecContext
*c
) {
3557 ExecDirectoryType i
;
3562 c
->ioprio
= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 0);
3563 c
->cpu_sched_policy
= SCHED_OTHER
;
3564 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
3565 c
->syslog_level_prefix
= true;
3566 c
->ignore_sigpipe
= true;
3567 c
->timer_slack_nsec
= NSEC_INFINITY
;
3568 c
->personality
= PERSONALITY_INVALID
;
3569 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3570 c
->directories
[i
].mode
= 0755;
3571 c
->capability_bounding_set
= CAP_ALL
;
3572 c
->restrict_namespaces
= NAMESPACE_FLAGS_ALL
;
3573 c
->log_level_max
= -1;
3576 void exec_context_done(ExecContext
*c
) {
3577 ExecDirectoryType i
;
3582 c
->environment
= strv_free(c
->environment
);
3583 c
->environment_files
= strv_free(c
->environment_files
);
3584 c
->pass_environment
= strv_free(c
->pass_environment
);
3585 c
->unset_environment
= strv_free(c
->unset_environment
);
3587 for (l
= 0; l
< ELEMENTSOF(c
->rlimit
); l
++)
3588 c
->rlimit
[l
] = mfree(c
->rlimit
[l
]);
3590 for (l
= 0; l
< 3; l
++) {
3591 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
3592 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
3595 c
->working_directory
= mfree(c
->working_directory
);
3596 c
->root_directory
= mfree(c
->root_directory
);
3597 c
->root_image
= mfree(c
->root_image
);
3598 c
->tty_path
= mfree(c
->tty_path
);
3599 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
3600 c
->user
= mfree(c
->user
);
3601 c
->group
= mfree(c
->group
);
3603 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
3605 c
->pam_name
= mfree(c
->pam_name
);
3607 c
->read_only_paths
= strv_free(c
->read_only_paths
);
3608 c
->read_write_paths
= strv_free(c
->read_write_paths
);
3609 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
3611 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
3614 CPU_FREE(c
->cpuset
);
3616 c
->utmp_id
= mfree(c
->utmp_id
);
3617 c
->selinux_context
= mfree(c
->selinux_context
);
3618 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
3619 c
->smack_process_label
= mfree(c
->smack_process_label
);
3621 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
3622 c
->syscall_archs
= set_free(c
->syscall_archs
);
3623 c
->address_families
= set_free(c
->address_families
);
3625 for (i
= 0; i
< _EXEC_DIRECTORY_TYPE_MAX
; i
++)
3626 c
->directories
[i
].paths
= strv_free(c
->directories
[i
].paths
);
3628 c
->log_level_max
= -1;
3630 exec_context_free_log_extra_fields(c
);
3632 c
->stdin_data
= mfree(c
->stdin_data
);
3633 c
->stdin_data_size
= 0;
3636 int exec_context_destroy_runtime_directory(ExecContext
*c
, const char *runtime_prefix
) {
3641 if (!runtime_prefix
)
3644 STRV_FOREACH(i
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].paths
) {
3645 _cleanup_free_
char *p
;
3647 p
= strjoin(runtime_prefix
, "/", *i
);
3651 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3653 (void) rm_rf(p
, REMOVE_ROOT
);
3659 void exec_command_done(ExecCommand
*c
) {
3662 c
->path
= mfree(c
->path
);
3664 c
->argv
= strv_free(c
->argv
);
3667 void exec_command_done_array(ExecCommand
*c
, unsigned n
) {
3670 for (i
= 0; i
< n
; i
++)
3671 exec_command_done(c
+i
);
3674 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
3678 LIST_REMOVE(command
, c
, i
);
3679 exec_command_done(i
);
3686 void exec_command_free_array(ExecCommand
**c
, unsigned n
) {
3689 for (i
= 0; i
< n
; i
++)
3690 c
[i
] = exec_command_free_list(c
[i
]);
3693 typedef struct InvalidEnvInfo
{
3698 static void invalid_env(const char *p
, void *userdata
) {
3699 InvalidEnvInfo
*info
= userdata
;
3701 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
3704 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
3710 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
3713 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
3716 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
3719 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
3722 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
3725 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
3732 int exec_context_named_iofds(Unit
*unit
, const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[3]) {
3733 unsigned i
, targets
;
3734 const char* stdio_fdname
[3];
3740 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
3741 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
3742 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
3744 for (i
= 0; i
< 3; i
++)
3745 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
3747 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
3749 for (i
= 0; i
< n_fds
&& targets
> 0; i
++)
3750 if (named_iofds
[STDIN_FILENO
] < 0 &&
3751 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
3752 stdio_fdname
[STDIN_FILENO
] &&
3753 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
3755 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
3758 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
3759 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
3760 stdio_fdname
[STDOUT_FILENO
] &&
3761 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
3763 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
3766 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
3767 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
3768 stdio_fdname
[STDERR_FILENO
] &&
3769 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
3771 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
3775 return targets
== 0 ? 0 : -ENOENT
;
3778 int exec_context_load_environment(Unit
*unit
, const ExecContext
*c
, char ***l
) {
3779 char **i
, **r
= NULL
;
3784 STRV_FOREACH(i
, c
->environment_files
) {
3788 bool ignore
= false;
3790 _cleanup_globfree_ glob_t pglob
= {};
3799 if (!path_is_absolute(fn
)) {
3807 /* Filename supports globbing, take all matching files */
3808 k
= safe_glob(fn
, 0, &pglob
);
3817 /* When we don't match anything, -ENOENT should be returned */
3818 assert(pglob
.gl_pathc
> 0);
3820 for (n
= 0; n
< pglob
.gl_pathc
; n
++) {
3821 k
= load_env_file(NULL
, pglob
.gl_pathv
[n
], NULL
, &p
);
3829 /* Log invalid environment variables with filename */
3831 InvalidEnvInfo info
= {
3833 .path
= pglob
.gl_pathv
[n
]
3836 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
3844 m
= strv_env_merge(2, r
, p
);
3860 static bool tty_may_match_dev_console(const char *tty
) {
3861 _cleanup_free_
char *active
= NULL
;
3867 tty
= skip_dev_prefix(tty
);
3869 /* trivial identity? */
3870 if (streq(tty
, "console"))
3873 console
= resolve_dev_console(&active
);
3874 /* if we could not resolve, assume it may */
3878 /* "tty0" means the active VC, so it may be the same sometimes */
3879 return streq(console
, tty
) || (streq(console
, "tty0") && tty_is_vc(tty
));
3882 bool exec_context_may_touch_console(ExecContext
*ec
) {
3884 return (ec
->tty_reset
||
3886 ec
->tty_vt_disallocate
||
3887 is_terminal_input(ec
->std_input
) ||
3888 is_terminal_output(ec
->std_output
) ||
3889 is_terminal_output(ec
->std_error
)) &&
3890 tty_may_match_dev_console(exec_context_tty_path(ec
));
3893 static void strv_fprintf(FILE *f
, char **l
) {
3899 fprintf(f
, " %s", *g
);
3902 void exec_context_dump(ExecContext
*c
, FILE* f
, const char *prefix
) {
3903 ExecDirectoryType dt
;
3911 prefix
= strempty(prefix
);
3915 "%sWorkingDirectory: %s\n"
3916 "%sRootDirectory: %s\n"
3917 "%sNonBlocking: %s\n"
3918 "%sPrivateTmp: %s\n"
3919 "%sPrivateDevices: %s\n"
3920 "%sProtectKernelTunables: %s\n"
3921 "%sProtectKernelModules: %s\n"
3922 "%sProtectControlGroups: %s\n"
3923 "%sPrivateNetwork: %s\n"
3924 "%sPrivateUsers: %s\n"
3925 "%sProtectHome: %s\n"
3926 "%sProtectSystem: %s\n"
3927 "%sMountAPIVFS: %s\n"
3928 "%sIgnoreSIGPIPE: %s\n"
3929 "%sMemoryDenyWriteExecute: %s\n"
3930 "%sRestrictRealtime: %s\n"
3931 "%sKeyringMode: %s\n",
3933 prefix
, c
->working_directory
? c
->working_directory
: "/",
3934 prefix
, c
->root_directory
? c
->root_directory
: "/",
3935 prefix
, yes_no(c
->non_blocking
),
3936 prefix
, yes_no(c
->private_tmp
),
3937 prefix
, yes_no(c
->private_devices
),
3938 prefix
, yes_no(c
->protect_kernel_tunables
),
3939 prefix
, yes_no(c
->protect_kernel_modules
),
3940 prefix
, yes_no(c
->protect_control_groups
),
3941 prefix
, yes_no(c
->private_network
),
3942 prefix
, yes_no(c
->private_users
),
3943 prefix
, protect_home_to_string(c
->protect_home
),
3944 prefix
, protect_system_to_string(c
->protect_system
),
3945 prefix
, yes_no(c
->mount_apivfs
),
3946 prefix
, yes_no(c
->ignore_sigpipe
),
3947 prefix
, yes_no(c
->memory_deny_write_execute
),
3948 prefix
, yes_no(c
->restrict_realtime
),
3949 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
));
3952 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
3954 STRV_FOREACH(e
, c
->environment
)
3955 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
3957 STRV_FOREACH(e
, c
->environment_files
)
3958 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
3960 STRV_FOREACH(e
, c
->pass_environment
)
3961 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
3963 STRV_FOREACH(e
, c
->unset_environment
)
3964 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
3966 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
3968 for (dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3969 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
3971 STRV_FOREACH(d
, c
->directories
[dt
].paths
)
3972 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), *d
);
3980 if (c
->oom_score_adjust_set
)
3982 "%sOOMScoreAdjust: %i\n",
3983 prefix
, c
->oom_score_adjust
);
3985 for (i
= 0; i
< RLIM_NLIMITS
; i
++)
3987 fprintf(f
, "%s%s: " RLIM_FMT
"\n",
3988 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
3989 fprintf(f
, "%s%sSoft: " RLIM_FMT
"\n",
3990 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
3993 if (c
->ioprio_set
) {
3994 _cleanup_free_
char *class_str
= NULL
;
3996 r
= ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c
->ioprio
), &class_str
);
3998 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
4000 fprintf(f
, "%sIOPriority: %lu\n", prefix
, IOPRIO_PRIO_DATA(c
->ioprio
));
4003 if (c
->cpu_sched_set
) {
4004 _cleanup_free_
char *policy_str
= NULL
;
4006 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
4008 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
4011 "%sCPUSchedulingPriority: %i\n"
4012 "%sCPUSchedulingResetOnFork: %s\n",
4013 prefix
, c
->cpu_sched_priority
,
4014 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
4018 fprintf(f
, "%sCPUAffinity:", prefix
);
4019 for (i
= 0; i
< c
->cpuset_ncpus
; i
++)
4020 if (CPU_ISSET_S(i
, CPU_ALLOC_SIZE(c
->cpuset_ncpus
), c
->cpuset
))
4021 fprintf(f
, " %u", i
);
4025 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
4026 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
4029 "%sStandardInput: %s\n"
4030 "%sStandardOutput: %s\n"
4031 "%sStandardError: %s\n",
4032 prefix
, exec_input_to_string(c
->std_input
),
4033 prefix
, exec_output_to_string(c
->std_output
),
4034 prefix
, exec_output_to_string(c
->std_error
));
4036 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
4037 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
4038 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
4039 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
4040 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
4041 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
4043 if (c
->std_input
== EXEC_INPUT_FILE
)
4044 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
4045 if (c
->std_output
== EXEC_OUTPUT_FILE
)
4046 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
4047 if (c
->std_error
== EXEC_OUTPUT_FILE
)
4048 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
4054 "%sTTYVHangup: %s\n"
4055 "%sTTYVTDisallocate: %s\n",
4056 prefix
, c
->tty_path
,
4057 prefix
, yes_no(c
->tty_reset
),
4058 prefix
, yes_no(c
->tty_vhangup
),
4059 prefix
, yes_no(c
->tty_vt_disallocate
));
4061 if (IN_SET(c
->std_output
,
4064 EXEC_OUTPUT_JOURNAL
,
4065 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4066 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4067 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
4068 IN_SET(c
->std_error
,
4071 EXEC_OUTPUT_JOURNAL
,
4072 EXEC_OUTPUT_SYSLOG_AND_CONSOLE
,
4073 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
4074 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
4076 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
4078 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
4080 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
4082 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
4084 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
4087 if (c
->log_level_max
>= 0) {
4088 _cleanup_free_
char *t
= NULL
;
4090 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
4092 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
4095 if (c
->n_log_extra_fields
> 0) {
4098 for (j
= 0; j
< c
->n_log_extra_fields
; j
++) {
4099 fprintf(f
, "%sLogExtraFields: ", prefix
);
4100 fwrite(c
->log_extra_fields
[j
].iov_base
,
4101 1, c
->log_extra_fields
[j
].iov_len
,
4107 if (c
->secure_bits
) {
4108 _cleanup_free_
char *str
= NULL
;
4110 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
4112 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
4115 if (c
->capability_bounding_set
!= CAP_ALL
) {
4116 _cleanup_free_
char *str
= NULL
;
4118 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
4120 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
4123 if (c
->capability_ambient_set
!= 0) {
4124 _cleanup_free_
char *str
= NULL
;
4126 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
4128 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
4132 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
4134 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
4136 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
4138 if (!strv_isempty(c
->supplementary_groups
)) {
4139 fprintf(f
, "%sSupplementaryGroups:", prefix
);
4140 strv_fprintf(f
, c
->supplementary_groups
);
4145 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
4147 if (strv_length(c
->read_write_paths
) > 0) {
4148 fprintf(f
, "%sReadWritePaths:", prefix
);
4149 strv_fprintf(f
, c
->read_write_paths
);
4153 if (strv_length(c
->read_only_paths
) > 0) {
4154 fprintf(f
, "%sReadOnlyPaths:", prefix
);
4155 strv_fprintf(f
, c
->read_only_paths
);
4159 if (strv_length(c
->inaccessible_paths
) > 0) {
4160 fprintf(f
, "%sInaccessiblePaths:", prefix
);
4161 strv_fprintf(f
, c
->inaccessible_paths
);
4165 if (c
->n_bind_mounts
> 0)
4166 for (i
= 0; i
< c
->n_bind_mounts
; i
++) {
4167 fprintf(f
, "%s%s: %s:%s:%s\n", prefix
,
4168 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
4169 c
->bind_mounts
[i
].source
,
4170 c
->bind_mounts
[i
].destination
,
4171 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
4176 "%sUtmpIdentifier: %s\n",
4177 prefix
, c
->utmp_id
);
4179 if (c
->selinux_context
)
4181 "%sSELinuxContext: %s%s\n",
4182 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
4184 if (c
->apparmor_profile
)
4186 "%sAppArmorProfile: %s%s\n",
4187 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4189 if (c
->smack_process_label
)
4191 "%sSmackProcessLabel: %s%s\n",
4192 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
4194 if (c
->personality
!= PERSONALITY_INVALID
)
4196 "%sPersonality: %s\n",
4197 prefix
, strna(personality_to_string(c
->personality
)));
4200 "%sLockPersonality: %s\n",
4201 prefix
, yes_no(c
->lock_personality
));
4203 if (c
->syscall_filter
) {
4211 "%sSystemCallFilter: ",
4214 if (!c
->syscall_whitelist
)
4218 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
, j
) {
4219 _cleanup_free_
char *name
= NULL
;
4220 const char *errno_name
= NULL
;
4221 int num
= PTR_TO_INT(val
);
4228 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
4229 fputs(strna(name
), f
);
4232 errno_name
= errno_to_name(num
);
4234 fprintf(f
, ":%s", errno_name
);
4236 fprintf(f
, ":%d", num
);
4244 if (c
->syscall_archs
) {
4251 "%sSystemCallArchitectures:",
4255 SET_FOREACH(id
, c
->syscall_archs
, j
)
4256 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
4261 if (exec_context_restrict_namespaces_set(c
)) {
4262 _cleanup_free_
char *s
= NULL
;
4264 r
= namespace_flag_to_string_many(c
->restrict_namespaces
, &s
);
4266 fprintf(f
, "%sRestrictNamespaces: %s\n",
4270 if (c
->syscall_errno
> 0) {
4271 const char *errno_name
;
4273 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
4275 errno_name
= errno_to_name(c
->syscall_errno
);
4277 fprintf(f
, "%s\n", errno_name
);
4279 fprintf(f
, "%d\n", c
->syscall_errno
);
4282 if (c
->apparmor_profile
)
4284 "%sAppArmorProfile: %s%s\n",
4285 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
4288 bool exec_context_maintains_privileges(ExecContext
*c
) {
4291 /* Returns true if the process forked off would run under
4292 * an unchanged UID or as root. */
4297 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
4303 int exec_context_get_effective_ioprio(ExecContext
*c
) {
4311 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
4313 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE
, 4);
4318 void exec_context_free_log_extra_fields(ExecContext
*c
) {
4323 for (l
= 0; l
< c
->n_log_extra_fields
; l
++)
4324 free(c
->log_extra_fields
[l
].iov_base
);
4325 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
4326 c
->n_log_extra_fields
= 0;
4329 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
4334 dual_timestamp_get(&s
->start_timestamp
);
4337 void exec_status_exit(ExecStatus
*s
, ExecContext
*context
, pid_t pid
, int code
, int status
) {
4340 if (s
->pid
&& s
->pid
!= pid
)
4344 dual_timestamp_get(&s
->exit_timestamp
);
4350 if (context
->utmp_id
)
4351 utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
4353 exec_context_tty_reset(context
, NULL
);
4357 void exec_status_dump(ExecStatus
*s
, FILE *f
, const char *prefix
) {
4358 char buf
[FORMAT_TIMESTAMP_MAX
];
4366 prefix
= strempty(prefix
);
4369 "%sPID: "PID_FMT
"\n",
4372 if (dual_timestamp_is_set(&s
->start_timestamp
))
4374 "%sStart Timestamp: %s\n",
4375 prefix
, format_timestamp(buf
, sizeof(buf
), s
->start_timestamp
.realtime
));
4377 if (dual_timestamp_is_set(&s
->exit_timestamp
))
4379 "%sExit Timestamp: %s\n"
4381 "%sExit Status: %i\n",
4382 prefix
, format_timestamp(buf
, sizeof(buf
), s
->exit_timestamp
.realtime
),
4383 prefix
, sigchld_code_to_string(s
->code
),
4387 char *exec_command_line(char **argv
) {
4395 STRV_FOREACH(a
, argv
)
4403 STRV_FOREACH(a
, argv
) {
4410 if (strpbrk(*a
, WHITESPACE
)) {
4421 /* FIXME: this doesn't really handle arguments that have
4422 * spaces and ticks in them */
4427 void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4428 _cleanup_free_
char *cmd
= NULL
;
4429 const char *prefix2
;
4434 prefix
= strempty(prefix
);
4435 prefix2
= strjoina(prefix
, "\t");
4437 cmd
= exec_command_line(c
->argv
);
4439 "%sCommand Line: %s\n",
4440 prefix
, cmd
? cmd
: strerror(ENOMEM
));
4442 exec_status_dump(&c
->exec_status
, f
, prefix2
);
4445 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
4448 prefix
= strempty(prefix
);
4450 LIST_FOREACH(command
, c
, c
)
4451 exec_command_dump(c
, f
, prefix
);
4454 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
4461 /* It's kind of important, that we keep the order here */
4462 LIST_FIND_TAIL(command
, *l
, end
);
4463 LIST_INSERT_AFTER(command
, *l
, end
, e
);
4468 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
4476 l
= strv_new_ap(path
, ap
);
4497 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
4498 _cleanup_strv_free_
char **l
= NULL
;
4506 l
= strv_new_ap(path
, ap
);
4512 r
= strv_extend_strv(&c
->argv
, l
, false);
4520 static int exec_runtime_allocate(ExecRuntime
**rt
) {
4525 *rt
= new0(ExecRuntime
, 1);
4530 (*rt
)->netns_storage_socket
[0] = (*rt
)->netns_storage_socket
[1] = -1;
4535 int exec_runtime_make(ExecRuntime
**rt
, ExecContext
*c
, const char *id
) {
4545 if (!c
->private_network
&& !c
->private_tmp
)
4548 r
= exec_runtime_allocate(rt
);
4552 if (c
->private_network
&& (*rt
)->netns_storage_socket
[0] < 0) {
4553 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, (*rt
)->netns_storage_socket
) < 0)
4557 if (c
->private_tmp
&& !(*rt
)->tmp_dir
) {
4558 r
= setup_tmp_dirs(id
, &(*rt
)->tmp_dir
, &(*rt
)->var_tmp_dir
);
4566 ExecRuntime
*exec_runtime_ref(ExecRuntime
*r
) {
4568 assert(r
->n_ref
> 0);
4574 ExecRuntime
*exec_runtime_unref(ExecRuntime
*r
) {
4579 assert(r
->n_ref
> 0);
4586 free(r
->var_tmp_dir
);
4587 safe_close_pair(r
->netns_storage_socket
);
4591 int exec_runtime_serialize(Unit
*u
, ExecRuntime
*rt
, FILE *f
, FDSet
*fds
) {
4600 unit_serialize_item(u
, f
, "tmp-dir", rt
->tmp_dir
);
4602 if (rt
->var_tmp_dir
)
4603 unit_serialize_item(u
, f
, "var-tmp-dir", rt
->var_tmp_dir
);
4605 if (rt
->netns_storage_socket
[0] >= 0) {
4608 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
4612 unit_serialize_item_format(u
, f
, "netns-socket-0", "%i", copy
);
4615 if (rt
->netns_storage_socket
[1] >= 0) {
4618 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
4622 unit_serialize_item_format(u
, f
, "netns-socket-1", "%i", copy
);
4628 int exec_runtime_deserialize_item(Unit
*u
, ExecRuntime
**rt
, const char *key
, const char *value
, FDSet
*fds
) {
4635 if (streq(key
, "tmp-dir")) {
4638 r
= exec_runtime_allocate(rt
);
4642 copy
= strdup(value
);
4646 free((*rt
)->tmp_dir
);
4647 (*rt
)->tmp_dir
= copy
;
4649 } else if (streq(key
, "var-tmp-dir")) {
4652 r
= exec_runtime_allocate(rt
);
4656 copy
= strdup(value
);
4660 free((*rt
)->var_tmp_dir
);
4661 (*rt
)->var_tmp_dir
= copy
;
4663 } else if (streq(key
, "netns-socket-0")) {
4666 r
= exec_runtime_allocate(rt
);
4670 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4671 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4673 safe_close((*rt
)->netns_storage_socket
[0]);
4674 (*rt
)->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
4676 } else if (streq(key
, "netns-socket-1")) {
4679 r
= exec_runtime_allocate(rt
);
4683 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
))
4684 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
4686 safe_close((*rt
)->netns_storage_socket
[1]);
4687 (*rt
)->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
4695 static void *remove_tmpdir_thread(void *p
) {
4696 _cleanup_free_
char *path
= p
;
4698 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
4702 void exec_runtime_destroy(ExecRuntime
*rt
) {
4708 /* If there are multiple users of this, let's leave the stuff around */
4713 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
4715 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
4717 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
4724 if (rt
->var_tmp_dir
) {
4725 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
4727 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
4729 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
4730 free(rt
->var_tmp_dir
);
4733 rt
->var_tmp_dir
= NULL
;
4736 safe_close_pair(rt
->netns_storage_socket
);
4739 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
4740 [EXEC_INPUT_NULL
] = "null",
4741 [EXEC_INPUT_TTY
] = "tty",
4742 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
4743 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
4744 [EXEC_INPUT_SOCKET
] = "socket",
4745 [EXEC_INPUT_NAMED_FD
] = "fd",
4746 [EXEC_INPUT_DATA
] = "data",
4747 [EXEC_INPUT_FILE
] = "file",
4750 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
4752 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
4753 [EXEC_OUTPUT_INHERIT
] = "inherit",
4754 [EXEC_OUTPUT_NULL
] = "null",
4755 [EXEC_OUTPUT_TTY
] = "tty",
4756 [EXEC_OUTPUT_SYSLOG
] = "syslog",
4757 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE
] = "syslog+console",
4758 [EXEC_OUTPUT_KMSG
] = "kmsg",
4759 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
4760 [EXEC_OUTPUT_JOURNAL
] = "journal",
4761 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
4762 [EXEC_OUTPUT_SOCKET
] = "socket",
4763 [EXEC_OUTPUT_NAMED_FD
] = "fd",
4764 [EXEC_OUTPUT_FILE
] = "file",
4767 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
4769 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
4770 [EXEC_UTMP_INIT
] = "init",
4771 [EXEC_UTMP_LOGIN
] = "login",
4772 [EXEC_UTMP_USER
] = "user",
4775 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
4777 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
4778 [EXEC_PRESERVE_NO
] = "no",
4779 [EXEC_PRESERVE_YES
] = "yes",
4780 [EXEC_PRESERVE_RESTART
] = "restart",
4783 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
4785 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
4786 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
4787 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
4788 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
4789 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
4790 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
4793 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
4795 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
4796 [EXEC_KEYRING_INHERIT
] = "inherit",
4797 [EXEC_KEYRING_PRIVATE
] = "private",
4798 [EXEC_KEYRING_SHARED
] = "shared",
4801 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);