1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <sys/eventfd.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
13 #include <sys/types.h>
19 #include <security/pam_appl.h>
23 #include <selinux/selinux.h>
31 #include <sys/apparmor.h>
34 #include "sd-messages.h"
38 #include "alloc-util.h"
40 #include "apparmor-util.h"
42 #include "argv-util.h"
47 #include "capability-util.h"
48 #include "cgroup-setup.h"
49 #include "chase-symlinks.h"
50 #include "chown-recursive.h"
51 #include "constants.h"
52 #include "cpu-set-util.h"
53 #include "creds-util.h"
54 #include "data-fd-util.h"
57 #include "errno-list.h"
60 #include "exit-status.h"
63 #include "format-util.h"
64 #include "glob-util.h"
65 #include "hexdecoct.h"
67 #include "ioprio-util.h"
72 #include "manager-dump.h"
73 #include "memory-util.h"
74 #include "missing_fs.h"
75 #include "missing_ioprio.h"
76 #include "mkdir-label.h"
77 #include "mount-util.h"
78 #include "mountpoint-util.h"
79 #include "namespace.h"
80 #include "parse-util.h"
81 #include "path-util.h"
82 #include "process-util.h"
83 #include "random-util.h"
84 #include "recurse-dir.h"
85 #include "rlimit-util.h"
88 #include "seccomp-util.h"
90 #include "securebits-util.h"
91 #include "selinux-util.h"
92 #include "signal-util.h"
93 #include "smack-util.h"
94 #include "socket-util.h"
95 #include "sort-util.h"
97 #include "stat-util.h"
98 #include "string-table.h"
99 #include "string-util.h"
101 #include "syslog-util.h"
102 #include "terminal-util.h"
103 #include "tmpfile-util.h"
104 #include "umask-util.h"
105 #include "unit-serialize.h"
106 #include "user-util.h"
107 #include "utmp-wtmp.h"
109 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
110 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
112 #define SNDBUF_SIZE (8*1024*1024)
114 static int shift_fds(int fds
[], size_t n_fds
) {
118 /* Modifies the fds array! (sorts it) */
122 for (int start
= 0;;) {
123 int restart_from
= -1;
125 for (int i
= start
; i
< (int) n_fds
; i
++) {
128 /* Already at right index? */
132 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
139 /* Hmm, the fd we wanted isn't free? Then
140 * let's remember that and try again from here */
141 if (nfd
!= i
+3 && restart_from
< 0)
145 if (restart_from
< 0)
148 start
= restart_from
;
154 static int flags_fds(
167 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
168 * O_NONBLOCK only applies to socket activation though. */
170 for (size_t i
= 0; i
< n_fds
; i
++) {
172 if (i
< n_socket_fds
) {
173 r
= fd_nonblock(fds
[i
], nonblock
);
178 /* We unconditionally drop FD_CLOEXEC from the fds,
179 * since after all we want to pass these fds to our
182 r
= fd_cloexec(fds
[i
], false);
190 static const char *exec_context_tty_path(const ExecContext
*context
) {
193 if (context
->stdio_as_fds
)
196 if (context
->tty_path
)
197 return context
->tty_path
;
199 return "/dev/console";
202 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
207 path
= exec_context_tty_path(context
);
209 if (context
->tty_vhangup
) {
210 if (p
&& p
->stdin_fd
>= 0)
211 (void) terminal_vhangup_fd(p
->stdin_fd
);
213 (void) terminal_vhangup(path
);
216 if (context
->tty_reset
) {
217 if (p
&& p
->stdin_fd
>= 0)
218 (void) reset_terminal_fd(p
->stdin_fd
, true);
220 (void) reset_terminal(path
);
223 if (p
&& p
->stdin_fd
>= 0)
224 (void) terminal_set_size_fd(p
->stdin_fd
, path
, context
->tty_rows
, context
->tty_cols
);
226 if (context
->tty_vt_disallocate
&& path
)
227 (void) vt_disallocate(path
);
230 static bool is_terminal_input(ExecInput i
) {
233 EXEC_INPUT_TTY_FORCE
,
234 EXEC_INPUT_TTY_FAIL
);
237 static bool is_terminal_output(ExecOutput o
) {
240 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
241 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
244 static bool is_kmsg_output(ExecOutput o
) {
247 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
250 static bool exec_context_needs_term(const ExecContext
*c
) {
253 /* Return true if the execution context suggests we should set $TERM to something useful. */
255 if (is_terminal_input(c
->std_input
))
258 if (is_terminal_output(c
->std_output
))
261 if (is_terminal_output(c
->std_error
))
264 return !!c
->tty_path
;
267 static int open_null_as(int flags
, int nfd
) {
272 fd
= open("/dev/null", flags
|O_NOCTTY
);
276 return move_fd(fd
, nfd
, false);
279 static int connect_journal_socket(
281 const char *log_namespace
,
285 uid_t olduid
= UID_INVALID
;
286 gid_t oldgid
= GID_INVALID
;
291 strjoina("/run/systemd/journal.", log_namespace
, "/stdout") :
292 "/run/systemd/journal/stdout";
294 if (gid_is_valid(gid
)) {
297 if (setegid(gid
) < 0)
301 if (uid_is_valid(uid
)) {
304 if (seteuid(uid
) < 0) {
310 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
312 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
313 an LSM interferes. */
315 if (uid_is_valid(uid
))
316 (void) seteuid(olduid
);
319 if (gid_is_valid(gid
))
320 (void) setegid(oldgid
);
325 static int connect_logger_as(
327 const ExecContext
*context
,
328 const ExecParameters
*params
,
335 _cleanup_close_
int fd
= -EBADF
;
340 assert(output
< _EXEC_OUTPUT_MAX
);
344 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
348 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
352 if (shutdown(fd
, SHUT_RD
) < 0)
355 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
365 context
->syslog_identifier
?: ident
,
366 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
367 context
->syslog_priority
,
368 !!context
->syslog_level_prefix
,
370 is_kmsg_output(output
),
371 is_terminal_output(output
)) < 0)
374 return move_fd(TAKE_FD(fd
), nfd
, false);
377 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
383 fd
= open_terminal(path
, flags
| O_NOCTTY
);
387 return move_fd(fd
, nfd
, false);
390 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
391 _cleanup_close_
int fd
= -EBADF
;
396 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
399 fd
= open(path
, flags
|O_NOCTTY
, mode
);
403 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
406 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
408 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
412 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
413 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
414 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
415 * wasn't an AF_UNIX socket after all */
420 if ((flags
& O_ACCMODE
) == O_RDONLY
)
421 r
= shutdown(fd
, SHUT_WR
);
422 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
423 r
= shutdown(fd
, SHUT_RD
);
432 static int fixup_input(
433 const ExecContext
*context
,
435 bool apply_tty_stdin
) {
441 std_input
= context
->std_input
;
443 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
444 return EXEC_INPUT_NULL
;
446 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
447 return EXEC_INPUT_NULL
;
449 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
450 return EXEC_INPUT_NULL
;
455 static int fixup_output(ExecOutput output
, int socket_fd
) {
457 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
458 return EXEC_OUTPUT_INHERIT
;
463 static int setup_input(
464 const ExecContext
*context
,
465 const ExecParameters
*params
,
467 const int named_iofds
[static 3]) {
476 if (params
->stdin_fd
>= 0) {
477 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
480 /* Try to make this the controlling tty, if it is a tty, and reset it */
481 if (isatty(STDIN_FILENO
)) {
482 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
483 (void) reset_terminal_fd(STDIN_FILENO
, true);
484 (void) terminal_set_size_fd(STDIN_FILENO
, NULL
, context
->tty_rows
, context
->tty_cols
);
490 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
494 case EXEC_INPUT_NULL
:
495 return open_null_as(O_RDONLY
, STDIN_FILENO
);
498 case EXEC_INPUT_TTY_FORCE
:
499 case EXEC_INPUT_TTY_FAIL
: {
502 fd
= acquire_terminal(exec_context_tty_path(context
),
503 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
504 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
505 ACQUIRE_TERMINAL_WAIT
,
510 r
= terminal_set_size_fd(fd
, exec_context_tty_path(context
), context
->tty_rows
, context
->tty_cols
);
514 return move_fd(fd
, STDIN_FILENO
, false);
517 case EXEC_INPUT_SOCKET
:
518 assert(socket_fd
>= 0);
520 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
522 case EXEC_INPUT_NAMED_FD
:
523 assert(named_iofds
[STDIN_FILENO
] >= 0);
525 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
526 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
528 case EXEC_INPUT_DATA
: {
531 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
535 return move_fd(fd
, STDIN_FILENO
, false);
538 case EXEC_INPUT_FILE
: {
542 assert(context
->stdio_file
[STDIN_FILENO
]);
544 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
545 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
547 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
551 return move_fd(fd
, STDIN_FILENO
, false);
555 assert_not_reached();
559 static bool can_inherit_stderr_from_stdout(
560 const ExecContext
*context
,
566 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
569 if (e
== EXEC_OUTPUT_INHERIT
)
574 if (e
== EXEC_OUTPUT_NAMED_FD
)
575 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
577 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
578 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
583 static int setup_output(
585 const ExecContext
*context
,
586 const ExecParameters
*params
,
589 const int named_iofds
[static 3],
593 dev_t
*journal_stream_dev
,
594 ino_t
*journal_stream_ino
) {
604 assert(journal_stream_dev
);
605 assert(journal_stream_ino
);
607 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
609 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
612 return STDOUT_FILENO
;
615 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
616 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
619 return STDERR_FILENO
;
622 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
623 o
= fixup_output(context
->std_output
, socket_fd
);
625 if (fileno
== STDERR_FILENO
) {
627 e
= fixup_output(context
->std_error
, socket_fd
);
629 /* This expects the input and output are already set up */
631 /* Don't change the stderr file descriptor if we inherit all
632 * the way and are not on a tty */
633 if (e
== EXEC_OUTPUT_INHERIT
&&
634 o
== EXEC_OUTPUT_INHERIT
&&
635 i
== EXEC_INPUT_NULL
&&
636 !is_terminal_input(context
->std_input
) &&
640 /* Duplicate from stdout if possible */
641 if (can_inherit_stderr_from_stdout(context
, o
, e
))
642 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
646 } else if (o
== EXEC_OUTPUT_INHERIT
) {
647 /* If input got downgraded, inherit the original value */
648 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
649 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
651 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
652 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
653 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
655 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
659 /* We need to open /dev/null here anew, to get the right access mode. */
660 return open_null_as(O_WRONLY
, fileno
);
665 case EXEC_OUTPUT_NULL
:
666 return open_null_as(O_WRONLY
, fileno
);
668 case EXEC_OUTPUT_TTY
:
669 if (is_terminal_input(i
))
670 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
672 /* We don't reset the terminal if this is just about output */
673 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
675 case EXEC_OUTPUT_KMSG
:
676 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
677 case EXEC_OUTPUT_JOURNAL
:
678 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
679 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
681 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m",
682 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
683 r
= open_null_as(O_WRONLY
, fileno
);
687 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
688 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
689 * services to detect whether they are connected to the journal or not.
691 * If both stdout and stderr are connected to a stream then let's make sure to store the data
692 * about STDERR as that's usually the best way to do logging. */
694 if (fstat(fileno
, &st
) >= 0 &&
695 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
696 *journal_stream_dev
= st
.st_dev
;
697 *journal_stream_ino
= st
.st_ino
;
702 case EXEC_OUTPUT_SOCKET
:
703 assert(socket_fd
>= 0);
705 return RET_NERRNO(dup2(socket_fd
, fileno
));
707 case EXEC_OUTPUT_NAMED_FD
:
708 assert(named_iofds
[fileno
] >= 0);
710 (void) fd_nonblock(named_iofds
[fileno
], false);
711 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
713 case EXEC_OUTPUT_FILE
:
714 case EXEC_OUTPUT_FILE_APPEND
:
715 case EXEC_OUTPUT_FILE_TRUNCATE
: {
719 assert(context
->stdio_file
[fileno
]);
721 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
722 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
725 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
728 if (o
== EXEC_OUTPUT_FILE_APPEND
)
730 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
733 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
737 return move_fd(fd
, fileno
, 0);
741 assert_not_reached();
745 static int chown_terminal(int fd
, uid_t uid
) {
750 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
751 if (isatty(fd
) < 1) {
752 if (IN_SET(errno
, EINVAL
, ENOTTY
))
753 return 0; /* not a tty */
758 /* This might fail. What matters are the results. */
759 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
766 static int setup_confirm_stdio(
767 const ExecContext
*context
,
769 int *ret_saved_stdin
,
770 int *ret_saved_stdout
) {
772 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
775 assert(ret_saved_stdin
);
776 assert(ret_saved_stdout
);
778 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
782 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
783 if (saved_stdout
< 0)
786 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
790 r
= chown_terminal(fd
, getuid());
794 r
= reset_terminal_fd(fd
, true);
798 r
= terminal_set_size_fd(fd
, vc
, context
->tty_rows
, context
->tty_cols
);
802 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
807 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
808 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
812 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
815 if (err
== -ETIMEDOUT
)
816 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
819 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
823 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
824 _cleanup_close_
int fd
= -EBADF
;
828 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
832 write_confirm_error_fd(err
, fd
, u
);
835 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
839 assert(saved_stdout
);
843 if (*saved_stdin
>= 0)
844 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
847 if (*saved_stdout
>= 0)
848 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
851 *saved_stdin
= safe_close(*saved_stdin
);
852 *saved_stdout
= safe_close(*saved_stdout
);
858 CONFIRM_PRETEND_FAILURE
= -1,
859 CONFIRM_PRETEND_SUCCESS
= 0,
863 static int ask_for_confirmation(const ExecContext
*context
, const char *vc
, Unit
*u
, const char *cmdline
) {
864 int saved_stdout
= -1, saved_stdin
= -1, r
;
865 _cleanup_free_
char *e
= NULL
;
868 /* For any internal errors, assume a positive response. */
869 r
= setup_confirm_stdio(context
, vc
, &saved_stdin
, &saved_stdout
);
871 write_confirm_error(r
, vc
, u
);
872 return CONFIRM_EXECUTE
;
875 /* confirm_spawn might have been disabled while we were sleeping. */
876 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
881 e
= ellipsize(cmdline
, 60, 100);
889 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
891 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
898 printf("Resuming normal execution.\n");
899 manager_disable_confirm_spawn();
903 unit_dump(u
, stdout
, " ");
904 continue; /* ask again */
906 printf("Failing execution.\n");
907 r
= CONFIRM_PRETEND_FAILURE
;
910 printf(" c - continue, proceed without asking anymore\n"
911 " D - dump, show the state of the unit\n"
912 " f - fail, don't execute the command and pretend it failed\n"
914 " i - info, show a short summary of the unit\n"
915 " j - jobs, show jobs that are in progress\n"
916 " s - skip, don't execute the command and pretend it succeeded\n"
917 " y - yes, execute the command\n");
918 continue; /* ask again */
920 printf(" Description: %s\n"
923 u
->id
, u
->description
, cmdline
);
924 continue; /* ask again */
926 manager_dump_jobs(u
->manager
, stdout
, /* patterns= */ NULL
, " ");
927 continue; /* ask again */
929 /* 'n' was removed in favor of 'f'. */
930 printf("Didn't understand 'n', did you mean 'f'?\n");
931 continue; /* ask again */
933 printf("Skipping execution.\n");
934 r
= CONFIRM_PRETEND_SUCCESS
;
940 assert_not_reached();
946 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
950 static int get_fixed_user(const ExecContext
*c
, const char **user
,
951 uid_t
*uid
, gid_t
*gid
,
952 const char **home
, const char **shell
) {
961 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
962 * (i.e. are "/" or "/bin/nologin"). */
965 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
973 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
983 r
= get_group_creds(&name
, gid
, 0);
991 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
992 const char *group
, gid_t gid
,
993 gid_t
**supplementary_gids
, int *ngids
) {
996 bool keep_groups
= false;
997 gid_t
*groups
= NULL
;
998 _cleanup_free_ gid_t
*l_gids
= NULL
;
1003 * If user is given, then lookup GID and supplementary groups list.
1004 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1005 * here and as early as possible so we keep the list of supplementary
1006 * groups of the caller.
1008 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
1009 /* First step, initialize groups from /etc/groups */
1010 if (initgroups(user
, gid
) < 0)
1016 if (strv_isempty(c
->supplementary_groups
))
1020 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1021 * be positive, otherwise fail.
1024 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1025 if (ngroups_max
<= 0)
1026 return errno_or_else(EOPNOTSUPP
);
1028 l_gids
= new(gid_t
, ngroups_max
);
1034 * Lookup the list of groups that the user belongs to, we
1035 * avoid NSS lookups here too for gid=0.
1038 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1043 STRV_FOREACH(i
, c
->supplementary_groups
) {
1046 if (k
>= ngroups_max
)
1050 r
= get_group_creds(&g
, l_gids
+k
, 0);
1058 * Sets ngids to zero to drop all supplementary groups, happens
1059 * when we are under root and SupplementaryGroups= is empty.
1066 /* Otherwise get the final list of supplementary groups */
1067 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1071 *supplementary_gids
= groups
;
1079 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1082 /* Handle SupplementaryGroups= if it is not empty */
1084 r
= maybe_setgroups(ngids
, supplementary_gids
);
1089 if (gid_is_valid(gid
)) {
1090 /* Then set our gids */
1091 if (setresgid(gid
, gid
, gid
) < 0)
1098 static int set_securebits(int bits
, int mask
) {
1099 int current
, applied
;
1100 current
= prctl(PR_GET_SECUREBITS
);
1103 /* Clear all securebits defined in mask and set bits */
1104 applied
= (current
& ~mask
) | bits
;
1105 if (current
== applied
)
1107 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1112 static int enforce_user(const ExecContext
*context
, uid_t uid
) {
1116 if (!uid_is_valid(uid
))
1119 /* Sets (but doesn't look up) the uid and make sure we keep the
1120 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1121 * required, so we also need keep-caps in this case.
1124 if (context
->capability_ambient_set
!= 0 || context
->secure_bits
!= 0) {
1126 /* First step: If we need to keep capabilities but
1127 * drop privileges we need to make sure we keep our
1128 * caps, while we drop privileges. */
1130 /* Add KEEP_CAPS to the securebits */
1131 r
= set_securebits(1<<SECURE_KEEP_CAPS
, 0);
1137 /* Second step: actually set the uids */
1138 if (setresuid(uid
, uid
, uid
) < 0)
1141 /* At this point we should have all necessary capabilities but
1142 are otherwise a normal user. However, the caps might got
1143 corrupted due to the setresuid() so we need clean them up
1144 later. This is done outside of this call. */
1151 static int null_conv(
1153 const struct pam_message
**msg
,
1154 struct pam_response
**resp
,
1155 void *appdata_ptr
) {
1157 /* We don't support conversations */
1159 return PAM_CONV_ERR
;
1164 static int setup_pam(
1170 char ***env
, /* updated on success */
1171 const int fds
[], size_t n_fds
) {
1175 static const struct pam_conv conv
= {
1180 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1181 _cleanup_strv_free_
char **e
= NULL
;
1182 pam_handle_t
*handle
= NULL
;
1184 int pam_code
= PAM_SUCCESS
, r
;
1185 bool close_session
= false;
1186 pid_t pam_pid
= 0, parent_pid
;
1193 /* We set up PAM in the parent process, then fork. The child
1194 * will then stay around until killed via PR_GET_PDEATHSIG or
1195 * systemd via the cgroup logic. It will then remove the PAM
1196 * session again. The parent process will exec() the actual
1197 * daemon. We do things this way to ensure that the main PID
1198 * of the daemon is the one we initially fork()ed. */
1200 r
= barrier_create(&barrier
);
1204 if (log_get_max_level() < LOG_DEBUG
)
1205 flags
|= PAM_SILENT
;
1207 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1208 if (pam_code
!= PAM_SUCCESS
) {
1214 _cleanup_free_
char *q
= NULL
;
1216 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1217 * out if that's the case, and read the TTY off it. */
1219 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1220 tty
= strjoina("/dev/", q
);
1224 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1225 if (pam_code
!= PAM_SUCCESS
)
1229 STRV_FOREACH(nv
, *env
) {
1230 pam_code
= pam_putenv(handle
, *nv
);
1231 if (pam_code
!= PAM_SUCCESS
)
1235 pam_code
= pam_acct_mgmt(handle
, flags
);
1236 if (pam_code
!= PAM_SUCCESS
)
1239 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1240 if (pam_code
!= PAM_SUCCESS
)
1241 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1243 pam_code
= pam_open_session(handle
, flags
);
1244 if (pam_code
!= PAM_SUCCESS
)
1247 close_session
= true;
1249 e
= pam_getenvlist(handle
);
1251 pam_code
= PAM_BUF_ERR
;
1255 /* Block SIGTERM, so that we know that it won't get lost in the child */
1257 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1259 parent_pid
= getpid_cached();
1261 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1265 int sig
, ret
= EXIT_PAM
;
1267 /* The child's job is to reset the PAM session on termination */
1268 barrier_set_role(&barrier
, BARRIER_CHILD
);
1270 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1271 * those fds are open here that have been opened by PAM. */
1272 (void) close_many(fds
, n_fds
);
1274 /* Drop privileges - we don't need any to pam_close_session and this will make
1275 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1276 * threads to fail to exit normally */
1278 r
= maybe_setgroups(0, NULL
);
1280 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1281 if (setresgid(gid
, gid
, gid
) < 0)
1282 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1283 if (setresuid(uid
, uid
, uid
) < 0)
1284 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1286 (void) ignore_signals(SIGPIPE
);
1288 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1289 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1290 * this way. We rely on the control groups kill logic to do the rest for us. */
1291 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1294 /* Tell the parent that our setup is done. This is especially important regarding dropping
1295 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1297 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1298 (void) barrier_place(&barrier
);
1300 /* Check if our parent process might already have died? */
1301 if (getppid() == parent_pid
) {
1304 assert_se(sigemptyset(&ss
) >= 0);
1305 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1308 if (sigwait(&ss
, &sig
) < 0) {
1315 assert(sig
== SIGTERM
);
1320 pam_code
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1321 if (pam_code
!= PAM_SUCCESS
)
1324 /* If our parent died we'll end the session */
1325 if (getppid() != parent_pid
) {
1326 pam_code
= pam_close_session(handle
, flags
);
1327 if (pam_code
!= PAM_SUCCESS
)
1334 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1335 * know about this. See pam_end(3) */
1336 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1340 barrier_set_role(&barrier
, BARRIER_PARENT
);
1342 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1346 /* Unblock SIGTERM again in the parent */
1347 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1349 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1350 * this fd around. */
1353 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1354 * recover. However, warn loudly if it happens. */
1355 if (!barrier_place_and_sync(&barrier
))
1356 log_error("PAM initialization failed");
1358 return strv_free_and_replace(*env
, e
);
1361 if (pam_code
!= PAM_SUCCESS
) {
1362 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1363 r
= -EPERM
; /* PAM errors do not map to errno */
1365 log_error_errno(r
, "PAM failed: %m");
1369 pam_code
= pam_close_session(handle
, flags
);
1371 (void) pam_end(handle
, pam_code
| flags
);
1381 static void rename_process_from_path(const char *path
) {
1382 _cleanup_free_
char *buf
= NULL
;
1387 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1390 if (path_extract_filename(path
, &buf
) < 0) {
1391 rename_process("(...)");
1395 size_t l
= strlen(buf
);
1397 /* The end of the process name is usually more interesting, since the first bit might just be
1404 char process_name
[11];
1405 process_name
[0] = '(';
1406 memcpy(process_name
+1, p
, l
);
1407 process_name
[1+l
] = ')';
1408 process_name
[1+l
+1] = 0;
1410 rename_process(process_name
);
1413 static bool context_has_address_families(const ExecContext
*c
) {
1416 return c
->address_families_allow_list
||
1417 !set_isempty(c
->address_families
);
1420 static bool context_has_syscall_filters(const ExecContext
*c
) {
1423 return c
->syscall_allow_list
||
1424 !hashmap_isempty(c
->syscall_filter
);
1427 static bool context_has_syscall_logs(const ExecContext
*c
) {
1430 return c
->syscall_log_allow_list
||
1431 !hashmap_isempty(c
->syscall_log
);
1434 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1437 if (c
->no_new_privileges
)
1440 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1443 /* We need NNP if we have any form of seccomp and are unprivileged */
1444 return c
->lock_personality
||
1445 c
->memory_deny_write_execute
||
1446 c
->private_devices
||
1448 c
->protect_hostname
||
1449 c
->protect_kernel_tunables
||
1450 c
->protect_kernel_modules
||
1451 c
->protect_kernel_logs
||
1452 context_has_address_families(c
) ||
1453 exec_context_restrict_namespaces_set(c
) ||
1454 c
->restrict_realtime
||
1455 c
->restrict_suid_sgid
||
1456 !set_isempty(c
->syscall_archs
) ||
1457 context_has_syscall_filters(c
) ||
1458 context_has_syscall_logs(c
);
1461 static bool exec_context_has_credentials(const ExecContext
*context
) {
1465 return !hashmap_isempty(context
->set_credentials
) ||
1466 !hashmap_isempty(context
->load_credentials
);
1471 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1473 if (is_seccomp_available())
1476 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1480 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1481 uint32_t negative_action
, default_action
, action
;
1487 if (!context_has_syscall_filters(c
))
1490 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1493 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1495 if (c
->syscall_allow_list
) {
1496 default_action
= negative_action
;
1497 action
= SCMP_ACT_ALLOW
;
1499 default_action
= SCMP_ACT_ALLOW
;
1500 action
= negative_action
;
1503 if (needs_ambient_hack
) {
1504 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1509 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1512 static int apply_syscall_log(const Unit
* u
, const ExecContext
*c
) {
1514 uint32_t default_action
, action
;
1520 if (!context_has_syscall_logs(c
))
1524 if (skip_seccomp_unavailable(u
, "SystemCallLog="))
1527 if (c
->syscall_log_allow_list
) {
1528 /* Log nothing but the ones listed */
1529 default_action
= SCMP_ACT_ALLOW
;
1530 action
= SCMP_ACT_LOG
;
1532 /* Log everything but the ones listed */
1533 default_action
= SCMP_ACT_LOG
;
1534 action
= SCMP_ACT_ALLOW
;
1537 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1539 /* old libseccomp */
1540 log_unit_debug(u
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1545 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1549 if (set_isempty(c
->syscall_archs
))
1552 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1555 return seccomp_restrict_archs(c
->syscall_archs
);
1558 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1562 if (!context_has_address_families(c
))
1565 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1568 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1571 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1575 if (!c
->memory_deny_write_execute
)
1578 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1581 return seccomp_memory_deny_write_execute();
1584 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1588 if (!c
->restrict_realtime
)
1591 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1594 return seccomp_restrict_realtime();
1597 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1601 if (!c
->restrict_suid_sgid
)
1604 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1607 return seccomp_restrict_suid_sgid();
1610 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1614 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1615 * let's protect even those systems where this is left on in the kernel. */
1617 if (!c
->protect_kernel_tunables
)
1620 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1623 return seccomp_protect_sysctl();
1626 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1630 /* Turn off module syscalls on ProtectKernelModules=yes */
1632 if (!c
->protect_kernel_modules
)
1635 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1638 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1641 static int apply_protect_kernel_logs(const Unit
*u
, const ExecContext
*c
) {
1645 if (!c
->protect_kernel_logs
)
1648 if (skip_seccomp_unavailable(u
, "ProtectKernelLogs="))
1651 return seccomp_protect_syslog();
1654 static int apply_protect_clock(const Unit
*u
, const ExecContext
*c
) {
1658 if (!c
->protect_clock
)
1661 if (skip_seccomp_unavailable(u
, "ProtectClock="))
1664 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1667 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1671 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1673 if (!c
->private_devices
)
1676 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1679 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1682 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1686 if (!exec_context_restrict_namespaces_set(c
))
1689 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1692 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1695 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1696 unsigned long personality
;
1702 if (!c
->lock_personality
)
1705 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1708 personality
= c
->personality
;
1710 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1711 if (personality
== PERSONALITY_INVALID
) {
1713 r
= opinionated_personality(&personality
);
1718 return seccomp_lock_personality(personality
);
1724 static int apply_restrict_filesystems(Unit
*u
, const ExecContext
*c
) {
1728 if (!exec_context_restrict_filesystems_set(c
))
1731 if (!u
->manager
->restrict_fs
) {
1732 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1733 log_unit_debug(u
, "LSM BPF not supported, skipping RestrictFileSystems=");
1737 return lsm_bpf_unit_restrict_filesystems(u
, c
->restrict_filesystems
, c
->restrict_filesystems_allow_list
);
1741 static int apply_protect_hostname(const Unit
*u
, const ExecContext
*c
, int *ret_exit_status
) {
1745 if (!c
->protect_hostname
)
1748 if (ns_type_supported(NAMESPACE_UTS
)) {
1749 if (unshare(CLONE_NEWUTS
) < 0) {
1750 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1751 *ret_exit_status
= EXIT_NAMESPACE
;
1752 return log_unit_error_errno(u
, errno
, "Failed to set up UTS namespacing: %m");
1755 log_unit_warning(u
, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1758 log_unit_warning(u
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1763 if (skip_seccomp_unavailable(u
, "ProtectHostname="))
1766 r
= seccomp_protect_hostname();
1768 *ret_exit_status
= EXIT_SECCOMP
;
1769 return log_unit_error_errno(u
, r
, "Failed to apply hostname restrictions: %m");
1776 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1779 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1780 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1782 if (idle_pipe
[0] >= 0) {
1785 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1787 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1790 /* Signal systemd that we are bored and want to continue. */
1791 n
= write(idle_pipe
[3], "x", 1);
1793 /* Wait for systemd to react to the signal above. */
1794 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1797 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1801 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1804 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1806 static int build_environment(
1808 const ExecContext
*c
,
1809 const ExecParameters
*p
,
1813 const char *username
,
1815 dev_t journal_stream_dev
,
1816 ino_t journal_stream_ino
,
1819 _cleanup_strv_free_
char **our_env
= NULL
;
1828 #define N_ENV_VARS 17
1829 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1834 _cleanup_free_
char *joined
= NULL
;
1836 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1838 our_env
[n_env
++] = x
;
1840 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1842 our_env
[n_env
++] = x
;
1844 joined
= strv_join(fdnames
, ":");
1848 x
= strjoin("LISTEN_FDNAMES=", joined
);
1851 our_env
[n_env
++] = x
;
1854 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1855 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1857 our_env
[n_env
++] = x
;
1859 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1861 our_env
[n_env
++] = x
;
1864 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1865 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1866 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1867 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1868 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1871 our_env
[n_env
++] = x
;
1875 x
= strjoin("HOME=", home
);
1879 path_simplify(x
+ 5);
1880 our_env
[n_env
++] = x
;
1884 x
= strjoin("LOGNAME=", username
);
1887 our_env
[n_env
++] = x
;
1889 x
= strjoin("USER=", username
);
1892 our_env
[n_env
++] = x
;
1896 x
= strjoin("SHELL=", shell
);
1900 path_simplify(x
+ 6);
1901 our_env
[n_env
++] = x
;
1904 if (!sd_id128_is_null(u
->invocation_id
)) {
1905 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1908 our_env
[n_env
++] = x
;
1911 if (exec_context_needs_term(c
)) {
1912 const char *tty_path
, *term
= NULL
;
1914 tty_path
= exec_context_tty_path(c
);
1916 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1917 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1918 * container manager passes to PID 1 ends up all the way in the console login shown. */
1920 if (path_equal_ptr(tty_path
, "/dev/console") && getppid() == 1)
1921 term
= getenv("TERM");
1924 term
= default_term_for_tty(tty_path
);
1926 x
= strjoin("TERM=", term
);
1929 our_env
[n_env
++] = x
;
1932 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
1933 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
1936 our_env
[n_env
++] = x
;
1939 if (c
->log_namespace
) {
1940 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
1944 our_env
[n_env
++] = x
;
1947 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
1948 _cleanup_free_
char *joined
= NULL
;
1954 if (c
->directories
[t
].n_items
== 0)
1957 n
= exec_directory_env_name_to_string(t
);
1961 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
1962 _cleanup_free_
char *prefixed
= NULL
;
1964 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
1968 if (!strextend_with_separator(&joined
, ":", prefixed
))
1972 x
= strjoin(n
, "=", joined
);
1976 our_env
[n_env
++] = x
;
1979 if (exec_context_has_credentials(c
) && p
->prefix
[EXEC_DIRECTORY_RUNTIME
]) {
1980 x
= strjoin("CREDENTIALS_DIRECTORY=", p
->prefix
[EXEC_DIRECTORY_RUNTIME
], "/credentials/", u
->id
);
1984 our_env
[n_env
++] = x
;
1987 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
1990 our_env
[n_env
++] = x
;
1992 our_env
[n_env
++] = NULL
;
1993 assert(n_env
<= N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1996 *ret
= TAKE_PTR(our_env
);
2001 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2002 _cleanup_strv_free_
char **pass_env
= NULL
;
2005 STRV_FOREACH(i
, c
->pass_environment
) {
2006 _cleanup_free_
char *x
= NULL
;
2012 x
= strjoin(*i
, "=", v
);
2016 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2019 pass_env
[n_env
++] = TAKE_PTR(x
);
2020 pass_env
[n_env
] = NULL
;
2023 *ret
= TAKE_PTR(pass_env
);
2028 bool exec_needs_mount_namespace(
2029 const ExecContext
*context
,
2030 const ExecParameters
*params
,
2031 const ExecRuntime
*runtime
) {
2035 if (context
->root_image
)
2038 if (!strv_isempty(context
->read_write_paths
) ||
2039 !strv_isempty(context
->read_only_paths
) ||
2040 !strv_isempty(context
->inaccessible_paths
) ||
2041 !strv_isempty(context
->exec_paths
) ||
2042 !strv_isempty(context
->no_exec_paths
))
2045 if (context
->n_bind_mounts
> 0)
2048 if (context
->n_temporary_filesystems
> 0)
2051 if (context
->n_mount_images
> 0)
2054 if (context
->n_extension_images
> 0)
2057 if (!strv_isempty(context
->extension_directories
))
2060 if (!IN_SET(context
->mount_flags
, 0, MS_SHARED
))
2063 if (context
->private_tmp
&& runtime
&& (runtime
->tmp_dir
|| runtime
->var_tmp_dir
))
2066 if (context
->private_devices
||
2067 context
->private_mounts
||
2068 context
->protect_system
!= PROTECT_SYSTEM_NO
||
2069 context
->protect_home
!= PROTECT_HOME_NO
||
2070 context
->protect_kernel_tunables
||
2071 context
->protect_kernel_modules
||
2072 context
->protect_kernel_logs
||
2073 context
->protect_control_groups
||
2074 context
->protect_proc
!= PROTECT_PROC_DEFAULT
||
2075 context
->proc_subset
!= PROC_SUBSET_ALL
||
2076 context
->private_ipc
||
2077 context
->ipc_namespace_path
)
2080 if (context
->root_directory
) {
2081 if (exec_context_get_effective_mount_apivfs(context
))
2084 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2085 if (params
&& !params
->prefix
[t
])
2088 if (context
->directories
[t
].n_items
> 0)
2093 if (context
->dynamic_user
&&
2094 (context
->directories
[EXEC_DIRECTORY_STATE
].n_items
> 0 ||
2095 context
->directories
[EXEC_DIRECTORY_CACHE
].n_items
> 0 ||
2096 context
->directories
[EXEC_DIRECTORY_LOGS
].n_items
> 0))
2099 if (context
->log_namespace
)
2105 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2106 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2107 _cleanup_close_pair_
int errno_pipe
[2] = PIPE_EBADF
;
2108 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2109 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2114 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2115 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2116 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2117 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2118 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2119 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2120 * continues execution normally.
2121 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2122 * does not need CAP_SETUID to write the single line mapping to itself. */
2124 /* Can only set up multiple mappings with CAP_SETUID. */
2125 if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
))
2126 r
= asprintf(&uid_map
,
2127 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2128 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2129 ouid
, ouid
, uid
, uid
);
2131 r
= asprintf(&uid_map
,
2132 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2138 /* Can only set up multiple mappings with CAP_SETGID. */
2139 if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
))
2140 r
= asprintf(&gid_map
,
2141 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2142 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2143 ogid
, ogid
, gid
, gid
);
2145 r
= asprintf(&gid_map
,
2146 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2152 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2154 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2155 if (unshare_ready_fd
< 0)
2158 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2160 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2163 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
2167 _cleanup_close_
int fd
= -EBADF
;
2171 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2172 * here, after the parent opened its own user namespace. */
2175 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2177 /* Wait until the parent unshared the user namespace */
2178 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2183 /* Disable the setgroups() system call in the child user namespace, for good. */
2184 a
= procfs_file_alloca(ppid
, "setgroups");
2185 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2187 if (errno
!= ENOENT
) {
2192 /* If the file is missing the kernel is too old, let's continue anyway. */
2194 if (write(fd
, "deny\n", 5) < 0) {
2199 fd
= safe_close(fd
);
2202 /* First write the GID map */
2203 a
= procfs_file_alloca(ppid
, "gid_map");
2204 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2209 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2213 fd
= safe_close(fd
);
2215 /* The write the UID map */
2216 a
= procfs_file_alloca(ppid
, "uid_map");
2217 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2222 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2227 _exit(EXIT_SUCCESS
);
2230 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2231 _exit(EXIT_FAILURE
);
2234 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2236 if (unshare(CLONE_NEWUSER
) < 0)
2239 /* Let the child know that the namespace is ready now */
2240 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2243 /* Try to read an error code from the child */
2244 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2247 if (n
== sizeof(r
)) { /* an error code was sent to us */
2252 if (n
!= 0) /* on success we should have read 0 bytes */
2255 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2258 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2264 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2265 if (!context
->dynamic_user
)
2268 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2271 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2277 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2278 _cleanup_free_
char *src_abs
= NULL
;
2283 src_abs
= path_join(root
, source
);
2287 STRV_FOREACH(dst
, symlinks
) {
2288 _cleanup_free_
char *dst_abs
= NULL
;
2290 dst_abs
= path_join(root
, *dst
);
2294 r
= mkdir_parents_label(dst_abs
, 0755);
2298 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2306 static int setup_exec_directory(
2307 const ExecContext
*context
,
2308 const ExecParameters
*params
,
2311 ExecDirectoryType type
,
2312 bool needs_mount_namespace
,
2315 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2316 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2317 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2318 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2319 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2320 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2326 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2327 assert(exit_status
);
2329 if (!params
->prefix
[type
])
2332 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2333 if (!uid_is_valid(uid
))
2335 if (!gid_is_valid(gid
))
2339 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2340 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2342 p
= path_join(params
->prefix
[type
], context
->directories
[type
].items
[i
].path
);
2348 r
= mkdir_parents_label(p
, 0755);
2352 if (exec_directory_is_private(context
, type
)) {
2353 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2354 * case we want to avoid leaving a directory around fully accessible that is owned by
2355 * a dynamic user whose UID is later on reused. To lock this down we use the same
2356 * trick used by container managers to prohibit host users to get access to files of
2357 * the same UID in containers: we place everything inside a directory that has an
2358 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2359 * for unprivileged host code. We then use fs namespacing to make this directory
2360 * permeable for the service itself.
2362 * Specifically: for a service which wants a special directory "foo/" we first create
2363 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2364 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2365 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2366 * unprivileged host users can't look into it. Inside of the namespace of the unit
2367 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2368 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2369 * for the service and making sure it only gets access to the dirs it needs but no
2370 * others. Tricky? Yes, absolutely, but it works!
2372 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2373 * to be owned by the service itself.
2375 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2376 * for sharing files or sockets with other services. */
2378 pp
= path_join(params
->prefix
[type
], "private");
2384 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2385 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2389 if (!path_extend(&pp
, context
->directories
[type
].items
[i
].path
)) {
2394 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2395 r
= mkdir_parents_label(pp
, 0755);
2399 if (is_dir(p
, false) > 0 &&
2400 (laccess(pp
, F_OK
) < 0 && errno
== ENOENT
)) {
2402 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2403 * it over. Most likely the service has been upgraded from one that didn't use
2404 * DynamicUser=1, to one that does. */
2406 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2407 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2408 exec_directory_type_to_string(type
), p
, pp
);
2410 if (rename(p
, pp
) < 0) {
2415 /* Otherwise, create the actual directory for the service */
2417 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2418 if (r
< 0 && r
!= -EEXIST
)
2422 if (!context
->directories
[type
].items
[i
].only_create
) {
2423 /* And link it up from the original place.
2425 * 1) If a mount namespace is going to be used, then this symlink remains on
2426 * the host, and a new one for the child namespace will be created later.
2427 * 2) It is not necessary to create this symlink when one of its parent
2428 * directories is specified and already created. E.g.
2429 * StateDirectory=foo foo/bar
2430 * In that case, the inode points to pp and p for "foo/bar" are the same:
2431 * pp = "/var/lib/private/foo/bar"
2432 * p = "/var/lib/foo/bar"
2433 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2434 * we do not need to create the symlink, but we cannot create the symlink.
2435 * See issue #24783. */
2436 r
= symlink_idempotent(pp
, p
, true);
2442 _cleanup_free_
char *target
= NULL
;
2444 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2445 readlink_and_make_absolute(p
, &target
) >= 0) {
2446 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2448 /* This already exists and is a symlink? Interesting. Maybe it's one created
2449 * by DynamicUser=1 (see above)?
2451 * We do this for all directory types except for ConfigurationDirectory=,
2452 * since they all support the private/ symlink logic at least in some
2453 * configurations, see above. */
2455 r
= chase_symlinks(target
, NULL
, 0, &target_resolved
, NULL
);
2459 q
= path_join(params
->prefix
[type
], "private", context
->directories
[type
].items
[i
].path
);
2465 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2466 r
= chase_symlinks(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2470 if (path_equal(q_resolved
, target_resolved
)) {
2472 /* Hmm, apparently DynamicUser= was once turned on for this service,
2473 * but is no longer. Let's move the directory back up. */
2475 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2476 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2477 exec_directory_type_to_string(type
), q
, p
);
2479 if (unlink(p
) < 0) {
2484 if (rename(q
, p
) < 0) {
2491 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2496 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2499 /* Don't change the owner/access mode of the configuration directory,
2500 * as in the common case it is not written to by a service, and shall
2501 * not be writable. */
2503 if (stat(p
, &st
) < 0) {
2508 /* Still complain if the access mode doesn't match */
2509 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2510 log_warning("%s \'%s\' already exists but the mode is different. "
2511 "(File system: %o %sMode: %o)",
2512 exec_directory_type_to_string(type
), context
->directories
[type
].items
[i
].path
,
2513 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2520 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2521 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2522 * current UID/GID ownership.) */
2523 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2527 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2528 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2529 * assignments to exist. */
2530 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777);
2535 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2536 * they are set up later, to allow configuring empty var/run/etc. */
2537 if (!needs_mount_namespace
)
2538 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2539 r
= create_many_symlinks(params
->prefix
[type
],
2540 context
->directories
[type
].items
[i
].path
,
2541 context
->directories
[type
].items
[i
].symlinks
);
2549 *exit_status
= exit_status_table
[type
];
2553 static int write_credential(
2559 bool ownership_ok
) {
2561 _cleanup_(unlink_and_freep
) char *tmp
= NULL
;
2562 _cleanup_close_
int fd
= -EBADF
;
2565 r
= tempfn_random_child("", "cred", &tmp
);
2569 fd
= openat(dfd
, tmp
, O_CREAT
|O_RDWR
|O_CLOEXEC
|O_EXCL
|O_NOFOLLOW
|O_NOCTTY
, 0600);
2575 r
= loop_write(fd
, data
, size
, /* do_poll = */ false);
2579 if (fchmod(fd
, 0400) < 0) /* Take away "w" bit */
2582 if (uid_is_valid(uid
) && uid
!= getuid()) {
2583 r
= fd_add_uid_acl_permission(fd
, uid
, ACL_READ
);
2585 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
2588 if (!ownership_ok
) /* Ideally we use ACLs, since we can neatly express what we want
2589 * to express: that the user gets read access and nothing
2590 * else. But if the backing fs can't support that (e.g. ramfs)
2591 * then we can use file ownership instead. But that's only safe if
2592 * we can then re-mount the whole thing read-only, so that the
2593 * user can no longer chmod() the file to gain write access. */
2596 if (fchown(fd
, uid
, GID_INVALID
) < 0)
2601 if (renameat(dfd
, tmp
, dfd
, id
) < 0)
2608 static char **credential_search_path(
2609 const ExecParameters
*params
,
2612 _cleanup_strv_free_
char **l
= NULL
;
2616 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2617 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2618 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2621 if (strv_extend(&l
, params
->received_encrypted_credentials_directory
) < 0)
2624 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2628 if (params
->received_credentials_directory
)
2629 if (strv_extend(&l
, params
->received_credentials_directory
) < 0)
2632 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2635 if (DEBUG_LOGGING
) {
2636 _cleanup_free_
char *t
= strv_join(l
, ":");
2638 log_debug("Credential search path is: %s", t
);
2644 static int load_credential(
2645 const ExecContext
*context
,
2646 const ExecParameters
*params
,
2657 ReadFullFileFlags flags
= READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
;
2658 _cleanup_strv_free_
char **search_path
= NULL
;
2659 _cleanup_(erase_and_freep
) char *data
= NULL
;
2660 _cleanup_free_
char *bindname
= NULL
;
2661 const char *source
= NULL
;
2662 bool missing_ok
= true;
2663 size_t size
, add
, maxsz
;
2671 assert(read_dfd
>= 0 || read_dfd
== AT_FDCWD
);
2672 assert(write_dfd
>= 0);
2675 if (read_dfd
>= 0) {
2676 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2677 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2678 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2681 if (!filename_is_valid(path
)) /* safety check */
2687 } else if (path_is_absolute(path
)) {
2688 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2691 if (!path_is_valid(path
)) /* safety check */
2694 flags
|= READ_FULL_FILE_CONNECT_SOCKET
;
2696 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2697 * via the source socket address in case we read off an AF_UNIX socket. */
2698 if (asprintf(&bindname
, "@%" PRIx64
"/unit/%s/%s", random_u64(), unit
, id
) < 0)
2704 } else if (credential_name_valid(path
)) {
2705 /* If this is a relative path, take it as credential name relative to the credentials
2706 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2707 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2709 search_path
= credential_search_path(params
, encrypted
);
2718 flags
|= READ_FULL_FILE_UNBASE64
;
2720 maxsz
= encrypted
? CREDENTIAL_ENCRYPTED_SIZE_MAX
: CREDENTIAL_SIZE_MAX
;
2723 STRV_FOREACH(d
, search_path
) {
2724 _cleanup_free_
char *j
= NULL
;
2726 j
= path_join(*d
, path
);
2730 r
= read_full_file_full(
2731 AT_FDCWD
, j
, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2741 r
= read_full_file_full(
2751 if (r
== -ENOENT
&& (missing_ok
|| hashmap_contains(context
->set_credentials
, id
))) {
2752 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2753 * will get clear errors if we don't pass such a missing credential on as they
2754 * themselves will get ENOENT when trying to read them, which should not be much
2755 * worse than when we handle the error here and make it fatal.
2757 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2758 * we are fine, too. */
2759 log_debug_errno(r
, "Couldn't read inherited credential '%s', skipping: %m", path
);
2763 return log_debug_errno(r
, "Failed to read credential '%s': %m", path
);
2766 _cleanup_free_
void *plaintext
= NULL
;
2767 size_t plaintext_size
= 0;
2769 r
= decrypt_credential_and_warn(id
, now(CLOCK_REALTIME
), NULL
, NULL
, data
, size
, &plaintext
, &plaintext_size
);
2773 free_and_replace(data
, plaintext
);
2774 size
= plaintext_size
;
2777 add
= strlen(id
) + size
;
2781 r
= write_credential(write_dfd
, id
, data
, size
, uid
, ownership_ok
);
2783 return log_debug_errno(r
, "Failed to write credential '%s': %m", id
);
2789 struct load_cred_args
{
2790 const ExecContext
*context
;
2791 const ExecParameters
*params
;
2800 static int load_cred_recurse_dir_cb(
2801 RecurseDirEvent event
,
2805 const struct dirent
*de
,
2806 const struct statx
*sx
,
2809 struct load_cred_args
*args
= ASSERT_PTR(userdata
);
2810 _cleanup_free_
char *sub_id
= NULL
;
2813 if (event
!= RECURSE_DIR_ENTRY
)
2814 return RECURSE_DIR_CONTINUE
;
2816 if (!IN_SET(de
->d_type
, DT_REG
, DT_SOCK
))
2817 return RECURSE_DIR_CONTINUE
;
2819 sub_id
= strreplace(path
, "/", "_");
2823 if (!credential_name_valid(sub_id
))
2824 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential would get ID %s, which is not valid, refusing", sub_id
);
2826 if (faccessat(args
->dfd
, sub_id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0) {
2827 log_debug("Skipping credential with duplicated ID %s at %s", sub_id
, path
);
2828 return RECURSE_DIR_CONTINUE
;
2830 if (errno
!= ENOENT
)
2831 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sub_id
);
2833 r
= load_credential(
2848 return RECURSE_DIR_CONTINUE
;
2851 static int acquire_credentials(
2852 const ExecContext
*context
,
2853 const ExecParameters
*params
,
2857 bool ownership_ok
) {
2859 uint64_t left
= CREDENTIALS_TOTAL_SIZE_MAX
;
2860 _cleanup_close_
int dfd
= -EBADF
;
2861 ExecLoadCredential
*lc
;
2862 ExecSetCredential
*sc
;
2868 dfd
= open(p
, O_DIRECTORY
|O_CLOEXEC
);
2872 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2873 HASHMAP_FOREACH(lc
, context
->load_credentials
) {
2874 _cleanup_close_
int sub_fd
= -EBADF
;
2876 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2877 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2878 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2879 * propagate a credential passed to us from further up. */
2881 if (path_is_absolute(lc
->path
)) {
2882 sub_fd
= open(lc
->path
, O_DIRECTORY
|O_CLOEXEC
|O_RDONLY
);
2883 if (sub_fd
< 0 && !IN_SET(errno
,
2884 ENOTDIR
, /* Not a directory */
2885 ENOENT
)) /* Doesn't exist? */
2886 return log_debug_errno(errno
, "Failed to open '%s': %m", lc
->path
);
2890 /* Regular file (incl. a credential passed in from higher up) */
2891 r
= load_credential(
2907 /* path= */ lc
->id
, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2908 /* statx_mask= */ 0,
2909 /* n_depth_max= */ UINT_MAX
,
2910 RECURSE_DIR_SORT
|RECURSE_DIR_IGNORE_DOT
|RECURSE_DIR_ENSURE_TYPE
,
2911 load_cred_recurse_dir_cb
,
2912 &(struct load_cred_args
) {
2915 .encrypted
= lc
->encrypted
,
2919 .ownership_ok
= ownership_ok
,
2926 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2927 * them, so that they can act as a "default" if the same credential is specified multiple times. */
2928 HASHMAP_FOREACH(sc
, context
->set_credentials
) {
2929 _cleanup_(erase_and_freep
) void *plaintext
= NULL
;
2933 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2934 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2935 * slow and involved, hence it's nice to be able to skip that if the credential already
2937 if (faccessat(dfd
, sc
->id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0)
2939 if (errno
!= ENOENT
)
2940 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sc
->id
);
2942 if (sc
->encrypted
) {
2943 r
= decrypt_credential_and_warn(sc
->id
, now(CLOCK_REALTIME
), NULL
, NULL
, sc
->data
, sc
->size
, &plaintext
, &size
);
2953 add
= strlen(sc
->id
) + size
;
2957 r
= write_credential(dfd
, sc
->id
, data
, size
, uid
, ownership_ok
);
2964 if (fchmod(dfd
, 0500) < 0) /* Now take away the "w" bit */
2967 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2970 if (uid_is_valid(uid
) && uid
!= getuid()) {
2971 r
= fd_add_uid_acl_permission(dfd
, uid
, ACL_READ
| ACL_EXECUTE
);
2973 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
2979 if (fchown(dfd
, uid
, GID_INVALID
) < 0)
2987 static int setup_credentials_internal(
2988 const ExecContext
*context
,
2989 const ExecParameters
*params
,
2991 const char *final
, /* This is where the credential store shall eventually end up at */
2992 const char *workspace
, /* This is where we can prepare it before moving it to the final place */
2993 bool reuse_workspace
, /* Whether to reuse any existing workspace mount if it already is a mount */
2994 bool must_mount
, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2997 int r
, workspace_mounted
; /* negative if we don't know yet whether we have/can mount something; true
2998 * if we mounted something; false if we definitely can't mount anything */
3006 if (reuse_workspace
) {
3007 r
= path_is_mount_point(workspace
, NULL
, 0);
3011 workspace_mounted
= true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3013 workspace_mounted
= -1; /* We need to figure out if we can mount something to the workspace */
3015 workspace_mounted
= -1; /* ditto */
3017 r
= path_is_mount_point(final
, NULL
, 0);
3021 /* If the final place already has something mounted, we use that. If the workspace also has
3022 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3024 final_mounted
= true;
3026 if (workspace_mounted
< 0) {
3027 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3028 * the final version to the workspace, and make it writable, so that we can make
3031 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3035 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
);
3039 workspace_mounted
= true;
3042 final_mounted
= false;
3044 if (workspace_mounted
< 0) {
3045 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3046 for (int try = 0;; try++) {
3049 /* Try "ramfs" first, since it's not swap backed */
3050 r
= mount_nofollow_verbose(LOG_DEBUG
, "ramfs", workspace
, "ramfs", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, "mode=0700");
3052 workspace_mounted
= true;
3056 } else if (try == 1) {
3057 _cleanup_free_
char *opts
= NULL
;
3059 if (asprintf(&opts
, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX
) < 0)
3062 /* Fall back to "tmpfs" otherwise */
3063 r
= mount_nofollow_verbose(LOG_DEBUG
, "tmpfs", workspace
, "tmpfs", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, opts
);
3065 workspace_mounted
= true;
3070 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3071 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3073 if (!ERRNO_IS_PRIVILEGE(r
)) /* Propagate anything that isn't a permission problem */
3076 if (must_mount
) /* If we it's not OK to use the plain directory
3077 * fallback, propagate all errors too */
3080 /* If we lack privileges to bind mount stuff, then let's gracefully
3081 * proceed for compat with container envs, and just use the final dir
3084 workspace_mounted
= false;
3088 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3089 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
);
3093 workspace_mounted
= true;
3099 assert(!must_mount
|| workspace_mounted
> 0);
3100 where
= workspace_mounted
? workspace
: final
;
3102 (void) label_fix_full(AT_FDCWD
, where
, final
, 0);
3104 r
= acquire_credentials(context
, params
, unit
, where
, uid
, workspace_mounted
);
3108 if (workspace_mounted
) {
3109 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3110 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
);
3114 /* And mount it to the final place, read-only */
3116 r
= umount_verbose(LOG_DEBUG
, workspace
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
3118 r
= mount_nofollow_verbose(LOG_DEBUG
, workspace
, final
, NULL
, MS_MOVE
, NULL
);
3122 _cleanup_free_
char *parent
= NULL
;
3124 /* If we do not have our own mount put used the plain directory fallback, then we need to
3125 * open access to the top-level credential directory and the per-service directory now */
3127 r
= path_extract_directory(final
, &parent
);
3130 if (chmod(parent
, 0755) < 0)
3137 static int setup_credentials(
3138 const ExecContext
*context
,
3139 const ExecParameters
*params
,
3143 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
3149 if (!exec_context_has_credentials(context
))
3152 if (!params
->prefix
[EXEC_DIRECTORY_RUNTIME
])
3155 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3156 * and the subdir we mount over with a read-only file system readable by the service's user */
3157 q
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials");
3161 r
= mkdir_label(q
, 0755); /* top-level dir: world readable/searchable */
3162 if (r
< 0 && r
!= -EEXIST
)
3165 p
= path_join(q
, unit
);
3169 r
= mkdir_label(p
, 0700); /* per-unit dir: private to user */
3170 if (r
< 0 && r
!= -EEXIST
)
3173 r
= safe_fork("(sd-mkdcreds)", FORK_DEATHSIG
|FORK_WAIT
|FORK_NEW_MOUNTNS
, NULL
);
3175 _cleanup_free_
char *t
= NULL
, *u
= NULL
;
3177 /* If this is not a privilege or support issue then propagate the error */
3178 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
3181 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3182 * it into place, so that users can't access half-initialized credential stores. */
3183 t
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "systemd/temporary-credentials");
3187 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3188 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3189 * after it is fully set up */
3190 u
= path_join(t
, unit
);
3194 FOREACH_STRING(i
, t
, u
) {
3195 r
= mkdir_label(i
, 0700);
3196 if (r
< 0 && r
!= -EEXIST
)
3200 r
= setup_credentials_internal(
3204 p
, /* final mount point */
3205 u
, /* temporary workspace to overmount */
3206 true, /* reuse the workspace if it is already a mount */
3207 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3210 (void) rmdir(u
); /* remove the workspace again if we can. */
3215 } else if (r
== 0) {
3217 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3218 * we can use the same directory for all cases, after turning off propagation. Question
3219 * though is: where do we turn off propagation exactly, and where do we place the workspace
3220 * directory? We need some place that is guaranteed to be a mount point in the host, and
3221 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3222 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3223 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3224 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3225 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3226 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3227 * propagation on the former, and then overmount the latter.
3229 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3230 * for this purpose, but there are few other candidates that work equally well for us, and
3231 * given that the we do this in a privately namespaced short-lived single-threaded process
3232 * that no one else sees this should be OK to do. */
3234 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, "/dev", NULL
, MS_SLAVE
|MS_REC
, NULL
); /* Turn off propagation from our namespace to host */
3238 r
= setup_credentials_internal(
3242 p
, /* final mount point */
3243 "/dev/shm", /* temporary workspace to overmount */
3244 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3245 true, /* insist that something is mounted, do not allow fallback to plain directory */
3250 _exit(EXIT_SUCCESS
);
3253 _exit(EXIT_FAILURE
);
3260 static int setup_smack(
3261 const Manager
*manager
,
3262 const ExecContext
*context
,
3263 int executable_fd
) {
3267 assert(executable_fd
>= 0);
3269 if (context
->smack_process_label
) {
3270 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
3273 } else if (manager
->default_smack_process_label
) {
3274 _cleanup_free_
char *exec_label
= NULL
;
3276 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
3277 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
3280 r
= mac_smack_apply_pid(0, exec_label
? : manager
->default_smack_process_label
);
3289 static int compile_bind_mounts(
3290 const ExecContext
*context
,
3291 const ExecParameters
*params
,
3292 BindMount
**ret_bind_mounts
,
3293 size_t *ret_n_bind_mounts
,
3294 char ***ret_empty_directories
) {
3296 _cleanup_strv_free_
char **empty_directories
= NULL
;
3297 BindMount
*bind_mounts
;
3303 assert(ret_bind_mounts
);
3304 assert(ret_n_bind_mounts
);
3305 assert(ret_empty_directories
);
3307 n
= context
->n_bind_mounts
;
3308 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3309 if (!params
->prefix
[t
])
3312 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++)
3313 n
+= !context
->directories
[t
].items
[i
].only_create
;
3317 *ret_bind_mounts
= NULL
;
3318 *ret_n_bind_mounts
= 0;
3319 *ret_empty_directories
= NULL
;
3323 bind_mounts
= new(BindMount
, n
);
3327 for (size_t i
= 0; i
< context
->n_bind_mounts
; i
++) {
3328 BindMount
*item
= context
->bind_mounts
+ i
;
3331 s
= strdup(item
->source
);
3337 d
= strdup(item
->destination
);
3344 bind_mounts
[h
++] = (BindMount
) {
3347 .read_only
= item
->read_only
,
3348 .recursive
= item
->recursive
,
3349 .ignore_enoent
= item
->ignore_enoent
,
3353 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3354 if (!params
->prefix
[t
])
3357 if (context
->directories
[t
].n_items
== 0)
3360 if (exec_directory_is_private(context
, t
) &&
3361 !exec_context_with_rootfs(context
)) {
3364 /* So this is for a dynamic user, and we need to make sure the process can access its own
3365 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3366 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3368 private_root
= path_join(params
->prefix
[t
], "private");
3369 if (!private_root
) {
3374 r
= strv_consume(&empty_directories
, private_root
);
3379 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++) {
3382 /* When one of the parent directories is in the list, we cannot create the symlink
3383 * for the child directory. See also the comments in setup_exec_directory(). */
3384 if (context
->directories
[t
].items
[i
].only_create
)
3387 if (exec_directory_is_private(context
, t
))
3388 s
= path_join(params
->prefix
[t
], "private", context
->directories
[t
].items
[i
].path
);
3390 s
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3396 if (exec_directory_is_private(context
, t
) &&
3397 exec_context_with_rootfs(context
))
3398 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3399 * directory is not created on the root directory. So, let's bind-mount the directory
3400 * on the 'non-private' place. */
3401 d
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3410 bind_mounts
[h
++] = (BindMount
) {
3414 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
3416 .ignore_enoent
= false,
3423 *ret_bind_mounts
= bind_mounts
;
3424 *ret_n_bind_mounts
= n
;
3425 *ret_empty_directories
= TAKE_PTR(empty_directories
);
3430 bind_mount_free_many(bind_mounts
, h
);
3434 /* ret_symlinks will contain a list of pairs src:dest that describes
3435 * the symlinks to create later on. For example, the symlinks needed
3436 * to safely give private directories to DynamicUser=1 users. */
3437 static int compile_symlinks(
3438 const ExecContext
*context
,
3439 const ExecParameters
*params
,
3440 char ***ret_symlinks
) {
3442 _cleanup_strv_free_
char **symlinks
= NULL
;
3447 assert(ret_symlinks
);
3449 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3450 for (size_t i
= 0; i
< context
->directories
[dt
].n_items
; i
++) {
3451 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
3453 STRV_FOREACH(symlink
, context
->directories
[dt
].items
[i
].symlinks
) {
3454 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
3456 src_abs
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3457 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
3458 if (!src_abs
|| !dst_abs
)
3461 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
3466 if (!exec_directory_is_private(context
, dt
) ||
3467 exec_context_with_rootfs(context
) ||
3468 context
->directories
[dt
].items
[i
].only_create
)
3471 private_path
= path_join(params
->prefix
[dt
], "private", context
->directories
[dt
].items
[i
].path
);
3475 path
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3479 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
3485 *ret_symlinks
= TAKE_PTR(symlinks
);
3490 static bool insist_on_sandboxing(
3491 const ExecContext
*context
,
3492 const char *root_dir
,
3493 const char *root_image
,
3494 const BindMount
*bind_mounts
,
3495 size_t n_bind_mounts
) {
3498 assert(n_bind_mounts
== 0 || bind_mounts
);
3500 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3501 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3502 * rearrange stuff in a way we cannot ignore gracefully. */
3504 if (context
->n_temporary_filesystems
> 0)
3507 if (root_dir
|| root_image
)
3510 if (context
->n_mount_images
> 0)
3513 if (context
->dynamic_user
)
3516 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
3519 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3521 for (size_t i
= 0; i
< n_bind_mounts
; i
++)
3522 if (!path_equal(bind_mounts
[i
].source
, bind_mounts
[i
].destination
))
3525 if (context
->log_namespace
)
3531 static int apply_mount_namespace(
3533 ExecCommandFlags command_flags
,
3534 const ExecContext
*context
,
3535 const ExecParameters
*params
,
3536 const ExecRuntime
*runtime
,
3537 char **error_path
) {
3539 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
;
3540 const char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
3541 const char *root_dir
= NULL
, *root_image
= NULL
;
3542 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
3543 *extension_dir
= NULL
;
3544 NamespaceInfo ns_info
;
3545 bool needs_sandboxing
;
3546 BindMount
*bind_mounts
= NULL
;
3547 size_t n_bind_mounts
= 0;
3552 if (params
->flags
& EXEC_APPLY_CHROOT
) {
3553 root_image
= context
->root_image
;
3556 root_dir
= context
->root_directory
;
3559 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
3563 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3564 r
= compile_symlinks(context
, params
, &symlinks
);
3568 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command_flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
3569 if (needs_sandboxing
) {
3570 /* The runtime struct only contains the parent of the private /tmp,
3571 * which is non-accessible to world users. Inside of it there's a /tmp
3572 * that is sticky, and that's the one we want to use here.
3573 * This does not apply when we are using /run/systemd/empty as fallback. */
3575 if (context
->private_tmp
&& runtime
) {
3576 if (streq_ptr(runtime
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
3577 tmp_dir
= runtime
->tmp_dir
;
3578 else if (runtime
->tmp_dir
)
3579 tmp_dir
= strjoina(runtime
->tmp_dir
, "/tmp");
3581 if (streq_ptr(runtime
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
3582 var_tmp_dir
= runtime
->var_tmp_dir
;
3583 else if (runtime
->var_tmp_dir
)
3584 var_tmp_dir
= strjoina(runtime
->var_tmp_dir
, "/tmp");
3587 ns_info
= (NamespaceInfo
) {
3588 .ignore_protect_paths
= false,
3589 .private_dev
= context
->private_devices
,
3590 .protect_control_groups
= context
->protect_control_groups
,
3591 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
3592 .protect_kernel_modules
= context
->protect_kernel_modules
,
3593 .protect_kernel_logs
= context
->protect_kernel_logs
,
3594 .protect_hostname
= context
->protect_hostname
,
3595 .mount_apivfs
= exec_context_get_effective_mount_apivfs(context
),
3596 .private_mounts
= context
->private_mounts
,
3597 .protect_home
= context
->protect_home
,
3598 .protect_system
= context
->protect_system
,
3599 .protect_proc
= context
->protect_proc
,
3600 .proc_subset
= context
->proc_subset
,
3601 .private_ipc
= context
->private_ipc
|| context
->ipc_namespace_path
,
3602 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3603 .mount_nosuid
= context
->no_new_privileges
&& !mac_selinux_use(),
3605 } else if (!context
->dynamic_user
&& root_dir
)
3607 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3608 * sandbox info, otherwise enforce it, don't ignore protected paths and
3609 * fail if we are enable to apply the sandbox inside the mount namespace.
3611 ns_info
= (NamespaceInfo
) {
3612 .ignore_protect_paths
= true,
3615 ns_info
= (NamespaceInfo
) {};
3617 if (context
->mount_flags
== MS_SHARED
)
3618 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3620 if (exec_context_has_credentials(context
) &&
3621 params
->prefix
[EXEC_DIRECTORY_RUNTIME
] &&
3622 FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
3623 creds_path
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials", u
->id
);
3630 if (MANAGER_IS_SYSTEM(u
->manager
)) {
3631 propagate_dir
= path_join("/run/systemd/propagate/", u
->id
);
3632 if (!propagate_dir
) {
3637 incoming_dir
= strdup("/run/systemd/incoming");
3638 if (!incoming_dir
) {
3643 extension_dir
= strdup("/run/systemd/unit-extensions");
3644 if (!extension_dir
) {
3649 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0) {
3654 r
= setup_namespace(root_dir
, root_image
, context
->root_image_options
,
3655 &ns_info
, context
->read_write_paths
,
3656 needs_sandboxing
? context
->read_only_paths
: NULL
,
3657 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
3658 needs_sandboxing
? context
->exec_paths
: NULL
,
3659 needs_sandboxing
? context
->no_exec_paths
: NULL
,
3664 context
->temporary_filesystems
,
3665 context
->n_temporary_filesystems
,
3666 context
->mount_images
,
3667 context
->n_mount_images
,
3671 context
->log_namespace
,
3672 context
->mount_flags
,
3673 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
3674 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
3675 context
->root_verity
,
3676 context
->extension_images
,
3677 context
->n_extension_images
,
3678 context
->extension_directories
,
3682 root_dir
|| root_image
? params
->notify_socket
: NULL
,
3685 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3686 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3687 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3688 * completely different execution environment. */
3690 if (insist_on_sandboxing(
3692 root_dir
, root_image
,
3695 log_unit_debug(u
, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3696 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3697 n_bind_mounts
, context
->n_temporary_filesystems
, yes_no(root_dir
), yes_no(root_image
), yes_no(context
->dynamic_user
));
3701 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
3707 bind_mount_free_many(bind_mounts
, n_bind_mounts
);
3711 static int apply_working_directory(
3712 const ExecContext
*context
,
3713 const ExecParameters
*params
,
3720 assert(exit_status
);
3722 if (context
->working_directory_home
) {
3725 *exit_status
= EXIT_CHDIR
;
3732 wd
= empty_to_root(context
->working_directory
);
3734 if (params
->flags
& EXEC_APPLY_CHROOT
)
3737 d
= prefix_roota(context
->root_directory
, wd
);
3739 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
3740 *exit_status
= EXIT_CHDIR
;
3747 static int apply_root_directory(
3748 const ExecContext
*context
,
3749 const ExecParameters
*params
,
3750 const bool needs_mount_ns
,
3754 assert(exit_status
);
3756 if (params
->flags
& EXEC_APPLY_CHROOT
)
3757 if (!needs_mount_ns
&& context
->root_directory
)
3758 if (chroot(context
->root_directory
) < 0) {
3759 *exit_status
= EXIT_CHROOT
;
3766 static int setup_keyring(
3768 const ExecContext
*context
,
3769 const ExecParameters
*p
,
3770 uid_t uid
, gid_t gid
) {
3772 key_serial_t keyring
;
3781 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3782 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3783 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3784 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3785 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3786 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3788 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
3791 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3792 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3793 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3794 * & group is just as nasty as acquiring a reference to the user keyring. */
3796 saved_uid
= getuid();
3797 saved_gid
= getgid();
3799 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3800 if (setregid(gid
, -1) < 0)
3801 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
3804 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3805 if (setreuid(uid
, -1) < 0) {
3806 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
3811 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
3812 if (keyring
== -1) {
3813 if (errno
== ENOSYS
)
3814 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
3815 else if (ERRNO_IS_PRIVILEGE(errno
))
3816 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
3817 else if (errno
== EDQUOT
)
3818 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
3820 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
3825 /* When requested link the user keyring into the session keyring. */
3826 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
3828 if (keyctl(KEYCTL_LINK
,
3829 KEY_SPEC_USER_KEYRING
,
3830 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
3831 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
3836 /* Restore uid/gid back */
3837 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
3838 if (setreuid(saved_uid
, -1) < 0) {
3839 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
3844 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
3845 if (setregid(saved_gid
, -1) < 0)
3846 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
3849 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3850 if (!sd_id128_is_null(u
->invocation_id
)) {
3853 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
3855 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
3857 if (keyctl(KEYCTL_SETPERM
, key
,
3858 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
3859 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
3860 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
3865 /* Revert back uid & gid for the last time, and exit */
3866 /* no extra logging, as only the first already reported error matters */
3867 if (getuid() != saved_uid
)
3868 (void) setreuid(saved_uid
, -1);
3870 if (getgid() != saved_gid
)
3871 (void) setregid(saved_gid
, -1);
3876 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
3882 array
[(*n
)++] = pair
[0];
3884 array
[(*n
)++] = pair
[1];
3887 static int close_remaining_fds(
3888 const ExecParameters
*params
,
3889 const ExecRuntime
*runtime
,
3890 const DynamicCreds
*dcreds
,
3893 const int *fds
, size_t n_fds
) {
3895 size_t n_dont_close
= 0;
3896 int dont_close
[n_fds
+ 12];
3900 if (params
->stdin_fd
>= 0)
3901 dont_close
[n_dont_close
++] = params
->stdin_fd
;
3902 if (params
->stdout_fd
>= 0)
3903 dont_close
[n_dont_close
++] = params
->stdout_fd
;
3904 if (params
->stderr_fd
>= 0)
3905 dont_close
[n_dont_close
++] = params
->stderr_fd
;
3908 dont_close
[n_dont_close
++] = socket_fd
;
3910 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
3911 n_dont_close
+= n_fds
;
3915 append_socket_pair(dont_close
, &n_dont_close
, runtime
->netns_storage_socket
);
3916 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ipcns_storage_socket
);
3921 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->user
->storage_socket
);
3923 append_socket_pair(dont_close
, &n_dont_close
, dcreds
->group
->storage_socket
);
3926 if (user_lookup_fd
>= 0)
3927 dont_close
[n_dont_close
++] = user_lookup_fd
;
3929 return close_all_fds(dont_close
, n_dont_close
);
3932 static int send_user_lookup(
3940 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3941 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3944 if (user_lookup_fd
< 0)
3947 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
3950 if (writev(user_lookup_fd
,
3952 IOVEC_INIT(&uid
, sizeof(uid
)),
3953 IOVEC_INIT(&gid
, sizeof(gid
)),
3954 IOVEC_INIT_STRING(unit
->id
) }, 3) < 0)
3960 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
3967 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3972 if (!c
->working_directory_home
)
3975 r
= get_home_dir(buf
);
3983 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
3984 _cleanup_strv_free_
char ** list
= NULL
;
3991 assert(c
->dynamic_user
);
3993 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3994 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3997 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3998 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
4004 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
4007 if (exec_directory_is_private(c
, t
))
4008 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
4010 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
4014 r
= strv_consume(&list
, e
);
4020 *ret
= TAKE_PTR(list
);
4025 static int exec_parameters_get_cgroup_path(const ExecParameters
*params
, char **ret
) {
4026 bool using_subcgroup
;
4032 if (!params
->cgroup_path
)
4035 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4036 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4037 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4038 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4039 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4040 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4041 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4042 * flag, which is only passed for the former statements, not for the latter. */
4044 using_subcgroup
= FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
|EXEC_CGROUP_DELEGATE
|EXEC_IS_CONTROL
);
4045 if (using_subcgroup
)
4046 p
= path_join(params
->cgroup_path
, ".control");
4048 p
= strdup(params
->cgroup_path
);
4053 return using_subcgroup
;
4056 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
4057 _cleanup_(cpu_set_reset
) CPUSet s
= {};
4063 if (!c
->numa_policy
.nodes
.set
) {
4064 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4068 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
4074 return cpu_set_add_all(ret
, &s
);
4077 bool exec_context_get_cpu_affinity_from_numa(const ExecContext
*c
) {
4080 return c
->cpu_affinity_from_numa
;
4083 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int fd
, int *ret_fd
) {
4088 assert(*n_fds
< fds_size
);
4096 if (fd
< 3 + (int) *n_fds
) {
4097 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4098 * the fds we pass to the process (or which are closed only during execve). */
4100 r
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
4104 close_and_replace(fd
, r
);
4107 *ret_fd
= fds
[*n_fds
] = fd
;
4112 static int connect_unix_harder(Unit
*u
, const OpenFile
*of
, int ofd
) {
4113 union sockaddr_union addr
= {
4114 .un
.sun_family
= AF_UNIX
,
4117 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
4124 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
4126 return log_unit_error_errno(u
, r
, "Failed to set sockaddr for %s: %m", of
->path
);
4130 for (size_t i
= 0; i
< ELEMENTSOF(socket_types
); i
++) {
4131 _cleanup_close_
int fd
= -EBADF
;
4133 fd
= socket(AF_UNIX
, socket_types
[i
] | SOCK_CLOEXEC
, 0);
4135 return log_unit_error_errno(u
, errno
, "Failed to create socket for %s: %m", of
->path
);
4137 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
4138 if (r
== -EPROTOTYPE
)
4141 return log_unit_error_errno(u
, r
, "Failed to connect socket for %s: %m", of
->path
);
4146 return log_unit_error_errno(u
, SYNTHETIC_ERRNO(EPROTOTYPE
), "Failed to connect socket for \"%s\".", of
->path
);
4149 static int get_open_file_fd(Unit
*u
, const OpenFile
*of
) {
4151 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
4156 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
4158 return log_error_errno(errno
, "Could not open \"%s\": %m", of
->path
);
4159 if (fstat(ofd
, &st
) < 0)
4160 return log_error_errno(errno
, "Failed to stat %s: %m", of
->path
);
4162 if (S_ISSOCK(st
.st_mode
)) {
4163 fd
= connect_unix_harder(u
, of
, ofd
);
4167 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
4168 return log_error_errno(errno
, "Failed to shutdown send for socket %s: %m", of
->path
);
4170 log_unit_debug(u
, "socket %s opened (fd=%d)", of
->path
, fd
);
4172 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
4173 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
4175 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
4178 fd
= fd_reopen(ofd
, flags
| O_CLOEXEC
);
4180 return log_unit_error_errno(u
, fd
, "Failed to open file %s: %m", of
->path
);
4182 log_unit_debug(u
, "file %s opened (fd=%d)", of
->path
, fd
);
4188 static int collect_open_file_fds(
4190 OpenFile
* open_files
,
4201 LIST_FOREACH(open_files
, of
, open_files
) {
4202 _cleanup_close_
int fd
= -EBADF
;
4204 fd
= get_open_file_fd(u
, of
);
4206 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
4207 log_unit_debug_errno(u
, fd
, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of
->path
);
4214 if (!GREEDY_REALLOC(*fds
, *n_fds
+ 1))
4217 r
= strv_extend(fdnames
, of
->fdname
);
4221 (*fds
)[*n_fds
] = TAKE_FD(fd
);
4229 static int exec_child(
4231 const ExecCommand
*command
,
4232 const ExecContext
*context
,
4233 const ExecParameters
*params
,
4234 ExecRuntime
*runtime
,
4235 DynamicCreds
*dcreds
,
4237 const int named_iofds
[static 3],
4239 size_t n_socket_fds
,
4240 size_t n_storage_fds
,
4245 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
4246 int r
, ngids
= 0, exec_fd
;
4247 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
4248 const char *username
= NULL
, *groupname
= NULL
;
4249 _cleanup_free_
char *home_buffer
= NULL
;
4250 const char *home
= NULL
, *shell
= NULL
;
4251 char **final_argv
= NULL
;
4252 dev_t journal_stream_dev
= 0;
4253 ino_t journal_stream_ino
= 0;
4254 bool userns_set_up
= false;
4255 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4256 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
4257 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
4258 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
4260 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
4261 bool use_selinux
= false;
4264 bool use_smack
= false;
4267 bool use_apparmor
= false;
4269 uid_t saved_uid
= getuid();
4270 gid_t saved_gid
= getgid();
4271 uid_t uid
= UID_INVALID
;
4272 gid_t gid
= GID_INVALID
;
4273 size_t n_fds
= n_socket_fds
+ n_storage_fds
, /* fds to pass to the child */
4274 n_keep_fds
; /* total number of fds not to close */
4276 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
4277 int ngids_after_pam
= 0;
4278 _cleanup_free_
int *fds
= NULL
;
4279 _cleanup_strv_free_
char **fdnames
= NULL
;
4285 assert(exit_status
);
4287 /* Explicitly test for CVE-2021-4034 inspired invocations */
4288 assert(command
->path
);
4289 assert(!strv_isempty(command
->argv
));
4291 rename_process_from_path(command
->path
);
4293 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4294 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4295 * both of which will be demoted to SIG_DFL. */
4296 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4299 if (context
->ignore_sigpipe
)
4300 (void) ignore_signals(SIGPIPE
);
4302 r
= reset_signal_mask();
4304 *exit_status
= EXIT_SIGNAL_MASK
;
4305 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
4308 if (params
->idle_pipe
)
4309 do_idle_pipe_dance(params
->idle_pipe
);
4311 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4312 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4313 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4314 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4317 log_set_open_when_needed(true);
4319 /* In case anything used libc syslog(), close this here, too */
4322 fds
= newdup(int, params_fds
, n_fds
);
4324 *exit_status
= EXIT_MEMORY
;
4328 fdnames
= strv_copy((char**) params
->fd_names
);
4330 *exit_status
= EXIT_MEMORY
;
4334 r
= collect_open_file_fds(unit
, params
->open_files
, &fds
, &fdnames
, &n_fds
);
4336 *exit_status
= EXIT_FDS
;
4337 return log_unit_error_errno(unit
, r
, "Failed to get OpenFile= file descriptors: %m");
4340 int keep_fds
[n_fds
+ 3];
4341 memcpy_safe(keep_fds
, fds
, n_fds
* sizeof(int));
4344 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, params
->exec_fd
, &exec_fd
);
4346 *exit_status
= EXIT_FDS
;
4347 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4351 if (unit
->manager
->restrict_fs
) {
4352 int bpf_map_fd
= lsm_bpf_map_restrict_fs_fd(unit
);
4353 if (bpf_map_fd
< 0) {
4354 *exit_status
= EXIT_FDS
;
4355 return log_unit_error_errno(unit
, bpf_map_fd
, "Failed to get restrict filesystems BPF map fd: %m");
4358 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, bpf_map_fd
, &bpf_map_fd
);
4360 *exit_status
= EXIT_FDS
;
4361 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4366 r
= close_remaining_fds(params
, runtime
, dcreds
, user_lookup_fd
, socket_fd
, keep_fds
, n_keep_fds
);
4368 *exit_status
= EXIT_FDS
;
4369 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
4372 if (!context
->same_pgrp
&&
4374 *exit_status
= EXIT_SETSID
;
4375 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
4378 exec_context_tty_reset(context
, params
);
4380 if (unit_shall_confirm_spawn(unit
)) {
4381 _cleanup_free_
char *cmdline
= NULL
;
4383 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4385 *exit_status
= EXIT_MEMORY
;
4389 r
= ask_for_confirmation(context
, params
->confirm_spawn
, unit
, cmdline
);
4390 if (r
!= CONFIRM_EXECUTE
) {
4391 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4392 *exit_status
= EXIT_SUCCESS
;
4395 *exit_status
= EXIT_CONFIRM
;
4396 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ECANCELED
),
4397 "Execution cancelled by the user");
4401 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4402 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4403 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4404 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4405 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4406 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
4407 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit
->manager
) ? "system" : "user", true) != 0) {
4408 *exit_status
= EXIT_MEMORY
;
4409 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4412 if (context
->dynamic_user
&& dcreds
) {
4413 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4415 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4416 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4417 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4418 *exit_status
= EXIT_USER
;
4419 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4422 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4424 *exit_status
= EXIT_MEMORY
;
4428 r
= dynamic_creds_realize(dcreds
, suggested_paths
, &uid
, &gid
);
4430 *exit_status
= EXIT_USER
;
4432 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4433 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4434 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
4437 if (!uid_is_valid(uid
)) {
4438 *exit_status
= EXIT_USER
;
4439 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\"", uid
);
4442 if (!gid_is_valid(gid
)) {
4443 *exit_status
= EXIT_USER
;
4444 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\"", gid
);
4448 username
= dcreds
->user
->name
;
4451 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
4453 *exit_status
= EXIT_USER
;
4454 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
4457 r
= get_fixed_group(context
, &groupname
, &gid
);
4459 *exit_status
= EXIT_GROUP
;
4460 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
4464 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4465 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
4466 &supplementary_gids
, &ngids
);
4468 *exit_status
= EXIT_GROUP
;
4469 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
4472 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
4474 *exit_status
= EXIT_USER
;
4475 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
4478 user_lookup_fd
= safe_close(user_lookup_fd
);
4480 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
4482 *exit_status
= EXIT_CHDIR
;
4483 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
4486 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4487 * must sure to drop O_NONBLOCK */
4489 (void) fd_nonblock(socket_fd
, false);
4491 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4492 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4493 if (params
->cgroup_path
) {
4494 _cleanup_free_
char *p
= NULL
;
4496 r
= exec_parameters_get_cgroup_path(params
, &p
);
4498 *exit_status
= EXIT_CGROUP
;
4499 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
4502 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
4503 if (r
== -EUCLEAN
) {
4504 *exit_status
= EXIT_CGROUP
;
4505 return log_unit_error_errno(unit
, r
, "Failed to attach process to cgroup %s "
4506 "because the cgroup or one of its parents or "
4507 "siblings is in the threaded mode: %m", p
);
4510 *exit_status
= EXIT_CGROUP
;
4511 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
4515 if (context
->network_namespace_path
&& runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
4516 r
= open_shareable_ns_path(runtime
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
4518 *exit_status
= EXIT_NETWORK
;
4519 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
4523 if (context
->ipc_namespace_path
&& runtime
&& runtime
->ipcns_storage_socket
[0] >= 0) {
4524 r
= open_shareable_ns_path(runtime
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
4526 *exit_status
= EXIT_NAMESPACE
;
4527 return log_unit_error_errno(unit
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
4531 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
4533 *exit_status
= EXIT_STDIN
;
4534 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
4537 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4539 *exit_status
= EXIT_STDOUT
;
4540 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
4543 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
4545 *exit_status
= EXIT_STDERR
;
4546 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
4549 if (context
->oom_score_adjust_set
) {
4550 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4551 * prohibit write access to this file, and we shouldn't trip up over that. */
4552 r
= set_oom_score_adjust(context
->oom_score_adjust
);
4553 if (ERRNO_IS_PRIVILEGE(r
))
4554 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4556 *exit_status
= EXIT_OOM_ADJUST
;
4557 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
4561 if (context
->coredump_filter_set
) {
4562 r
= set_coredump_filter(context
->coredump_filter
);
4563 if (ERRNO_IS_PRIVILEGE(r
))
4564 log_unit_debug_errno(unit
, r
, "Failed to adjust coredump_filter, ignoring: %m");
4566 return log_unit_error_errno(unit
, r
, "Failed to adjust coredump_filter: %m");
4569 if (context
->nice_set
) {
4570 r
= setpriority_closest(context
->nice
);
4572 return log_unit_error_errno(unit
, r
, "Failed to set up process scheduling priority (nice level): %m");
4575 if (context
->cpu_sched_set
) {
4576 struct sched_param param
= {
4577 .sched_priority
= context
->cpu_sched_priority
,
4580 r
= sched_setscheduler(0,
4581 context
->cpu_sched_policy
|
4582 (context
->cpu_sched_reset_on_fork
?
4583 SCHED_RESET_ON_FORK
: 0),
4586 *exit_status
= EXIT_SETSCHEDULER
;
4587 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
4591 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
4592 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
4593 const CPUSet
*cpu_set
;
4595 if (context
->cpu_affinity_from_numa
) {
4596 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
4598 *exit_status
= EXIT_CPUAFFINITY
;
4599 return log_unit_error_errno(unit
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
4602 cpu_set
= &converted_cpu_set
;
4604 cpu_set
= &context
->cpu_set
;
4606 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
4607 *exit_status
= EXIT_CPUAFFINITY
;
4608 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
4612 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
4613 r
= apply_numa_policy(&context
->numa_policy
);
4614 if (r
== -EOPNOTSUPP
)
4615 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
4617 *exit_status
= EXIT_NUMA_POLICY
;
4618 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
4622 if (context
->ioprio_set
)
4623 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
4624 *exit_status
= EXIT_IOPRIO
;
4625 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
4628 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
4629 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
4630 *exit_status
= EXIT_TIMERSLACK
;
4631 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
4634 if (context
->personality
!= PERSONALITY_INVALID
) {
4635 r
= safe_personality(context
->personality
);
4637 *exit_status
= EXIT_PERSONALITY
;
4638 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
4642 if (context
->utmp_id
) {
4643 const char *line
= context
->tty_path
?
4644 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
4646 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
4648 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
4649 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
4654 if (uid_is_valid(uid
)) {
4655 r
= chown_terminal(STDIN_FILENO
, uid
);
4657 *exit_status
= EXIT_STDIN
;
4658 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
4662 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4663 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4664 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4665 * touch a single hierarchy too. */
4666 if (params
->cgroup_path
&& context
->user
&& (params
->flags
& EXEC_CGROUP_DELEGATE
)) {
4667 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
4669 *exit_status
= EXIT_CGROUP
;
4670 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
4674 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
4676 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
4677 r
= setup_exec_directory(context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
4679 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
4682 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
4683 r
= setup_credentials(context
, params
, unit
->id
, uid
);
4685 *exit_status
= EXIT_CREDENTIALS
;
4686 return log_unit_error_errno(unit
, r
, "Failed to set up credentials: %m");
4690 r
= build_environment(
4703 *exit_status
= EXIT_MEMORY
;
4707 r
= build_pass_environment(context
, &pass_env
);
4709 *exit_status
= EXIT_MEMORY
;
4713 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4714 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4715 * not specify PATH but the unit has ExecSearchPath. */
4716 if (!strv_isempty(context
->exec_search_path
)) {
4717 _cleanup_free_
char *joined
= NULL
;
4719 joined
= strv_join(context
->exec_search_path
, ":");
4721 *exit_status
= EXIT_MEMORY
;
4725 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
4727 *exit_status
= EXIT_MEMORY
;
4732 accum_env
= strv_env_merge(params
->environment
,
4734 joined_exec_search_path
,
4736 context
->environment
,
4739 *exit_status
= EXIT_MEMORY
;
4742 accum_env
= strv_env_clean(accum_env
);
4744 (void) umask(context
->umask
);
4746 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
4748 *exit_status
= EXIT_KEYRING
;
4749 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
4752 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4754 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4756 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4757 * for it, and the kernel doesn't actually support ambient caps. */
4758 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
4760 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4761 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4763 if (needs_ambient_hack
)
4764 needs_setuid
= false;
4766 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
4768 if (needs_sandboxing
) {
4769 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4770 * /sys being present. The actual MAC context application will happen later, as late as
4771 * possible, to avoid impacting our own code paths. */
4774 use_selinux
= mac_selinux_use();
4777 use_smack
= mac_smack_use();
4780 use_apparmor
= mac_apparmor_use();
4784 if (needs_sandboxing
) {
4787 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4788 * is set here. (See below.) */
4790 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
4792 *exit_status
= EXIT_LIMITS
;
4793 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
4797 if (needs_setuid
&& context
->pam_name
&& username
) {
4798 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4799 * wins here. (See above.) */
4801 /* All fds passed in the fds array will be closed in the pam child process. */
4802 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
4804 *exit_status
= EXIT_PAM
;
4805 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
4808 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
4809 if (ngids_after_pam
< 0) {
4810 *exit_status
= EXIT_MEMORY
;
4811 return log_unit_error_errno(unit
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
4815 if (needs_sandboxing
&& context
->private_users
&& have_effective_cap(CAP_SYS_ADMIN
) <= 0) {
4816 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4817 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4818 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4820 userns_set_up
= true;
4821 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4823 *exit_status
= EXIT_USER
;
4824 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user: %m");
4828 if ((context
->private_network
|| context
->network_namespace_path
) && runtime
&& runtime
->netns_storage_socket
[0] >= 0) {
4830 if (ns_type_supported(NAMESPACE_NET
)) {
4831 r
= setup_shareable_ns(runtime
->netns_storage_socket
, CLONE_NEWNET
);
4833 log_unit_warning_errno(unit
, r
,
4834 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4836 *exit_status
= EXIT_NETWORK
;
4837 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
4839 } else if (context
->network_namespace_path
) {
4840 *exit_status
= EXIT_NETWORK
;
4841 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4842 "NetworkNamespacePath= is not supported, refusing.");
4844 log_unit_warning(unit
, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4847 if ((context
->private_ipc
|| context
->ipc_namespace_path
) && runtime
&& runtime
->ipcns_storage_socket
[0] >= 0) {
4849 if (ns_type_supported(NAMESPACE_IPC
)) {
4850 r
= setup_shareable_ns(runtime
->ipcns_storage_socket
, CLONE_NEWIPC
);
4852 log_unit_warning_errno(unit
, r
,
4853 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4855 *exit_status
= EXIT_NAMESPACE
;
4856 return log_unit_error_errno(unit
, r
, "Failed to set up IPC namespacing: %m");
4858 } else if (context
->ipc_namespace_path
) {
4859 *exit_status
= EXIT_NAMESPACE
;
4860 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4861 "IPCNamespacePath= is not supported, refusing.");
4863 log_unit_warning(unit
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4866 if (needs_mount_namespace
) {
4867 _cleanup_free_
char *error_path
= NULL
;
4869 r
= apply_mount_namespace(unit
, command
->flags
, context
, params
, runtime
, &error_path
);
4871 *exit_status
= EXIT_NAMESPACE
;
4872 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
4873 error_path
? ": " : "", strempty(error_path
));
4877 if (needs_sandboxing
) {
4878 r
= apply_protect_hostname(unit
, context
, exit_status
);
4883 /* Drop groups as early as possible.
4884 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4885 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4887 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
4888 int ngids_to_enforce
= 0;
4890 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
4895 if (ngids_to_enforce
< 0) {
4896 *exit_status
= EXIT_MEMORY
;
4897 return log_unit_error_errno(unit
,
4899 "Failed to merge group lists. Group membership might be incorrect: %m");
4902 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
4904 *exit_status
= EXIT_GROUP
;
4905 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
4909 /* If the user namespace was not set up above, try to do it now.
4910 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4911 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4912 * case of mount namespaces being less privileged when the mount point list is copied from a
4913 * different user namespace). */
4915 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
4916 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
4918 *exit_status
= EXIT_USER
;
4919 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
4923 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4926 _cleanup_free_
char *executable
= NULL
;
4927 _cleanup_close_
int executable_fd
= -EBADF
;
4928 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
4930 if (r
!= -ENOMEM
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
4931 log_unit_struct_errno(unit
, LOG_INFO
, r
,
4932 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4933 LOG_UNIT_INVOCATION_ID(unit
),
4934 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
4936 "EXECUTABLE=%s", command
->path
);
4940 *exit_status
= EXIT_EXEC
;
4942 return log_unit_struct_errno(unit
, LOG_INFO
, r
,
4943 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
4944 LOG_UNIT_INVOCATION_ID(unit
),
4945 LOG_UNIT_MESSAGE(unit
, "Failed to locate executable %s: %m",
4947 "EXECUTABLE=%s", command
->path
);
4950 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, executable_fd
, &executable_fd
);
4952 *exit_status
= EXIT_FDS
;
4953 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4957 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
4962 else if (params
->n_socket_fds
== 1)
4963 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4964 * use context from that fd to compute the label. */
4965 fd
= params
->fds
[0];
4968 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
4970 if (!context
->selinux_context_ignore
) {
4971 *exit_status
= EXIT_SELINUX_CONTEXT
;
4972 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
4974 log_unit_debug_errno(unit
, r
, "Failed to determine SELinux context, ignoring: %m");
4980 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4981 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4982 * however if we have it as we want to keep it open until the final execve(). */
4984 r
= close_all_fds(keep_fds
, n_keep_fds
);
4986 r
= shift_fds(fds
, n_fds
);
4988 r
= flags_fds(fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
4990 *exit_status
= EXIT_FDS
;
4991 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
4994 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4995 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4996 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4999 secure_bits
= context
->secure_bits
;
5001 if (needs_sandboxing
) {
5004 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
5005 * requested. (Note this is placed after the general resource limit initialization, see
5006 * above, in order to take precedence.) */
5007 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
5008 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
5009 *exit_status
= EXIT_LIMITS
;
5010 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5015 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5016 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5018 r
= setup_smack(unit
->manager
, context
, executable_fd
);
5019 if (r
< 0 && !context
->smack_process_label_ignore
) {
5020 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
5021 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
5026 bset
= context
->capability_bounding_set
;
5027 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5028 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5029 * instead of us doing that */
5030 if (needs_ambient_hack
)
5031 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
5032 (UINT64_C(1) << CAP_SETUID
) |
5033 (UINT64_C(1) << CAP_SETGID
);
5035 if (!cap_test_all(bset
)) {
5036 r
= capability_bounding_set_drop(bset
, false);
5038 *exit_status
= EXIT_CAPABILITIES
;
5039 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
5043 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5045 * To be able to raise the ambient capabilities after setresuid() they have to be
5046 * added to the inherited set and keep caps has to be set (done in enforce_user()).
5047 * After setresuid() the ambient capabilities can be raised as they are present in
5048 * the permitted and inhertiable set. However it is possible that someone wants to
5049 * set ambient capabilities without changing the user, so we also set the ambient
5050 * capabilities here.
5051 * The requested ambient capabilities are raised in the inheritable set if the
5052 * second argument is true. */
5053 if (!needs_ambient_hack
) {
5054 r
= capability_ambient_set_apply(context
->capability_ambient_set
, true);
5056 *exit_status
= EXIT_CAPABILITIES
;
5057 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
5062 /* chroot to root directory first, before we lose the ability to chroot */
5063 r
= apply_root_directory(context
, params
, needs_mount_namespace
, exit_status
);
5065 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
5068 if (uid_is_valid(uid
)) {
5069 r
= enforce_user(context
, uid
);
5071 *exit_status
= EXIT_USER
;
5072 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
5075 if (!needs_ambient_hack
&&
5076 context
->capability_ambient_set
!= 0) {
5078 /* Raise the ambient capabilities after user change. */
5079 r
= capability_ambient_set_apply(context
->capability_ambient_set
, false);
5081 *exit_status
= EXIT_CAPABILITIES
;
5082 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
5088 /* Apply working directory here, because the working directory might be on NFS and only the user running
5089 * this service might have the correct privilege to change to the working directory */
5090 r
= apply_working_directory(context
, params
, home
, exit_status
);
5092 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
5094 if (needs_sandboxing
) {
5095 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5096 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5097 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5098 * are restricted. */
5102 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
5105 r
= setexeccon(exec_context
);
5107 if (!context
->selinux_context_ignore
) {
5108 *exit_status
= EXIT_SELINUX_CONTEXT
;
5109 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
5111 log_unit_debug_errno(unit
, r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
5118 if (use_apparmor
&& context
->apparmor_profile
) {
5119 r
= aa_change_onexec(context
->apparmor_profile
);
5120 if (r
< 0 && !context
->apparmor_profile_ignore
) {
5121 *exit_status
= EXIT_APPARMOR_PROFILE
;
5122 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
5127 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
5128 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
5130 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
5131 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5132 * effective set here.
5133 * The effective set is overwritten during execve with the following values:
5134 * - ambient set (for non-root processes)
5135 * - (inheritable | bounding) set for root processes)
5137 * Hence there is no security impact to raise it in the effective set before execve
5139 r
= capability_gain_cap_setpcap(NULL
);
5141 *exit_status
= EXIT_CAPABILITIES
;
5142 return log_unit_error_errno(unit
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
5144 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
5145 *exit_status
= EXIT_SECUREBITS
;
5146 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
5150 if (context_has_no_new_privileges(context
))
5151 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
5152 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
5153 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
5157 r
= apply_address_families(unit
, context
);
5159 *exit_status
= EXIT_ADDRESS_FAMILIES
;
5160 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
5163 r
= apply_memory_deny_write_execute(unit
, context
);
5165 *exit_status
= EXIT_SECCOMP
;
5166 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
5169 r
= apply_restrict_realtime(unit
, context
);
5171 *exit_status
= EXIT_SECCOMP
;
5172 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
5175 r
= apply_restrict_suid_sgid(unit
, context
);
5177 *exit_status
= EXIT_SECCOMP
;
5178 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
5181 r
= apply_restrict_namespaces(unit
, context
);
5183 *exit_status
= EXIT_SECCOMP
;
5184 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
5187 r
= apply_protect_sysctl(unit
, context
);
5189 *exit_status
= EXIT_SECCOMP
;
5190 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
5193 r
= apply_protect_kernel_modules(unit
, context
);
5195 *exit_status
= EXIT_SECCOMP
;
5196 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
5199 r
= apply_protect_kernel_logs(unit
, context
);
5201 *exit_status
= EXIT_SECCOMP
;
5202 return log_unit_error_errno(unit
, r
, "Failed to apply kernel log restrictions: %m");
5205 r
= apply_protect_clock(unit
, context
);
5207 *exit_status
= EXIT_SECCOMP
;
5208 return log_unit_error_errno(unit
, r
, "Failed to apply clock restrictions: %m");
5211 r
= apply_private_devices(unit
, context
);
5213 *exit_status
= EXIT_SECCOMP
;
5214 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
5217 r
= apply_syscall_archs(unit
, context
);
5219 *exit_status
= EXIT_SECCOMP
;
5220 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
5223 r
= apply_lock_personality(unit
, context
);
5225 *exit_status
= EXIT_SECCOMP
;
5226 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
5229 r
= apply_syscall_log(unit
, context
);
5231 *exit_status
= EXIT_SECCOMP
;
5232 return log_unit_error_errno(unit
, r
, "Failed to apply system call log filters: %m");
5235 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5236 * by the filter as little as possible. */
5237 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
5239 *exit_status
= EXIT_SECCOMP
;
5240 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
5245 r
= apply_restrict_filesystems(unit
, context
);
5247 *exit_status
= EXIT_BPF
;
5248 return log_unit_error_errno(unit
, r
, "Failed to restrict filesystems: %m");
5254 if (!strv_isempty(context
->unset_environment
)) {
5257 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5259 *exit_status
= EXIT_MEMORY
;
5263 strv_free_and_replace(accum_env
, ee
);
5266 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5267 replaced_argv
= replace_env_argv(command
->argv
, accum_env
);
5268 if (!replaced_argv
) {
5269 *exit_status
= EXIT_MEMORY
;
5272 final_argv
= replaced_argv
;
5274 final_argv
= command
->argv
;
5276 if (DEBUG_LOGGING
) {
5277 _cleanup_free_
char *line
= NULL
;
5279 line
= quote_command_line(final_argv
, SHELL_ESCAPE_EMPTY
);
5281 *exit_status
= EXIT_MEMORY
;
5285 log_unit_struct(unit
, LOG_DEBUG
,
5286 "EXECUTABLE=%s", executable
,
5287 LOG_UNIT_MESSAGE(unit
, "Executing: %s", line
));
5293 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5294 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5296 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5297 *exit_status
= EXIT_EXEC
;
5298 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
5302 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5307 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5308 * that POLLHUP on it no longer means execve() succeeded. */
5310 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5311 *exit_status
= EXIT_EXEC
;
5312 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
5316 *exit_status
= EXIT_EXEC
;
5317 return log_unit_error_errno(unit
, r
, "Failed to execute %s: %m", executable
);
5320 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
5321 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
5323 int exec_spawn(Unit
*unit
,
5324 ExecCommand
*command
,
5325 const ExecContext
*context
,
5326 const ExecParameters
*params
,
5327 ExecRuntime
*runtime
,
5328 DynamicCreds
*dcreds
,
5331 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
5332 _cleanup_free_
char *subcgroup_path
= NULL
;
5333 _cleanup_strv_free_
char **files_env
= NULL
;
5334 size_t n_storage_fds
= 0, n_socket_fds
= 0;
5335 _cleanup_free_
char *line
= NULL
;
5343 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
5345 if (context
->std_input
== EXEC_INPUT_SOCKET
||
5346 context
->std_output
== EXEC_OUTPUT_SOCKET
||
5347 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
5349 if (params
->n_socket_fds
> 1)
5350 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
5352 if (params
->n_socket_fds
== 0)
5353 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
5355 socket_fd
= params
->fds
[0];
5359 n_socket_fds
= params
->n_socket_fds
;
5360 n_storage_fds
= params
->n_storage_fds
;
5363 r
= exec_context_named_iofds(context
, params
, named_iofds
);
5365 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
5367 r
= exec_context_load_environment(unit
, context
, &files_env
);
5369 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
5371 line
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
5375 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5376 and, until the next SELinux policy changes, we save further reloads in future children. */
5377 mac_selinux_maybe_reload();
5379 log_unit_struct(unit
, LOG_DEBUG
,
5380 LOG_UNIT_MESSAGE(unit
, "About to execute %s", line
),
5381 "EXECUTABLE=%s", command
->path
, /* We won't know the real executable path until we create
5382 the mount namespace in the child, but we want to log
5383 from the parent, so we need to use the (possibly
5384 inaccurate) path here. */
5385 LOG_UNIT_INVOCATION_ID(unit
));
5387 if (params
->cgroup_path
) {
5388 r
= exec_parameters_get_cgroup_path(params
, &subcgroup_path
);
5390 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
5391 if (r
> 0) { /* We are using a child cgroup */
5392 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
5394 return log_unit_error_errno(unit
, r
, "Failed to create control group '%s': %m", subcgroup_path
);
5396 /* Normally we would not propagate the xattrs to children but since we created this
5397 * sub-cgroup internally we should do it. */
5398 cgroup_oomd_xattr_apply(unit
, subcgroup_path
);
5399 cgroup_log_xattr_apply(unit
, subcgroup_path
);
5405 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
5408 int exit_status
= EXIT_SUCCESS
;
5410 r
= exec_child(unit
,
5422 unit
->manager
->user_lookup_fds
[1],
5426 const char *status
=
5427 exit_status_to_string(exit_status
,
5428 EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
);
5430 log_unit_struct_errno(unit
, LOG_ERR
, r
,
5431 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5432 LOG_UNIT_INVOCATION_ID(unit
),
5433 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
5434 status
, command
->path
),
5435 "EXECUTABLE=%s", command
->path
);
5441 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
5443 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5444 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5445 * process will be killed too). */
5447 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
5449 exec_status_start(&command
->exec_status
, pid
);
5455 void exec_context_init(ExecContext
*c
) {
5459 c
->ioprio
= IOPRIO_DEFAULT_CLASS_AND_PRIO
;
5460 c
->cpu_sched_policy
= SCHED_OTHER
;
5461 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
5462 c
->syslog_level_prefix
= true;
5463 c
->ignore_sigpipe
= true;
5464 c
->timer_slack_nsec
= NSEC_INFINITY
;
5465 c
->personality
= PERSONALITY_INVALID
;
5466 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5467 c
->directories
[t
].mode
= 0755;
5468 c
->timeout_clean_usec
= USEC_INFINITY
;
5469 c
->capability_bounding_set
= CAP_ALL
;
5470 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
5471 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
5472 c
->log_level_max
= -1;
5474 c
->syscall_errno
= SECCOMP_ERROR_NUMBER_KILL
;
5476 c
->tty_rows
= UINT_MAX
;
5477 c
->tty_cols
= UINT_MAX
;
5478 numa_policy_reset(&c
->numa_policy
);
5481 void exec_context_done(ExecContext
*c
) {
5484 c
->environment
= strv_free(c
->environment
);
5485 c
->environment_files
= strv_free(c
->environment_files
);
5486 c
->pass_environment
= strv_free(c
->pass_environment
);
5487 c
->unset_environment
= strv_free(c
->unset_environment
);
5489 rlimit_free_all(c
->rlimit
);
5491 for (size_t l
= 0; l
< 3; l
++) {
5492 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
5493 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
5496 c
->working_directory
= mfree(c
->working_directory
);
5497 c
->root_directory
= mfree(c
->root_directory
);
5498 c
->root_image
= mfree(c
->root_image
);
5499 c
->root_image_options
= mount_options_free_all(c
->root_image_options
);
5500 c
->root_hash
= mfree(c
->root_hash
);
5501 c
->root_hash_size
= 0;
5502 c
->root_hash_path
= mfree(c
->root_hash_path
);
5503 c
->root_hash_sig
= mfree(c
->root_hash_sig
);
5504 c
->root_hash_sig_size
= 0;
5505 c
->root_hash_sig_path
= mfree(c
->root_hash_sig_path
);
5506 c
->root_verity
= mfree(c
->root_verity
);
5507 c
->extension_images
= mount_image_free_many(c
->extension_images
, &c
->n_extension_images
);
5508 c
->extension_directories
= strv_free(c
->extension_directories
);
5509 c
->tty_path
= mfree(c
->tty_path
);
5510 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
5511 c
->user
= mfree(c
->user
);
5512 c
->group
= mfree(c
->group
);
5514 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
5516 c
->pam_name
= mfree(c
->pam_name
);
5518 c
->read_only_paths
= strv_free(c
->read_only_paths
);
5519 c
->read_write_paths
= strv_free(c
->read_write_paths
);
5520 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
5521 c
->exec_paths
= strv_free(c
->exec_paths
);
5522 c
->no_exec_paths
= strv_free(c
->no_exec_paths
);
5523 c
->exec_search_path
= strv_free(c
->exec_search_path
);
5525 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
5526 c
->bind_mounts
= NULL
;
5527 c
->n_bind_mounts
= 0;
5528 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
5529 c
->temporary_filesystems
= NULL
;
5530 c
->n_temporary_filesystems
= 0;
5531 c
->mount_images
= mount_image_free_many(c
->mount_images
, &c
->n_mount_images
);
5533 cpu_set_reset(&c
->cpu_set
);
5534 numa_policy_reset(&c
->numa_policy
);
5536 c
->utmp_id
= mfree(c
->utmp_id
);
5537 c
->selinux_context
= mfree(c
->selinux_context
);
5538 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
5539 c
->smack_process_label
= mfree(c
->smack_process_label
);
5541 c
->restrict_filesystems
= set_free(c
->restrict_filesystems
);
5543 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
5544 c
->syscall_archs
= set_free(c
->syscall_archs
);
5545 c
->address_families
= set_free(c
->address_families
);
5547 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
5548 exec_directory_done(&c
->directories
[t
]);
5550 c
->log_level_max
= -1;
5552 exec_context_free_log_extra_fields(c
);
5553 c
->log_filter_allowed_patterns
= set_free(c
->log_filter_allowed_patterns
);
5554 c
->log_filter_denied_patterns
= set_free(c
->log_filter_denied_patterns
);
5556 c
->log_ratelimit_interval_usec
= 0;
5557 c
->log_ratelimit_burst
= 0;
5559 c
->stdin_data
= mfree(c
->stdin_data
);
5560 c
->stdin_data_size
= 0;
5562 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
5563 c
->ipc_namespace_path
= mfree(c
->ipc_namespace_path
);
5565 c
->log_namespace
= mfree(c
->log_namespace
);
5567 c
->load_credentials
= hashmap_free(c
->load_credentials
);
5568 c
->set_credentials
= hashmap_free(c
->set_credentials
);
5571 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
5574 if (!runtime_prefix
)
5577 for (size_t i
= 0; i
< c
->directories
[EXEC_DIRECTORY_RUNTIME
].n_items
; i
++) {
5578 _cleanup_free_
char *p
= NULL
;
5580 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5581 p
= path_join(runtime_prefix
, "private", c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5583 p
= path_join(runtime_prefix
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
5587 /* We execute this synchronously, since we need to be sure this is gone when we start the
5589 (void) rm_rf(p
, REMOVE_ROOT
);
5591 STRV_FOREACH(symlink
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].symlinks
) {
5592 _cleanup_free_
char *symlink_abs
= NULL
;
5594 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
5595 symlink_abs
= path_join(runtime_prefix
, "private", *symlink
);
5597 symlink_abs
= path_join(runtime_prefix
, *symlink
);
5601 (void) unlink(symlink_abs
);
5609 int exec_context_destroy_credentials(const ExecContext
*c
, const char *runtime_prefix
, const char *unit
) {
5610 _cleanup_free_
char *p
= NULL
;
5614 if (!runtime_prefix
|| !unit
)
5617 p
= path_join(runtime_prefix
, "credentials", unit
);
5621 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5622 * unmount it, and afterwards remove the mount point */
5623 (void) umount2(p
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
5624 (void) rm_rf(p
, REMOVE_ROOT
|REMOVE_CHMOD
);
5629 static void exec_command_done(ExecCommand
*c
) {
5632 c
->path
= mfree(c
->path
);
5633 c
->argv
= strv_free(c
->argv
);
5636 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
5637 for (size_t i
= 0; i
< n
; i
++)
5638 exec_command_done(c
+i
);
5641 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
5645 LIST_REMOVE(command
, c
, i
);
5646 exec_command_done(i
);
5653 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
5654 for (size_t i
= 0; i
< n
; i
++)
5655 c
[i
] = exec_command_free_list(c
[i
]);
5658 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
5659 for (size_t i
= 0; i
< n
; i
++)
5660 exec_status_reset(&c
[i
].exec_status
);
5663 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
5664 for (size_t i
= 0; i
< n
; i
++)
5665 LIST_FOREACH(command
, z
, c
[i
])
5666 exec_status_reset(&z
->exec_status
);
5669 typedef struct InvalidEnvInfo
{
5674 static void invalid_env(const char *p
, void *userdata
) {
5675 InvalidEnvInfo
*info
= userdata
;
5677 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
5680 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
5686 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
5689 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
5692 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
5695 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
5698 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
5701 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
5708 static int exec_context_named_iofds(
5709 const ExecContext
*c
,
5710 const ExecParameters
*p
,
5711 int named_iofds
[static 3]) {
5714 const char* stdio_fdname
[3];
5719 assert(named_iofds
);
5721 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
5722 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
5723 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
5725 for (size_t i
= 0; i
< 3; i
++)
5726 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
5728 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
5730 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
5731 if (named_iofds
[STDIN_FILENO
] < 0 &&
5732 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
5733 stdio_fdname
[STDIN_FILENO
] &&
5734 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
5736 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
5739 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
5740 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
5741 stdio_fdname
[STDOUT_FILENO
] &&
5742 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
5744 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
5747 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
5748 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
5749 stdio_fdname
[STDERR_FILENO
] &&
5750 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
5752 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
5756 return targets
== 0 ? 0 : -ENOENT
;
5759 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***ret
) {
5760 _cleanup_strv_free_
char **v
= NULL
;
5766 STRV_FOREACH(i
, c
->environment_files
) {
5767 _cleanup_globfree_ glob_t pglob
= {};
5768 bool ignore
= false;
5776 if (!path_is_absolute(fn
)) {
5782 /* Filename supports globbing, take all matching files */
5783 r
= safe_glob(fn
, 0, &pglob
);
5790 /* When we don't match anything, -ENOENT should be returned */
5791 assert(pglob
.gl_pathc
> 0);
5793 for (unsigned n
= 0; n
< pglob
.gl_pathc
; n
++) {
5794 _cleanup_strv_free_
char **p
= NULL
;
5796 r
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
5803 /* Log invalid environment variables with filename */
5805 InvalidEnvInfo info
= {
5807 .path
= pglob
.gl_pathv
[n
]
5810 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
5816 char **m
= strv_env_merge(v
, p
);
5820 strv_free_and_replace(v
, m
);
5830 static bool tty_may_match_dev_console(const char *tty
) {
5831 _cleanup_free_
char *resolved
= NULL
;
5836 tty
= skip_dev_prefix(tty
);
5838 /* trivial identity? */
5839 if (streq(tty
, "console"))
5842 if (resolve_dev_console(&resolved
) < 0)
5843 return true; /* if we could not resolve, assume it may */
5845 /* "tty0" means the active VC, so it may be the same sometimes */
5846 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
5849 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
5852 return ec
->tty_reset
||
5854 ec
->tty_vt_disallocate
||
5855 is_terminal_input(ec
->std_input
) ||
5856 is_terminal_output(ec
->std_output
) ||
5857 is_terminal_output(ec
->std_error
);
5860 bool exec_context_may_touch_console(const ExecContext
*ec
) {
5862 return exec_context_may_touch_tty(ec
) &&
5863 tty_may_match_dev_console(exec_context_tty_path(ec
));
5866 static void strv_fprintf(FILE *f
, char **l
) {
5870 fprintf(f
, " %s", *g
);
5873 static void strv_dump(FILE* f
, const char *prefix
, const char *name
, char **strv
) {
5878 if (!strv_isempty(strv
)) {
5879 fprintf(f
, "%s%s:", prefix
, name
);
5880 strv_fprintf(f
, strv
);
5885 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
5891 prefix
= strempty(prefix
);
5895 "%sWorkingDirectory: %s\n"
5896 "%sRootDirectory: %s\n"
5897 "%sNonBlocking: %s\n"
5898 "%sPrivateTmp: %s\n"
5899 "%sPrivateDevices: %s\n"
5900 "%sProtectKernelTunables: %s\n"
5901 "%sProtectKernelModules: %s\n"
5902 "%sProtectKernelLogs: %s\n"
5903 "%sProtectClock: %s\n"
5904 "%sProtectControlGroups: %s\n"
5905 "%sPrivateNetwork: %s\n"
5906 "%sPrivateUsers: %s\n"
5907 "%sProtectHome: %s\n"
5908 "%sProtectSystem: %s\n"
5909 "%sMountAPIVFS: %s\n"
5910 "%sIgnoreSIGPIPE: %s\n"
5911 "%sMemoryDenyWriteExecute: %s\n"
5912 "%sRestrictRealtime: %s\n"
5913 "%sRestrictSUIDSGID: %s\n"
5914 "%sKeyringMode: %s\n"
5915 "%sProtectHostname: %s\n"
5916 "%sProtectProc: %s\n"
5917 "%sProcSubset: %s\n",
5919 prefix
, empty_to_root(c
->working_directory
),
5920 prefix
, empty_to_root(c
->root_directory
),
5921 prefix
, yes_no(c
->non_blocking
),
5922 prefix
, yes_no(c
->private_tmp
),
5923 prefix
, yes_no(c
->private_devices
),
5924 prefix
, yes_no(c
->protect_kernel_tunables
),
5925 prefix
, yes_no(c
->protect_kernel_modules
),
5926 prefix
, yes_no(c
->protect_kernel_logs
),
5927 prefix
, yes_no(c
->protect_clock
),
5928 prefix
, yes_no(c
->protect_control_groups
),
5929 prefix
, yes_no(c
->private_network
),
5930 prefix
, yes_no(c
->private_users
),
5931 prefix
, protect_home_to_string(c
->protect_home
),
5932 prefix
, protect_system_to_string(c
->protect_system
),
5933 prefix
, yes_no(exec_context_get_effective_mount_apivfs(c
)),
5934 prefix
, yes_no(c
->ignore_sigpipe
),
5935 prefix
, yes_no(c
->memory_deny_write_execute
),
5936 prefix
, yes_no(c
->restrict_realtime
),
5937 prefix
, yes_no(c
->restrict_suid_sgid
),
5938 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
5939 prefix
, yes_no(c
->protect_hostname
),
5940 prefix
, protect_proc_to_string(c
->protect_proc
),
5941 prefix
, proc_subset_to_string(c
->proc_subset
));
5944 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
5946 if (c
->root_image_options
) {
5947 fprintf(f
, "%sRootImageOptions:", prefix
);
5948 LIST_FOREACH(mount_options
, o
, c
->root_image_options
)
5949 if (!isempty(o
->options
))
5950 fprintf(f
, " %s:%s",
5951 partition_designator_to_string(o
->partition_designator
),
5957 _cleanup_free_
char *encoded
= NULL
;
5958 encoded
= hexmem(c
->root_hash
, c
->root_hash_size
);
5960 fprintf(f
, "%sRootHash: %s\n", prefix
, encoded
);
5963 if (c
->root_hash_path
)
5964 fprintf(f
, "%sRootHash: %s\n", prefix
, c
->root_hash_path
);
5966 if (c
->root_hash_sig
) {
5967 _cleanup_free_
char *encoded
= NULL
;
5969 len
= base64mem(c
->root_hash_sig
, c
->root_hash_sig_size
, &encoded
);
5971 fprintf(f
, "%sRootHashSignature: base64:%s\n", prefix
, encoded
);
5974 if (c
->root_hash_sig_path
)
5975 fprintf(f
, "%sRootHashSignature: %s\n", prefix
, c
->root_hash_sig_path
);
5978 fprintf(f
, "%sRootVerity: %s\n", prefix
, c
->root_verity
);
5980 STRV_FOREACH(e
, c
->environment
)
5981 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
5983 STRV_FOREACH(e
, c
->environment_files
)
5984 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
5986 STRV_FOREACH(e
, c
->pass_environment
)
5987 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
5989 STRV_FOREACH(e
, c
->unset_environment
)
5990 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
5992 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
5994 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5995 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
5997 for (size_t i
= 0; i
< c
->directories
[dt
].n_items
; i
++) {
5998 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].items
[i
].path
);
6000 STRV_FOREACH(d
, c
->directories
[dt
].items
[i
].symlinks
)
6001 fprintf(f
, "%s%s: %s:%s\n", prefix
, exec_directory_type_symlink_to_string(dt
), c
->directories
[dt
].items
[i
].path
, *d
);
6005 fprintf(f
, "%sTimeoutCleanSec: %s\n", prefix
, FORMAT_TIMESPAN(c
->timeout_clean_usec
, USEC_PER_SEC
));
6008 fprintf(f
, "%sNice: %i\n", prefix
, c
->nice
);
6010 if (c
->oom_score_adjust_set
)
6011 fprintf(f
, "%sOOMScoreAdjust: %i\n", prefix
, c
->oom_score_adjust
);
6013 if (c
->coredump_filter_set
)
6014 fprintf(f
, "%sCoredumpFilter: 0x%"PRIx64
"\n", prefix
, c
->coredump_filter
);
6016 for (unsigned i
= 0; i
< RLIM_NLIMITS
; i
++)
6018 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
6019 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
6020 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
6021 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
6024 if (c
->ioprio_set
) {
6025 _cleanup_free_
char *class_str
= NULL
;
6027 r
= ioprio_class_to_string_alloc(ioprio_prio_class(c
->ioprio
), &class_str
);
6029 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
6031 fprintf(f
, "%sIOPriority: %d\n", prefix
, ioprio_prio_data(c
->ioprio
));
6034 if (c
->cpu_sched_set
) {
6035 _cleanup_free_
char *policy_str
= NULL
;
6037 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
6039 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
6042 "%sCPUSchedulingPriority: %i\n"
6043 "%sCPUSchedulingResetOnFork: %s\n",
6044 prefix
, c
->cpu_sched_priority
,
6045 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
6048 if (c
->cpu_set
.set
) {
6049 _cleanup_free_
char *affinity
= NULL
;
6051 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
6052 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
6055 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
6056 _cleanup_free_
char *nodes
= NULL
;
6058 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
6059 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
6060 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
6063 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
6064 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
6067 "%sStandardInput: %s\n"
6068 "%sStandardOutput: %s\n"
6069 "%sStandardError: %s\n",
6070 prefix
, exec_input_to_string(c
->std_input
),
6071 prefix
, exec_output_to_string(c
->std_output
),
6072 prefix
, exec_output_to_string(c
->std_error
));
6074 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
6075 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
6076 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
6077 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
6078 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
6079 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
6081 if (c
->std_input
== EXEC_INPUT_FILE
)
6082 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
6083 if (c
->std_output
== EXEC_OUTPUT_FILE
)
6084 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6085 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
6086 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6087 if (c
->std_output
== EXEC_OUTPUT_FILE_TRUNCATE
)
6088 fprintf(f
, "%sStandardOutputFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6089 if (c
->std_error
== EXEC_OUTPUT_FILE
)
6090 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6091 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
6092 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6093 if (c
->std_error
== EXEC_OUTPUT_FILE_TRUNCATE
)
6094 fprintf(f
, "%sStandardErrorFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6100 "%sTTYVHangup: %s\n"
6101 "%sTTYVTDisallocate: %s\n"
6103 "%sTTYColumns: %u\n",
6104 prefix
, c
->tty_path
,
6105 prefix
, yes_no(c
->tty_reset
),
6106 prefix
, yes_no(c
->tty_vhangup
),
6107 prefix
, yes_no(c
->tty_vt_disallocate
),
6108 prefix
, c
->tty_rows
,
6109 prefix
, c
->tty_cols
);
6111 if (IN_SET(c
->std_output
,
6113 EXEC_OUTPUT_JOURNAL
,
6114 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
6115 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
6116 IN_SET(c
->std_error
,
6118 EXEC_OUTPUT_JOURNAL
,
6119 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
6120 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
6122 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
6124 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
6126 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
6128 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
6130 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
6133 if (c
->log_level_max
>= 0) {
6134 _cleanup_free_
char *t
= NULL
;
6136 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
6138 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
6141 if (c
->log_ratelimit_interval_usec
> 0)
6143 "%sLogRateLimitIntervalSec: %s\n",
6144 prefix
, FORMAT_TIMESPAN(c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
6146 if (c
->log_ratelimit_burst
> 0)
6147 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
6149 if (!set_isempty(c
->log_filter_allowed_patterns
) || !set_isempty(c
->log_filter_denied_patterns
)) {
6150 fprintf(f
, "%sLogFilterPatterns:", prefix
);
6153 SET_FOREACH(pattern
, c
->log_filter_allowed_patterns
)
6154 fprintf(f
, " %s", pattern
);
6155 SET_FOREACH(pattern
, c
->log_filter_denied_patterns
)
6156 fprintf(f
, " ~%s", pattern
);
6160 for (size_t j
= 0; j
< c
->n_log_extra_fields
; j
++) {
6161 fprintf(f
, "%sLogExtraFields: ", prefix
);
6162 fwrite(c
->log_extra_fields
[j
].iov_base
,
6163 1, c
->log_extra_fields
[j
].iov_len
,
6168 if (c
->log_namespace
)
6169 fprintf(f
, "%sLogNamespace: %s\n", prefix
, c
->log_namespace
);
6171 if (c
->secure_bits
) {
6172 _cleanup_free_
char *str
= NULL
;
6174 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
6176 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
6179 if (c
->capability_bounding_set
!= CAP_ALL
) {
6180 _cleanup_free_
char *str
= NULL
;
6182 r
= capability_set_to_string_alloc(c
->capability_bounding_set
, &str
);
6184 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
6187 if (c
->capability_ambient_set
!= 0) {
6188 _cleanup_free_
char *str
= NULL
;
6190 r
= capability_set_to_string_alloc(c
->capability_ambient_set
, &str
);
6192 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
6196 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
6198 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
6200 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
6202 strv_dump(f
, prefix
, "SupplementaryGroups", c
->supplementary_groups
);
6205 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
6207 strv_dump(f
, prefix
, "ReadWritePaths", c
->read_write_paths
);
6208 strv_dump(f
, prefix
, "ReadOnlyPaths", c
->read_only_paths
);
6209 strv_dump(f
, prefix
, "InaccessiblePaths", c
->inaccessible_paths
);
6210 strv_dump(f
, prefix
, "ExecPaths", c
->exec_paths
);
6211 strv_dump(f
, prefix
, "NoExecPaths", c
->no_exec_paths
);
6212 strv_dump(f
, prefix
, "ExecSearchPath", c
->exec_search_path
);
6214 for (size_t i
= 0; i
< c
->n_bind_mounts
; i
++)
6215 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
6216 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
6217 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
6218 c
->bind_mounts
[i
].source
,
6219 c
->bind_mounts
[i
].destination
,
6220 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
6222 for (size_t i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
6223 const TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
6225 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
6227 isempty(t
->options
) ? "" : ":",
6228 strempty(t
->options
));
6233 "%sUtmpIdentifier: %s\n",
6234 prefix
, c
->utmp_id
);
6236 if (c
->selinux_context
)
6238 "%sSELinuxContext: %s%s\n",
6239 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
6241 if (c
->apparmor_profile
)
6243 "%sAppArmorProfile: %s%s\n",
6244 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
6246 if (c
->smack_process_label
)
6248 "%sSmackProcessLabel: %s%s\n",
6249 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
6251 if (c
->personality
!= PERSONALITY_INVALID
)
6253 "%sPersonality: %s\n",
6254 prefix
, strna(personality_to_string(c
->personality
)));
6257 "%sLockPersonality: %s\n",
6258 prefix
, yes_no(c
->lock_personality
));
6260 if (c
->syscall_filter
) {
6262 "%sSystemCallFilter: ",
6265 if (!c
->syscall_allow_list
)
6271 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
6272 _cleanup_free_
char *name
= NULL
;
6273 const char *errno_name
= NULL
;
6274 int num
= PTR_TO_INT(val
);
6281 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
6282 fputs(strna(name
), f
);
6285 errno_name
= seccomp_errno_or_action_to_string(num
);
6287 fprintf(f
, ":%s", errno_name
);
6289 fprintf(f
, ":%d", num
);
6297 if (c
->syscall_archs
) {
6299 "%sSystemCallArchitectures:",
6304 SET_FOREACH(id
, c
->syscall_archs
)
6305 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
6310 if (exec_context_restrict_namespaces_set(c
)) {
6311 _cleanup_free_
char *s
= NULL
;
6313 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
6315 fprintf(f
, "%sRestrictNamespaces: %s\n",
6320 if (exec_context_restrict_filesystems_set(c
)) {
6322 SET_FOREACH(fs
, c
->restrict_filesystems
)
6323 fprintf(f
, "%sRestrictFileSystems: %s\n", prefix
, fs
);
6327 if (c
->network_namespace_path
)
6329 "%sNetworkNamespacePath: %s\n",
6330 prefix
, c
->network_namespace_path
);
6332 if (c
->syscall_errno
> 0) {
6333 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
6336 const char *errno_name
= seccomp_errno_or_action_to_string(c
->syscall_errno
);
6338 fputs(errno_name
, f
);
6340 fprintf(f
, "%d", c
->syscall_errno
);
6345 for (size_t i
= 0; i
< c
->n_mount_images
; i
++) {
6346 fprintf(f
, "%sMountImages: %s%s:%s", prefix
,
6347 c
->mount_images
[i
].ignore_enoent
? "-": "",
6348 c
->mount_images
[i
].source
,
6349 c
->mount_images
[i
].destination
);
6350 LIST_FOREACH(mount_options
, o
, c
->mount_images
[i
].mount_options
)
6351 fprintf(f
, ":%s:%s",
6352 partition_designator_to_string(o
->partition_designator
),
6353 strempty(o
->options
));
6357 for (size_t i
= 0; i
< c
->n_extension_images
; i
++) {
6358 fprintf(f
, "%sExtensionImages: %s%s", prefix
,
6359 c
->extension_images
[i
].ignore_enoent
? "-": "",
6360 c
->extension_images
[i
].source
);
6361 LIST_FOREACH(mount_options
, o
, c
->extension_images
[i
].mount_options
)
6362 fprintf(f
, ":%s:%s",
6363 partition_designator_to_string(o
->partition_designator
),
6364 strempty(o
->options
));
6368 strv_dump(f
, prefix
, "ExtensionDirectories", c
->extension_directories
);
6371 bool exec_context_maintains_privileges(const ExecContext
*c
) {
6374 /* Returns true if the process forked off would run under
6375 * an unchanged UID or as root. */
6380 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
6386 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
6394 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
6396 return IOPRIO_DEFAULT_CLASS_AND_PRIO
;
6398 return ioprio_normalize(p
);
6401 bool exec_context_get_effective_mount_apivfs(const ExecContext
*c
) {
6404 /* Explicit setting wins */
6405 if (c
->mount_apivfs_set
)
6406 return c
->mount_apivfs
;
6408 /* Default to "yes" if root directory or image are specified */
6409 if (exec_context_with_rootfs(c
))
6415 void exec_context_free_log_extra_fields(ExecContext
*c
) {
6418 for (size_t l
= 0; l
< c
->n_log_extra_fields
; l
++)
6419 free(c
->log_extra_fields
[l
].iov_base
);
6420 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
6421 c
->n_log_extra_fields
= 0;
6424 void exec_context_revert_tty(ExecContext
*c
) {
6425 _cleanup_close_
int fd
= -EBADF
;
6432 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6433 exec_context_tty_reset(c
, NULL
);
6435 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6436 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6437 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6438 if (!exec_context_may_touch_tty(c
))
6441 path
= exec_context_tty_path(c
);
6445 fd
= open(path
, O_PATH
|O_CLOEXEC
);
6447 return (void) log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
6448 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6451 if (fstat(fd
, &st
) < 0)
6452 return (void) log_warning_errno(errno
, "Failed to stat TTY '%s', ignoring: %m", path
);
6454 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6455 * if things are a character device, since a proper check either means we'd have to open the TTY and
6456 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6457 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6458 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6459 if (!S_ISCHR(st
.st_mode
))
6460 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path
);
6462 r
= fchmod_and_chown(fd
, TTY_MODE
, 0, TTY_GID
);
6464 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
6467 int exec_context_get_clean_directories(
6473 _cleanup_strv_free_
char **l
= NULL
;
6480 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
6481 if (!FLAGS_SET(mask
, 1U << t
))
6487 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
6490 j
= path_join(prefix
[t
], c
->directories
[t
].items
[i
].path
);
6494 r
= strv_consume(&l
, j
);
6498 /* Also remove private directories unconditionally. */
6499 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
6500 j
= path_join(prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
6504 r
= strv_consume(&l
, j
);
6509 STRV_FOREACH(symlink
, c
->directories
[t
].items
[i
].symlinks
) {
6510 j
= path_join(prefix
[t
], *symlink
);
6514 r
= strv_consume(&l
, j
);
6525 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
6526 ExecCleanMask mask
= 0;
6531 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6532 if (c
->directories
[t
].n_items
> 0)
6539 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
6546 dual_timestamp_get(&s
->start_timestamp
);
6549 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
6557 dual_timestamp_get(&s
->exit_timestamp
);
6562 if (context
&& context
->utmp_id
)
6563 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
6566 void exec_status_reset(ExecStatus
*s
) {
6569 *s
= (ExecStatus
) {};
6572 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
6579 prefix
= strempty(prefix
);
6582 "%sPID: "PID_FMT
"\n",
6585 if (dual_timestamp_is_set(&s
->start_timestamp
))
6587 "%sStart Timestamp: %s\n",
6588 prefix
, FORMAT_TIMESTAMP(s
->start_timestamp
.realtime
));
6590 if (dual_timestamp_is_set(&s
->exit_timestamp
))
6592 "%sExit Timestamp: %s\n"
6594 "%sExit Status: %i\n",
6595 prefix
, FORMAT_TIMESTAMP(s
->exit_timestamp
.realtime
),
6596 prefix
, sigchld_code_to_string(s
->code
),
6600 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6601 _cleanup_free_
char *cmd
= NULL
;
6602 const char *prefix2
;
6607 prefix
= strempty(prefix
);
6608 prefix2
= strjoina(prefix
, "\t");
6610 cmd
= quote_command_line(c
->argv
, SHELL_ESCAPE_EMPTY
);
6613 "%sCommand Line: %s\n",
6614 prefix
, strnull(cmd
));
6616 exec_status_dump(&c
->exec_status
, f
, prefix2
);
6619 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
6622 prefix
= strempty(prefix
);
6624 LIST_FOREACH(command
, i
, c
)
6625 exec_command_dump(i
, f
, prefix
);
6628 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
6635 /* It's kind of important, that we keep the order here */
6636 LIST_FIND_TAIL(command
, *l
, end
);
6637 LIST_INSERT_AFTER(command
, *l
, end
, e
);
6642 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
6650 l
= strv_new_ap(path
, ap
);
6662 free_and_replace(c
->path
, p
);
6664 return strv_free_and_replace(c
->argv
, l
);
6667 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
6668 _cleanup_strv_free_
char **l
= NULL
;
6676 l
= strv_new_ap(path
, ap
);
6682 r
= strv_extend_strv(&c
->argv
, l
, false);
6689 static void *remove_tmpdir_thread(void *p
) {
6690 _cleanup_free_
char *path
= p
;
6692 (void) rm_rf(path
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
6696 static ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
, bool destroy
) {
6703 (void) hashmap_remove(rt
->manager
->exec_runtime_by_id
, rt
->id
);
6705 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6707 if (destroy
&& rt
->tmp_dir
&& !streq(rt
->tmp_dir
, RUN_SYSTEMD_EMPTY
)) {
6708 log_debug("Spawning thread to nuke %s", rt
->tmp_dir
);
6710 r
= asynchronous_job(remove_tmpdir_thread
, rt
->tmp_dir
);
6712 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->tmp_dir
);
6717 if (destroy
&& rt
->var_tmp_dir
&& !streq(rt
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
)) {
6718 log_debug("Spawning thread to nuke %s", rt
->var_tmp_dir
);
6720 r
= asynchronous_job(remove_tmpdir_thread
, rt
->var_tmp_dir
);
6722 log_warning_errno(r
, "Failed to nuke %s: %m", rt
->var_tmp_dir
);
6724 rt
->var_tmp_dir
= NULL
;
6727 rt
->id
= mfree(rt
->id
);
6728 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
6729 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
6730 safe_close_pair(rt
->netns_storage_socket
);
6731 safe_close_pair(rt
->ipcns_storage_socket
);
6735 static void exec_runtime_freep(ExecRuntime
**rt
) {
6736 (void) exec_runtime_free(*rt
, false);
6739 static int exec_runtime_allocate(ExecRuntime
**ret
, const char *id
) {
6740 _cleanup_free_
char *id_copy
= NULL
;
6745 id_copy
= strdup(id
);
6749 n
= new(ExecRuntime
, 1);
6753 *n
= (ExecRuntime
) {
6754 .id
= TAKE_PTR(id_copy
),
6755 .netns_storage_socket
= PIPE_EBADF
,
6756 .ipcns_storage_socket
= PIPE_EBADF
,
6763 static int exec_runtime_add(
6768 int netns_storage_socket
[2],
6769 int ipcns_storage_socket
[2],
6770 ExecRuntime
**ret
) {
6772 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
6778 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6780 r
= exec_runtime_allocate(&rt
, id
);
6784 r
= hashmap_ensure_put(&m
->exec_runtime_by_id
, &string_hash_ops
, rt
->id
, rt
);
6788 assert(!!rt
->tmp_dir
== !!rt
->var_tmp_dir
); /* We require both to be set together */
6789 rt
->tmp_dir
= TAKE_PTR(*tmp_dir
);
6790 rt
->var_tmp_dir
= TAKE_PTR(*var_tmp_dir
);
6792 if (netns_storage_socket
) {
6793 rt
->netns_storage_socket
[0] = TAKE_FD(netns_storage_socket
[0]);
6794 rt
->netns_storage_socket
[1] = TAKE_FD(netns_storage_socket
[1]);
6797 if (ipcns_storage_socket
) {
6798 rt
->ipcns_storage_socket
[0] = TAKE_FD(ipcns_storage_socket
[0]);
6799 rt
->ipcns_storage_socket
[1] = TAKE_FD(ipcns_storage_socket
[1]);
6806 /* do not remove created ExecRuntime object when the operation succeeds. */
6811 static int exec_runtime_make(
6813 const ExecContext
*c
,
6815 ExecRuntime
**ret
) {
6817 _cleanup_(namespace_cleanup_tmpdirp
) char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
6818 _cleanup_close_pair_
int netns_storage_socket
[2] = PIPE_EBADF
, ipcns_storage_socket
[2] = PIPE_EBADF
;
6825 /* It is not necessary to create ExecRuntime object. */
6826 if (!c
->private_network
&& !c
->private_ipc
&& !c
->private_tmp
&& !c
->network_namespace_path
) {
6831 if (c
->private_tmp
&&
6832 !(prefixed_path_strv_contains(c
->inaccessible_paths
, "/tmp") &&
6833 (prefixed_path_strv_contains(c
->inaccessible_paths
, "/var/tmp") ||
6834 prefixed_path_strv_contains(c
->inaccessible_paths
, "/var")))) {
6835 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
6840 if (c
->private_network
|| c
->network_namespace_path
) {
6841 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
6845 if (c
->private_ipc
|| c
->ipc_namespace_path
) {
6846 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ipcns_storage_socket
) < 0)
6850 r
= exec_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_storage_socket
, ipcns_storage_socket
, ret
);
6857 int exec_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecRuntime
**ret
) {
6865 rt
= hashmap_get(m
->exec_runtime_by_id
, id
);
6867 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6875 /* If not found, then create a new object. */
6876 r
= exec_runtime_make(m
, c
, id
, &rt
);
6880 /* When r == 0, it is not necessary to create ExecRuntime object. */
6886 /* increment reference counter. */
6892 ExecRuntime
*exec_runtime_unref(ExecRuntime
*rt
, bool destroy
) {
6896 assert(rt
->n_ref
> 0);
6902 return exec_runtime_free(rt
, destroy
);
6905 int exec_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
6912 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
) {
6913 fprintf(f
, "exec-runtime=%s", rt
->id
);
6916 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
6918 if (rt
->var_tmp_dir
)
6919 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
6921 if (rt
->netns_storage_socket
[0] >= 0) {
6924 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
6928 fprintf(f
, " netns-socket-0=%i", copy
);
6931 if (rt
->netns_storage_socket
[1] >= 0) {
6934 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
6938 fprintf(f
, " netns-socket-1=%i", copy
);
6941 if (rt
->ipcns_storage_socket
[0] >= 0) {
6944 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[0]);
6948 fprintf(f
, " ipcns-socket-0=%i", copy
);
6951 if (rt
->ipcns_storage_socket
[1] >= 0) {
6954 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[1]);
6958 fprintf(f
, " ipcns-socket-1=%i", copy
);
6967 int exec_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
6968 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt_create
= NULL
;
6972 /* This is for the migration from old (v237 or earlier) deserialization text.
6973 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6974 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6975 * so or not from the serialized text, then we always creates a new object owned by this. */
6981 /* Manager manages ExecRuntime objects by the unit id.
6982 * So, we omit the serialized text when the unit does not have id (yet?)... */
6983 if (isempty(u
->id
)) {
6984 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
6988 if (hashmap_ensure_allocated(&u
->manager
->exec_runtime_by_id
, &string_hash_ops
) < 0)
6991 rt
= hashmap_get(u
->manager
->exec_runtime_by_id
, u
->id
);
6993 if (exec_runtime_allocate(&rt_create
, u
->id
) < 0)
6999 if (streq(key
, "tmp-dir")) {
7000 if (free_and_strdup_warn(&rt
->tmp_dir
, value
) < 0)
7003 } else if (streq(key
, "var-tmp-dir")) {
7004 if (free_and_strdup_warn(&rt
->var_tmp_dir
, value
) < 0)
7007 } else if (streq(key
, "netns-socket-0")) {
7010 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
7011 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
7015 safe_close(rt
->netns_storage_socket
[0]);
7016 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
7018 } else if (streq(key
, "netns-socket-1")) {
7021 if (safe_atoi(value
, &fd
) < 0 || !fdset_contains(fds
, fd
)) {
7022 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
7026 safe_close(rt
->netns_storage_socket
[1]);
7027 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
7032 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
7034 r
= hashmap_put(u
->manager
->exec_runtime_by_id
, rt_create
->id
, rt_create
);
7036 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
7040 rt_create
->manager
= u
->manager
;
7043 TAKE_PTR(rt_create
);
7049 int exec_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
7050 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
7052 int r
, netns_fdpair
[] = {-1, -1}, ipcns_fdpair
[] = {-1, -1};
7053 const char *p
, *v
= ASSERT_PTR(value
);
7059 n
= strcspn(v
, " ");
7060 id
= strndupa_safe(v
, n
);
7065 v
= startswith(p
, "tmp-dir=");
7067 n
= strcspn(v
, " ");
7068 tmp_dir
= strndup(v
, n
);
7076 v
= startswith(p
, "var-tmp-dir=");
7078 n
= strcspn(v
, " ");
7079 var_tmp_dir
= strndup(v
, n
);
7087 v
= startswith(p
, "netns-socket-0=");
7091 n
= strcspn(v
, " ");
7092 buf
= strndupa_safe(v
, n
);
7094 r
= safe_atoi(buf
, &netns_fdpair
[0]);
7096 return log_debug_errno(r
, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf
);
7097 if (!fdset_contains(fds
, netns_fdpair
[0]))
7098 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7099 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair
[0]);
7100 netns_fdpair
[0] = fdset_remove(fds
, netns_fdpair
[0]);
7106 v
= startswith(p
, "netns-socket-1=");
7110 n
= strcspn(v
, " ");
7111 buf
= strndupa_safe(v
, n
);
7113 r
= safe_atoi(buf
, &netns_fdpair
[1]);
7115 return log_debug_errno(r
, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf
);
7116 if (!fdset_contains(fds
, netns_fdpair
[1]))
7117 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7118 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair
[1]);
7119 netns_fdpair
[1] = fdset_remove(fds
, netns_fdpair
[1]);
7125 v
= startswith(p
, "ipcns-socket-0=");
7129 n
= strcspn(v
, " ");
7130 buf
= strndupa_safe(v
, n
);
7132 r
= safe_atoi(buf
, &ipcns_fdpair
[0]);
7134 return log_debug_errno(r
, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf
);
7135 if (!fdset_contains(fds
, ipcns_fdpair
[0]))
7136 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7137 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair
[0]);
7138 ipcns_fdpair
[0] = fdset_remove(fds
, ipcns_fdpair
[0]);
7144 v
= startswith(p
, "ipcns-socket-1=");
7148 n
= strcspn(v
, " ");
7149 buf
= strndupa_safe(v
, n
);
7151 r
= safe_atoi(buf
, &ipcns_fdpair
[1]);
7153 return log_debug_errno(r
, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf
);
7154 if (!fdset_contains(fds
, ipcns_fdpair
[1]))
7155 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7156 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair
[1]);
7157 ipcns_fdpair
[1] = fdset_remove(fds
, ipcns_fdpair
[1]);
7161 r
= exec_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_fdpair
, ipcns_fdpair
, NULL
);
7163 return log_debug_errno(r
, "Failed to add exec-runtime: %m");
7167 void exec_runtime_vacuum(Manager
*m
) {
7172 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7174 HASHMAP_FOREACH(rt
, m
->exec_runtime_by_id
) {
7178 (void) exec_runtime_free(rt
, false);
7182 void exec_params_clear(ExecParameters
*p
) {
7186 p
->environment
= strv_free(p
->environment
);
7187 p
->fd_names
= strv_free(p
->fd_names
);
7188 p
->fds
= mfree(p
->fds
);
7189 p
->exec_fd
= safe_close(p
->exec_fd
);
7192 ExecSetCredential
*exec_set_credential_free(ExecSetCredential
*sc
) {
7201 ExecLoadCredential
*exec_load_credential_free(ExecLoadCredential
*lc
) {
7210 void exec_directory_done(ExecDirectory
*d
) {
7214 for (size_t i
= 0; i
< d
->n_items
; i
++) {
7215 free(d
->items
[i
].path
);
7216 strv_free(d
->items
[i
].symlinks
);
7219 d
->items
= mfree(d
->items
);
7224 static ExecDirectoryItem
*exec_directory_find(ExecDirectory
*d
, const char *path
) {
7228 for (size_t i
= 0; i
< d
->n_items
; i
++)
7229 if (path_equal(d
->items
[i
].path
, path
))
7230 return &d
->items
[i
];
7235 int exec_directory_add(ExecDirectory
*d
, const char *path
, const char *symlink
) {
7236 _cleanup_strv_free_
char **s
= NULL
;
7237 _cleanup_free_
char *p
= NULL
;
7238 ExecDirectoryItem
*existing
;
7244 existing
= exec_directory_find(d
, path
);
7246 r
= strv_extend(&existing
->symlinks
, symlink
);
7250 return 0; /* existing item is updated */
7258 s
= strv_new(symlink
);
7263 if (!GREEDY_REALLOC(d
->items
, d
->n_items
+ 1))
7266 d
->items
[d
->n_items
++] = (ExecDirectoryItem
) {
7267 .path
= TAKE_PTR(p
),
7268 .symlinks
= TAKE_PTR(s
),
7271 return 1; /* new item is added */
7274 static int exec_directory_item_compare_func(const ExecDirectoryItem
*a
, const ExecDirectoryItem
*b
) {
7278 return path_compare(a
->path
, b
->path
);
7281 void exec_directory_sort(ExecDirectory
*d
) {
7284 /* Sort the exec directories to make always parent directories processed at first in
7285 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7286 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7287 * list. See also comments in setup_exec_directory() and issue #24783. */
7289 if (d
->n_items
<= 1)
7292 typesafe_qsort(d
->items
, d
->n_items
, exec_directory_item_compare_func
);
7294 for (size_t i
= 1; i
< d
->n_items
; i
++)
7295 for (size_t j
= 0; j
< i
; j
++)
7296 if (path_startswith(d
->items
[i
].path
, d
->items
[j
].path
)) {
7297 d
->items
[i
].only_create
= true;
7302 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecSetCredential
, exec_set_credential_free
);
7303 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecLoadCredential
, exec_load_credential_free
);
7305 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
7306 [EXEC_INPUT_NULL
] = "null",
7307 [EXEC_INPUT_TTY
] = "tty",
7308 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
7309 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
7310 [EXEC_INPUT_SOCKET
] = "socket",
7311 [EXEC_INPUT_NAMED_FD
] = "fd",
7312 [EXEC_INPUT_DATA
] = "data",
7313 [EXEC_INPUT_FILE
] = "file",
7316 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
7318 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
7319 [EXEC_OUTPUT_INHERIT
] = "inherit",
7320 [EXEC_OUTPUT_NULL
] = "null",
7321 [EXEC_OUTPUT_TTY
] = "tty",
7322 [EXEC_OUTPUT_KMSG
] = "kmsg",
7323 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
7324 [EXEC_OUTPUT_JOURNAL
] = "journal",
7325 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
7326 [EXEC_OUTPUT_SOCKET
] = "socket",
7327 [EXEC_OUTPUT_NAMED_FD
] = "fd",
7328 [EXEC_OUTPUT_FILE
] = "file",
7329 [EXEC_OUTPUT_FILE_APPEND
] = "append",
7330 [EXEC_OUTPUT_FILE_TRUNCATE
] = "truncate",
7333 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
7335 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
7336 [EXEC_UTMP_INIT
] = "init",
7337 [EXEC_UTMP_LOGIN
] = "login",
7338 [EXEC_UTMP_USER
] = "user",
7341 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
7343 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
7344 [EXEC_PRESERVE_NO
] = "no",
7345 [EXEC_PRESERVE_YES
] = "yes",
7346 [EXEC_PRESERVE_RESTART
] = "restart",
7349 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
7351 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7352 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7353 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
7354 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
7355 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
7356 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
7357 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
7360 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
7362 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7363 static const char* const exec_directory_type_symlink_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7364 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectorySymlink",
7365 [EXEC_DIRECTORY_STATE
] = "StateDirectorySymlink",
7366 [EXEC_DIRECTORY_CACHE
] = "CacheDirectorySymlink",
7367 [EXEC_DIRECTORY_LOGS
] = "LogsDirectorySymlink",
7368 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectorySymlink",
7371 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink
, ExecDirectoryType
);
7373 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7374 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7375 * directories, specifically .timer units with their timestamp touch file. */
7376 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7377 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
7378 [EXEC_DIRECTORY_STATE
] = "state",
7379 [EXEC_DIRECTORY_CACHE
] = "cache",
7380 [EXEC_DIRECTORY_LOGS
] = "logs",
7381 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
7384 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
7386 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7387 * the service payload in. */
7388 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
7389 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
7390 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
7391 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
7392 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
7393 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
7396 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
7398 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
7399 [EXEC_KEYRING_INHERIT
] = "inherit",
7400 [EXEC_KEYRING_PRIVATE
] = "private",
7401 [EXEC_KEYRING_SHARED
] = "shared",
7404 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);