1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <sys/eventfd.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
13 #include <sys/types.h>
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
21 #include <security/pam_appl.h>
25 #include <selinux/selinux.h>
33 #include <sys/apparmor.h>
36 #include "sd-messages.h"
40 #include "alloc-util.h"
42 #include "apparmor-util.h"
44 #include "argv-util.h"
48 #include "btrfs-util.h"
50 #include "capability-util.h"
51 #include "chattr-util.h"
52 #include "cgroup-setup.h"
54 #include "chown-recursive.h"
55 #include "constants.h"
56 #include "cpu-set-util.h"
57 #include "creds-util.h"
58 #include "data-fd-util.h"
61 #include "errno-list.h"
64 #include "exit-status.h"
67 #include "format-util.h"
68 #include "glob-util.h"
69 #include "hexdecoct.h"
71 #include "ioprio-util.h"
72 #include "label-util.h"
73 #include "lock-util.h"
77 #include "manager-dump.h"
78 #include "memory-util.h"
79 #include "missing_fs.h"
80 #include "missing_ioprio.h"
81 #include "missing_prctl.h"
82 #include "mkdir-label.h"
83 #include "mount-util.h"
84 #include "mountpoint-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "proc-cmdline.h"
89 #include "process-util.h"
91 #include "random-util.h"
92 #include "recurse-dir.h"
93 #include "rlimit-util.h"
96 #include "seccomp-util.h"
98 #include "securebits-util.h"
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "smack-util.h"
102 #include "socket-util.h"
103 #include "sort-util.h"
105 #include "stat-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
109 #include "syslog-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-serialize.h"
114 #include "user-util.h"
115 #include "utmp-wtmp.h"
117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
120 #define SNDBUF_SIZE (8*1024*1024)
122 static int shift_fds(int fds
[], size_t n_fds
) {
126 /* Modifies the fds array! (sorts it) */
130 for (int start
= 0;;) {
131 int restart_from
= -1;
133 for (int i
= start
; i
< (int) n_fds
; i
++) {
136 /* Already at right index? */
140 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
147 /* Hmm, the fd we wanted isn't free? Then
148 * let's remember that and try again from here */
149 if (nfd
!= i
+3 && restart_from
< 0)
153 if (restart_from
< 0)
156 start
= restart_from
;
162 static int flags_fds(
175 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
176 * O_NONBLOCK only applies to socket activation though. */
178 for (size_t i
= 0; i
< n_fds
; i
++) {
180 if (i
< n_socket_fds
) {
181 r
= fd_nonblock(fds
[i
], nonblock
);
186 /* We unconditionally drop FD_CLOEXEC from the fds,
187 * since after all we want to pass these fds to our
190 r
= fd_cloexec(fds
[i
], false);
198 static const char *exec_context_tty_path(const ExecContext
*context
) {
201 if (context
->stdio_as_fds
)
204 if (context
->tty_path
)
205 return context
->tty_path
;
207 return "/dev/console";
210 static int exec_context_tty_size(const ExecContext
*context
, unsigned *ret_rows
, unsigned *ret_cols
) {
211 _cleanup_free_
char *rowskey
= NULL
, *rowsvalue
= NULL
, *colskey
= NULL
, *colsvalue
= NULL
;
220 rows
= context
->tty_rows
;
221 cols
= context
->tty_cols
;
223 tty
= exec_context_tty_path(context
);
224 if (!tty
|| (rows
!= UINT_MAX
&& cols
!= UINT_MAX
)) {
230 tty
= skip_dev_prefix(tty
);
231 if (!in_charset(tty
, ALPHANUMERICAL
)) {
232 log_debug("%s contains non-alphanumeric characters, ignoring", tty
);
238 rowskey
= strjoin("systemd.tty.rows.", tty
);
242 colskey
= strjoin("systemd.tty.columns.", tty
);
246 r
= proc_cmdline_get_key_many(/* flags = */ 0,
248 colskey
, &colsvalue
);
250 log_debug_errno(r
, "Failed to read TTY size of %s from kernel cmdline, ignoring: %m", tty
);
252 if (rows
== UINT_MAX
&& rowsvalue
) {
253 r
= safe_atou(rowsvalue
, &rows
);
255 log_debug_errno(r
, "Failed to parse %s=%s, ignoring: %m", rowskey
, rowsvalue
);
258 if (cols
== UINT_MAX
&& colsvalue
) {
259 r
= safe_atou(colsvalue
, &cols
);
261 log_debug_errno(r
, "Failed to parse %s=%s, ignoring: %m", colskey
, colsvalue
);
270 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
275 path
= exec_context_tty_path(context
);
277 if (context
->tty_vhangup
) {
278 if (p
&& p
->stdin_fd
>= 0)
279 (void) terminal_vhangup_fd(p
->stdin_fd
);
281 (void) terminal_vhangup(path
);
284 if (context
->tty_reset
) {
285 if (p
&& p
->stdin_fd
>= 0)
286 (void) reset_terminal_fd(p
->stdin_fd
, true);
288 (void) reset_terminal(path
);
291 if (p
&& p
->stdin_fd
>= 0) {
292 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
294 (void) exec_context_tty_size(context
, &rows
, &cols
);
295 (void) terminal_set_size_fd(p
->stdin_fd
, path
, rows
, cols
);
298 if (context
->tty_vt_disallocate
&& path
)
299 (void) vt_disallocate(path
);
302 static bool is_terminal_input(ExecInput i
) {
305 EXEC_INPUT_TTY_FORCE
,
306 EXEC_INPUT_TTY_FAIL
);
309 static bool is_terminal_output(ExecOutput o
) {
312 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
313 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
316 static bool is_kmsg_output(ExecOutput o
) {
319 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
322 static bool exec_context_needs_term(const ExecContext
*c
) {
325 /* Return true if the execution context suggests we should set $TERM to something useful. */
327 if (is_terminal_input(c
->std_input
))
330 if (is_terminal_output(c
->std_output
))
333 if (is_terminal_output(c
->std_error
))
336 return !!c
->tty_path
;
339 static int open_null_as(int flags
, int nfd
) {
344 fd
= open("/dev/null", flags
|O_NOCTTY
);
348 return move_fd(fd
, nfd
, false);
351 static int connect_journal_socket(
353 const char *log_namespace
,
357 uid_t olduid
= UID_INVALID
;
358 gid_t oldgid
= GID_INVALID
;
363 strjoina("/run/systemd/journal.", log_namespace
, "/stdout") :
364 "/run/systemd/journal/stdout";
366 if (gid_is_valid(gid
)) {
369 if (setegid(gid
) < 0)
373 if (uid_is_valid(uid
)) {
376 if (seteuid(uid
) < 0) {
382 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
384 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
385 an LSM interferes. */
387 if (uid_is_valid(uid
))
388 (void) seteuid(olduid
);
391 if (gid_is_valid(gid
))
392 (void) setegid(oldgid
);
397 static int connect_logger_as(
399 const ExecContext
*context
,
400 const ExecParameters
*params
,
407 _cleanup_close_
int fd
= -EBADF
;
412 assert(output
< _EXEC_OUTPUT_MAX
);
416 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
420 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
424 if (shutdown(fd
, SHUT_RD
) < 0)
427 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
437 context
->syslog_identifier
?: ident
,
438 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
439 context
->syslog_priority
,
440 !!context
->syslog_level_prefix
,
442 is_kmsg_output(output
),
443 is_terminal_output(output
)) < 0)
446 return move_fd(TAKE_FD(fd
), nfd
, false);
449 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
455 fd
= open_terminal(path
, flags
| O_NOCTTY
);
459 return move_fd(fd
, nfd
, false);
462 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
463 _cleanup_close_
int fd
= -EBADF
;
468 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
471 fd
= open(path
, flags
|O_NOCTTY
, mode
);
475 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
478 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
480 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
484 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
485 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
486 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
487 * wasn't an AF_UNIX socket after all */
492 if ((flags
& O_ACCMODE
) == O_RDONLY
)
493 r
= shutdown(fd
, SHUT_WR
);
494 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
495 r
= shutdown(fd
, SHUT_RD
);
504 static int fixup_input(
505 const ExecContext
*context
,
507 bool apply_tty_stdin
) {
513 std_input
= context
->std_input
;
515 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
516 return EXEC_INPUT_NULL
;
518 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
519 return EXEC_INPUT_NULL
;
521 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
522 return EXEC_INPUT_NULL
;
527 static int fixup_output(ExecOutput output
, int socket_fd
) {
529 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
530 return EXEC_OUTPUT_INHERIT
;
535 static int setup_input(
536 const ExecContext
*context
,
537 const ExecParameters
*params
,
539 const int named_iofds
[static 3]) {
548 if (params
->stdin_fd
>= 0) {
549 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
552 /* Try to make this the controlling tty, if it is a tty, and reset it */
553 if (isatty(STDIN_FILENO
)) {
554 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
556 (void) exec_context_tty_size(context
, &rows
, &cols
);
557 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
558 (void) reset_terminal_fd(STDIN_FILENO
, true);
559 (void) terminal_set_size_fd(STDIN_FILENO
, NULL
, rows
, cols
);
565 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
569 case EXEC_INPUT_NULL
:
570 return open_null_as(O_RDONLY
, STDIN_FILENO
);
573 case EXEC_INPUT_TTY_FORCE
:
574 case EXEC_INPUT_TTY_FAIL
: {
578 fd
= acquire_terminal(exec_context_tty_path(context
),
579 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
580 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
581 ACQUIRE_TERMINAL_WAIT
,
586 r
= exec_context_tty_size(context
, &rows
, &cols
);
590 r
= terminal_set_size_fd(fd
, exec_context_tty_path(context
), rows
, cols
);
594 return move_fd(fd
, STDIN_FILENO
, false);
597 case EXEC_INPUT_SOCKET
:
598 assert(socket_fd
>= 0);
600 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
602 case EXEC_INPUT_NAMED_FD
:
603 assert(named_iofds
[STDIN_FILENO
] >= 0);
605 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
606 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
608 case EXEC_INPUT_DATA
: {
611 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
615 return move_fd(fd
, STDIN_FILENO
, false);
618 case EXEC_INPUT_FILE
: {
622 assert(context
->stdio_file
[STDIN_FILENO
]);
624 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
625 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
627 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
631 return move_fd(fd
, STDIN_FILENO
, false);
635 assert_not_reached();
639 static bool can_inherit_stderr_from_stdout(
640 const ExecContext
*context
,
646 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
649 if (e
== EXEC_OUTPUT_INHERIT
)
654 if (e
== EXEC_OUTPUT_NAMED_FD
)
655 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
657 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
658 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
663 static int setup_output(
665 const ExecContext
*context
,
666 const ExecParameters
*params
,
669 const int named_iofds
[static 3],
673 dev_t
*journal_stream_dev
,
674 ino_t
*journal_stream_ino
) {
684 assert(journal_stream_dev
);
685 assert(journal_stream_ino
);
687 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
689 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
692 return STDOUT_FILENO
;
695 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
696 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
699 return STDERR_FILENO
;
702 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
703 o
= fixup_output(context
->std_output
, socket_fd
);
705 if (fileno
== STDERR_FILENO
) {
707 e
= fixup_output(context
->std_error
, socket_fd
);
709 /* This expects the input and output are already set up */
711 /* Don't change the stderr file descriptor if we inherit all
712 * the way and are not on a tty */
713 if (e
== EXEC_OUTPUT_INHERIT
&&
714 o
== EXEC_OUTPUT_INHERIT
&&
715 i
== EXEC_INPUT_NULL
&&
716 !is_terminal_input(context
->std_input
) &&
720 /* Duplicate from stdout if possible */
721 if (can_inherit_stderr_from_stdout(context
, o
, e
))
722 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
726 } else if (o
== EXEC_OUTPUT_INHERIT
) {
727 /* If input got downgraded, inherit the original value */
728 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
729 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
731 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
732 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
733 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
735 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
739 /* We need to open /dev/null here anew, to get the right access mode. */
740 return open_null_as(O_WRONLY
, fileno
);
745 case EXEC_OUTPUT_NULL
:
746 return open_null_as(O_WRONLY
, fileno
);
748 case EXEC_OUTPUT_TTY
:
749 if (is_terminal_input(i
))
750 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
752 /* We don't reset the terminal if this is just about output */
753 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
755 case EXEC_OUTPUT_KMSG
:
756 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
757 case EXEC_OUTPUT_JOURNAL
:
758 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
759 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
761 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m",
762 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
763 r
= open_null_as(O_WRONLY
, fileno
);
767 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
768 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
769 * services to detect whether they are connected to the journal or not.
771 * If both stdout and stderr are connected to a stream then let's make sure to store the data
772 * about STDERR as that's usually the best way to do logging. */
774 if (fstat(fileno
, &st
) >= 0 &&
775 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
776 *journal_stream_dev
= st
.st_dev
;
777 *journal_stream_ino
= st
.st_ino
;
782 case EXEC_OUTPUT_SOCKET
:
783 assert(socket_fd
>= 0);
785 return RET_NERRNO(dup2(socket_fd
, fileno
));
787 case EXEC_OUTPUT_NAMED_FD
:
788 assert(named_iofds
[fileno
] >= 0);
790 (void) fd_nonblock(named_iofds
[fileno
], false);
791 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
793 case EXEC_OUTPUT_FILE
:
794 case EXEC_OUTPUT_FILE_APPEND
:
795 case EXEC_OUTPUT_FILE_TRUNCATE
: {
799 assert(context
->stdio_file
[fileno
]);
801 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
802 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
805 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
808 if (o
== EXEC_OUTPUT_FILE_APPEND
)
810 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
813 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
817 return move_fd(fd
, fileno
, 0);
821 assert_not_reached();
825 static int chown_terminal(int fd
, uid_t uid
) {
830 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
831 if (isatty(fd
) < 1) {
832 if (IN_SET(errno
, EINVAL
, ENOTTY
))
833 return 0; /* not a tty */
838 /* This might fail. What matters are the results. */
839 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
846 static int setup_confirm_stdio(
847 const ExecContext
*context
,
849 int *ret_saved_stdin
,
850 int *ret_saved_stdout
) {
852 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
856 assert(ret_saved_stdin
);
857 assert(ret_saved_stdout
);
859 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
863 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
864 if (saved_stdout
< 0)
867 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
871 r
= chown_terminal(fd
, getuid());
875 r
= reset_terminal_fd(fd
, true);
879 r
= exec_context_tty_size(context
, &rows
, &cols
);
883 r
= terminal_set_size_fd(fd
, vc
, rows
, cols
);
887 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
892 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
893 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
897 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
900 if (err
== -ETIMEDOUT
)
901 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
904 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
908 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
909 _cleanup_close_
int fd
= -EBADF
;
913 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
917 write_confirm_error_fd(err
, fd
, u
);
920 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
924 assert(saved_stdout
);
928 if (*saved_stdin
>= 0)
929 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
932 if (*saved_stdout
>= 0)
933 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
936 *saved_stdin
= safe_close(*saved_stdin
);
937 *saved_stdout
= safe_close(*saved_stdout
);
943 CONFIRM_PRETEND_FAILURE
= -1,
944 CONFIRM_PRETEND_SUCCESS
= 0,
948 static int ask_for_confirmation(const ExecContext
*context
, const char *vc
, Unit
*u
, const char *cmdline
) {
949 int saved_stdout
= -1, saved_stdin
= -1, r
;
950 _cleanup_free_
char *e
= NULL
;
953 /* For any internal errors, assume a positive response. */
954 r
= setup_confirm_stdio(context
, vc
, &saved_stdin
, &saved_stdout
);
956 write_confirm_error(r
, vc
, u
);
957 return CONFIRM_EXECUTE
;
960 /* confirm_spawn might have been disabled while we were sleeping. */
961 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
966 e
= ellipsize(cmdline
, 60, 100);
974 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
976 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
983 printf("Resuming normal execution.\n");
984 manager_disable_confirm_spawn();
988 unit_dump(u
, stdout
, " ");
989 continue; /* ask again */
991 printf("Failing execution.\n");
992 r
= CONFIRM_PRETEND_FAILURE
;
995 printf(" c - continue, proceed without asking anymore\n"
996 " D - dump, show the state of the unit\n"
997 " f - fail, don't execute the command and pretend it failed\n"
999 " i - info, show a short summary of the unit\n"
1000 " j - jobs, show jobs that are in progress\n"
1001 " s - skip, don't execute the command and pretend it succeeded\n"
1002 " y - yes, execute the command\n");
1003 continue; /* ask again */
1005 printf(" Description: %s\n"
1008 u
->id
, u
->description
, cmdline
);
1009 continue; /* ask again */
1011 manager_dump_jobs(u
->manager
, stdout
, /* patterns= */ NULL
, " ");
1012 continue; /* ask again */
1014 /* 'n' was removed in favor of 'f'. */
1015 printf("Didn't understand 'n', did you mean 'f'?\n");
1016 continue; /* ask again */
1018 printf("Skipping execution.\n");
1019 r
= CONFIRM_PRETEND_SUCCESS
;
1022 r
= CONFIRM_EXECUTE
;
1025 assert_not_reached();
1031 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
1035 static int get_fixed_user(const ExecContext
*c
, const char **user
,
1036 uid_t
*uid
, gid_t
*gid
,
1037 const char **home
, const char **shell
) {
1046 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1047 * (i.e. are "/" or "/bin/nologin"). */
1050 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
1058 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
1068 r
= get_group_creds(&name
, gid
, 0);
1076 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
1077 const char *group
, gid_t gid
,
1078 gid_t
**supplementary_gids
, int *ngids
) {
1081 bool keep_groups
= false;
1082 gid_t
*groups
= NULL
;
1083 _cleanup_free_ gid_t
*l_gids
= NULL
;
1088 * If user is given, then lookup GID and supplementary groups list.
1089 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1090 * here and as early as possible so we keep the list of supplementary
1091 * groups of the caller.
1093 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
1094 /* First step, initialize groups from /etc/groups */
1095 if (initgroups(user
, gid
) < 0)
1101 if (strv_isempty(c
->supplementary_groups
))
1105 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1106 * be positive, otherwise fail.
1109 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1110 if (ngroups_max
<= 0)
1111 return errno_or_else(EOPNOTSUPP
);
1113 l_gids
= new(gid_t
, ngroups_max
);
1119 * Lookup the list of groups that the user belongs to, we
1120 * avoid NSS lookups here too for gid=0.
1123 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1128 STRV_FOREACH(i
, c
->supplementary_groups
) {
1131 if (k
>= ngroups_max
)
1135 r
= get_group_creds(&g
, l_gids
+k
, 0);
1143 * Sets ngids to zero to drop all supplementary groups, happens
1144 * when we are under root and SupplementaryGroups= is empty.
1151 /* Otherwise get the final list of supplementary groups */
1152 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1156 *supplementary_gids
= groups
;
1164 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1167 /* Handle SupplementaryGroups= if it is not empty */
1169 r
= maybe_setgroups(ngids
, supplementary_gids
);
1174 if (gid_is_valid(gid
)) {
1175 /* Then set our gids */
1176 if (setresgid(gid
, gid
, gid
) < 0)
1183 static int set_securebits(unsigned bits
, unsigned mask
) {
1187 current
= prctl(PR_GET_SECUREBITS
);
1191 /* Clear all securebits defined in mask and set bits */
1192 applied
= ((unsigned) current
& ~mask
) | bits
;
1193 if ((unsigned) current
== applied
)
1196 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1202 static int enforce_user(
1203 const ExecContext
*context
,
1205 uint64_t capability_ambient_set
) {
1209 if (!uid_is_valid(uid
))
1212 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1213 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1216 if ((capability_ambient_set
!= 0 || context
->secure_bits
!= 0) && uid
!= 0) {
1218 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1219 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1220 r
= set_securebits(1U << SECURE_KEEP_CAPS
, 0);
1225 /* Second step: actually set the uids */
1226 if (setresuid(uid
, uid
, uid
) < 0)
1229 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1230 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1231 * outside of this call. */
1237 static int null_conv(
1239 const struct pam_message
**msg
,
1240 struct pam_response
**resp
,
1241 void *appdata_ptr
) {
1243 /* We don't support conversations */
1245 return PAM_CONV_ERR
;
1250 static int setup_pam(
1256 char ***env
, /* updated on success */
1257 const int fds
[], size_t n_fds
) {
1261 static const struct pam_conv conv
= {
1266 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1267 _cleanup_strv_free_
char **e
= NULL
;
1268 pam_handle_t
*handle
= NULL
;
1270 int pam_code
= PAM_SUCCESS
, r
;
1271 bool close_session
= false;
1272 pid_t pam_pid
= 0, parent_pid
;
1279 /* We set up PAM in the parent process, then fork. The child
1280 * will then stay around until killed via PR_GET_PDEATHSIG or
1281 * systemd via the cgroup logic. It will then remove the PAM
1282 * session again. The parent process will exec() the actual
1283 * daemon. We do things this way to ensure that the main PID
1284 * of the daemon is the one we initially fork()ed. */
1286 r
= barrier_create(&barrier
);
1290 if (log_get_max_level() < LOG_DEBUG
)
1291 flags
|= PAM_SILENT
;
1293 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1294 if (pam_code
!= PAM_SUCCESS
) {
1300 _cleanup_free_
char *q
= NULL
;
1302 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1303 * out if that's the case, and read the TTY off it. */
1305 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1306 tty
= strjoina("/dev/", q
);
1310 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1311 if (pam_code
!= PAM_SUCCESS
)
1315 STRV_FOREACH(nv
, *env
) {
1316 pam_code
= pam_putenv(handle
, *nv
);
1317 if (pam_code
!= PAM_SUCCESS
)
1321 pam_code
= pam_acct_mgmt(handle
, flags
);
1322 if (pam_code
!= PAM_SUCCESS
)
1325 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1326 if (pam_code
!= PAM_SUCCESS
)
1327 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1329 pam_code
= pam_open_session(handle
, flags
);
1330 if (pam_code
!= PAM_SUCCESS
)
1333 close_session
= true;
1335 e
= pam_getenvlist(handle
);
1337 pam_code
= PAM_BUF_ERR
;
1341 /* Block SIGTERM, so that we know that it won't get lost in the child */
1343 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1345 parent_pid
= getpid_cached();
1347 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1351 int sig
, ret
= EXIT_PAM
;
1353 /* The child's job is to reset the PAM session on termination */
1354 barrier_set_role(&barrier
, BARRIER_CHILD
);
1356 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1357 * those fds are open here that have been opened by PAM. */
1358 (void) close_many(fds
, n_fds
);
1360 /* Drop privileges - we don't need any to pam_close_session and this will make
1361 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1362 * threads to fail to exit normally */
1364 r
= maybe_setgroups(0, NULL
);
1366 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1367 if (setresgid(gid
, gid
, gid
) < 0)
1368 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1369 if (setresuid(uid
, uid
, uid
) < 0)
1370 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1372 (void) ignore_signals(SIGPIPE
);
1374 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1375 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1376 * this way. We rely on the control groups kill logic to do the rest for us. */
1377 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1380 /* Tell the parent that our setup is done. This is especially important regarding dropping
1381 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1383 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1384 (void) barrier_place(&barrier
);
1386 /* Check if our parent process might already have died? */
1387 if (getppid() == parent_pid
) {
1390 assert_se(sigemptyset(&ss
) >= 0);
1391 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1394 if (sigwait(&ss
, &sig
) < 0) {
1401 assert(sig
== SIGTERM
);
1406 pam_code
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1407 if (pam_code
!= PAM_SUCCESS
)
1410 /* If our parent died we'll end the session */
1411 if (getppid() != parent_pid
) {
1412 pam_code
= pam_close_session(handle
, flags
);
1413 if (pam_code
!= PAM_SUCCESS
)
1420 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1421 * know about this. See pam_end(3) */
1422 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1426 barrier_set_role(&barrier
, BARRIER_PARENT
);
1428 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1432 /* Unblock SIGTERM again in the parent */
1433 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1435 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1436 * this fd around. */
1439 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1440 * recover. However, warn loudly if it happens. */
1441 if (!barrier_place_and_sync(&barrier
))
1442 log_error("PAM initialization failed");
1444 return strv_free_and_replace(*env
, e
);
1447 if (pam_code
!= PAM_SUCCESS
) {
1448 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1449 r
= -EPERM
; /* PAM errors do not map to errno */
1451 log_error_errno(r
, "PAM failed: %m");
1455 pam_code
= pam_close_session(handle
, flags
);
1457 (void) pam_end(handle
, pam_code
| flags
);
1467 static void rename_process_from_path(const char *path
) {
1468 _cleanup_free_
char *buf
= NULL
;
1473 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1476 if (path_extract_filename(path
, &buf
) < 0) {
1477 rename_process("(...)");
1481 size_t l
= strlen(buf
);
1483 /* The end of the process name is usually more interesting, since the first bit might just be
1490 char process_name
[11];
1491 process_name
[0] = '(';
1492 memcpy(process_name
+1, p
, l
);
1493 process_name
[1+l
] = ')';
1494 process_name
[1+l
+1] = 0;
1496 rename_process(process_name
);
1499 static bool context_has_address_families(const ExecContext
*c
) {
1502 return c
->address_families_allow_list
||
1503 !set_isempty(c
->address_families
);
1506 static bool context_has_syscall_filters(const ExecContext
*c
) {
1509 return c
->syscall_allow_list
||
1510 !hashmap_isempty(c
->syscall_filter
);
1513 static bool context_has_syscall_logs(const ExecContext
*c
) {
1516 return c
->syscall_log_allow_list
||
1517 !hashmap_isempty(c
->syscall_log
);
1520 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1523 if (c
->no_new_privileges
)
1526 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1529 /* We need NNP if we have any form of seccomp and are unprivileged */
1530 return c
->lock_personality
||
1531 c
->memory_deny_write_execute
||
1532 c
->private_devices
||
1534 c
->protect_hostname
||
1535 c
->protect_kernel_tunables
||
1536 c
->protect_kernel_modules
||
1537 c
->protect_kernel_logs
||
1538 context_has_address_families(c
) ||
1539 exec_context_restrict_namespaces_set(c
) ||
1540 c
->restrict_realtime
||
1541 c
->restrict_suid_sgid
||
1542 !set_isempty(c
->syscall_archs
) ||
1543 context_has_syscall_filters(c
) ||
1544 context_has_syscall_logs(c
);
1547 bool exec_context_has_credentials(const ExecContext
*context
) {
1551 return !hashmap_isempty(context
->set_credentials
) ||
1552 !hashmap_isempty(context
->load_credentials
) ||
1553 !set_isempty(context
->import_credentials
);
1558 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1560 if (is_seccomp_available())
1563 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1567 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1568 uint32_t negative_action
, default_action
, action
;
1574 if (!context_has_syscall_filters(c
))
1577 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1580 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1582 if (c
->syscall_allow_list
) {
1583 default_action
= negative_action
;
1584 action
= SCMP_ACT_ALLOW
;
1586 default_action
= SCMP_ACT_ALLOW
;
1587 action
= negative_action
;
1590 if (needs_ambient_hack
) {
1591 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1596 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1599 static int apply_syscall_log(const Unit
* u
, const ExecContext
*c
) {
1601 uint32_t default_action
, action
;
1607 if (!context_has_syscall_logs(c
))
1611 if (skip_seccomp_unavailable(u
, "SystemCallLog="))
1614 if (c
->syscall_log_allow_list
) {
1615 /* Log nothing but the ones listed */
1616 default_action
= SCMP_ACT_ALLOW
;
1617 action
= SCMP_ACT_LOG
;
1619 /* Log everything but the ones listed */
1620 default_action
= SCMP_ACT_LOG
;
1621 action
= SCMP_ACT_ALLOW
;
1624 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1626 /* old libseccomp */
1627 log_unit_debug(u
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1632 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1636 if (set_isempty(c
->syscall_archs
))
1639 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1642 return seccomp_restrict_archs(c
->syscall_archs
);
1645 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1649 if (!context_has_address_families(c
))
1652 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1655 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1658 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1664 if (!c
->memory_deny_write_execute
)
1667 /* use prctl() if kernel supports it (6.3) */
1668 r
= prctl(PR_SET_MDWE
, PR_MDWE_REFUSE_EXEC_GAIN
, 0, 0, 0);
1670 log_unit_debug(u
, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1673 if (r
< 0 && errno
!= EINVAL
)
1674 return log_unit_debug_errno(u
, errno
, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1675 /* else use seccomp */
1676 log_unit_debug(u
, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1678 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1681 return seccomp_memory_deny_write_execute();
1684 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1688 if (!c
->restrict_realtime
)
1691 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1694 return seccomp_restrict_realtime();
1697 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1701 if (!c
->restrict_suid_sgid
)
1704 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1707 return seccomp_restrict_suid_sgid();
1710 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1714 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1715 * let's protect even those systems where this is left on in the kernel. */
1717 if (!c
->protect_kernel_tunables
)
1720 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1723 return seccomp_protect_sysctl();
1726 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1730 /* Turn off module syscalls on ProtectKernelModules=yes */
1732 if (!c
->protect_kernel_modules
)
1735 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1738 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1741 static int apply_protect_kernel_logs(const Unit
*u
, const ExecContext
*c
) {
1745 if (!c
->protect_kernel_logs
)
1748 if (skip_seccomp_unavailable(u
, "ProtectKernelLogs="))
1751 return seccomp_protect_syslog();
1754 static int apply_protect_clock(const Unit
*u
, const ExecContext
*c
) {
1758 if (!c
->protect_clock
)
1761 if (skip_seccomp_unavailable(u
, "ProtectClock="))
1764 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1767 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1771 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1773 if (!c
->private_devices
)
1776 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1779 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1782 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1786 if (!exec_context_restrict_namespaces_set(c
))
1789 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1792 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1795 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1796 unsigned long personality
;
1802 if (!c
->lock_personality
)
1805 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1808 personality
= c
->personality
;
1810 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1811 if (personality
== PERSONALITY_INVALID
) {
1813 r
= opinionated_personality(&personality
);
1818 return seccomp_lock_personality(personality
);
1824 static int apply_restrict_filesystems(Unit
*u
, const ExecContext
*c
) {
1828 if (!exec_context_restrict_filesystems_set(c
))
1831 if (!u
->manager
->restrict_fs
) {
1832 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1833 log_unit_debug(u
, "LSM BPF not supported, skipping RestrictFileSystems=");
1837 return lsm_bpf_unit_restrict_filesystems(u
, c
->restrict_filesystems
, c
->restrict_filesystems_allow_list
);
1841 static int apply_protect_hostname(const Unit
*u
, const ExecContext
*c
, int *ret_exit_status
) {
1845 if (!c
->protect_hostname
)
1848 if (ns_type_supported(NAMESPACE_UTS
)) {
1849 if (unshare(CLONE_NEWUTS
) < 0) {
1850 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1851 *ret_exit_status
= EXIT_NAMESPACE
;
1852 return log_unit_error_errno(u
, errno
, "Failed to set up UTS namespacing: %m");
1855 log_unit_warning(u
, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1858 log_unit_warning(u
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1863 if (skip_seccomp_unavailable(u
, "ProtectHostname="))
1866 r
= seccomp_protect_hostname();
1868 *ret_exit_status
= EXIT_SECCOMP
;
1869 return log_unit_error_errno(u
, r
, "Failed to apply hostname restrictions: %m");
1876 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1879 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1880 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1882 if (idle_pipe
[0] >= 0) {
1885 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1887 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1890 /* Signal systemd that we are bored and want to continue. */
1891 n
= write(idle_pipe
[3], "x", 1);
1893 /* Wait for systemd to react to the signal above. */
1894 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1897 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1901 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1904 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1906 static int build_environment(
1908 const ExecContext
*c
,
1909 const ExecParameters
*p
,
1910 const CGroupContext
*cgroup_context
,
1914 const char *username
,
1916 dev_t journal_stream_dev
,
1917 ino_t journal_stream_ino
,
1918 const char *memory_pressure_path
,
1921 _cleanup_strv_free_
char **our_env
= NULL
;
1931 #define N_ENV_VARS 19
1932 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1937 _cleanup_free_
char *joined
= NULL
;
1939 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1941 our_env
[n_env
++] = x
;
1943 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1945 our_env
[n_env
++] = x
;
1947 joined
= strv_join(fdnames
, ":");
1951 x
= strjoin("LISTEN_FDNAMES=", joined
);
1954 our_env
[n_env
++] = x
;
1957 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1958 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1960 our_env
[n_env
++] = x
;
1962 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1964 our_env
[n_env
++] = x
;
1967 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1968 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1969 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1970 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1971 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1974 our_env
[n_env
++] = x
;
1978 x
= strjoin("HOME=", home
);
1982 path_simplify(x
+ 5);
1983 our_env
[n_env
++] = x
;
1987 x
= strjoin("LOGNAME=", username
);
1990 our_env
[n_env
++] = x
;
1992 x
= strjoin("USER=", username
);
1995 our_env
[n_env
++] = x
;
1999 x
= strjoin("SHELL=", shell
);
2003 path_simplify(x
+ 6);
2004 our_env
[n_env
++] = x
;
2007 if (!sd_id128_is_null(u
->invocation_id
)) {
2008 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
2011 our_env
[n_env
++] = x
;
2014 if (exec_context_needs_term(c
)) {
2015 _cleanup_free_
char *cmdline
= NULL
;
2016 const char *tty_path
, *term
= NULL
;
2018 tty_path
= exec_context_tty_path(c
);
2020 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
2021 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
2022 * container manager passes to PID 1 ends up all the way in the console login shown. */
2024 if (path_equal_ptr(tty_path
, "/dev/console") && getppid() == 1)
2025 term
= getenv("TERM");
2026 else if (tty_path
&& in_charset(skip_dev_prefix(tty_path
), ALPHANUMERICAL
)) {
2027 _cleanup_free_
char *key
= NULL
;
2029 key
= strjoin("systemd.tty.term.", skip_dev_prefix(tty_path
));
2033 r
= proc_cmdline_get_key(key
, 0, &cmdline
);
2035 log_debug_errno(r
, "Failed to read %s from kernel cmdline, ignoring: %m", key
);
2041 term
= default_term_for_tty(tty_path
);
2043 x
= strjoin("TERM=", term
);
2046 our_env
[n_env
++] = x
;
2049 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
2050 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
2053 our_env
[n_env
++] = x
;
2056 if (c
->log_namespace
) {
2057 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
2061 our_env
[n_env
++] = x
;
2064 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2065 _cleanup_free_
char *joined
= NULL
;
2071 if (c
->directories
[t
].n_items
== 0)
2074 n
= exec_directory_env_name_to_string(t
);
2078 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
2079 _cleanup_free_
char *prefixed
= NULL
;
2081 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
2085 if (!strextend_with_separator(&joined
, ":", prefixed
))
2089 x
= strjoin(n
, "=", joined
);
2093 our_env
[n_env
++] = x
;
2096 if (exec_context_has_credentials(c
) && p
->prefix
[EXEC_DIRECTORY_RUNTIME
]) {
2097 x
= strjoin("CREDENTIALS_DIRECTORY=", p
->prefix
[EXEC_DIRECTORY_RUNTIME
], "/credentials/", u
->id
);
2101 our_env
[n_env
++] = x
;
2104 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
2107 our_env
[n_env
++] = x
;
2109 if (memory_pressure_path
) {
2110 x
= strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path
);
2114 our_env
[n_env
++] = x
;
2116 if (cgroup_context
&& !path_equal(memory_pressure_path
, "/dev/null")) {
2117 _cleanup_free_
char *b
= NULL
, *e
= NULL
;
2119 if (asprintf(&b
, "%s " USEC_FMT
" " USEC_FMT
,
2120 MEMORY_PRESSURE_DEFAULT_TYPE
,
2121 cgroup_context
->memory_pressure_threshold_usec
== USEC_INFINITY
? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
:
2122 CLAMP(cgroup_context
->memory_pressure_threshold_usec
, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
),
2123 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2126 if (base64mem(b
, strlen(b
) + 1, &e
) < 0)
2129 x
= strjoin("MEMORY_PRESSURE_WRITE=", e
);
2133 our_env
[n_env
++] = x
;
2137 assert(n_env
< N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
2140 *ret
= TAKE_PTR(our_env
);
2145 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2146 _cleanup_strv_free_
char **pass_env
= NULL
;
2149 STRV_FOREACH(i
, c
->pass_environment
) {
2150 _cleanup_free_
char *x
= NULL
;
2156 x
= strjoin(*i
, "=", v
);
2160 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2163 pass_env
[n_env
++] = TAKE_PTR(x
);
2164 pass_env
[n_env
] = NULL
;
2167 *ret
= TAKE_PTR(pass_env
);
2172 bool exec_needs_network_namespace(const ExecContext
*context
) {
2175 return context
->private_network
|| context
->network_namespace_path
;
2178 static bool exec_needs_ephemeral(const ExecContext
*context
) {
2179 return (context
->root_image
|| context
->root_directory
) && context
->root_ephemeral
;
2182 static bool exec_needs_ipc_namespace(const ExecContext
*context
) {
2185 return context
->private_ipc
|| context
->ipc_namespace_path
;
2188 bool exec_needs_mount_namespace(
2189 const ExecContext
*context
,
2190 const ExecParameters
*params
,
2191 const ExecRuntime
*runtime
) {
2195 if (context
->root_image
)
2198 if (!strv_isempty(context
->read_write_paths
) ||
2199 !strv_isempty(context
->read_only_paths
) ||
2200 !strv_isempty(context
->inaccessible_paths
) ||
2201 !strv_isempty(context
->exec_paths
) ||
2202 !strv_isempty(context
->no_exec_paths
))
2205 if (context
->n_bind_mounts
> 0)
2208 if (context
->n_temporary_filesystems
> 0)
2211 if (context
->n_mount_images
> 0)
2214 if (context
->n_extension_images
> 0)
2217 if (!strv_isempty(context
->extension_directories
))
2220 if (!IN_SET(context
->mount_propagation_flag
, 0, MS_SHARED
))
2223 if (context
->private_tmp
&& runtime
&& runtime
->shared
&& (runtime
->shared
->tmp_dir
|| runtime
->shared
->var_tmp_dir
))
2226 if (context
->private_devices
||
2227 context
->private_mounts
> 0 ||
2228 (context
->private_mounts
< 0 && exec_needs_network_namespace(context
)) ||
2229 context
->protect_system
!= PROTECT_SYSTEM_NO
||
2230 context
->protect_home
!= PROTECT_HOME_NO
||
2231 context
->protect_kernel_tunables
||
2232 context
->protect_kernel_modules
||
2233 context
->protect_kernel_logs
||
2234 context
->protect_control_groups
||
2235 context
->protect_proc
!= PROTECT_PROC_DEFAULT
||
2236 context
->proc_subset
!= PROC_SUBSET_ALL
||
2237 exec_needs_ipc_namespace(context
))
2240 if (context
->root_directory
) {
2241 if (exec_context_get_effective_mount_apivfs(context
))
2244 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2245 if (params
&& !params
->prefix
[t
])
2248 if (context
->directories
[t
].n_items
> 0)
2253 if (context
->dynamic_user
&&
2254 (context
->directories
[EXEC_DIRECTORY_STATE
].n_items
> 0 ||
2255 context
->directories
[EXEC_DIRECTORY_CACHE
].n_items
> 0 ||
2256 context
->directories
[EXEC_DIRECTORY_LOGS
].n_items
> 0))
2259 if (context
->log_namespace
)
2265 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2266 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2267 _cleanup_close_pair_
int errno_pipe
[2] = PIPE_EBADF
;
2268 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2269 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2274 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2275 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2276 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2277 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2278 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2279 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2280 * continues execution normally.
2281 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2282 * does not need CAP_SETUID to write the single line mapping to itself. */
2284 /* Can only set up multiple mappings with CAP_SETUID. */
2285 if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
))
2286 r
= asprintf(&uid_map
,
2287 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2288 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2289 ouid
, ouid
, uid
, uid
);
2291 r
= asprintf(&uid_map
,
2292 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2298 /* Can only set up multiple mappings with CAP_SETGID. */
2299 if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
))
2300 r
= asprintf(&gid_map
,
2301 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2302 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2303 ogid
, ogid
, gid
, gid
);
2305 r
= asprintf(&gid_map
,
2306 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2312 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2314 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2315 if (unshare_ready_fd
< 0)
2318 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2320 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2323 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
2327 _cleanup_close_
int fd
= -EBADF
;
2331 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2332 * here, after the parent opened its own user namespace. */
2335 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2337 /* Wait until the parent unshared the user namespace */
2338 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2343 /* Disable the setgroups() system call in the child user namespace, for good. */
2344 a
= procfs_file_alloca(ppid
, "setgroups");
2345 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2347 if (errno
!= ENOENT
) {
2352 /* If the file is missing the kernel is too old, let's continue anyway. */
2354 if (write(fd
, "deny\n", 5) < 0) {
2359 fd
= safe_close(fd
);
2362 /* First write the GID map */
2363 a
= procfs_file_alloca(ppid
, "gid_map");
2364 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2369 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2373 fd
= safe_close(fd
);
2375 /* The write the UID map */
2376 a
= procfs_file_alloca(ppid
, "uid_map");
2377 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2382 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2387 _exit(EXIT_SUCCESS
);
2390 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2391 _exit(EXIT_FAILURE
);
2394 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2396 if (unshare(CLONE_NEWUSER
) < 0)
2399 /* Let the child know that the namespace is ready now */
2400 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2403 /* Try to read an error code from the child */
2404 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2407 if (n
== sizeof(r
)) { /* an error code was sent to us */
2412 if (n
!= 0) /* on success we should have read 0 bytes */
2415 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2418 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2424 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2427 if (!context
->dynamic_user
)
2430 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2433 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2439 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2440 _cleanup_free_
char *src_abs
= NULL
;
2445 src_abs
= path_join(root
, source
);
2449 STRV_FOREACH(dst
, symlinks
) {
2450 _cleanup_free_
char *dst_abs
= NULL
;
2452 dst_abs
= path_join(root
, *dst
);
2456 r
= mkdir_parents_label(dst_abs
, 0755);
2460 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2468 static int setup_exec_directory(
2470 const ExecContext
*context
,
2471 const ExecParameters
*params
,
2474 ExecDirectoryType type
,
2475 bool needs_mount_namespace
,
2478 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2479 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2480 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2481 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2482 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2483 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2489 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2490 assert(exit_status
);
2492 if (!params
->prefix
[type
])
2495 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2496 if (!uid_is_valid(uid
))
2498 if (!gid_is_valid(gid
))
2502 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2503 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2505 p
= path_join(params
->prefix
[type
], context
->directories
[type
].items
[i
].path
);
2511 r
= mkdir_parents_label(p
, 0755);
2515 if (IN_SET(type
, EXEC_DIRECTORY_STATE
, EXEC_DIRECTORY_LOGS
) && params
->runtime_scope
== RUNTIME_SCOPE_USER
) {
2517 /* If we are in user mode, and a configuration directory exists but a state directory
2518 * doesn't exist, then we likely are upgrading from an older systemd version that
2519 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2520 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2521 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2522 * seperated. If a service has both dirs configured but only the configuration dir
2523 * exists and the state dir does not, we assume we are looking at an update
2524 * situation. Hence, create a compatibility symlink, so that all expectations are
2527 * (We also do something similar with the log directory, which still doesn't exist in
2528 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2530 /* this assumes the state dir is always created before the configuration dir */
2531 assert_cc(EXEC_DIRECTORY_STATE
< EXEC_DIRECTORY_LOGS
);
2532 assert_cc(EXEC_DIRECTORY_LOGS
< EXEC_DIRECTORY_CONFIGURATION
);
2534 r
= laccess(p
, F_OK
);
2536 _cleanup_free_
char *q
= NULL
;
2538 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2539 * under the configuration hierarchy. */
2541 if (type
== EXEC_DIRECTORY_STATE
)
2542 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], context
->directories
[type
].items
[i
].path
);
2543 else if (type
== EXEC_DIRECTORY_LOGS
)
2544 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], "log", context
->directories
[type
].items
[i
].path
);
2546 assert_not_reached();
2552 r
= laccess(q
, F_OK
);
2554 /* It does exist! This hence looks like an update. Symlink the
2555 * configuration directory into the state directory. */
2557 r
= symlink_idempotent(q
, p
, /* make_relative= */ true);
2561 log_unit_notice(u
, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p
, q
);
2563 } else if (r
!= -ENOENT
)
2564 log_unit_warning_errno(u
, r
, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q
);
2567 log_unit_warning_errno(u
, r
, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p
);
2570 if (exec_directory_is_private(context
, type
)) {
2571 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2572 * case we want to avoid leaving a directory around fully accessible that is owned by
2573 * a dynamic user whose UID is later on reused. To lock this down we use the same
2574 * trick used by container managers to prohibit host users to get access to files of
2575 * the same UID in containers: we place everything inside a directory that has an
2576 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2577 * for unprivileged host code. We then use fs namespacing to make this directory
2578 * permeable for the service itself.
2580 * Specifically: for a service which wants a special directory "foo/" we first create
2581 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2582 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2583 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2584 * unprivileged host users can't look into it. Inside of the namespace of the unit
2585 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2586 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2587 * for the service and making sure it only gets access to the dirs it needs but no
2588 * others. Tricky? Yes, absolutely, but it works!
2590 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2591 * to be owned by the service itself.
2593 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2594 * for sharing files or sockets with other services. */
2596 pp
= path_join(params
->prefix
[type
], "private");
2602 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2603 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2607 if (!path_extend(&pp
, context
->directories
[type
].items
[i
].path
)) {
2612 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2613 r
= mkdir_parents_label(pp
, 0755);
2617 if (is_dir(p
, false) > 0 &&
2618 (laccess(pp
, F_OK
) == -ENOENT
)) {
2620 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2621 * it over. Most likely the service has been upgraded from one that didn't use
2622 * DynamicUser=1, to one that does. */
2624 log_unit_info(u
, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2625 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2626 exec_directory_type_to_string(type
), p
, pp
);
2628 r
= RET_NERRNO(rename(p
, pp
));
2632 /* Otherwise, create the actual directory for the service */
2634 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2635 if (r
< 0 && r
!= -EEXIST
)
2639 if (!context
->directories
[type
].items
[i
].only_create
) {
2640 /* And link it up from the original place.
2642 * 1) If a mount namespace is going to be used, then this symlink remains on
2643 * the host, and a new one for the child namespace will be created later.
2644 * 2) It is not necessary to create this symlink when one of its parent
2645 * directories is specified and already created. E.g.
2646 * StateDirectory=foo foo/bar
2647 * In that case, the inode points to pp and p for "foo/bar" are the same:
2648 * pp = "/var/lib/private/foo/bar"
2649 * p = "/var/lib/foo/bar"
2650 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2651 * we do not need to create the symlink, but we cannot create the symlink.
2652 * See issue #24783. */
2653 r
= symlink_idempotent(pp
, p
, true);
2659 _cleanup_free_
char *target
= NULL
;
2661 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2662 readlink_and_make_absolute(p
, &target
) >= 0) {
2663 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2665 /* This already exists and is a symlink? Interesting. Maybe it's one created
2666 * by DynamicUser=1 (see above)?
2668 * We do this for all directory types except for ConfigurationDirectory=,
2669 * since they all support the private/ symlink logic at least in some
2670 * configurations, see above. */
2672 r
= chase(target
, NULL
, 0, &target_resolved
, NULL
);
2676 q
= path_join(params
->prefix
[type
], "private", context
->directories
[type
].items
[i
].path
);
2682 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2683 r
= chase(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2687 if (path_equal(q_resolved
, target_resolved
)) {
2689 /* Hmm, apparently DynamicUser= was once turned on for this service,
2690 * but is no longer. Let's move the directory back up. */
2692 log_unit_info(u
, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2693 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2694 exec_directory_type_to_string(type
), q
, p
);
2696 r
= RET_NERRNO(unlink(p
));
2700 r
= RET_NERRNO(rename(q
, p
));
2706 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2711 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2714 /* Don't change the owner/access mode of the configuration directory,
2715 * as in the common case it is not written to by a service, and shall
2716 * not be writable. */
2718 r
= RET_NERRNO(stat(p
, &st
));
2722 /* Still complain if the access mode doesn't match */
2723 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2724 log_unit_warning(u
, "%s \'%s\' already exists but the mode is different. "
2725 "(File system: %o %sMode: %o)",
2726 exec_directory_type_to_string(type
), context
->directories
[type
].items
[i
].path
,
2727 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2734 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2735 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2736 * current UID/GID ownership.) */
2737 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2741 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2742 * available to user code anyway */
2743 if (params
->runtime_scope
!= RUNTIME_SCOPE_SYSTEM
)
2746 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2747 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2748 * assignments to exist. */
2749 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777, AT_SYMLINK_FOLLOW
);
2754 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2755 * they are set up later, to allow configuring empty var/run/etc. */
2756 if (!needs_mount_namespace
)
2757 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2758 r
= create_many_symlinks(params
->prefix
[type
],
2759 context
->directories
[type
].items
[i
].path
,
2760 context
->directories
[type
].items
[i
].symlinks
);
2768 *exit_status
= exit_status_table
[type
];
2772 static int write_credential(
2778 bool ownership_ok
) {
2780 _cleanup_(unlink_and_freep
) char *tmp
= NULL
;
2781 _cleanup_close_
int fd
= -EBADF
;
2784 r
= tempfn_random_child("", "cred", &tmp
);
2788 fd
= openat(dfd
, tmp
, O_CREAT
|O_RDWR
|O_CLOEXEC
|O_EXCL
|O_NOFOLLOW
|O_NOCTTY
, 0600);
2794 r
= loop_write(fd
, data
, size
, /* do_poll = */ false);
2798 if (fchmod(fd
, 0400) < 0) /* Take away "w" bit */
2801 if (uid_is_valid(uid
) && uid
!= getuid()) {
2802 r
= fd_add_uid_acl_permission(fd
, uid
, ACL_READ
);
2804 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
2807 if (!ownership_ok
) /* Ideally we use ACLs, since we can neatly express what we want
2808 * to express: that the user gets read access and nothing
2809 * else. But if the backing fs can't support that (e.g. ramfs)
2810 * then we can use file ownership instead. But that's only safe if
2811 * we can then re-mount the whole thing read-only, so that the
2812 * user can no longer chmod() the file to gain write access. */
2815 if (fchown(fd
, uid
, GID_INVALID
) < 0)
2820 if (renameat(dfd
, tmp
, dfd
, id
) < 0)
2827 typedef enum CredentialSearchPath
{
2828 CREDENTIAL_SEARCH_PATH_TRUSTED
,
2829 CREDENTIAL_SEARCH_PATH_ENCRYPTED
,
2830 CREDENTIAL_SEARCH_PATH_ALL
,
2831 _CREDENTIAL_SEARCH_PATH_MAX
,
2832 _CREDENTIAL_SEARCH_PATH_INVALID
= -EINVAL
,
2833 } CredentialSearchPath
;
2835 static char **credential_search_path(const ExecParameters
*params
, CredentialSearchPath path
) {
2837 _cleanup_strv_free_
char **l
= NULL
;
2840 assert(path
>= 0 && path
< _CREDENTIAL_SEARCH_PATH_MAX
);
2842 /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2843 * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2844 * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2846 if (IN_SET(path
, CREDENTIAL_SEARCH_PATH_ENCRYPTED
, CREDENTIAL_SEARCH_PATH_ALL
)) {
2847 if (strv_extend(&l
, params
->received_encrypted_credentials_directory
) < 0)
2850 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2854 if (IN_SET(path
, CREDENTIAL_SEARCH_PATH_TRUSTED
, CREDENTIAL_SEARCH_PATH_ALL
)) {
2855 if (params
->received_credentials_directory
)
2856 if (strv_extend(&l
, params
->received_credentials_directory
) < 0)
2859 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2863 if (DEBUG_LOGGING
) {
2864 _cleanup_free_
char *t
= strv_join(l
, ":");
2866 log_debug("Credential search path is: %s", strempty(t
));
2872 static int maybe_decrypt_and_write_credential(
2882 _cleanup_free_
void *plaintext
= NULL
;
2887 size_t plaintext_size
= 0;
2889 r
= decrypt_credential_and_warn(id
, now(CLOCK_REALTIME
), NULL
, NULL
, data
, size
,
2890 &plaintext
, &plaintext_size
);
2895 size
= plaintext_size
;
2898 add
= strlen(id
) + size
;
2902 r
= write_credential(dir_fd
, id
, data
, size
, uid
, ownership_ok
);
2904 return log_debug_errno(r
, "Failed to write credential '%s': %m", id
);
2910 static int load_credential_glob(
2914 ReadFullFileFlags flags
,
2922 STRV_FOREACH(d
, search_path
) {
2923 _cleanup_globfree_ glob_t pglob
= {};
2924 _cleanup_free_
char *j
= NULL
;
2926 j
= path_join(*d
, path
);
2930 r
= safe_glob(j
, 0, &pglob
);
2936 for (size_t n
= 0; n
< pglob
.gl_pathc
; n
++) {
2937 _cleanup_free_
char *fn
= NULL
;
2938 _cleanup_(erase_and_freep
) char *data
= NULL
;
2941 /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2942 r
= read_full_file_full(
2946 encrypted
? CREDENTIAL_ENCRYPTED_SIZE_MAX
: CREDENTIAL_SIZE_MAX
,
2951 return log_debug_errno(r
, "Failed to read credential '%s': %m",
2954 r
= path_extract_filename(pglob
.gl_pathv
[n
], &fn
);
2956 return log_debug_errno(r
, "Failed to extract filename from '%s': %m",
2959 r
= maybe_decrypt_and_write_credential(
2977 static int load_credential(
2978 const ExecContext
*context
,
2979 const ExecParameters
*params
,
2990 ReadFullFileFlags flags
= READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
;
2991 _cleanup_strv_free_
char **search_path
= NULL
;
2992 _cleanup_(erase_and_freep
) char *data
= NULL
;
2993 _cleanup_free_
char *bindname
= NULL
;
2994 const char *source
= NULL
;
2995 bool missing_ok
= true;
3004 assert(read_dfd
>= 0 || read_dfd
== AT_FDCWD
);
3005 assert(write_dfd
>= 0);
3008 if (read_dfd
>= 0) {
3009 /* If a directory fd is specified, then read the file directly from that dir. In this case we
3010 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
3011 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
3014 if (!filename_is_valid(path
)) /* safety check */
3020 } else if (path_is_absolute(path
)) {
3021 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
3024 if (!path_is_valid(path
)) /* safety check */
3027 flags
|= READ_FULL_FILE_CONNECT_SOCKET
;
3029 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3030 * via the source socket address in case we read off an AF_UNIX socket. */
3031 if (asprintf(&bindname
, "@%" PRIx64
"/unit/%s/%s", random_u64(), unit
, id
) < 0)
3037 } else if (credential_name_valid(path
)) {
3038 /* If this is a relative path, take it as credential name relative to the credentials
3039 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3040 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3042 search_path
= credential_search_path(params
, CREDENTIAL_SEARCH_PATH_ALL
);
3051 flags
|= READ_FULL_FILE_UNBASE64
;
3053 maxsz
= encrypted
? CREDENTIAL_ENCRYPTED_SIZE_MAX
: CREDENTIAL_SIZE_MAX
;
3056 STRV_FOREACH(d
, search_path
) {
3057 _cleanup_free_
char *j
= NULL
;
3059 j
= path_join(*d
, path
);
3063 r
= read_full_file_full(
3064 AT_FDCWD
, j
, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3074 r
= read_full_file_full(
3084 if (r
== -ENOENT
&& (missing_ok
|| hashmap_contains(context
->set_credentials
, id
))) {
3085 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3086 * will get clear errors if we don't pass such a missing credential on as they
3087 * themselves will get ENOENT when trying to read them, which should not be much
3088 * worse than when we handle the error here and make it fatal.
3090 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3091 * we are fine, too. */
3092 log_debug_errno(r
, "Couldn't read inherited credential '%s', skipping: %m", path
);
3096 return log_debug_errno(r
, "Failed to read credential '%s': %m", path
);
3098 return maybe_decrypt_and_write_credential(write_dfd
, id
, encrypted
, uid
, ownership_ok
, data
, size
, left
);
3101 struct load_cred_args
{
3102 const ExecContext
*context
;
3103 const ExecParameters
*params
;
3112 static int load_cred_recurse_dir_cb(
3113 RecurseDirEvent event
,
3117 const struct dirent
*de
,
3118 const struct statx
*sx
,
3121 struct load_cred_args
*args
= ASSERT_PTR(userdata
);
3122 _cleanup_free_
char *sub_id
= NULL
;
3125 if (event
!= RECURSE_DIR_ENTRY
)
3126 return RECURSE_DIR_CONTINUE
;
3128 if (!IN_SET(de
->d_type
, DT_REG
, DT_SOCK
))
3129 return RECURSE_DIR_CONTINUE
;
3131 sub_id
= strreplace(path
, "/", "_");
3135 if (!credential_name_valid(sub_id
))
3136 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential would get ID %s, which is not valid, refusing", sub_id
);
3138 if (faccessat(args
->dfd
, sub_id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0) {
3139 log_debug("Skipping credential with duplicated ID %s at %s", sub_id
, path
);
3140 return RECURSE_DIR_CONTINUE
;
3142 if (errno
!= ENOENT
)
3143 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sub_id
);
3145 r
= load_credential(
3160 return RECURSE_DIR_CONTINUE
;
3163 static int acquire_credentials(
3164 const ExecContext
*context
,
3165 const ExecParameters
*params
,
3169 bool ownership_ok
) {
3171 uint64_t left
= CREDENTIALS_TOTAL_SIZE_MAX
;
3172 _cleanup_close_
int dfd
= -EBADF
;
3174 ExecLoadCredential
*lc
;
3175 ExecSetCredential
*sc
;
3181 dfd
= open(p
, O_DIRECTORY
|O_CLOEXEC
);
3185 r
= fd_acl_make_writable(dfd
); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3189 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3190 HASHMAP_FOREACH(lc
, context
->load_credentials
) {
3191 _cleanup_close_
int sub_fd
= -EBADF
;
3193 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3194 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3195 * a regular file. Finally, if it's a relative path we will use it as a credential name to
3196 * propagate a credential passed to us from further up. */
3198 if (path_is_absolute(lc
->path
)) {
3199 sub_fd
= open(lc
->path
, O_DIRECTORY
|O_CLOEXEC
|O_RDONLY
);
3200 if (sub_fd
< 0 && !IN_SET(errno
,
3201 ENOTDIR
, /* Not a directory */
3202 ENOENT
)) /* Doesn't exist? */
3203 return log_debug_errno(errno
, "Failed to open '%s': %m", lc
->path
);
3207 /* Regular file (incl. a credential passed in from higher up) */
3208 r
= load_credential(
3224 /* path= */ lc
->id
, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3225 /* statx_mask= */ 0,
3226 /* n_depth_max= */ UINT_MAX
,
3227 RECURSE_DIR_SORT
|RECURSE_DIR_IGNORE_DOT
|RECURSE_DIR_ENSURE_TYPE
,
3228 load_cred_recurse_dir_cb
,
3229 &(struct load_cred_args
) {
3232 .encrypted
= lc
->encrypted
,
3236 .ownership_ok
= ownership_ok
,
3243 /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3244 * override any credentials found earlier. */
3245 SET_FOREACH(ic
, context
->import_credentials
) {
3246 _cleanup_free_
char **search_path
= NULL
;
3248 search_path
= credential_search_path(params
, CREDENTIAL_SEARCH_PATH_TRUSTED
);
3252 r
= load_credential_glob(
3254 /* encrypted = */ false,
3256 READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
,
3264 search_path
= strv_free(search_path
);
3265 search_path
= credential_search_path(params
, CREDENTIAL_SEARCH_PATH_ENCRYPTED
);
3269 r
= load_credential_glob(
3271 /* encrypted = */ true,
3273 READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
|READ_FULL_FILE_UNBASE64
,
3282 /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3283 * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3284 HASHMAP_FOREACH(sc
, context
->set_credentials
) {
3285 _cleanup_(erase_and_freep
) void *plaintext
= NULL
;
3289 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3290 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3291 * slow and involved, hence it's nice to be able to skip that if the credential already
3293 if (faccessat(dfd
, sc
->id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0)
3295 if (errno
!= ENOENT
)
3296 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sc
->id
);
3298 if (sc
->encrypted
) {
3299 r
= decrypt_credential_and_warn(sc
->id
, now(CLOCK_REALTIME
), NULL
, NULL
, sc
->data
, sc
->size
, &plaintext
, &size
);
3309 add
= strlen(sc
->id
) + size
;
3313 r
= write_credential(dfd
, sc
->id
, data
, size
, uid
, ownership_ok
);
3320 r
= fd_acl_make_read_only(dfd
); /* Now take away the "w" bit */
3324 /* After we created all keys with the right perms, also make sure the credential store as a whole is
3327 if (uid_is_valid(uid
) && uid
!= getuid()) {
3328 r
= fd_add_uid_acl_permission(dfd
, uid
, ACL_READ
| ACL_EXECUTE
);
3330 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
3336 if (fchown(dfd
, uid
, GID_INVALID
) < 0)
3344 static int setup_credentials_internal(
3345 const ExecContext
*context
,
3346 const ExecParameters
*params
,
3348 const char *final
, /* This is where the credential store shall eventually end up at */
3349 const char *workspace
, /* This is where we can prepare it before moving it to the final place */
3350 bool reuse_workspace
, /* Whether to reuse any existing workspace mount if it already is a mount */
3351 bool must_mount
, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3354 int r
, workspace_mounted
; /* negative if we don't know yet whether we have/can mount something; true
3355 * if we mounted something; false if we definitely can't mount anything */
3363 if (reuse_workspace
) {
3364 r
= path_is_mount_point(workspace
, NULL
, 0);
3368 workspace_mounted
= true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3370 workspace_mounted
= -1; /* We need to figure out if we can mount something to the workspace */
3372 workspace_mounted
= -1; /* ditto */
3374 r
= path_is_mount_point(final
, NULL
, 0);
3378 /* If the final place already has something mounted, we use that. If the workspace also has
3379 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3381 final_mounted
= true;
3383 if (workspace_mounted
< 0) {
3384 /* If the final place is mounted, but the workspace isn't, then let's bind mount
3385 * the final version to the workspace, and make it writable, so that we can make
3388 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3392 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|credentials_fs_mount_flags(/* ro= */ false), NULL
);
3396 workspace_mounted
= true;
3399 final_mounted
= false;
3401 if (workspace_mounted
< 0) {
3402 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3404 r
= mount_credentials_fs(workspace
, CREDENTIALS_TOTAL_SIZE_MAX
, /* ro= */ false);
3406 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3407 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3409 if (!ERRNO_IS_PRIVILEGE(r
)) /* Propagate anything that isn't a permission problem */
3412 if (must_mount
) /* If we it's not OK to use the plain directory
3413 * fallback, propagate all errors too */
3416 /* If we lack privileges to bind mount stuff, then let's gracefully
3417 * proceed for compat with container envs, and just use the final dir
3420 workspace_mounted
= false;
3422 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3423 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|credentials_fs_mount_flags(/* ro= */ false), NULL
);
3427 workspace_mounted
= true;
3430 workspace_mounted
= true;
3433 assert(!must_mount
|| workspace_mounted
> 0);
3434 where
= workspace_mounted
? workspace
: final
;
3436 (void) label_fix_full(AT_FDCWD
, where
, final
, 0);
3438 r
= acquire_credentials(context
, params
, unit
, where
, uid
, workspace_mounted
);
3442 if (workspace_mounted
) {
3445 /* Determine if we should actually install the prepared mount in the final location by bind
3446 * mounting it there. We do so only if the mount is not established there already, and if the
3447 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3448 * case we are doing all this in a mount namespace, thus no one else will see that we
3449 * allocated a file system we are getting rid of again here. */
3451 install
= false; /* already installed */
3453 r
= dir_is_empty(where
, /* ignore_hidden_or_backup= */ false);
3457 install
= r
== 0; /* install only if non-empty */
3461 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3462 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|credentials_fs_mount_flags(/* ro= */ true), NULL
);
3466 /* And mount it to the final place, read-only */
3467 r
= mount_nofollow_verbose(LOG_DEBUG
, workspace
, final
, NULL
, MS_MOVE
, NULL
);
3469 /* Otherwise get rid of it */
3470 r
= umount_verbose(LOG_DEBUG
, workspace
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
3474 _cleanup_free_
char *parent
= NULL
;
3476 /* If we do not have our own mount put used the plain directory fallback, then we need to
3477 * open access to the top-level credential directory and the per-service directory now */
3479 r
= path_extract_directory(final
, &parent
);
3482 if (chmod(parent
, 0755) < 0)
3489 static int setup_credentials(
3490 const ExecContext
*context
,
3491 const ExecParameters
*params
,
3495 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
3501 if (!exec_context_has_credentials(context
))
3504 if (!params
->prefix
[EXEC_DIRECTORY_RUNTIME
])
3507 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3508 * and the subdir we mount over with a read-only file system readable by the service's user */
3509 q
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials");
3513 r
= mkdir_label(q
, 0755); /* top-level dir: world readable/searchable */
3514 if (r
< 0 && r
!= -EEXIST
)
3517 p
= path_join(q
, unit
);
3521 r
= mkdir_label(p
, 0700); /* per-unit dir: private to user */
3522 if (r
< 0 && r
!= -EEXIST
)
3525 r
= safe_fork("(sd-mkdcreds)", FORK_DEATHSIG
|FORK_WAIT
|FORK_NEW_MOUNTNS
, NULL
);
3527 _cleanup_free_
char *t
= NULL
, *u
= NULL
;
3529 /* If this is not a privilege or support issue then propagate the error */
3530 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
3533 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3534 * it into place, so that users can't access half-initialized credential stores. */
3535 t
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "systemd/temporary-credentials");
3539 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3540 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3541 * after it is fully set up */
3542 u
= path_join(t
, unit
);
3546 FOREACH_STRING(i
, t
, u
) {
3547 r
= mkdir_label(i
, 0700);
3548 if (r
< 0 && r
!= -EEXIST
)
3552 r
= setup_credentials_internal(
3556 p
, /* final mount point */
3557 u
, /* temporary workspace to overmount */
3558 true, /* reuse the workspace if it is already a mount */
3559 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3562 (void) rmdir(u
); /* remove the workspace again if we can. */
3567 } else if (r
== 0) {
3569 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3570 * we can use the same directory for all cases, after turning off propagation. Question
3571 * though is: where do we turn off propagation exactly, and where do we place the workspace
3572 * directory? We need some place that is guaranteed to be a mount point in the host, and
3573 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3574 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3575 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3576 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3577 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3578 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3579 * propagation on the former, and then overmount the latter.
3581 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3582 * for this purpose, but there are few other candidates that work equally well for us, and
3583 * given that the we do this in a privately namespaced short-lived single-threaded process
3584 * that no one else sees this should be OK to do. */
3586 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, "/dev", NULL
, MS_SLAVE
|MS_REC
, NULL
); /* Turn off propagation from our namespace to host */
3590 r
= setup_credentials_internal(
3594 p
, /* final mount point */
3595 "/dev/shm", /* temporary workspace to overmount */
3596 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3597 true, /* insist that something is mounted, do not allow fallback to plain directory */
3602 _exit(EXIT_SUCCESS
);
3605 _exit(EXIT_FAILURE
);
3608 /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3609 * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3610 * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3611 * seen by users when trying access this inode. */
3617 static int setup_smack(
3618 const Manager
*manager
,
3619 const ExecContext
*context
,
3620 int executable_fd
) {
3624 assert(executable_fd
>= 0);
3626 if (context
->smack_process_label
) {
3627 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
3630 } else if (manager
->default_smack_process_label
) {
3631 _cleanup_free_
char *exec_label
= NULL
;
3633 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
3634 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
3637 r
= mac_smack_apply_pid(0, exec_label
?: manager
->default_smack_process_label
);
3646 static int compile_bind_mounts(
3647 const ExecContext
*context
,
3648 const ExecParameters
*params
,
3649 BindMount
**ret_bind_mounts
,
3650 size_t *ret_n_bind_mounts
,
3651 char ***ret_empty_directories
) {
3653 _cleanup_strv_free_
char **empty_directories
= NULL
;
3654 BindMount
*bind_mounts
= NULL
;
3660 assert(ret_bind_mounts
);
3661 assert(ret_n_bind_mounts
);
3662 assert(ret_empty_directories
);
3664 CLEANUP_ARRAY(bind_mounts
, h
, bind_mount_free_many
);
3666 n
= context
->n_bind_mounts
;
3667 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3668 if (!params
->prefix
[t
])
3671 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++)
3672 n
+= !context
->directories
[t
].items
[i
].only_create
;
3676 *ret_bind_mounts
= NULL
;
3677 *ret_n_bind_mounts
= 0;
3678 *ret_empty_directories
= NULL
;
3682 bind_mounts
= new(BindMount
, n
);
3686 for (size_t i
= 0; i
< context
->n_bind_mounts
; i
++) {
3687 BindMount
*item
= context
->bind_mounts
+ i
;
3688 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
3690 s
= strdup(item
->source
);
3694 d
= strdup(item
->destination
);
3698 bind_mounts
[h
++] = (BindMount
) {
3699 .source
= TAKE_PTR(s
),
3700 .destination
= TAKE_PTR(d
),
3701 .read_only
= item
->read_only
,
3702 .recursive
= item
->recursive
,
3703 .ignore_enoent
= item
->ignore_enoent
,
3707 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3708 if (!params
->prefix
[t
])
3711 if (context
->directories
[t
].n_items
== 0)
3714 if (exec_directory_is_private(context
, t
) &&
3715 !exec_context_with_rootfs(context
)) {
3718 /* So this is for a dynamic user, and we need to make sure the process can access its own
3719 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3720 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3722 private_root
= path_join(params
->prefix
[t
], "private");
3726 r
= strv_consume(&empty_directories
, private_root
);
3731 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++) {
3732 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
3734 /* When one of the parent directories is in the list, we cannot create the symlink
3735 * for the child directory. See also the comments in setup_exec_directory(). */
3736 if (context
->directories
[t
].items
[i
].only_create
)
3739 if (exec_directory_is_private(context
, t
))
3740 s
= path_join(params
->prefix
[t
], "private", context
->directories
[t
].items
[i
].path
);
3742 s
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3746 if (exec_directory_is_private(context
, t
) &&
3747 exec_context_with_rootfs(context
))
3748 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3749 * directory is not created on the root directory. So, let's bind-mount the directory
3750 * on the 'non-private' place. */
3751 d
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3757 bind_mounts
[h
++] = (BindMount
) {
3758 .source
= TAKE_PTR(s
),
3759 .destination
= TAKE_PTR(d
),
3761 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
3763 .ignore_enoent
= false,
3770 *ret_bind_mounts
= TAKE_PTR(bind_mounts
);
3771 *ret_n_bind_mounts
= n
;
3772 *ret_empty_directories
= TAKE_PTR(empty_directories
);
3777 /* ret_symlinks will contain a list of pairs src:dest that describes
3778 * the symlinks to create later on. For example, the symlinks needed
3779 * to safely give private directories to DynamicUser=1 users. */
3780 static int compile_symlinks(
3781 const ExecContext
*context
,
3782 const ExecParameters
*params
,
3783 char ***ret_symlinks
) {
3785 _cleanup_strv_free_
char **symlinks
= NULL
;
3790 assert(ret_symlinks
);
3792 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3793 for (size_t i
= 0; i
< context
->directories
[dt
].n_items
; i
++) {
3794 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
3796 STRV_FOREACH(symlink
, context
->directories
[dt
].items
[i
].symlinks
) {
3797 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
3799 src_abs
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3800 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
3801 if (!src_abs
|| !dst_abs
)
3804 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
3809 if (!exec_directory_is_private(context
, dt
) ||
3810 exec_context_with_rootfs(context
) ||
3811 context
->directories
[dt
].items
[i
].only_create
)
3814 private_path
= path_join(params
->prefix
[dt
], "private", context
->directories
[dt
].items
[i
].path
);
3818 path
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3822 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
3828 *ret_symlinks
= TAKE_PTR(symlinks
);
3833 static bool insist_on_sandboxing(
3834 const ExecContext
*context
,
3835 const char *root_dir
,
3836 const char *root_image
,
3837 const BindMount
*bind_mounts
,
3838 size_t n_bind_mounts
) {
3841 assert(n_bind_mounts
== 0 || bind_mounts
);
3843 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3844 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3845 * rearrange stuff in a way we cannot ignore gracefully. */
3847 if (context
->n_temporary_filesystems
> 0)
3850 if (root_dir
|| root_image
)
3853 if (context
->n_mount_images
> 0)
3856 if (context
->dynamic_user
)
3859 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
3862 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3864 for (size_t i
= 0; i
< n_bind_mounts
; i
++)
3865 if (!path_equal(bind_mounts
[i
].source
, bind_mounts
[i
].destination
))
3868 if (context
->log_namespace
)
3874 static int setup_ephemeral(const ExecContext
*context
, ExecRuntime
*runtime
) {
3875 _cleanup_close_
int fd
= -EBADF
;
3878 if (!runtime
|| !runtime
->ephemeral_copy
)
3881 r
= posix_lock(runtime
->ephemeral_storage_socket
[0], LOCK_EX
);
3883 return log_debug_errno(r
, "Failed to lock ephemeral storage socket: %m");
3885 CLEANUP_POSIX_UNLOCK(runtime
->ephemeral_storage_socket
[0]);
3887 fd
= receive_one_fd(runtime
->ephemeral_storage_socket
[0], MSG_PEEK
|MSG_DONTWAIT
);
3889 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3893 return log_debug_errno(fd
, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3895 log_debug("Making ephemeral snapshot of %s to %s",
3896 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3898 if (context
->root_image
)
3899 fd
= copy_file(context
->root_image
, runtime
->ephemeral_copy
, O_EXCL
, 0600,
3900 COPY_LOCK_BSD
|COPY_REFLINK
|COPY_CRTIME
);
3902 fd
= btrfs_subvol_snapshot_at(AT_FDCWD
, context
->root_directory
,
3903 AT_FDCWD
, runtime
->ephemeral_copy
,
3904 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3905 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3906 BTRFS_SNAPSHOT_RECURSIVE
|
3907 BTRFS_SNAPSHOT_LOCK_BSD
);
3909 return log_debug_errno(fd
, "Failed to snapshot %s to %s: %m",
3910 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3912 if (context
->root_image
) {
3913 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3914 * which tends to not perform well in combination with lots of random writes.
3916 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3917 * copy, but we at least want to make the intention clear.
3919 r
= chattr_fd(fd
, FS_NOCOW_FL
, FS_NOCOW_FL
, NULL
);
3921 log_debug_errno(fd
, "Failed to disable copy-on-write for %s, ignoring: %m", runtime
->ephemeral_copy
);
3924 r
= send_one_fd(runtime
->ephemeral_storage_socket
[1], fd
, MSG_DONTWAIT
);
3926 return log_debug_errno(r
, "Failed to queue file descriptor on ephemeral storage socket: %m");
3931 static int verity_settings_prepare(
3932 VeritySettings
*verity
,
3933 const char *root_image
,
3934 const void *root_hash
,
3935 size_t root_hash_size
,
3936 const char *root_hash_path
,
3937 const void *root_hash_sig
,
3938 size_t root_hash_sig_size
,
3939 const char *root_hash_sig_path
,
3940 const char *verity_data_path
) {
3949 d
= memdup(root_hash
, root_hash_size
);
3953 free_and_replace(verity
->root_hash
, d
);
3954 verity
->root_hash_size
= root_hash_size
;
3955 verity
->designator
= PARTITION_ROOT
;
3958 if (root_hash_sig
) {
3961 d
= memdup(root_hash_sig
, root_hash_sig_size
);
3965 free_and_replace(verity
->root_hash_sig
, d
);
3966 verity
->root_hash_sig_size
= root_hash_sig_size
;
3967 verity
->designator
= PARTITION_ROOT
;
3970 if (verity_data_path
) {
3971 r
= free_and_strdup(&verity
->data_path
, verity_data_path
);
3976 r
= verity_settings_load(
3980 root_hash_sig_path
);
3982 return log_debug_errno(r
, "Failed to load root hash: %m");
3987 static int apply_mount_namespace(
3989 ExecCommandFlags command_flags
,
3990 const ExecContext
*context
,
3991 const ExecParameters
*params
,
3992 ExecRuntime
*runtime
,
3993 const char *memory_pressure_path
,
3994 char **error_path
) {
3996 _cleanup_(verity_settings_done
) VeritySettings verity
= VERITY_SETTINGS_DEFAULT
;
3997 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
,
3998 **read_write_paths_cleanup
= NULL
;
3999 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
4000 *extension_dir
= NULL
;
4001 const char *root_dir
= NULL
, *root_image
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
4002 char **read_write_paths
;
4003 NamespaceInfo ns_info
;
4004 bool needs_sandboxing
;
4005 BindMount
*bind_mounts
= NULL
;
4006 size_t n_bind_mounts
= 0;
4011 CLEANUP_ARRAY(bind_mounts
, n_bind_mounts
, bind_mount_free_many
);
4013 if (params
->flags
& EXEC_APPLY_CHROOT
) {
4014 r
= setup_ephemeral(context
, runtime
);
4018 if (context
->root_image
)
4019 root_image
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_image
;
4021 root_dir
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
;
4024 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
4028 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
4029 r
= compile_symlinks(context
, params
, &symlinks
);
4033 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4034 * service will need to write to it in order to start the notifications. */
4035 if (context
->protect_control_groups
&& memory_pressure_path
&& !streq(memory_pressure_path
, "/dev/null")) {
4036 read_write_paths_cleanup
= strv_copy(context
->read_write_paths
);
4037 if (!read_write_paths_cleanup
)
4040 r
= strv_extend(&read_write_paths_cleanup
, memory_pressure_path
);
4044 read_write_paths
= read_write_paths_cleanup
;
4046 read_write_paths
= context
->read_write_paths
;
4048 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command_flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4049 if (needs_sandboxing
) {
4050 /* The runtime struct only contains the parent of the private /tmp,
4051 * which is non-accessible to world users. Inside of it there's a /tmp
4052 * that is sticky, and that's the one we want to use here.
4053 * This does not apply when we are using /run/systemd/empty as fallback. */
4055 if (context
->private_tmp
&& runtime
&& runtime
->shared
) {
4056 if (streq_ptr(runtime
->shared
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
4057 tmp_dir
= runtime
->shared
->tmp_dir
;
4058 else if (runtime
->shared
->tmp_dir
)
4059 tmp_dir
= strjoina(runtime
->shared
->tmp_dir
, "/tmp");
4061 if (streq_ptr(runtime
->shared
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
4062 var_tmp_dir
= runtime
->shared
->var_tmp_dir
;
4063 else if (runtime
->shared
->var_tmp_dir
)
4064 var_tmp_dir
= strjoina(runtime
->shared
->var_tmp_dir
, "/tmp");
4067 ns_info
= (NamespaceInfo
) {
4068 .ignore_protect_paths
= false,
4069 .private_dev
= context
->private_devices
,
4070 .protect_control_groups
= context
->protect_control_groups
,
4071 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
4072 .protect_kernel_modules
= context
->protect_kernel_modules
,
4073 .protect_kernel_logs
= context
->protect_kernel_logs
,
4074 .protect_hostname
= context
->protect_hostname
,
4075 .mount_apivfs
= exec_context_get_effective_mount_apivfs(context
),
4076 .protect_home
= context
->protect_home
,
4077 .protect_system
= context
->protect_system
,
4078 .protect_proc
= context
->protect_proc
,
4079 .proc_subset
= context
->proc_subset
,
4080 .private_network
= exec_needs_network_namespace(context
),
4081 .private_ipc
= exec_needs_ipc_namespace(context
),
4082 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4083 .mount_nosuid
= context
->no_new_privileges
&& !mac_selinux_use(),
4085 } else if (!context
->dynamic_user
&& root_dir
)
4087 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4088 * sandbox info, otherwise enforce it, don't ignore protected paths and
4089 * fail if we are enable to apply the sandbox inside the mount namespace.
4091 ns_info
= (NamespaceInfo
) {
4092 .ignore_protect_paths
= true,
4095 ns_info
= (NamespaceInfo
) {};
4097 if (context
->mount_propagation_flag
== MS_SHARED
)
4098 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4100 if (exec_context_has_credentials(context
) &&
4101 params
->prefix
[EXEC_DIRECTORY_RUNTIME
] &&
4102 FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
4103 creds_path
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials", u
->id
);
4108 if (params
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
4109 propagate_dir
= path_join("/run/systemd/propagate/", u
->id
);
4113 incoming_dir
= strdup("/run/systemd/incoming");
4117 extension_dir
= strdup("/run/systemd/unit-extensions");
4121 assert(params
->runtime_scope
== RUNTIME_SCOPE_USER
);
4123 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0)
4128 r
= verity_settings_prepare(
4131 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
4132 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
4133 context
->root_verity
);
4138 r
= setup_namespace(
4141 context
->root_image_options
,
4142 context
->root_image_policy
?: &image_policy_service
,
4145 needs_sandboxing
? context
->read_only_paths
: NULL
,
4146 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
4147 needs_sandboxing
? context
->exec_paths
: NULL
,
4148 needs_sandboxing
? context
->no_exec_paths
: NULL
,
4153 context
->temporary_filesystems
,
4154 context
->n_temporary_filesystems
,
4155 context
->mount_images
,
4156 context
->n_mount_images
,
4157 context
->mount_image_policy
?: &image_policy_service
,
4161 context
->log_namespace
,
4162 context
->mount_propagation_flag
,
4164 context
->extension_images
,
4165 context
->n_extension_images
,
4166 context
->extension_image_policy
?: &image_policy_sysext
,
4167 context
->extension_directories
,
4171 root_dir
|| root_image
? params
->notify_socket
: NULL
,
4174 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4175 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4176 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4177 * completely different execution environment. */
4179 if (insist_on_sandboxing(
4181 root_dir
, root_image
,
4184 return log_unit_debug_errno(u
,
4185 SYNTHETIC_ERRNO(EOPNOTSUPP
),
4186 "Failed to set up namespace, and refusing to continue since "
4187 "the selected namespacing options alter mount environment non-trivially.\n"
4188 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4190 context
->n_temporary_filesystems
,
4193 yes_no(context
->dynamic_user
));
4195 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
4202 static int apply_working_directory(
4203 const ExecContext
*context
,
4204 const ExecParameters
*params
,
4205 ExecRuntime
*runtime
,
4212 assert(exit_status
);
4214 if (context
->working_directory_home
) {
4217 *exit_status
= EXIT_CHDIR
;
4224 wd
= empty_to_root(context
->working_directory
);
4226 if (params
->flags
& EXEC_APPLY_CHROOT
)
4229 d
= prefix_roota((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
, wd
);
4231 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
4232 *exit_status
= EXIT_CHDIR
;
4239 static int apply_root_directory(
4240 const ExecContext
*context
,
4241 const ExecParameters
*params
,
4242 ExecRuntime
*runtime
,
4243 const bool needs_mount_ns
,
4247 assert(exit_status
);
4249 if (params
->flags
& EXEC_APPLY_CHROOT
)
4250 if (!needs_mount_ns
&& context
->root_directory
)
4251 if (chroot((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
) < 0) {
4252 *exit_status
= EXIT_CHROOT
;
4259 static int setup_keyring(
4261 const ExecContext
*context
,
4262 const ExecParameters
*p
,
4263 uid_t uid
, gid_t gid
) {
4265 key_serial_t keyring
;
4274 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4275 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4276 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4277 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4278 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4279 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4281 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
4284 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4285 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4286 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4287 * & group is just as nasty as acquiring a reference to the user keyring. */
4289 saved_uid
= getuid();
4290 saved_gid
= getgid();
4292 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
4293 if (setregid(gid
, -1) < 0)
4294 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
4297 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
4298 if (setreuid(uid
, -1) < 0) {
4299 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
4304 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
4305 if (keyring
== -1) {
4306 if (errno
== ENOSYS
)
4307 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
4308 else if (ERRNO_IS_PRIVILEGE(errno
))
4309 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
4310 else if (errno
== EDQUOT
)
4311 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
4313 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
4318 /* When requested link the user keyring into the session keyring. */
4319 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
4321 if (keyctl(KEYCTL_LINK
,
4322 KEY_SPEC_USER_KEYRING
,
4323 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
4324 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
4329 /* Restore uid/gid back */
4330 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
4331 if (setreuid(saved_uid
, -1) < 0) {
4332 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
4337 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
4338 if (setregid(saved_gid
, -1) < 0)
4339 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
4342 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4343 if (!sd_id128_is_null(u
->invocation_id
)) {
4346 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
4348 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
4350 if (keyctl(KEYCTL_SETPERM
, key
,
4351 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
4352 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
4353 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
4358 /* Revert back uid & gid for the last time, and exit */
4359 /* no extra logging, as only the first already reported error matters */
4360 if (getuid() != saved_uid
)
4361 (void) setreuid(saved_uid
, -1);
4363 if (getgid() != saved_gid
)
4364 (void) setregid(saved_gid
, -1);
4369 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
4375 array
[(*n
)++] = pair
[0];
4377 array
[(*n
)++] = pair
[1];
4380 static int close_remaining_fds(
4381 const ExecParameters
*params
,
4382 const ExecRuntime
*runtime
,
4385 const int *fds
, size_t n_fds
) {
4387 size_t n_dont_close
= 0;
4388 int dont_close
[n_fds
+ 14];
4392 if (params
->stdin_fd
>= 0)
4393 dont_close
[n_dont_close
++] = params
->stdin_fd
;
4394 if (params
->stdout_fd
>= 0)
4395 dont_close
[n_dont_close
++] = params
->stdout_fd
;
4396 if (params
->stderr_fd
>= 0)
4397 dont_close
[n_dont_close
++] = params
->stderr_fd
;
4400 dont_close
[n_dont_close
++] = socket_fd
;
4402 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
4403 n_dont_close
+= n_fds
;
4407 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ephemeral_storage_socket
);
4409 if (runtime
&& runtime
->shared
) {
4410 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->netns_storage_socket
);
4411 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->ipcns_storage_socket
);
4414 if (runtime
&& runtime
->dynamic_creds
) {
4415 if (runtime
->dynamic_creds
->user
)
4416 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->user
->storage_socket
);
4417 if (runtime
->dynamic_creds
->group
)
4418 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->group
->storage_socket
);
4421 if (user_lookup_fd
>= 0)
4422 dont_close
[n_dont_close
++] = user_lookup_fd
;
4424 return close_all_fds(dont_close
, n_dont_close
);
4427 static int send_user_lookup(
4435 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4436 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4439 if (user_lookup_fd
< 0)
4442 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
4445 if (writev(user_lookup_fd
,
4447 IOVEC_MAKE(&uid
, sizeof(uid
)),
4448 IOVEC_MAKE(&gid
, sizeof(gid
)),
4449 IOVEC_MAKE_STRING(unit
->id
) }, 3) < 0)
4455 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
4462 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4467 if (!c
->working_directory_home
)
4470 r
= get_home_dir(buf
);
4478 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
4479 _cleanup_strv_free_
char ** list
= NULL
;
4486 assert(c
->dynamic_user
);
4488 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4489 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4492 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
4493 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
4499 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
4502 if (exec_directory_is_private(c
, t
))
4503 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
4505 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
4509 r
= strv_consume(&list
, e
);
4515 *ret
= TAKE_PTR(list
);
4520 static int exec_parameters_get_cgroup_path(
4521 const ExecParameters
*params
,
4522 const CGroupContext
*c
,
4525 const char *subgroup
= NULL
;
4531 if (!params
->cgroup_path
)
4534 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4535 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4536 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4537 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4538 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4539 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4540 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4541 * flag, which is only passed for the former statements, not for the latter. */
4543 if (FLAGS_SET(params
->flags
, EXEC_CGROUP_DELEGATE
) && (FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
) || c
->delegate_subgroup
)) {
4544 if (FLAGS_SET(params
->flags
, EXEC_IS_CONTROL
))
4545 subgroup
= ".control";
4547 subgroup
= c
->delegate_subgroup
;
4551 p
= path_join(params
->cgroup_path
, subgroup
);
4553 p
= strdup(params
->cgroup_path
);
4561 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
4562 _cleanup_(cpu_set_reset
) CPUSet s
= {};
4568 if (!c
->numa_policy
.nodes
.set
) {
4569 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4573 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
4579 return cpu_set_add_all(ret
, &s
);
4582 bool exec_context_get_cpu_affinity_from_numa(const ExecContext
*c
) {
4585 return c
->cpu_affinity_from_numa
;
4588 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int fd
, int *ret_fd
) {
4593 assert(*n_fds
< fds_size
);
4601 if (fd
< 3 + (int) *n_fds
) {
4602 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4603 * the fds we pass to the process (or which are closed only during execve). */
4605 r
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
4609 close_and_replace(fd
, r
);
4612 *ret_fd
= fds
[*n_fds
] = fd
;
4617 static int connect_unix_harder(Unit
*u
, const OpenFile
*of
, int ofd
) {
4618 union sockaddr_union addr
= {
4619 .un
.sun_family
= AF_UNIX
,
4622 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
4629 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
4631 return log_unit_error_errno(u
, r
, "Failed to set sockaddr for %s: %m", of
->path
);
4635 for (size_t i
= 0; i
< ELEMENTSOF(socket_types
); i
++) {
4636 _cleanup_close_
int fd
= -EBADF
;
4638 fd
= socket(AF_UNIX
, socket_types
[i
] | SOCK_CLOEXEC
, 0);
4640 return log_unit_error_errno(u
, errno
, "Failed to create socket for %s: %m", of
->path
);
4642 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
4643 if (r
== -EPROTOTYPE
)
4646 return log_unit_error_errno(u
, r
, "Failed to connect socket for %s: %m", of
->path
);
4651 return log_unit_error_errno(u
, SYNTHETIC_ERRNO(EPROTOTYPE
), "Failed to connect socket for \"%s\".", of
->path
);
4654 static int get_open_file_fd(Unit
*u
, const OpenFile
*of
) {
4656 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
4661 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
4663 return log_unit_error_errno(u
, errno
, "Could not open \"%s\": %m", of
->path
);
4665 if (fstat(ofd
, &st
) < 0)
4666 return log_unit_error_errno(u
, errno
, "Failed to stat %s: %m", of
->path
);
4668 if (S_ISSOCK(st
.st_mode
)) {
4669 fd
= connect_unix_harder(u
, of
, ofd
);
4673 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
4674 return log_unit_error_errno(u
, errno
, "Failed to shutdown send for socket %s: %m",
4677 log_unit_debug(u
, "socket %s opened (fd=%d)", of
->path
, fd
);
4679 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
4680 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
4682 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
4685 fd
= fd_reopen(ofd
, flags
| O_CLOEXEC
);
4687 return log_unit_error_errno(u
, fd
, "Failed to open file %s: %m", of
->path
);
4689 log_unit_debug(u
, "file %s opened (fd=%d)", of
->path
, fd
);
4695 static int collect_open_file_fds(
4697 OpenFile
* open_files
,
4708 LIST_FOREACH(open_files
, of
, open_files
) {
4709 _cleanup_close_
int fd
= -EBADF
;
4711 fd
= get_open_file_fd(u
, of
);
4713 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
4714 log_unit_debug_errno(u
, fd
, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of
->path
);
4721 if (!GREEDY_REALLOC(*fds
, *n_fds
+ 1))
4724 r
= strv_extend(fdnames
, of
->fdname
);
4728 (*fds
)[*n_fds
] = TAKE_FD(fd
);
4736 static void log_command_line(Unit
*unit
, const char *msg
, const char *executable
, char **argv
) {
4744 _cleanup_free_
char *cmdline
= quote_command_line(argv
, SHELL_ESCAPE_EMPTY
);
4746 log_unit_struct(unit
, LOG_DEBUG
,
4747 "EXECUTABLE=%s", executable
,
4748 LOG_UNIT_MESSAGE(unit
, "%s: %s", msg
, strnull(cmdline
)),
4749 LOG_UNIT_INVOCATION_ID(unit
));
4752 static bool exec_context_need_unprivileged_private_users(
4753 const ExecContext
*context
,
4754 const ExecParameters
*params
) {
4759 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4760 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4761 * (system manager) then we have privileges and don't need this. */
4762 if (params
->runtime_scope
!= RUNTIME_SCOPE_USER
)
4765 return context
->private_users
||
4766 context
->private_tmp
||
4767 context
->private_devices
||
4768 context
->private_network
||
4769 context
->network_namespace_path
||
4770 context
->private_ipc
||
4771 context
->ipc_namespace_path
||
4772 context
->private_mounts
> 0 ||
4773 context
->mount_apivfs
||
4774 context
->n_bind_mounts
> 0 ||
4775 context
->n_temporary_filesystems
> 0 ||
4776 context
->root_directory
||
4777 !strv_isempty(context
->extension_directories
) ||
4778 context
->protect_system
!= PROTECT_SYSTEM_NO
||
4779 context
->protect_home
!= PROTECT_HOME_NO
||
4780 context
->protect_kernel_tunables
||
4781 context
->protect_kernel_modules
||
4782 context
->protect_kernel_logs
||
4783 context
->protect_control_groups
||
4784 context
->protect_clock
||
4785 context
->protect_hostname
||
4786 !strv_isempty(context
->read_write_paths
) ||
4787 !strv_isempty(context
->read_only_paths
) ||
4788 !strv_isempty(context
->inaccessible_paths
) ||
4789 !strv_isempty(context
->exec_paths
) ||
4790 !strv_isempty(context
->no_exec_paths
);
4793 static int exec_child(
4795 const ExecCommand
*command
,
4796 const ExecContext
*context
,
4797 const ExecParameters
*params
,
4798 ExecRuntime
*runtime
,
4799 const CGroupContext
*cgroup_context
,
4801 const int named_iofds
[static 3],
4803 size_t n_socket_fds
,
4804 size_t n_storage_fds
,
4809 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
4810 int r
, ngids
= 0, exec_fd
;
4811 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
4812 const char *username
= NULL
, *groupname
= NULL
;
4813 _cleanup_free_
char *home_buffer
= NULL
, *memory_pressure_path
= NULL
;
4814 const char *home
= NULL
, *shell
= NULL
;
4815 char **final_argv
= NULL
;
4816 dev_t journal_stream_dev
= 0;
4817 ino_t journal_stream_ino
= 0;
4818 bool userns_set_up
= false;
4819 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4820 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
4821 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
4822 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
4824 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
4825 bool use_selinux
= false;
4828 bool use_smack
= false;
4831 bool use_apparmor
= false;
4833 uid_t saved_uid
= getuid();
4834 gid_t saved_gid
= getgid();
4835 uid_t uid
= UID_INVALID
;
4836 gid_t gid
= GID_INVALID
;
4837 size_t n_fds
= n_socket_fds
+ n_storage_fds
, /* fds to pass to the child */
4838 n_keep_fds
; /* total number of fds not to close */
4840 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
4841 int ngids_after_pam
= 0;
4842 _cleanup_free_
int *fds
= NULL
;
4843 _cleanup_strv_free_
char **fdnames
= NULL
;
4849 assert(exit_status
);
4851 /* Explicitly test for CVE-2021-4034 inspired invocations */
4852 assert(command
->path
);
4853 assert(!strv_isempty(command
->argv
));
4855 rename_process_from_path(command
->path
);
4857 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4858 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4859 * both of which will be demoted to SIG_DFL. */
4860 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4863 if (context
->ignore_sigpipe
)
4864 (void) ignore_signals(SIGPIPE
);
4866 r
= reset_signal_mask();
4868 *exit_status
= EXIT_SIGNAL_MASK
;
4869 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
4872 if (params
->idle_pipe
)
4873 do_idle_pipe_dance(params
->idle_pipe
);
4875 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4876 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4877 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4878 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4881 log_set_open_when_needed(true);
4882 log_settle_target();
4884 /* In case anything used libc syslog(), close this here, too */
4887 fds
= newdup(int, params_fds
, n_fds
);
4889 *exit_status
= EXIT_MEMORY
;
4893 fdnames
= strv_copy((char**) params
->fd_names
);
4895 *exit_status
= EXIT_MEMORY
;
4899 r
= collect_open_file_fds(unit
, params
->open_files
, &fds
, &fdnames
, &n_fds
);
4901 *exit_status
= EXIT_FDS
;
4902 return log_unit_error_errno(unit
, r
, "Failed to get OpenFile= file descriptors: %m");
4905 int keep_fds
[n_fds
+ 3];
4906 memcpy_safe(keep_fds
, fds
, n_fds
* sizeof(int));
4909 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, params
->exec_fd
, &exec_fd
);
4911 *exit_status
= EXIT_FDS
;
4912 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4916 if (unit
->manager
->restrict_fs
) {
4917 int bpf_map_fd
= lsm_bpf_map_restrict_fs_fd(unit
);
4918 if (bpf_map_fd
< 0) {
4919 *exit_status
= EXIT_FDS
;
4920 return log_unit_error_errno(unit
, bpf_map_fd
, "Failed to get restrict filesystems BPF map fd: %m");
4923 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, bpf_map_fd
, &bpf_map_fd
);
4925 *exit_status
= EXIT_FDS
;
4926 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4931 r
= close_remaining_fds(params
, runtime
, user_lookup_fd
, socket_fd
, keep_fds
, n_keep_fds
);
4933 *exit_status
= EXIT_FDS
;
4934 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
4937 if (!context
->same_pgrp
&&
4939 *exit_status
= EXIT_SETSID
;
4940 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
4943 exec_context_tty_reset(context
, params
);
4945 if (unit_shall_confirm_spawn(unit
)) {
4946 _cleanup_free_
char *cmdline
= NULL
;
4948 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4950 *exit_status
= EXIT_MEMORY
;
4954 r
= ask_for_confirmation(context
, params
->confirm_spawn
, unit
, cmdline
);
4955 if (r
!= CONFIRM_EXECUTE
) {
4956 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4957 *exit_status
= EXIT_SUCCESS
;
4960 *exit_status
= EXIT_CONFIRM
;
4961 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ECANCELED
),
4962 "Execution cancelled by the user");
4966 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4967 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4968 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4969 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4970 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4971 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
4972 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params
->runtime_scope
), true) != 0) {
4973 *exit_status
= EXIT_MEMORY
;
4974 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4977 if (context
->dynamic_user
&& runtime
&& runtime
->dynamic_creds
) {
4978 _cleanup_strv_free_
char **suggested_paths
= NULL
;
4980 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4981 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4982 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4983 *exit_status
= EXIT_USER
;
4984 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4987 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
4989 *exit_status
= EXIT_MEMORY
;
4993 r
= dynamic_creds_realize(runtime
->dynamic_creds
, suggested_paths
, &uid
, &gid
);
4995 *exit_status
= EXIT_USER
;
4997 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
4998 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4999 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
5002 if (!uid_is_valid(uid
)) {
5003 *exit_status
= EXIT_USER
;
5004 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\"", uid
);
5007 if (!gid_is_valid(gid
)) {
5008 *exit_status
= EXIT_USER
;
5009 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\"", gid
);
5012 if (runtime
->dynamic_creds
->user
)
5013 username
= runtime
->dynamic_creds
->user
->name
;
5016 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
5018 *exit_status
= EXIT_USER
;
5019 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
5022 r
= get_fixed_group(context
, &groupname
, &gid
);
5024 *exit_status
= EXIT_GROUP
;
5025 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
5029 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5030 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
5031 &supplementary_gids
, &ngids
);
5033 *exit_status
= EXIT_GROUP
;
5034 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
5037 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
5039 *exit_status
= EXIT_USER
;
5040 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
5043 user_lookup_fd
= safe_close(user_lookup_fd
);
5045 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
5047 *exit_status
= EXIT_CHDIR
;
5048 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
5051 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5053 (void) fd_nonblock(socket_fd
, false);
5055 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5056 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5057 if (params
->cgroup_path
) {
5058 _cleanup_free_
char *p
= NULL
;
5060 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
5062 *exit_status
= EXIT_CGROUP
;
5063 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
5066 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
5067 if (r
== -EUCLEAN
) {
5068 *exit_status
= EXIT_CGROUP
;
5069 return log_unit_error_errno(unit
, r
, "Failed to attach process to cgroup %s "
5070 "because the cgroup or one of its parents or "
5071 "siblings is in the threaded mode: %m", p
);
5074 *exit_status
= EXIT_CGROUP
;
5075 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
5079 if (context
->network_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
5080 r
= open_shareable_ns_path(runtime
->shared
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
5082 *exit_status
= EXIT_NETWORK
;
5083 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
5087 if (context
->ipc_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
5088 r
= open_shareable_ns_path(runtime
->shared
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
5090 *exit_status
= EXIT_NAMESPACE
;
5091 return log_unit_error_errno(unit
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
5095 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
5097 *exit_status
= EXIT_STDIN
;
5098 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
5101 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
5103 *exit_status
= EXIT_STDOUT
;
5104 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
5107 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
5109 *exit_status
= EXIT_STDERR
;
5110 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
5113 if (context
->oom_score_adjust_set
) {
5114 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5115 * prohibit write access to this file, and we shouldn't trip up over that. */
5116 r
= set_oom_score_adjust(context
->oom_score_adjust
);
5117 if (ERRNO_IS_PRIVILEGE(r
))
5118 log_unit_debug_errno(unit
, r
, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5120 *exit_status
= EXIT_OOM_ADJUST
;
5121 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
5125 if (context
->coredump_filter_set
) {
5126 r
= set_coredump_filter(context
->coredump_filter
);
5127 if (ERRNO_IS_PRIVILEGE(r
))
5128 log_unit_debug_errno(unit
, r
, "Failed to adjust coredump_filter, ignoring: %m");
5130 return log_unit_error_errno(unit
, r
, "Failed to adjust coredump_filter: %m");
5133 if (context
->nice_set
) {
5134 r
= setpriority_closest(context
->nice
);
5136 return log_unit_error_errno(unit
, r
, "Failed to set up process scheduling priority (nice level): %m");
5139 if (context
->cpu_sched_set
) {
5140 struct sched_param param
= {
5141 .sched_priority
= context
->cpu_sched_priority
,
5144 r
= sched_setscheduler(0,
5145 context
->cpu_sched_policy
|
5146 (context
->cpu_sched_reset_on_fork
?
5147 SCHED_RESET_ON_FORK
: 0),
5150 *exit_status
= EXIT_SETSCHEDULER
;
5151 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
5155 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
5156 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
5157 const CPUSet
*cpu_set
;
5159 if (context
->cpu_affinity_from_numa
) {
5160 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
5162 *exit_status
= EXIT_CPUAFFINITY
;
5163 return log_unit_error_errno(unit
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
5166 cpu_set
= &converted_cpu_set
;
5168 cpu_set
= &context
->cpu_set
;
5170 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
5171 *exit_status
= EXIT_CPUAFFINITY
;
5172 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
5176 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
5177 r
= apply_numa_policy(&context
->numa_policy
);
5179 if (ERRNO_IS_NOT_SUPPORTED(r
))
5180 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
5182 *exit_status
= EXIT_NUMA_POLICY
;
5183 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
5188 if (context
->ioprio_set
)
5189 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
5190 *exit_status
= EXIT_IOPRIO
;
5191 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
5194 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
5195 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
5196 *exit_status
= EXIT_TIMERSLACK
;
5197 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
5200 if (context
->personality
!= PERSONALITY_INVALID
) {
5201 r
= safe_personality(context
->personality
);
5203 *exit_status
= EXIT_PERSONALITY
;
5204 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
5208 if (context
->utmp_id
) {
5209 const char *line
= context
->tty_path
?
5210 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
5212 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
5214 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
5215 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
5220 if (uid_is_valid(uid
)) {
5221 r
= chown_terminal(STDIN_FILENO
, uid
);
5223 *exit_status
= EXIT_STDIN
;
5224 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
5228 if (params
->cgroup_path
) {
5229 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5230 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5231 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5232 * touch a single hierarchy too. */
5234 if (params
->flags
& EXEC_CGROUP_DELEGATE
) {
5235 _cleanup_free_
char *p
= NULL
;
5237 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
5239 *exit_status
= EXIT_CGROUP
;
5240 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
5243 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
5245 *exit_status
= EXIT_CGROUP
;
5246 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
5249 r
= cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER
, p
, uid
, gid
);
5251 *exit_status
= EXIT_CGROUP
;
5252 return log_unit_error_errno(unit
, r
, "Failed to adjust control subgroup access: %m");
5257 if (cgroup_context
&& cg_unified() > 0 && is_pressure_supported() > 0) {
5258 if (cgroup_context_want_memory_pressure(cgroup_context
)) {
5259 r
= cg_get_path("memory", params
->cgroup_path
, "memory.pressure", &memory_pressure_path
);
5261 *exit_status
= EXIT_MEMORY
;
5265 r
= chmod_and_chown(memory_pressure_path
, 0644, uid
, gid
);
5267 log_unit_full_errno(unit
, r
== -ENOENT
|| ERRNO_IS_PRIVILEGE(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
5268 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path
);
5269 memory_pressure_path
= mfree(memory_pressure_path
);
5271 } else if (cgroup_context
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_OFF
) {
5272 memory_pressure_path
= strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5273 if (!memory_pressure_path
) {
5274 *exit_status
= EXIT_MEMORY
;
5281 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
5283 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5284 r
= setup_exec_directory(unit
, context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
5286 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
5289 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
5290 r
= setup_credentials(context
, params
, unit
->id
, uid
);
5292 *exit_status
= EXIT_CREDENTIALS
;
5293 return log_unit_error_errno(unit
, r
, "Failed to set up credentials: %m");
5297 r
= build_environment(
5309 memory_pressure_path
,
5312 *exit_status
= EXIT_MEMORY
;
5316 r
= build_pass_environment(context
, &pass_env
);
5318 *exit_status
= EXIT_MEMORY
;
5322 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5323 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5324 * not specify PATH but the unit has ExecSearchPath. */
5325 if (!strv_isempty(context
->exec_search_path
)) {
5326 _cleanup_free_
char *joined
= NULL
;
5328 joined
= strv_join(context
->exec_search_path
, ":");
5330 *exit_status
= EXIT_MEMORY
;
5334 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
5336 *exit_status
= EXIT_MEMORY
;
5341 accum_env
= strv_env_merge(params
->environment
,
5343 joined_exec_search_path
,
5345 context
->environment
,
5348 *exit_status
= EXIT_MEMORY
;
5351 accum_env
= strv_env_clean(accum_env
);
5353 (void) umask(context
->umask
);
5355 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
5357 *exit_status
= EXIT_KEYRING
;
5358 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
5361 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5363 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
5365 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5366 * for it, and the kernel doesn't actually support ambient caps. */
5367 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
5369 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5370 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5372 if (needs_ambient_hack
)
5373 needs_setuid
= false;
5375 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
5377 uint64_t capability_ambient_set
= context
->capability_ambient_set
;
5379 if (needs_sandboxing
) {
5380 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5381 * /sys being present. The actual MAC context application will happen later, as late as
5382 * possible, to avoid impacting our own code paths. */
5385 use_selinux
= mac_selinux_use();
5388 use_smack
= mac_smack_use();
5391 use_apparmor
= mac_apparmor_use();
5395 if (needs_sandboxing
) {
5398 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5399 * is set here. (See below.) */
5401 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
5403 *exit_status
= EXIT_LIMITS
;
5404 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
5408 if (needs_setuid
&& context
->pam_name
&& username
) {
5409 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5410 * wins here. (See above.) */
5412 /* All fds passed in the fds array will be closed in the pam child process. */
5413 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
5415 *exit_status
= EXIT_PAM
;
5416 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
5419 if (ambient_capabilities_supported()) {
5420 uint64_t ambient_after_pam
;
5422 /* PAM modules might have set some ambient caps. Query them here and merge them into
5423 * the caps we want to set in the end, so that we don't end up unsetting them. */
5424 r
= capability_get_ambient(&ambient_after_pam
);
5426 *exit_status
= EXIT_CAPABILITIES
;
5427 return log_unit_error_errno(unit
, r
, "Failed to query ambient caps: %m");
5430 capability_ambient_set
|= ambient_after_pam
;
5433 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
5434 if (ngids_after_pam
< 0) {
5435 *exit_status
= EXIT_MEMORY
;
5436 return log_unit_error_errno(unit
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
5440 if (needs_sandboxing
&& exec_context_need_unprivileged_private_users(context
, params
)) {
5441 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5442 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5443 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5445 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
5446 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5447 * the actual requested operations fail (or silently continue). */
5448 if (r
< 0 && context
->private_users
) {
5449 *exit_status
= EXIT_USER
;
5450 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user: %m");
5453 log_unit_info_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5455 userns_set_up
= true;
5458 if (exec_needs_network_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
5460 /* Try to enable network namespacing if network namespacing is available and we have
5461 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5462 * new network namespace. And if we don't have that, then we could only create a network
5463 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5464 if (ns_type_supported(NAMESPACE_NET
) && have_effective_cap(CAP_NET_ADMIN
) > 0) {
5465 r
= setup_shareable_ns(runtime
->shared
->netns_storage_socket
, CLONE_NEWNET
);
5467 if (ERRNO_IS_PRIVILEGE(r
))
5468 log_unit_notice_errno(unit
, r
,
5469 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5471 *exit_status
= EXIT_NETWORK
;
5472 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
5475 } else if (context
->network_namespace_path
) {
5476 *exit_status
= EXIT_NETWORK
;
5477 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
5478 "NetworkNamespacePath= is not supported, refusing.");
5480 log_unit_notice(unit
, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5483 if (exec_needs_ipc_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
5485 if (ns_type_supported(NAMESPACE_IPC
)) {
5486 r
= setup_shareable_ns(runtime
->shared
->ipcns_storage_socket
, CLONE_NEWIPC
);
5488 log_unit_warning_errno(unit
, r
,
5489 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5491 *exit_status
= EXIT_NAMESPACE
;
5492 return log_unit_error_errno(unit
, r
, "Failed to set up IPC namespacing: %m");
5494 } else if (context
->ipc_namespace_path
) {
5495 *exit_status
= EXIT_NAMESPACE
;
5496 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
5497 "IPCNamespacePath= is not supported, refusing.");
5499 log_unit_warning(unit
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5502 if (needs_mount_namespace
) {
5503 _cleanup_free_
char *error_path
= NULL
;
5505 r
= apply_mount_namespace(unit
, command
->flags
, context
, params
, runtime
, memory_pressure_path
, &error_path
);
5507 *exit_status
= EXIT_NAMESPACE
;
5508 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
5509 error_path
? ": " : "", strempty(error_path
));
5513 if (needs_sandboxing
) {
5514 r
= apply_protect_hostname(unit
, context
, exit_status
);
5519 if (context
->memory_ksm
>= 0)
5520 if (prctl(PR_SET_MEMORY_MERGE
, context
->memory_ksm
) < 0) {
5521 if (ERRNO_IS_NOT_SUPPORTED(errno
))
5522 log_unit_debug_errno(unit
, errno
, "KSM support not available, ignoring.");
5524 *exit_status
= EXIT_KSM
;
5525 return log_unit_error_errno(unit
, errno
, "Failed to set KSM: %m");
5529 /* Drop groups as early as possible.
5530 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5531 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5533 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
5534 int ngids_to_enforce
= 0;
5536 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
5541 if (ngids_to_enforce
< 0) {
5542 *exit_status
= EXIT_MEMORY
;
5543 return log_unit_error_errno(unit
,
5545 "Failed to merge group lists. Group membership might be incorrect: %m");
5548 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
5550 *exit_status
= EXIT_GROUP
;
5551 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
5555 /* If the user namespace was not set up above, try to do it now.
5556 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5557 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5558 * case of mount namespaces being less privileged when the mount point list is copied from a
5559 * different user namespace). */
5561 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
5562 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
5564 *exit_status
= EXIT_USER
;
5565 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
5569 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5572 _cleanup_free_
char *executable
= NULL
;
5573 _cleanup_close_
int executable_fd
= -EBADF
;
5574 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
5576 if (r
!= -ENOMEM
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
5577 log_unit_struct_errno(unit
, LOG_INFO
, r
,
5578 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5579 LOG_UNIT_INVOCATION_ID(unit
),
5580 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
5582 "EXECUTABLE=%s", command
->path
);
5586 *exit_status
= EXIT_EXEC
;
5588 return log_unit_struct_errno(unit
, LOG_INFO
, r
,
5589 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5590 LOG_UNIT_INVOCATION_ID(unit
),
5591 LOG_UNIT_MESSAGE(unit
, "Failed to locate executable %s: %m",
5593 "EXECUTABLE=%s", command
->path
);
5596 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, executable_fd
, &executable_fd
);
5598 *exit_status
= EXIT_FDS
;
5599 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
5603 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
5608 else if (params
->n_socket_fds
== 1)
5609 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5610 * use context from that fd to compute the label. */
5611 fd
= params
->fds
[0];
5614 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
5616 if (!context
->selinux_context_ignore
) {
5617 *exit_status
= EXIT_SELINUX_CONTEXT
;
5618 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
5620 log_unit_debug_errno(unit
, r
, "Failed to determine SELinux context, ignoring: %m");
5626 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5627 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5628 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5631 r
= close_all_fds(keep_fds
, n_keep_fds
);
5633 r
= shift_fds(fds
, n_fds
);
5635 r
= flags_fds(fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
5637 *exit_status
= EXIT_FDS
;
5638 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
5641 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5642 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5643 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5646 secure_bits
= context
->secure_bits
;
5648 if (needs_sandboxing
) {
5651 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5652 * (Note this is placed after the general resource limit initialization, see above, in order
5653 * to take precedence.) */
5654 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
5655 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
5656 *exit_status
= EXIT_LIMITS
;
5657 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5662 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5663 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5665 r
= setup_smack(unit
->manager
, context
, executable_fd
);
5666 if (r
< 0 && !context
->smack_process_label_ignore
) {
5667 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
5668 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
5673 bset
= context
->capability_bounding_set
;
5674 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5675 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5676 * instead of us doing that */
5677 if (needs_ambient_hack
)
5678 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
5679 (UINT64_C(1) << CAP_SETUID
) |
5680 (UINT64_C(1) << CAP_SETGID
);
5682 if (!cap_test_all(bset
)) {
5683 r
= capability_bounding_set_drop(bset
, /* right_now= */ false);
5685 *exit_status
= EXIT_CAPABILITIES
;
5686 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
5690 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5693 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5694 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5695 * the ambient capabilities can be raised as they are present in the permitted and
5696 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5697 * without changing the user, so we also set the ambient capabilities here.
5699 * The requested ambient capabilities are raised in the inheritable set if the second
5700 * argument is true. */
5701 if (!needs_ambient_hack
) {
5702 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ true);
5704 *exit_status
= EXIT_CAPABILITIES
;
5705 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
5710 /* chroot to root directory first, before we lose the ability to chroot */
5711 r
= apply_root_directory(context
, params
, runtime
, needs_mount_namespace
, exit_status
);
5713 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
5716 if (uid_is_valid(uid
)) {
5717 r
= enforce_user(context
, uid
, capability_ambient_set
);
5719 *exit_status
= EXIT_USER
;
5720 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
5723 if (!needs_ambient_hack
&& capability_ambient_set
!= 0) {
5725 /* Raise the ambient capabilities after user change. */
5726 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ false);
5728 *exit_status
= EXIT_CAPABILITIES
;
5729 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
5735 /* Apply working directory here, because the working directory might be on NFS and only the user running
5736 * this service might have the correct privilege to change to the working directory */
5737 r
= apply_working_directory(context
, params
, runtime
, home
, exit_status
);
5739 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
5741 if (needs_sandboxing
) {
5742 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5743 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5744 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5745 * are restricted. */
5749 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
5752 r
= setexeccon(exec_context
);
5754 if (!context
->selinux_context_ignore
) {
5755 *exit_status
= EXIT_SELINUX_CONTEXT
;
5756 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
5758 log_unit_debug_errno(unit
, r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
5765 if (use_apparmor
&& context
->apparmor_profile
) {
5766 r
= aa_change_onexec(context
->apparmor_profile
);
5767 if (r
< 0 && !context
->apparmor_profile_ignore
) {
5768 *exit_status
= EXIT_APPARMOR_PROFILE
;
5769 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
5774 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5775 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5776 * requires CAP_SETPCAP. */
5777 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
5778 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5779 * effective set here.
5781 * The effective set is overwritten during execve() with the following values:
5783 * - ambient set (for non-root processes)
5785 * - (inheritable | bounding) set for root processes)
5787 * Hence there is no security impact to raise it in the effective set before execve
5789 r
= capability_gain_cap_setpcap(/* return_caps= */ NULL
);
5791 *exit_status
= EXIT_CAPABILITIES
;
5792 return log_unit_error_errno(unit
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
5794 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
5795 *exit_status
= EXIT_SECUREBITS
;
5796 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
5800 if (context_has_no_new_privileges(context
))
5801 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
5802 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
5803 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
5807 r
= apply_address_families(unit
, context
);
5809 *exit_status
= EXIT_ADDRESS_FAMILIES
;
5810 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
5813 r
= apply_memory_deny_write_execute(unit
, context
);
5815 *exit_status
= EXIT_SECCOMP
;
5816 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
5819 r
= apply_restrict_realtime(unit
, context
);
5821 *exit_status
= EXIT_SECCOMP
;
5822 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
5825 r
= apply_restrict_suid_sgid(unit
, context
);
5827 *exit_status
= EXIT_SECCOMP
;
5828 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
5831 r
= apply_restrict_namespaces(unit
, context
);
5833 *exit_status
= EXIT_SECCOMP
;
5834 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
5837 r
= apply_protect_sysctl(unit
, context
);
5839 *exit_status
= EXIT_SECCOMP
;
5840 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
5843 r
= apply_protect_kernel_modules(unit
, context
);
5845 *exit_status
= EXIT_SECCOMP
;
5846 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
5849 r
= apply_protect_kernel_logs(unit
, context
);
5851 *exit_status
= EXIT_SECCOMP
;
5852 return log_unit_error_errno(unit
, r
, "Failed to apply kernel log restrictions: %m");
5855 r
= apply_protect_clock(unit
, context
);
5857 *exit_status
= EXIT_SECCOMP
;
5858 return log_unit_error_errno(unit
, r
, "Failed to apply clock restrictions: %m");
5861 r
= apply_private_devices(unit
, context
);
5863 *exit_status
= EXIT_SECCOMP
;
5864 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
5867 r
= apply_syscall_archs(unit
, context
);
5869 *exit_status
= EXIT_SECCOMP
;
5870 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
5873 r
= apply_lock_personality(unit
, context
);
5875 *exit_status
= EXIT_SECCOMP
;
5876 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
5879 r
= apply_syscall_log(unit
, context
);
5881 *exit_status
= EXIT_SECCOMP
;
5882 return log_unit_error_errno(unit
, r
, "Failed to apply system call log filters: %m");
5885 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5886 * by the filter as little as possible. */
5887 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
5889 *exit_status
= EXIT_SECCOMP
;
5890 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
5895 r
= apply_restrict_filesystems(unit
, context
);
5897 *exit_status
= EXIT_BPF
;
5898 return log_unit_error_errno(unit
, r
, "Failed to restrict filesystems: %m");
5904 if (!strv_isempty(context
->unset_environment
)) {
5907 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5909 *exit_status
= EXIT_MEMORY
;
5913 strv_free_and_replace(accum_env
, ee
);
5916 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5917 _cleanup_strv_free_
char **unset_variables
= NULL
, **bad_variables
= NULL
;
5919 r
= replace_env_argv(command
->argv
, accum_env
, &replaced_argv
, &unset_variables
, &bad_variables
);
5921 *exit_status
= EXIT_MEMORY
;
5922 return log_unit_error_errno(unit
, r
, "Failed to replace environment variables: %m");
5924 final_argv
= replaced_argv
;
5926 if (!strv_isempty(unset_variables
)) {
5927 _cleanup_free_
char *ju
= strv_join(unset_variables
, ", ");
5928 log_unit_warning(unit
, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju
));
5931 if (!strv_isempty(bad_variables
)) {
5932 _cleanup_free_
char *jb
= strv_join(bad_variables
, ", ");
5933 log_unit_warning(unit
, "Invalid environment variable name evaluates to an empty string: %s", strna(jb
));;
5936 final_argv
= command
->argv
;
5938 log_command_line(unit
, "Executing", executable
, final_argv
);
5943 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5944 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5946 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5947 *exit_status
= EXIT_EXEC
;
5948 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
5952 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5957 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5958 * that POLLHUP on it no longer means execve() succeeded. */
5960 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5961 *exit_status
= EXIT_EXEC
;
5962 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
5966 *exit_status
= EXIT_EXEC
;
5967 return log_unit_error_errno(unit
, r
, "Failed to execute %s: %m", executable
);
5970 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
5971 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
5973 int exec_spawn(Unit
*unit
,
5974 ExecCommand
*command
,
5975 const ExecContext
*context
,
5976 const ExecParameters
*params
,
5977 ExecRuntime
*runtime
,
5978 const CGroupContext
*cgroup_context
,
5981 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
5982 _cleanup_free_
char *subcgroup_path
= NULL
;
5983 _cleanup_strv_free_
char **files_env
= NULL
;
5984 size_t n_storage_fds
= 0, n_socket_fds
= 0;
5992 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
5994 LOG_CONTEXT_PUSH_UNIT(unit
);
5996 if (context
->std_input
== EXEC_INPUT_SOCKET
||
5997 context
->std_output
== EXEC_OUTPUT_SOCKET
||
5998 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
6000 if (params
->n_socket_fds
> 1)
6001 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
6003 if (params
->n_socket_fds
== 0)
6004 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
6006 socket_fd
= params
->fds
[0];
6010 n_socket_fds
= params
->n_socket_fds
;
6011 n_storage_fds
= params
->n_storage_fds
;
6014 r
= exec_context_named_iofds(context
, params
, named_iofds
);
6016 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
6018 r
= exec_context_load_environment(unit
, context
, &files_env
);
6020 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
6022 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6023 and, until the next SELinux policy changes, we save further reloads in future children. */
6024 mac_selinux_maybe_reload();
6026 /* We won't know the real executable path until we create the mount namespace in the child, but we
6027 want to log from the parent, so we use the possibly inaccurate path here. */
6028 log_command_line(unit
, "About to execute", command
->path
, command
->argv
);
6030 if (params
->cgroup_path
) {
6031 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &subcgroup_path
);
6033 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
6035 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6036 * realized by the unit logic) */
6038 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
6040 return log_unit_error_errno(unit
, r
, "Failed to create subcgroup '%s': %m", subcgroup_path
);
6046 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
6049 int exit_status
= EXIT_SUCCESS
;
6051 r
= exec_child(unit
,
6063 unit
->manager
->user_lookup_fds
[1],
6067 const char *status
=
6068 exit_status_to_string(exit_status
,
6069 EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
);
6071 log_unit_struct_errno(unit
, LOG_ERR
, r
,
6072 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
6073 LOG_UNIT_INVOCATION_ID(unit
),
6074 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
6075 status
, command
->path
),
6076 "EXECUTABLE=%s", command
->path
);
6082 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
6084 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6085 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6086 * process will be killed too). */
6088 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
6090 exec_status_start(&command
->exec_status
, pid
);
6096 void exec_context_init(ExecContext
*c
) {
6100 c
->ioprio
= IOPRIO_DEFAULT_CLASS_AND_PRIO
;
6101 c
->cpu_sched_policy
= SCHED_OTHER
;
6102 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
6103 c
->syslog_level_prefix
= true;
6104 c
->ignore_sigpipe
= true;
6105 c
->timer_slack_nsec
= NSEC_INFINITY
;
6106 c
->personality
= PERSONALITY_INVALID
;
6107 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6108 c
->directories
[t
].mode
= 0755;
6109 c
->timeout_clean_usec
= USEC_INFINITY
;
6110 c
->capability_bounding_set
= CAP_MASK_UNSET
;
6111 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
6112 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
6113 c
->log_level_max
= -1;
6115 c
->syscall_errno
= SECCOMP_ERROR_NUMBER_KILL
;
6117 c
->tty_rows
= UINT_MAX
;
6118 c
->tty_cols
= UINT_MAX
;
6119 numa_policy_reset(&c
->numa_policy
);
6120 c
->private_mounts
= -1;
6124 void exec_context_done(ExecContext
*c
) {
6127 c
->environment
= strv_free(c
->environment
);
6128 c
->environment_files
= strv_free(c
->environment_files
);
6129 c
->pass_environment
= strv_free(c
->pass_environment
);
6130 c
->unset_environment
= strv_free(c
->unset_environment
);
6132 rlimit_free_all(c
->rlimit
);
6134 for (size_t l
= 0; l
< 3; l
++) {
6135 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
6136 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
6139 c
->working_directory
= mfree(c
->working_directory
);
6140 c
->root_directory
= mfree(c
->root_directory
);
6141 c
->root_image
= mfree(c
->root_image
);
6142 c
->root_image_options
= mount_options_free_all(c
->root_image_options
);
6143 c
->root_hash
= mfree(c
->root_hash
);
6144 c
->root_hash_size
= 0;
6145 c
->root_hash_path
= mfree(c
->root_hash_path
);
6146 c
->root_hash_sig
= mfree(c
->root_hash_sig
);
6147 c
->root_hash_sig_size
= 0;
6148 c
->root_hash_sig_path
= mfree(c
->root_hash_sig_path
);
6149 c
->root_verity
= mfree(c
->root_verity
);
6150 c
->extension_images
= mount_image_free_many(c
->extension_images
, &c
->n_extension_images
);
6151 c
->extension_directories
= strv_free(c
->extension_directories
);
6152 c
->tty_path
= mfree(c
->tty_path
);
6153 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
6154 c
->user
= mfree(c
->user
);
6155 c
->group
= mfree(c
->group
);
6157 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
6159 c
->pam_name
= mfree(c
->pam_name
);
6161 c
->read_only_paths
= strv_free(c
->read_only_paths
);
6162 c
->read_write_paths
= strv_free(c
->read_write_paths
);
6163 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
6164 c
->exec_paths
= strv_free(c
->exec_paths
);
6165 c
->no_exec_paths
= strv_free(c
->no_exec_paths
);
6166 c
->exec_search_path
= strv_free(c
->exec_search_path
);
6168 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
6169 c
->bind_mounts
= NULL
;
6170 c
->n_bind_mounts
= 0;
6171 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
6172 c
->temporary_filesystems
= NULL
;
6173 c
->n_temporary_filesystems
= 0;
6174 c
->mount_images
= mount_image_free_many(c
->mount_images
, &c
->n_mount_images
);
6176 cpu_set_reset(&c
->cpu_set
);
6177 numa_policy_reset(&c
->numa_policy
);
6179 c
->utmp_id
= mfree(c
->utmp_id
);
6180 c
->selinux_context
= mfree(c
->selinux_context
);
6181 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
6182 c
->smack_process_label
= mfree(c
->smack_process_label
);
6184 c
->restrict_filesystems
= set_free(c
->restrict_filesystems
);
6186 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
6187 c
->syscall_archs
= set_free(c
->syscall_archs
);
6188 c
->address_families
= set_free(c
->address_families
);
6190 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6191 exec_directory_done(&c
->directories
[t
]);
6193 c
->log_level_max
= -1;
6195 exec_context_free_log_extra_fields(c
);
6196 c
->log_filter_allowed_patterns
= set_free(c
->log_filter_allowed_patterns
);
6197 c
->log_filter_denied_patterns
= set_free(c
->log_filter_denied_patterns
);
6199 c
->log_ratelimit_interval_usec
= 0;
6200 c
->log_ratelimit_burst
= 0;
6202 c
->stdin_data
= mfree(c
->stdin_data
);
6203 c
->stdin_data_size
= 0;
6205 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
6206 c
->ipc_namespace_path
= mfree(c
->ipc_namespace_path
);
6208 c
->log_namespace
= mfree(c
->log_namespace
);
6210 c
->load_credentials
= hashmap_free(c
->load_credentials
);
6211 c
->set_credentials
= hashmap_free(c
->set_credentials
);
6212 c
->import_credentials
= set_free(c
->import_credentials
);
6214 c
->root_image_policy
= image_policy_free(c
->root_image_policy
);
6215 c
->mount_image_policy
= image_policy_free(c
->mount_image_policy
);
6216 c
->extension_image_policy
= image_policy_free(c
->extension_image_policy
);
6219 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
6222 if (!runtime_prefix
)
6225 for (size_t i
= 0; i
< c
->directories
[EXEC_DIRECTORY_RUNTIME
].n_items
; i
++) {
6226 _cleanup_free_
char *p
= NULL
;
6228 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
6229 p
= path_join(runtime_prefix
, "private", c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
6231 p
= path_join(runtime_prefix
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
6235 /* We execute this synchronously, since we need to be sure this is gone when we start the
6237 (void) rm_rf(p
, REMOVE_ROOT
);
6239 STRV_FOREACH(symlink
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].symlinks
) {
6240 _cleanup_free_
char *symlink_abs
= NULL
;
6242 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
6243 symlink_abs
= path_join(runtime_prefix
, "private", *symlink
);
6245 symlink_abs
= path_join(runtime_prefix
, *symlink
);
6249 (void) unlink(symlink_abs
);
6256 int exec_context_destroy_credentials(const ExecContext
*c
, const char *runtime_prefix
, const char *unit
) {
6257 _cleanup_free_
char *p
= NULL
;
6261 if (!runtime_prefix
|| !unit
)
6264 p
= path_join(runtime_prefix
, "credentials", unit
);
6268 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6269 * unmount it, and afterwards remove the mount point */
6270 (void) umount2(p
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
6271 (void) rm_rf(p
, REMOVE_ROOT
|REMOVE_CHMOD
);
6276 int exec_context_destroy_mount_ns_dir(Unit
*u
) {
6277 _cleanup_free_
char *p
= NULL
;
6279 if (!u
|| !MANAGER_IS_SYSTEM(u
->manager
))
6282 p
= path_join("/run/systemd/propagate/", u
->id
);
6286 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6287 if (rmdir(p
) < 0 && errno
!= ENOENT
)
6288 log_unit_debug_errno(u
, errno
, "Unable to remove propagation dir '%s', ignoring: %m", p
);
6293 static void exec_command_done(ExecCommand
*c
) {
6296 c
->path
= mfree(c
->path
);
6297 c
->argv
= strv_free(c
->argv
);
6300 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
6301 for (size_t i
= 0; i
< n
; i
++)
6302 exec_command_done(c
+i
);
6305 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
6309 LIST_REMOVE(command
, c
, i
);
6310 exec_command_done(i
);
6317 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
6318 for (size_t i
= 0; i
< n
; i
++)
6319 c
[i
] = exec_command_free_list(c
[i
]);
6322 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
6323 for (size_t i
= 0; i
< n
; i
++)
6324 exec_status_reset(&c
[i
].exec_status
);
6327 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
6328 for (size_t i
= 0; i
< n
; i
++)
6329 LIST_FOREACH(command
, z
, c
[i
])
6330 exec_status_reset(&z
->exec_status
);
6333 typedef struct InvalidEnvInfo
{
6338 static void invalid_env(const char *p
, void *userdata
) {
6339 InvalidEnvInfo
*info
= userdata
;
6341 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
6344 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
6350 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
6353 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
6356 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
6359 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
6362 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
6365 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
6372 static int exec_context_named_iofds(
6373 const ExecContext
*c
,
6374 const ExecParameters
*p
,
6375 int named_iofds
[static 3]) {
6378 const char* stdio_fdname
[3];
6383 assert(named_iofds
);
6385 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
6386 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
6387 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
6389 for (size_t i
= 0; i
< 3; i
++)
6390 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
6392 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
6394 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
6395 if (named_iofds
[STDIN_FILENO
] < 0 &&
6396 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
6397 stdio_fdname
[STDIN_FILENO
] &&
6398 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
6400 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
6403 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
6404 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
6405 stdio_fdname
[STDOUT_FILENO
] &&
6406 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
6408 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
6411 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
6412 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
6413 stdio_fdname
[STDERR_FILENO
] &&
6414 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
6416 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
6420 return targets
== 0 ? 0 : -ENOENT
;
6423 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***ret
) {
6424 _cleanup_strv_free_
char **v
= NULL
;
6430 STRV_FOREACH(i
, c
->environment_files
) {
6431 _cleanup_globfree_ glob_t pglob
= {};
6432 bool ignore
= false;
6440 if (!path_is_absolute(fn
)) {
6446 /* Filename supports globbing, take all matching files */
6447 r
= safe_glob(fn
, 0, &pglob
);
6454 /* When we don't match anything, -ENOENT should be returned */
6455 assert(pglob
.gl_pathc
> 0);
6457 for (size_t n
= 0; n
< pglob
.gl_pathc
; n
++) {
6458 _cleanup_strv_free_
char **p
= NULL
;
6460 r
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
6467 /* Log invalid environment variables with filename */
6469 InvalidEnvInfo info
= {
6471 .path
= pglob
.gl_pathv
[n
]
6474 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
6480 char **m
= strv_env_merge(v
, p
);
6484 strv_free_and_replace(v
, m
);
6494 static bool tty_may_match_dev_console(const char *tty
) {
6495 _cleanup_free_
char *resolved
= NULL
;
6500 tty
= skip_dev_prefix(tty
);
6502 /* trivial identity? */
6503 if (streq(tty
, "console"))
6506 if (resolve_dev_console(&resolved
) < 0)
6507 return true; /* if we could not resolve, assume it may */
6509 /* "tty0" means the active VC, so it may be the same sometimes */
6510 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
6513 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
6516 return ec
->tty_reset
||
6518 ec
->tty_vt_disallocate
||
6519 is_terminal_input(ec
->std_input
) ||
6520 is_terminal_output(ec
->std_output
) ||
6521 is_terminal_output(ec
->std_error
);
6524 bool exec_context_may_touch_console(const ExecContext
*ec
) {
6526 return exec_context_may_touch_tty(ec
) &&
6527 tty_may_match_dev_console(exec_context_tty_path(ec
));
6530 static void strv_fprintf(FILE *f
, char **l
) {
6534 fprintf(f
, " %s", *g
);
6537 static void strv_dump(FILE* f
, const char *prefix
, const char *name
, char **strv
) {
6542 if (!strv_isempty(strv
)) {
6543 fprintf(f
, "%s%s:", prefix
, name
);
6544 strv_fprintf(f
, strv
);
6549 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
6555 prefix
= strempty(prefix
);
6559 "%sWorkingDirectory: %s\n"
6560 "%sRootDirectory: %s\n"
6561 "%sRootEphemeral: %s\n"
6562 "%sNonBlocking: %s\n"
6563 "%sPrivateTmp: %s\n"
6564 "%sPrivateDevices: %s\n"
6565 "%sProtectKernelTunables: %s\n"
6566 "%sProtectKernelModules: %s\n"
6567 "%sProtectKernelLogs: %s\n"
6568 "%sProtectClock: %s\n"
6569 "%sProtectControlGroups: %s\n"
6570 "%sPrivateNetwork: %s\n"
6571 "%sPrivateUsers: %s\n"
6572 "%sProtectHome: %s\n"
6573 "%sProtectSystem: %s\n"
6574 "%sMountAPIVFS: %s\n"
6575 "%sIgnoreSIGPIPE: %s\n"
6576 "%sMemoryDenyWriteExecute: %s\n"
6577 "%sRestrictRealtime: %s\n"
6578 "%sRestrictSUIDSGID: %s\n"
6579 "%sKeyringMode: %s\n"
6580 "%sProtectHostname: %s\n"
6581 "%sProtectProc: %s\n"
6582 "%sProcSubset: %s\n",
6584 prefix
, empty_to_root(c
->working_directory
),
6585 prefix
, empty_to_root(c
->root_directory
),
6586 prefix
, yes_no(c
->root_ephemeral
),
6587 prefix
, yes_no(c
->non_blocking
),
6588 prefix
, yes_no(c
->private_tmp
),
6589 prefix
, yes_no(c
->private_devices
),
6590 prefix
, yes_no(c
->protect_kernel_tunables
),
6591 prefix
, yes_no(c
->protect_kernel_modules
),
6592 prefix
, yes_no(c
->protect_kernel_logs
),
6593 prefix
, yes_no(c
->protect_clock
),
6594 prefix
, yes_no(c
->protect_control_groups
),
6595 prefix
, yes_no(c
->private_network
),
6596 prefix
, yes_no(c
->private_users
),
6597 prefix
, protect_home_to_string(c
->protect_home
),
6598 prefix
, protect_system_to_string(c
->protect_system
),
6599 prefix
, yes_no(exec_context_get_effective_mount_apivfs(c
)),
6600 prefix
, yes_no(c
->ignore_sigpipe
),
6601 prefix
, yes_no(c
->memory_deny_write_execute
),
6602 prefix
, yes_no(c
->restrict_realtime
),
6603 prefix
, yes_no(c
->restrict_suid_sgid
),
6604 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
6605 prefix
, yes_no(c
->protect_hostname
),
6606 prefix
, protect_proc_to_string(c
->protect_proc
),
6607 prefix
, proc_subset_to_string(c
->proc_subset
));
6610 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
6612 if (c
->root_image_options
) {
6613 fprintf(f
, "%sRootImageOptions:", prefix
);
6614 LIST_FOREACH(mount_options
, o
, c
->root_image_options
)
6615 if (!isempty(o
->options
))
6616 fprintf(f
, " %s:%s",
6617 partition_designator_to_string(o
->partition_designator
),
6623 _cleanup_free_
char *encoded
= NULL
;
6624 encoded
= hexmem(c
->root_hash
, c
->root_hash_size
);
6626 fprintf(f
, "%sRootHash: %s\n", prefix
, encoded
);
6629 if (c
->root_hash_path
)
6630 fprintf(f
, "%sRootHash: %s\n", prefix
, c
->root_hash_path
);
6632 if (c
->root_hash_sig
) {
6633 _cleanup_free_
char *encoded
= NULL
;
6635 len
= base64mem(c
->root_hash_sig
, c
->root_hash_sig_size
, &encoded
);
6637 fprintf(f
, "%sRootHashSignature: base64:%s\n", prefix
, encoded
);
6640 if (c
->root_hash_sig_path
)
6641 fprintf(f
, "%sRootHashSignature: %s\n", prefix
, c
->root_hash_sig_path
);
6644 fprintf(f
, "%sRootVerity: %s\n", prefix
, c
->root_verity
);
6646 STRV_FOREACH(e
, c
->environment
)
6647 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
6649 STRV_FOREACH(e
, c
->environment_files
)
6650 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
6652 STRV_FOREACH(e
, c
->pass_environment
)
6653 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
6655 STRV_FOREACH(e
, c
->unset_environment
)
6656 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
6658 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
6660 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
6661 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
6663 for (size_t i
= 0; i
< c
->directories
[dt
].n_items
; i
++) {
6664 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].items
[i
].path
);
6666 STRV_FOREACH(d
, c
->directories
[dt
].items
[i
].symlinks
)
6667 fprintf(f
, "%s%s: %s:%s\n", prefix
, exec_directory_type_symlink_to_string(dt
), c
->directories
[dt
].items
[i
].path
, *d
);
6671 fprintf(f
, "%sTimeoutCleanSec: %s\n", prefix
, FORMAT_TIMESPAN(c
->timeout_clean_usec
, USEC_PER_SEC
));
6674 fprintf(f
, "%sNice: %i\n", prefix
, c
->nice
);
6676 if (c
->oom_score_adjust_set
)
6677 fprintf(f
, "%sOOMScoreAdjust: %i\n", prefix
, c
->oom_score_adjust
);
6679 if (c
->coredump_filter_set
)
6680 fprintf(f
, "%sCoredumpFilter: 0x%"PRIx64
"\n", prefix
, c
->coredump_filter
);
6682 for (unsigned i
= 0; i
< RLIM_NLIMITS
; i
++)
6684 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
6685 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
6686 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
6687 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
6690 if (c
->ioprio_set
) {
6691 _cleanup_free_
char *class_str
= NULL
;
6693 r
= ioprio_class_to_string_alloc(ioprio_prio_class(c
->ioprio
), &class_str
);
6695 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
6697 fprintf(f
, "%sIOPriority: %d\n", prefix
, ioprio_prio_data(c
->ioprio
));
6700 if (c
->cpu_sched_set
) {
6701 _cleanup_free_
char *policy_str
= NULL
;
6703 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
6705 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
6708 "%sCPUSchedulingPriority: %i\n"
6709 "%sCPUSchedulingResetOnFork: %s\n",
6710 prefix
, c
->cpu_sched_priority
,
6711 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
6714 if (c
->cpu_set
.set
) {
6715 _cleanup_free_
char *affinity
= NULL
;
6717 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
6718 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
6721 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
6722 _cleanup_free_
char *nodes
= NULL
;
6724 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
6725 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
6726 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
6729 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
6730 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
6733 "%sStandardInput: %s\n"
6734 "%sStandardOutput: %s\n"
6735 "%sStandardError: %s\n",
6736 prefix
, exec_input_to_string(c
->std_input
),
6737 prefix
, exec_output_to_string(c
->std_output
),
6738 prefix
, exec_output_to_string(c
->std_error
));
6740 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
6741 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
6742 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
6743 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
6744 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
6745 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
6747 if (c
->std_input
== EXEC_INPUT_FILE
)
6748 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
6749 if (c
->std_output
== EXEC_OUTPUT_FILE
)
6750 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6751 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
6752 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6753 if (c
->std_output
== EXEC_OUTPUT_FILE_TRUNCATE
)
6754 fprintf(f
, "%sStandardOutputFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6755 if (c
->std_error
== EXEC_OUTPUT_FILE
)
6756 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6757 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
6758 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6759 if (c
->std_error
== EXEC_OUTPUT_FILE_TRUNCATE
)
6760 fprintf(f
, "%sStandardErrorFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6766 "%sTTYVHangup: %s\n"
6767 "%sTTYVTDisallocate: %s\n"
6769 "%sTTYColumns: %u\n",
6770 prefix
, c
->tty_path
,
6771 prefix
, yes_no(c
->tty_reset
),
6772 prefix
, yes_no(c
->tty_vhangup
),
6773 prefix
, yes_no(c
->tty_vt_disallocate
),
6774 prefix
, c
->tty_rows
,
6775 prefix
, c
->tty_cols
);
6777 if (IN_SET(c
->std_output
,
6779 EXEC_OUTPUT_JOURNAL
,
6780 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
6781 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
6782 IN_SET(c
->std_error
,
6784 EXEC_OUTPUT_JOURNAL
,
6785 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
6786 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
6788 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
6790 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
6792 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
6794 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
6796 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
6799 if (c
->log_level_max
>= 0) {
6800 _cleanup_free_
char *t
= NULL
;
6802 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
6804 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
6807 if (c
->log_ratelimit_interval_usec
> 0)
6809 "%sLogRateLimitIntervalSec: %s\n",
6810 prefix
, FORMAT_TIMESPAN(c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
6812 if (c
->log_ratelimit_burst
> 0)
6813 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
6815 if (!set_isempty(c
->log_filter_allowed_patterns
) || !set_isempty(c
->log_filter_denied_patterns
)) {
6816 fprintf(f
, "%sLogFilterPatterns:", prefix
);
6819 SET_FOREACH(pattern
, c
->log_filter_allowed_patterns
)
6820 fprintf(f
, " %s", pattern
);
6821 SET_FOREACH(pattern
, c
->log_filter_denied_patterns
)
6822 fprintf(f
, " ~%s", pattern
);
6826 for (size_t j
= 0; j
< c
->n_log_extra_fields
; j
++) {
6827 fprintf(f
, "%sLogExtraFields: ", prefix
);
6828 fwrite(c
->log_extra_fields
[j
].iov_base
,
6829 1, c
->log_extra_fields
[j
].iov_len
,
6834 if (c
->log_namespace
)
6835 fprintf(f
, "%sLogNamespace: %s\n", prefix
, c
->log_namespace
);
6837 if (c
->secure_bits
) {
6838 _cleanup_free_
char *str
= NULL
;
6840 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
6842 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
6845 if (c
->capability_bounding_set
!= CAP_MASK_UNSET
) {
6846 _cleanup_free_
char *str
= NULL
;
6848 r
= capability_set_to_string(c
->capability_bounding_set
, &str
);
6850 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
6853 if (c
->capability_ambient_set
!= 0) {
6854 _cleanup_free_
char *str
= NULL
;
6856 r
= capability_set_to_string(c
->capability_ambient_set
, &str
);
6858 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
6862 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
6864 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
6866 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
6868 strv_dump(f
, prefix
, "SupplementaryGroups", c
->supplementary_groups
);
6871 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
6873 strv_dump(f
, prefix
, "ReadWritePaths", c
->read_write_paths
);
6874 strv_dump(f
, prefix
, "ReadOnlyPaths", c
->read_only_paths
);
6875 strv_dump(f
, prefix
, "InaccessiblePaths", c
->inaccessible_paths
);
6876 strv_dump(f
, prefix
, "ExecPaths", c
->exec_paths
);
6877 strv_dump(f
, prefix
, "NoExecPaths", c
->no_exec_paths
);
6878 strv_dump(f
, prefix
, "ExecSearchPath", c
->exec_search_path
);
6880 for (size_t i
= 0; i
< c
->n_bind_mounts
; i
++)
6881 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
6882 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
6883 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
6884 c
->bind_mounts
[i
].source
,
6885 c
->bind_mounts
[i
].destination
,
6886 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
6888 for (size_t i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
6889 const TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
6891 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
6893 isempty(t
->options
) ? "" : ":",
6894 strempty(t
->options
));
6899 "%sUtmpIdentifier: %s\n",
6900 prefix
, c
->utmp_id
);
6902 if (c
->selinux_context
)
6904 "%sSELinuxContext: %s%s\n",
6905 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
6907 if (c
->apparmor_profile
)
6909 "%sAppArmorProfile: %s%s\n",
6910 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
6912 if (c
->smack_process_label
)
6914 "%sSmackProcessLabel: %s%s\n",
6915 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
6917 if (c
->personality
!= PERSONALITY_INVALID
)
6919 "%sPersonality: %s\n",
6920 prefix
, strna(personality_to_string(c
->personality
)));
6923 "%sLockPersonality: %s\n",
6924 prefix
, yes_no(c
->lock_personality
));
6926 if (c
->syscall_filter
) {
6928 "%sSystemCallFilter: ",
6931 if (!c
->syscall_allow_list
)
6937 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
6938 _cleanup_free_
char *name
= NULL
;
6939 const char *errno_name
= NULL
;
6940 int num
= PTR_TO_INT(val
);
6947 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
6948 fputs(strna(name
), f
);
6951 errno_name
= seccomp_errno_or_action_to_string(num
);
6953 fprintf(f
, ":%s", errno_name
);
6955 fprintf(f
, ":%d", num
);
6963 if (c
->syscall_archs
) {
6965 "%sSystemCallArchitectures:",
6970 SET_FOREACH(id
, c
->syscall_archs
)
6971 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
6976 if (exec_context_restrict_namespaces_set(c
)) {
6977 _cleanup_free_
char *s
= NULL
;
6979 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
6981 fprintf(f
, "%sRestrictNamespaces: %s\n",
6986 if (exec_context_restrict_filesystems_set(c
)) {
6988 SET_FOREACH(fs
, c
->restrict_filesystems
)
6989 fprintf(f
, "%sRestrictFileSystems: %s\n", prefix
, fs
);
6993 if (c
->network_namespace_path
)
6995 "%sNetworkNamespacePath: %s\n",
6996 prefix
, c
->network_namespace_path
);
6998 if (c
->syscall_errno
> 0) {
6999 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
7002 const char *errno_name
= seccomp_errno_or_action_to_string(c
->syscall_errno
);
7004 fputs(errno_name
, f
);
7006 fprintf(f
, "%d", c
->syscall_errno
);
7011 for (size_t i
= 0; i
< c
->n_mount_images
; i
++) {
7012 fprintf(f
, "%sMountImages: %s%s:%s", prefix
,
7013 c
->mount_images
[i
].ignore_enoent
? "-": "",
7014 c
->mount_images
[i
].source
,
7015 c
->mount_images
[i
].destination
);
7016 LIST_FOREACH(mount_options
, o
, c
->mount_images
[i
].mount_options
)
7017 fprintf(f
, ":%s:%s",
7018 partition_designator_to_string(o
->partition_designator
),
7019 strempty(o
->options
));
7023 for (size_t i
= 0; i
< c
->n_extension_images
; i
++) {
7024 fprintf(f
, "%sExtensionImages: %s%s", prefix
,
7025 c
->extension_images
[i
].ignore_enoent
? "-": "",
7026 c
->extension_images
[i
].source
);
7027 LIST_FOREACH(mount_options
, o
, c
->extension_images
[i
].mount_options
)
7028 fprintf(f
, ":%s:%s",
7029 partition_designator_to_string(o
->partition_designator
),
7030 strempty(o
->options
));
7034 strv_dump(f
, prefix
, "ExtensionDirectories", c
->extension_directories
);
7037 bool exec_context_maintains_privileges(const ExecContext
*c
) {
7040 /* Returns true if the process forked off would run under
7041 * an unchanged UID or as root. */
7046 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
7052 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
7060 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
7062 return IOPRIO_DEFAULT_CLASS_AND_PRIO
;
7064 return ioprio_normalize(p
);
7067 bool exec_context_get_effective_mount_apivfs(const ExecContext
*c
) {
7070 /* Explicit setting wins */
7071 if (c
->mount_apivfs_set
)
7072 return c
->mount_apivfs
;
7074 /* Default to "yes" if root directory or image are specified */
7075 if (exec_context_with_rootfs(c
))
7081 void exec_context_free_log_extra_fields(ExecContext
*c
) {
7084 for (size_t l
= 0; l
< c
->n_log_extra_fields
; l
++)
7085 free(c
->log_extra_fields
[l
].iov_base
);
7086 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
7087 c
->n_log_extra_fields
= 0;
7090 void exec_context_revert_tty(ExecContext
*c
) {
7091 _cleanup_close_
int fd
= -EBADF
;
7098 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7099 exec_context_tty_reset(c
, NULL
);
7101 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7102 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7103 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7104 if (!exec_context_may_touch_tty(c
))
7107 path
= exec_context_tty_path(c
);
7111 fd
= open(path
, O_PATH
|O_CLOEXEC
);
7113 return (void) log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
7114 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7117 if (fstat(fd
, &st
) < 0)
7118 return (void) log_warning_errno(errno
, "Failed to stat TTY '%s', ignoring: %m", path
);
7120 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7121 * if things are a character device, since a proper check either means we'd have to open the TTY and
7122 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7123 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7124 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7125 if (!S_ISCHR(st
.st_mode
))
7126 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path
);
7128 r
= fchmod_and_chown(fd
, TTY_MODE
, 0, TTY_GID
);
7130 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
7133 int exec_context_get_clean_directories(
7139 _cleanup_strv_free_
char **l
= NULL
;
7146 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
7147 if (!FLAGS_SET(mask
, 1U << t
))
7153 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
7156 j
= path_join(prefix
[t
], c
->directories
[t
].items
[i
].path
);
7160 r
= strv_consume(&l
, j
);
7164 /* Also remove private directories unconditionally. */
7165 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
7166 j
= path_join(prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
7170 r
= strv_consume(&l
, j
);
7175 STRV_FOREACH(symlink
, c
->directories
[t
].items
[i
].symlinks
) {
7176 j
= path_join(prefix
[t
], *symlink
);
7180 r
= strv_consume(&l
, j
);
7191 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
7192 ExecCleanMask mask
= 0;
7197 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
7198 if (c
->directories
[t
].n_items
> 0)
7205 bool exec_context_has_encrypted_credentials(ExecContext
*c
) {
7206 ExecLoadCredential
*load_cred
;
7207 ExecSetCredential
*set_cred
;
7211 HASHMAP_FOREACH(load_cred
, c
->load_credentials
)
7212 if (load_cred
->encrypted
)
7215 HASHMAP_FOREACH(set_cred
, c
->set_credentials
)
7216 if (set_cred
->encrypted
)
7222 int exec_context_add_default_dependencies(Unit
*u
, const ExecContext
*c
) {
7224 assert(u
->default_dependencies
);
7226 if (c
&& exec_context_needs_term(c
))
7227 return unit_add_dependency_by_name(u
, UNIT_AFTER
, SPECIAL_VCONSOLE_SETUP_SERVICE
,
7228 /* add_reference= */ true, UNIT_DEPENDENCY_DEFAULT
);
7232 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
7239 dual_timestamp_get(&s
->start_timestamp
);
7242 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
7250 dual_timestamp_get(&s
->exit_timestamp
);
7255 if (context
&& context
->utmp_id
)
7256 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
7259 void exec_status_reset(ExecStatus
*s
) {
7262 *s
= (ExecStatus
) {};
7265 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
7272 prefix
= strempty(prefix
);
7275 "%sPID: "PID_FMT
"\n",
7278 if (dual_timestamp_is_set(&s
->start_timestamp
))
7280 "%sStart Timestamp: %s\n",
7281 prefix
, FORMAT_TIMESTAMP(s
->start_timestamp
.realtime
));
7283 if (dual_timestamp_is_set(&s
->exit_timestamp
))
7285 "%sExit Timestamp: %s\n"
7287 "%sExit Status: %i\n",
7288 prefix
, FORMAT_TIMESTAMP(s
->exit_timestamp
.realtime
),
7289 prefix
, sigchld_code_to_string(s
->code
),
7293 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
7294 _cleanup_free_
char *cmd
= NULL
;
7295 const char *prefix2
;
7300 prefix
= strempty(prefix
);
7301 prefix2
= strjoina(prefix
, "\t");
7303 cmd
= quote_command_line(c
->argv
, SHELL_ESCAPE_EMPTY
);
7306 "%sCommand Line: %s\n",
7307 prefix
, strnull(cmd
));
7309 exec_status_dump(&c
->exec_status
, f
, prefix2
);
7312 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
7315 prefix
= strempty(prefix
);
7317 LIST_FOREACH(command
, i
, c
)
7318 exec_command_dump(i
, f
, prefix
);
7321 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
7328 /* It's kind of important, that we keep the order here */
7329 end
= LIST_FIND_TAIL(command
, *l
);
7330 LIST_INSERT_AFTER(command
, *l
, end
, e
);
7335 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
7343 l
= strv_new_ap(path
, ap
);
7355 free_and_replace(c
->path
, p
);
7357 return strv_free_and_replace(c
->argv
, l
);
7360 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
7361 _cleanup_strv_free_
char **l
= NULL
;
7369 l
= strv_new_ap(path
, ap
);
7375 r
= strv_extend_strv(&c
->argv
, l
, false);
7382 static char *destroy_tree(char *path
) {
7386 if (!path_equal(path
, RUN_SYSTEMD_EMPTY
)) {
7387 log_debug("Spawning process to nuke '%s'", path
);
7389 (void) asynchronous_rm_rf(path
, REMOVE_ROOT
|REMOVE_SUBVOLUME
|REMOVE_PHYSICAL
);
7395 static ExecSharedRuntime
* exec_shared_runtime_free(ExecSharedRuntime
*rt
) {
7400 (void) hashmap_remove(rt
->manager
->exec_shared_runtime_by_id
, rt
->id
);
7402 rt
->id
= mfree(rt
->id
);
7403 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
7404 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
7405 safe_close_pair(rt
->netns_storage_socket
);
7406 safe_close_pair(rt
->ipcns_storage_socket
);
7410 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime
, exec_shared_runtime
, exec_shared_runtime_free
);
7411 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime
*, exec_shared_runtime_free
);
7413 ExecSharedRuntime
* exec_shared_runtime_destroy(ExecSharedRuntime
*rt
) {
7417 assert(rt
->n_ref
> 0);
7423 rt
->tmp_dir
= destroy_tree(rt
->tmp_dir
);
7424 rt
->var_tmp_dir
= destroy_tree(rt
->var_tmp_dir
);
7426 return exec_shared_runtime_free(rt
);
7429 static int exec_shared_runtime_allocate(ExecSharedRuntime
**ret
, const char *id
) {
7430 _cleanup_free_
char *id_copy
= NULL
;
7431 ExecSharedRuntime
*n
;
7435 id_copy
= strdup(id
);
7439 n
= new(ExecSharedRuntime
, 1);
7443 *n
= (ExecSharedRuntime
) {
7444 .id
= TAKE_PTR(id_copy
),
7445 .netns_storage_socket
= PIPE_EBADF
,
7446 .ipcns_storage_socket
= PIPE_EBADF
,
7453 static int exec_shared_runtime_add(
7458 int netns_storage_socket
[2],
7459 int ipcns_storage_socket
[2],
7460 ExecSharedRuntime
**ret
) {
7462 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt
= NULL
;
7468 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7470 r
= exec_shared_runtime_allocate(&rt
, id
);
7474 r
= hashmap_ensure_put(&m
->exec_shared_runtime_by_id
, &string_hash_ops
, rt
->id
, rt
);
7478 assert(!!rt
->tmp_dir
== !!rt
->var_tmp_dir
); /* We require both to be set together */
7479 rt
->tmp_dir
= TAKE_PTR(*tmp_dir
);
7480 rt
->var_tmp_dir
= TAKE_PTR(*var_tmp_dir
);
7482 if (netns_storage_socket
) {
7483 rt
->netns_storage_socket
[0] = TAKE_FD(netns_storage_socket
[0]);
7484 rt
->netns_storage_socket
[1] = TAKE_FD(netns_storage_socket
[1]);
7487 if (ipcns_storage_socket
) {
7488 rt
->ipcns_storage_socket
[0] = TAKE_FD(ipcns_storage_socket
[0]);
7489 rt
->ipcns_storage_socket
[1] = TAKE_FD(ipcns_storage_socket
[1]);
7496 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7501 static int exec_shared_runtime_make(
7503 const ExecContext
*c
,
7505 ExecSharedRuntime
**ret
) {
7507 _cleanup_(namespace_cleanup_tmpdirp
) char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
7508 _cleanup_close_pair_
int netns_storage_socket
[2] = PIPE_EBADF
, ipcns_storage_socket
[2] = PIPE_EBADF
;
7515 /* It is not necessary to create ExecSharedRuntime object. */
7516 if (!exec_needs_network_namespace(c
) && !exec_needs_ipc_namespace(c
) && !c
->private_tmp
) {
7521 if (c
->private_tmp
&&
7522 !(prefixed_path_strv_contains(c
->inaccessible_paths
, "/tmp") &&
7523 (prefixed_path_strv_contains(c
->inaccessible_paths
, "/var/tmp") ||
7524 prefixed_path_strv_contains(c
->inaccessible_paths
, "/var")))) {
7525 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
7530 if (exec_needs_network_namespace(c
)) {
7531 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
7535 if (exec_needs_ipc_namespace(c
)) {
7536 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ipcns_storage_socket
) < 0)
7540 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_storage_socket
, ipcns_storage_socket
, ret
);
7547 int exec_shared_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecSharedRuntime
**ret
) {
7548 ExecSharedRuntime
*rt
;
7555 rt
= hashmap_get(m
->exec_shared_runtime_by_id
, id
);
7557 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7565 /* If not found, then create a new object. */
7566 r
= exec_shared_runtime_make(m
, c
, id
, &rt
);
7570 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7576 /* increment reference counter. */
7582 int exec_shared_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
7583 ExecSharedRuntime
*rt
;
7589 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
7590 fprintf(f
, "exec-runtime=%s", rt
->id
);
7593 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
7595 if (rt
->var_tmp_dir
)
7596 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
7598 if (rt
->netns_storage_socket
[0] >= 0) {
7601 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
7605 fprintf(f
, " netns-socket-0=%i", copy
);
7608 if (rt
->netns_storage_socket
[1] >= 0) {
7611 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
7615 fprintf(f
, " netns-socket-1=%i", copy
);
7618 if (rt
->ipcns_storage_socket
[0] >= 0) {
7621 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[0]);
7625 fprintf(f
, " ipcns-socket-0=%i", copy
);
7628 if (rt
->ipcns_storage_socket
[1] >= 0) {
7631 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[1]);
7635 fprintf(f
, " ipcns-socket-1=%i", copy
);
7644 int exec_shared_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
7645 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt_create
= NULL
;
7646 ExecSharedRuntime
*rt
;
7649 /* This is for the migration from old (v237 or earlier) deserialization text.
7650 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7651 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7652 * so or not from the serialized text, then we always creates a new object owned by this. */
7658 /* Manager manages ExecSharedRuntime objects by the unit id.
7659 * So, we omit the serialized text when the unit does not have id (yet?)... */
7660 if (isempty(u
->id
)) {
7661 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
7665 if (hashmap_ensure_allocated(&u
->manager
->exec_shared_runtime_by_id
, &string_hash_ops
) < 0)
7668 rt
= hashmap_get(u
->manager
->exec_shared_runtime_by_id
, u
->id
);
7670 if (exec_shared_runtime_allocate(&rt_create
, u
->id
) < 0)
7676 if (streq(key
, "tmp-dir")) {
7677 if (free_and_strdup_warn(&rt
->tmp_dir
, value
) < 0)
7680 } else if (streq(key
, "var-tmp-dir")) {
7681 if (free_and_strdup_warn(&rt
->var_tmp_dir
, value
) < 0)
7684 } else if (streq(key
, "netns-socket-0")) {
7687 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
7688 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
7692 safe_close(rt
->netns_storage_socket
[0]);
7693 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
7695 } else if (streq(key
, "netns-socket-1")) {
7698 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
7699 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
7703 safe_close(rt
->netns_storage_socket
[1]);
7704 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
7709 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7711 r
= hashmap_put(u
->manager
->exec_shared_runtime_by_id
, rt_create
->id
, rt_create
);
7713 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
7717 rt_create
->manager
= u
->manager
;
7720 TAKE_PTR(rt_create
);
7726 int exec_shared_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
7727 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
7729 int r
, netns_fdpair
[] = {-1, -1}, ipcns_fdpair
[] = {-1, -1};
7730 const char *p
, *v
= ASSERT_PTR(value
);
7736 n
= strcspn(v
, " ");
7737 id
= strndupa_safe(v
, n
);
7742 v
= startswith(p
, "tmp-dir=");
7744 n
= strcspn(v
, " ");
7745 tmp_dir
= strndup(v
, n
);
7753 v
= startswith(p
, "var-tmp-dir=");
7755 n
= strcspn(v
, " ");
7756 var_tmp_dir
= strndup(v
, n
);
7764 v
= startswith(p
, "netns-socket-0=");
7768 n
= strcspn(v
, " ");
7769 buf
= strndupa_safe(v
, n
);
7771 netns_fdpair
[0] = parse_fd(buf
);
7772 if (netns_fdpair
[0] < 0)
7773 return log_debug_errno(netns_fdpair
[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf
);
7774 if (!fdset_contains(fds
, netns_fdpair
[0]))
7775 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7776 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair
[0]);
7777 netns_fdpair
[0] = fdset_remove(fds
, netns_fdpair
[0]);
7783 v
= startswith(p
, "netns-socket-1=");
7787 n
= strcspn(v
, " ");
7788 buf
= strndupa_safe(v
, n
);
7790 netns_fdpair
[1] = parse_fd(buf
);
7791 if (netns_fdpair
[1] < 0)
7792 return log_debug_errno(netns_fdpair
[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf
);
7793 if (!fdset_contains(fds
, netns_fdpair
[1]))
7794 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7795 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair
[1]);
7796 netns_fdpair
[1] = fdset_remove(fds
, netns_fdpair
[1]);
7802 v
= startswith(p
, "ipcns-socket-0=");
7806 n
= strcspn(v
, " ");
7807 buf
= strndupa_safe(v
, n
);
7809 ipcns_fdpair
[0] = parse_fd(buf
);
7810 if (ipcns_fdpair
[0] < 0)
7811 return log_debug_errno(ipcns_fdpair
[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf
);
7812 if (!fdset_contains(fds
, ipcns_fdpair
[0]))
7813 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7814 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair
[0]);
7815 ipcns_fdpair
[0] = fdset_remove(fds
, ipcns_fdpair
[0]);
7821 v
= startswith(p
, "ipcns-socket-1=");
7825 n
= strcspn(v
, " ");
7826 buf
= strndupa_safe(v
, n
);
7828 ipcns_fdpair
[1] = parse_fd(buf
);
7829 if (ipcns_fdpair
[1] < 0)
7830 return log_debug_errno(ipcns_fdpair
[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf
);
7831 if (!fdset_contains(fds
, ipcns_fdpair
[1]))
7832 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7833 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair
[1]);
7834 ipcns_fdpair
[1] = fdset_remove(fds
, ipcns_fdpair
[1]);
7838 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_fdpair
, ipcns_fdpair
, NULL
);
7840 return log_debug_errno(r
, "Failed to add exec-runtime: %m");
7844 void exec_shared_runtime_vacuum(Manager
*m
) {
7845 ExecSharedRuntime
*rt
;
7849 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7851 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
7855 (void) exec_shared_runtime_free(rt
);
7859 int exec_runtime_make(
7861 const ExecContext
*context
,
7862 ExecSharedRuntime
*shared
,
7863 DynamicCreds
*creds
,
7864 ExecRuntime
**ret
) {
7865 _cleanup_close_pair_
int ephemeral_storage_socket
[2] = PIPE_EBADF
;
7866 _cleanup_free_
char *ephemeral
= NULL
;
7867 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
7874 if (!shared
&& !creds
&& !exec_needs_ephemeral(context
)) {
7879 if (exec_needs_ephemeral(context
)) {
7880 r
= mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7884 r
= tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit
->id
, &ephemeral
);
7888 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ephemeral_storage_socket
) < 0)
7892 rt
= new(ExecRuntime
, 1);
7896 *rt
= (ExecRuntime
) {
7898 .dynamic_creds
= creds
,
7899 .ephemeral_copy
= TAKE_PTR(ephemeral
),
7900 .ephemeral_storage_socket
[0] = TAKE_FD(ephemeral_storage_socket
[0]),
7901 .ephemeral_storage_socket
[1] = TAKE_FD(ephemeral_storage_socket
[1]),
7904 *ret
= TAKE_PTR(rt
);
7908 ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
) {
7912 exec_shared_runtime_unref(rt
->shared
);
7913 dynamic_creds_unref(rt
->dynamic_creds
);
7915 rt
->ephemeral_copy
= destroy_tree(rt
->ephemeral_copy
);
7917 safe_close_pair(rt
->ephemeral_storage_socket
);
7921 ExecRuntime
* exec_runtime_destroy(ExecRuntime
*rt
) {
7925 rt
->shared
= exec_shared_runtime_destroy(rt
->shared
);
7926 rt
->dynamic_creds
= dynamic_creds_destroy(rt
->dynamic_creds
);
7927 return exec_runtime_free(rt
);
7930 void exec_params_clear(ExecParameters
*p
) {
7934 p
->environment
= strv_free(p
->environment
);
7935 p
->fd_names
= strv_free(p
->fd_names
);
7936 p
->fds
= mfree(p
->fds
);
7937 p
->exec_fd
= safe_close(p
->exec_fd
);
7940 ExecSetCredential
*exec_set_credential_free(ExecSetCredential
*sc
) {
7949 ExecLoadCredential
*exec_load_credential_free(ExecLoadCredential
*lc
) {
7958 void exec_directory_done(ExecDirectory
*d
) {
7962 for (size_t i
= 0; i
< d
->n_items
; i
++) {
7963 free(d
->items
[i
].path
);
7964 strv_free(d
->items
[i
].symlinks
);
7967 d
->items
= mfree(d
->items
);
7972 static ExecDirectoryItem
*exec_directory_find(ExecDirectory
*d
, const char *path
) {
7976 for (size_t i
= 0; i
< d
->n_items
; i
++)
7977 if (path_equal(d
->items
[i
].path
, path
))
7978 return &d
->items
[i
];
7983 int exec_directory_add(ExecDirectory
*d
, const char *path
, const char *symlink
) {
7984 _cleanup_strv_free_
char **s
= NULL
;
7985 _cleanup_free_
char *p
= NULL
;
7986 ExecDirectoryItem
*existing
;
7992 existing
= exec_directory_find(d
, path
);
7994 r
= strv_extend(&existing
->symlinks
, symlink
);
7998 return 0; /* existing item is updated */
8006 s
= strv_new(symlink
);
8011 if (!GREEDY_REALLOC(d
->items
, d
->n_items
+ 1))
8014 d
->items
[d
->n_items
++] = (ExecDirectoryItem
) {
8015 .path
= TAKE_PTR(p
),
8016 .symlinks
= TAKE_PTR(s
),
8019 return 1; /* new item is added */
8022 static int exec_directory_item_compare_func(const ExecDirectoryItem
*a
, const ExecDirectoryItem
*b
) {
8026 return path_compare(a
->path
, b
->path
);
8029 void exec_directory_sort(ExecDirectory
*d
) {
8032 /* Sort the exec directories to make always parent directories processed at first in
8033 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8034 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8035 * list. See also comments in setup_exec_directory() and issue #24783. */
8037 if (d
->n_items
<= 1)
8040 typesafe_qsort(d
->items
, d
->n_items
, exec_directory_item_compare_func
);
8042 for (size_t i
= 1; i
< d
->n_items
; i
++)
8043 for (size_t j
= 0; j
< i
; j
++)
8044 if (path_startswith(d
->items
[i
].path
, d
->items
[j
].path
)) {
8045 d
->items
[i
].only_create
= true;
8050 ExecCleanMask
exec_clean_mask_from_string(const char *s
) {
8051 ExecDirectoryType t
;
8055 if (streq(s
, "all"))
8056 return EXEC_CLEAN_ALL
;
8057 if (streq(s
, "fdstore"))
8058 return EXEC_CLEAN_FDSTORE
;
8060 t
= exec_resource_type_from_string(s
);
8062 return (ExecCleanMask
) t
;
8067 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecSetCredential
, exec_set_credential_free
);
8068 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecLoadCredential
, exec_load_credential_free
);
8070 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
8071 [EXEC_INPUT_NULL
] = "null",
8072 [EXEC_INPUT_TTY
] = "tty",
8073 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
8074 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
8075 [EXEC_INPUT_SOCKET
] = "socket",
8076 [EXEC_INPUT_NAMED_FD
] = "fd",
8077 [EXEC_INPUT_DATA
] = "data",
8078 [EXEC_INPUT_FILE
] = "file",
8081 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
8083 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
8084 [EXEC_OUTPUT_INHERIT
] = "inherit",
8085 [EXEC_OUTPUT_NULL
] = "null",
8086 [EXEC_OUTPUT_TTY
] = "tty",
8087 [EXEC_OUTPUT_KMSG
] = "kmsg",
8088 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
8089 [EXEC_OUTPUT_JOURNAL
] = "journal",
8090 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
8091 [EXEC_OUTPUT_SOCKET
] = "socket",
8092 [EXEC_OUTPUT_NAMED_FD
] = "fd",
8093 [EXEC_OUTPUT_FILE
] = "file",
8094 [EXEC_OUTPUT_FILE_APPEND
] = "append",
8095 [EXEC_OUTPUT_FILE_TRUNCATE
] = "truncate",
8098 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
8100 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
8101 [EXEC_UTMP_INIT
] = "init",
8102 [EXEC_UTMP_LOGIN
] = "login",
8103 [EXEC_UTMP_USER
] = "user",
8106 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
8108 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
8109 [EXEC_PRESERVE_NO
] = "no",
8110 [EXEC_PRESERVE_YES
] = "yes",
8111 [EXEC_PRESERVE_RESTART
] = "restart",
8114 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
8116 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8117 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8118 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
8119 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
8120 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
8121 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
8122 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
8125 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
8127 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8128 static const char* const exec_directory_type_symlink_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8129 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectorySymlink",
8130 [EXEC_DIRECTORY_STATE
] = "StateDirectorySymlink",
8131 [EXEC_DIRECTORY_CACHE
] = "CacheDirectorySymlink",
8132 [EXEC_DIRECTORY_LOGS
] = "LogsDirectorySymlink",
8133 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectorySymlink",
8136 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink
, ExecDirectoryType
);
8138 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8139 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8140 * directories, specifically .timer units with their timestamp touch file. */
8141 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8142 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
8143 [EXEC_DIRECTORY_STATE
] = "state",
8144 [EXEC_DIRECTORY_CACHE
] = "cache",
8145 [EXEC_DIRECTORY_LOGS
] = "logs",
8146 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
8149 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
8151 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8152 * the service payload in. */
8153 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8154 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
8155 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
8156 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
8157 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
8158 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
8161 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
8163 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
8164 [EXEC_KEYRING_INHERIT
] = "inherit",
8165 [EXEC_KEYRING_PRIVATE
] = "private",
8166 [EXEC_KEYRING_SHARED
] = "shared",
8169 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);