1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <sys/eventfd.h>
10 #include <sys/mount.h>
11 #include <sys/personality.h>
12 #include <sys/prctl.h>
14 #include <sys/types.h>
19 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
22 #include <security/pam_appl.h>
26 #include <selinux/selinux.h>
34 #include <sys/apparmor.h>
37 #include "sd-messages.h"
41 #include "alloc-util.h"
43 #include "apparmor-util.h"
45 #include "argv-util.h"
49 #include "btrfs-util.h"
51 #include "capability-util.h"
52 #include "chattr-util.h"
53 #include "cgroup-setup.h"
55 #include "chown-recursive.h"
56 #include "constants.h"
57 #include "cpu-set-util.h"
58 #include "creds-util.h"
59 #include "data-fd-util.h"
62 #include "errno-list.h"
65 #include "exit-status.h"
68 #include "format-util.h"
69 #include "glob-util.h"
70 #include "hexdecoct.h"
72 #include "ioprio-util.h"
73 #include "label-util.h"
74 #include "lock-util.h"
78 #include "manager-dump.h"
79 #include "memory-util.h"
80 #include "missing_fs.h"
81 #include "missing_ioprio.h"
82 #include "missing_prctl.h"
83 #include "mkdir-label.h"
84 #include "mount-util.h"
85 #include "mountpoint-util.h"
86 #include "namespace.h"
87 #include "parse-util.h"
88 #include "path-util.h"
89 #include "proc-cmdline.h"
90 #include "process-util.h"
92 #include "random-util.h"
93 #include "recurse-dir.h"
94 #include "rlimit-util.h"
97 #include "seccomp-util.h"
99 #include "securebits-util.h"
100 #include "selinux-util.h"
101 #include "signal-util.h"
102 #include "smack-util.h"
103 #include "socket-util.h"
104 #include "sort-util.h"
106 #include "stat-util.h"
107 #include "string-table.h"
108 #include "string-util.h"
110 #include "syslog-util.h"
111 #include "terminal-util.h"
112 #include "tmpfile-util.h"
113 #include "umask-util.h"
114 #include "unit-serialize.h"
115 #include "user-util.h"
116 #include "utmp-wtmp.h"
118 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
119 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
121 #define SNDBUF_SIZE (8*1024*1024)
123 static int shift_fds(int fds
[], size_t n_fds
) {
127 /* Modifies the fds array! (sorts it) */
131 for (int start
= 0;;) {
132 int restart_from
= -1;
134 for (int i
= start
; i
< (int) n_fds
; i
++) {
137 /* Already at right index? */
141 nfd
= fcntl(fds
[i
], F_DUPFD
, i
+ 3);
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd
!= i
+3 && restart_from
< 0)
154 if (restart_from
< 0)
157 start
= restart_from
;
163 static int flags_fds(
176 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
177 * O_NONBLOCK only applies to socket activation though. */
179 for (size_t i
= 0; i
< n_fds
; i
++) {
181 if (i
< n_socket_fds
) {
182 r
= fd_nonblock(fds
[i
], nonblock
);
187 /* We unconditionally drop FD_CLOEXEC from the fds,
188 * since after all we want to pass these fds to our
191 r
= fd_cloexec(fds
[i
], false);
199 static const char *exec_context_tty_path(const ExecContext
*context
) {
202 if (context
->stdio_as_fds
)
205 if (context
->tty_path
)
206 return context
->tty_path
;
208 return "/dev/console";
211 static int exec_context_tty_size(const ExecContext
*context
, unsigned *ret_rows
, unsigned *ret_cols
) {
219 rows
= context
->tty_rows
;
220 cols
= context
->tty_cols
;
222 tty
= exec_context_tty_path(context
);
224 (void) proc_cmdline_tty_size(tty
, rows
== UINT_MAX
? &rows
: NULL
, cols
== UINT_MAX
? &cols
: NULL
);
232 static void exec_context_tty_reset(const ExecContext
*context
, const ExecParameters
*p
) {
233 _cleanup_close_
int fd
= -EBADF
;
234 const char *path
= exec_context_tty_path(ASSERT_PTR(context
));
236 /* Take a lock around the device for the duration of the setup that we do here.
237 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
238 * We open a new fd that will be closed automatically, and operate on it for convenience.
241 if (p
&& p
->stdin_fd
>= 0) {
242 fd
= xopenat_lock(p
->stdin_fd
, NULL
,
243 O_RDONLY
|O_CLOEXEC
|O_NONBLOCK
|O_NOCTTY
, 0, 0, LOCK_BSD
, LOCK_EX
);
247 fd
= open_terminal(path
, O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NONBLOCK
);
251 if (lock_generic(fd
, LOCK_BSD
, LOCK_EX
) < 0)
254 return; /* nothing to do */
256 if (context
->tty_vhangup
)
257 (void) terminal_vhangup_fd(fd
);
259 if (context
->tty_reset
)
260 (void) reset_terminal_fd(fd
, true);
262 if (p
&& p
->stdin_fd
>= 0) {
263 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
265 (void) exec_context_tty_size(context
, &rows
, &cols
);
266 (void) terminal_set_size_fd(p
->stdin_fd
, path
, rows
, cols
);
269 if (context
->tty_vt_disallocate
&& path
)
270 (void) vt_disallocate(path
);
273 static bool is_terminal_input(ExecInput i
) {
276 EXEC_INPUT_TTY_FORCE
,
277 EXEC_INPUT_TTY_FAIL
);
280 static bool is_terminal_output(ExecOutput o
) {
283 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
284 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
);
287 static bool is_kmsg_output(ExecOutput o
) {
290 EXEC_OUTPUT_KMSG_AND_CONSOLE
);
293 static bool exec_context_needs_term(const ExecContext
*c
) {
296 /* Return true if the execution context suggests we should set $TERM to something useful. */
298 if (is_terminal_input(c
->std_input
))
301 if (is_terminal_output(c
->std_output
))
304 if (is_terminal_output(c
->std_error
))
307 return !!c
->tty_path
;
310 static int open_null_as(int flags
, int nfd
) {
315 fd
= open("/dev/null", flags
|O_NOCTTY
);
319 return move_fd(fd
, nfd
, false);
322 static int connect_journal_socket(
324 const char *log_namespace
,
328 uid_t olduid
= UID_INVALID
;
329 gid_t oldgid
= GID_INVALID
;
334 strjoina("/run/systemd/journal.", log_namespace
, "/stdout") :
335 "/run/systemd/journal/stdout";
337 if (gid_is_valid(gid
)) {
340 if (setegid(gid
) < 0)
344 if (uid_is_valid(uid
)) {
347 if (seteuid(uid
) < 0) {
353 r
= connect_unix_path(fd
, AT_FDCWD
, j
);
355 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
356 an LSM interferes. */
358 if (uid_is_valid(uid
))
359 (void) seteuid(olduid
);
362 if (gid_is_valid(gid
))
363 (void) setegid(oldgid
);
368 static int connect_logger_as(
370 const ExecContext
*context
,
371 const ExecParameters
*params
,
378 _cleanup_close_
int fd
= -EBADF
;
383 assert(output
< _EXEC_OUTPUT_MAX
);
387 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
391 r
= connect_journal_socket(fd
, context
->log_namespace
, uid
, gid
);
395 if (shutdown(fd
, SHUT_RD
) < 0)
398 (void) fd_inc_sndbuf(fd
, SNDBUF_SIZE
);
408 context
->syslog_identifier
?: ident
,
409 params
->flags
& EXEC_PASS_LOG_UNIT
? unit
->id
: "",
410 context
->syslog_priority
,
411 !!context
->syslog_level_prefix
,
413 is_kmsg_output(output
),
414 is_terminal_output(output
)) < 0)
417 return move_fd(TAKE_FD(fd
), nfd
, false);
420 static int open_terminal_as(const char *path
, int flags
, int nfd
) {
426 fd
= open_terminal(path
, flags
| O_NOCTTY
);
430 return move_fd(fd
, nfd
, false);
433 static int acquire_path(const char *path
, int flags
, mode_t mode
) {
434 _cleanup_close_
int fd
= -EBADF
;
439 if (IN_SET(flags
& O_ACCMODE
, O_WRONLY
, O_RDWR
))
442 fd
= open(path
, flags
|O_NOCTTY
, mode
);
446 if (errno
!= ENXIO
) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
449 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
451 fd
= socket(AF_UNIX
, SOCK_STREAM
, 0);
455 r
= connect_unix_path(fd
, AT_FDCWD
, path
);
456 if (IN_SET(r
, -ENOTSOCK
, -EINVAL
))
457 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
458 * wasn't an AF_UNIX socket after all */
463 if ((flags
& O_ACCMODE
) == O_RDONLY
)
464 r
= shutdown(fd
, SHUT_WR
);
465 else if ((flags
& O_ACCMODE
) == O_WRONLY
)
466 r
= shutdown(fd
, SHUT_RD
);
475 static int fixup_input(
476 const ExecContext
*context
,
478 bool apply_tty_stdin
) {
484 std_input
= context
->std_input
;
486 if (is_terminal_input(std_input
) && !apply_tty_stdin
)
487 return EXEC_INPUT_NULL
;
489 if (std_input
== EXEC_INPUT_SOCKET
&& socket_fd
< 0)
490 return EXEC_INPUT_NULL
;
492 if (std_input
== EXEC_INPUT_DATA
&& context
->stdin_data_size
== 0)
493 return EXEC_INPUT_NULL
;
498 static int fixup_output(ExecOutput output
, int socket_fd
) {
500 if (output
== EXEC_OUTPUT_SOCKET
&& socket_fd
< 0)
501 return EXEC_OUTPUT_INHERIT
;
506 static int setup_input(
507 const ExecContext
*context
,
508 const ExecParameters
*params
,
510 const int named_iofds
[static 3]) {
519 if (params
->stdin_fd
>= 0) {
520 if (dup2(params
->stdin_fd
, STDIN_FILENO
) < 0)
523 /* Try to make this the controlling tty, if it is a tty, and reset it */
524 if (isatty(STDIN_FILENO
)) {
525 unsigned rows
= context
->tty_rows
, cols
= context
->tty_cols
;
527 (void) exec_context_tty_size(context
, &rows
, &cols
);
528 (void) ioctl(STDIN_FILENO
, TIOCSCTTY
, context
->std_input
== EXEC_INPUT_TTY_FORCE
);
529 (void) reset_terminal_fd(STDIN_FILENO
, true);
530 (void) terminal_set_size_fd(STDIN_FILENO
, NULL
, rows
, cols
);
536 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
540 case EXEC_INPUT_NULL
:
541 return open_null_as(O_RDONLY
, STDIN_FILENO
);
544 case EXEC_INPUT_TTY_FORCE
:
545 case EXEC_INPUT_TTY_FAIL
: {
549 fd
= acquire_terminal(exec_context_tty_path(context
),
550 i
== EXEC_INPUT_TTY_FAIL
? ACQUIRE_TERMINAL_TRY
:
551 i
== EXEC_INPUT_TTY_FORCE
? ACQUIRE_TERMINAL_FORCE
:
552 ACQUIRE_TERMINAL_WAIT
,
557 r
= exec_context_tty_size(context
, &rows
, &cols
);
561 r
= terminal_set_size_fd(fd
, exec_context_tty_path(context
), rows
, cols
);
565 return move_fd(fd
, STDIN_FILENO
, false);
568 case EXEC_INPUT_SOCKET
:
569 assert(socket_fd
>= 0);
571 return RET_NERRNO(dup2(socket_fd
, STDIN_FILENO
));
573 case EXEC_INPUT_NAMED_FD
:
574 assert(named_iofds
[STDIN_FILENO
] >= 0);
576 (void) fd_nonblock(named_iofds
[STDIN_FILENO
], false);
577 return RET_NERRNO(dup2(named_iofds
[STDIN_FILENO
], STDIN_FILENO
));
579 case EXEC_INPUT_DATA
: {
582 fd
= acquire_data_fd(context
->stdin_data
, context
->stdin_data_size
, 0);
586 return move_fd(fd
, STDIN_FILENO
, false);
589 case EXEC_INPUT_FILE
: {
593 assert(context
->stdio_file
[STDIN_FILENO
]);
595 rw
= (context
->std_output
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDOUT_FILENO
])) ||
596 (context
->std_error
== EXEC_OUTPUT_FILE
&& streq_ptr(context
->stdio_file
[STDIN_FILENO
], context
->stdio_file
[STDERR_FILENO
]));
598 fd
= acquire_path(context
->stdio_file
[STDIN_FILENO
], rw
? O_RDWR
: O_RDONLY
, 0666 & ~context
->umask
);
602 return move_fd(fd
, STDIN_FILENO
, false);
606 assert_not_reached();
610 static bool can_inherit_stderr_from_stdout(
611 const ExecContext
*context
,
617 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
620 if (e
== EXEC_OUTPUT_INHERIT
)
625 if (e
== EXEC_OUTPUT_NAMED_FD
)
626 return streq_ptr(context
->stdio_fdname
[STDOUT_FILENO
], context
->stdio_fdname
[STDERR_FILENO
]);
628 if (IN_SET(e
, EXEC_OUTPUT_FILE
, EXEC_OUTPUT_FILE_APPEND
, EXEC_OUTPUT_FILE_TRUNCATE
))
629 return streq_ptr(context
->stdio_file
[STDOUT_FILENO
], context
->stdio_file
[STDERR_FILENO
]);
634 static int setup_output(
636 const ExecContext
*context
,
637 const ExecParameters
*params
,
640 const int named_iofds
[static 3],
644 dev_t
*journal_stream_dev
,
645 ino_t
*journal_stream_ino
) {
655 assert(journal_stream_dev
);
656 assert(journal_stream_ino
);
658 if (fileno
== STDOUT_FILENO
&& params
->stdout_fd
>= 0) {
660 if (dup2(params
->stdout_fd
, STDOUT_FILENO
) < 0)
663 return STDOUT_FILENO
;
666 if (fileno
== STDERR_FILENO
&& params
->stderr_fd
>= 0) {
667 if (dup2(params
->stderr_fd
, STDERR_FILENO
) < 0)
670 return STDERR_FILENO
;
673 i
= fixup_input(context
, socket_fd
, params
->flags
& EXEC_APPLY_TTY_STDIN
);
674 o
= fixup_output(context
->std_output
, socket_fd
);
676 if (fileno
== STDERR_FILENO
) {
678 e
= fixup_output(context
->std_error
, socket_fd
);
680 /* This expects the input and output are already set up */
682 /* Don't change the stderr file descriptor if we inherit all
683 * the way and are not on a tty */
684 if (e
== EXEC_OUTPUT_INHERIT
&&
685 o
== EXEC_OUTPUT_INHERIT
&&
686 i
== EXEC_INPUT_NULL
&&
687 !is_terminal_input(context
->std_input
) &&
691 /* Duplicate from stdout if possible */
692 if (can_inherit_stderr_from_stdout(context
, o
, e
))
693 return RET_NERRNO(dup2(STDOUT_FILENO
, fileno
));
697 } else if (o
== EXEC_OUTPUT_INHERIT
) {
698 /* If input got downgraded, inherit the original value */
699 if (i
== EXEC_INPUT_NULL
&& is_terminal_input(context
->std_input
))
700 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
702 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
703 if (!IN_SET(i
, EXEC_INPUT_NULL
, EXEC_INPUT_DATA
))
704 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
706 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
710 /* We need to open /dev/null here anew, to get the right access mode. */
711 return open_null_as(O_WRONLY
, fileno
);
716 case EXEC_OUTPUT_NULL
:
717 return open_null_as(O_WRONLY
, fileno
);
719 case EXEC_OUTPUT_TTY
:
720 if (is_terminal_input(i
))
721 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
723 /* We don't reset the terminal if this is just about output */
724 return open_terminal_as(exec_context_tty_path(context
), O_WRONLY
, fileno
);
726 case EXEC_OUTPUT_KMSG
:
727 case EXEC_OUTPUT_KMSG_AND_CONSOLE
:
728 case EXEC_OUTPUT_JOURNAL
:
729 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE
:
730 r
= connect_logger_as(unit
, context
, params
, o
, ident
, fileno
, uid
, gid
);
732 log_unit_warning_errno(unit
, r
, "Failed to connect %s to the journal socket, ignoring: %m",
733 fileno
== STDOUT_FILENO
? "stdout" : "stderr");
734 r
= open_null_as(O_WRONLY
, fileno
);
738 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
739 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
740 * services to detect whether they are connected to the journal or not.
742 * If both stdout and stderr are connected to a stream then let's make sure to store the data
743 * about STDERR as that's usually the best way to do logging. */
745 if (fstat(fileno
, &st
) >= 0 &&
746 (*journal_stream_ino
== 0 || fileno
== STDERR_FILENO
)) {
747 *journal_stream_dev
= st
.st_dev
;
748 *journal_stream_ino
= st
.st_ino
;
753 case EXEC_OUTPUT_SOCKET
:
754 assert(socket_fd
>= 0);
756 return RET_NERRNO(dup2(socket_fd
, fileno
));
758 case EXEC_OUTPUT_NAMED_FD
:
759 assert(named_iofds
[fileno
] >= 0);
761 (void) fd_nonblock(named_iofds
[fileno
], false);
762 return RET_NERRNO(dup2(named_iofds
[fileno
], fileno
));
764 case EXEC_OUTPUT_FILE
:
765 case EXEC_OUTPUT_FILE_APPEND
:
766 case EXEC_OUTPUT_FILE_TRUNCATE
: {
770 assert(context
->stdio_file
[fileno
]);
772 rw
= context
->std_input
== EXEC_INPUT_FILE
&&
773 streq_ptr(context
->stdio_file
[fileno
], context
->stdio_file
[STDIN_FILENO
]);
776 return RET_NERRNO(dup2(STDIN_FILENO
, fileno
));
779 if (o
== EXEC_OUTPUT_FILE_APPEND
)
781 else if (o
== EXEC_OUTPUT_FILE_TRUNCATE
)
784 fd
= acquire_path(context
->stdio_file
[fileno
], flags
, 0666 & ~context
->umask
);
788 return move_fd(fd
, fileno
, 0);
792 assert_not_reached();
796 static int chown_terminal(int fd
, uid_t uid
) {
801 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
802 if (isatty(fd
) < 1) {
803 if (IN_SET(errno
, EINVAL
, ENOTTY
))
804 return 0; /* not a tty */
809 /* This might fail. What matters are the results. */
810 r
= fchmod_and_chown(fd
, TTY_MODE
, uid
, GID_INVALID
);
817 static int setup_confirm_stdio(
818 const ExecContext
*context
,
820 int *ret_saved_stdin
,
821 int *ret_saved_stdout
) {
823 _cleanup_close_
int fd
= -EBADF
, saved_stdin
= -EBADF
, saved_stdout
= -EBADF
;
827 assert(ret_saved_stdin
);
828 assert(ret_saved_stdout
);
830 saved_stdin
= fcntl(STDIN_FILENO
, F_DUPFD
, 3);
834 saved_stdout
= fcntl(STDOUT_FILENO
, F_DUPFD
, 3);
835 if (saved_stdout
< 0)
838 fd
= acquire_terminal(vc
, ACQUIRE_TERMINAL_WAIT
, DEFAULT_CONFIRM_USEC
);
842 r
= chown_terminal(fd
, getuid());
846 r
= reset_terminal_fd(fd
, true);
850 r
= exec_context_tty_size(context
, &rows
, &cols
);
854 r
= terminal_set_size_fd(fd
, vc
, rows
, cols
);
858 r
= rearrange_stdio(fd
, fd
, STDERR_FILENO
); /* Invalidates 'fd' also on failure */
863 *ret_saved_stdin
= TAKE_FD(saved_stdin
);
864 *ret_saved_stdout
= TAKE_FD(saved_stdout
);
868 static void write_confirm_error_fd(int err
, int fd
, const Unit
*u
) {
871 if (err
== -ETIMEDOUT
)
872 dprintf(fd
, "Confirmation question timed out for %s, assuming positive response.\n", u
->id
);
875 dprintf(fd
, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u
->id
);
879 static void write_confirm_error(int err
, const char *vc
, const Unit
*u
) {
880 _cleanup_close_
int fd
= -EBADF
;
884 fd
= open_terminal(vc
, O_WRONLY
|O_NOCTTY
|O_CLOEXEC
);
888 write_confirm_error_fd(err
, fd
, u
);
891 static int restore_confirm_stdio(int *saved_stdin
, int *saved_stdout
) {
895 assert(saved_stdout
);
899 if (*saved_stdin
>= 0)
900 if (dup2(*saved_stdin
, STDIN_FILENO
) < 0)
903 if (*saved_stdout
>= 0)
904 if (dup2(*saved_stdout
, STDOUT_FILENO
) < 0)
907 *saved_stdin
= safe_close(*saved_stdin
);
908 *saved_stdout
= safe_close(*saved_stdout
);
914 CONFIRM_PRETEND_FAILURE
= -1,
915 CONFIRM_PRETEND_SUCCESS
= 0,
919 static int ask_for_confirmation(const ExecContext
*context
, const char *vc
, Unit
*u
, const char *cmdline
) {
920 int saved_stdout
= -1, saved_stdin
= -1, r
;
921 _cleanup_free_
char *e
= NULL
;
924 /* For any internal errors, assume a positive response. */
925 r
= setup_confirm_stdio(context
, vc
, &saved_stdin
, &saved_stdout
);
927 write_confirm_error(r
, vc
, u
);
928 return CONFIRM_EXECUTE
;
931 /* confirm_spawn might have been disabled while we were sleeping. */
932 if (manager_is_confirm_spawn_disabled(u
->manager
)) {
937 e
= ellipsize(cmdline
, 60, 100);
945 r
= ask_char(&c
, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e
);
947 write_confirm_error_fd(r
, STDOUT_FILENO
, u
);
954 printf("Resuming normal execution.\n");
955 manager_disable_confirm_spawn();
959 unit_dump(u
, stdout
, " ");
960 continue; /* ask again */
962 printf("Failing execution.\n");
963 r
= CONFIRM_PRETEND_FAILURE
;
966 printf(" c - continue, proceed without asking anymore\n"
967 " D - dump, show the state of the unit\n"
968 " f - fail, don't execute the command and pretend it failed\n"
970 " i - info, show a short summary of the unit\n"
971 " j - jobs, show jobs that are in progress\n"
972 " s - skip, don't execute the command and pretend it succeeded\n"
973 " y - yes, execute the command\n");
974 continue; /* ask again */
976 printf(" Description: %s\n"
979 u
->id
, u
->description
, cmdline
);
980 continue; /* ask again */
982 manager_dump_jobs(u
->manager
, stdout
, /* patterns= */ NULL
, " ");
983 continue; /* ask again */
985 /* 'n' was removed in favor of 'f'. */
986 printf("Didn't understand 'n', did you mean 'f'?\n");
987 continue; /* ask again */
989 printf("Skipping execution.\n");
990 r
= CONFIRM_PRETEND_SUCCESS
;
996 assert_not_reached();
1002 restore_confirm_stdio(&saved_stdin
, &saved_stdout
);
1006 static int get_fixed_user(const ExecContext
*c
, const char **user
,
1007 uid_t
*uid
, gid_t
*gid
,
1008 const char **home
, const char **shell
) {
1017 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1018 * (i.e. are "/" or "/bin/nologin"). */
1021 r
= get_user_creds(&name
, uid
, gid
, home
, shell
, USER_CREDS_CLEAN
);
1029 static int get_fixed_group(const ExecContext
*c
, const char **group
, gid_t
*gid
) {
1039 r
= get_group_creds(&name
, gid
, 0);
1047 static int get_supplementary_groups(const ExecContext
*c
, const char *user
,
1048 const char *group
, gid_t gid
,
1049 gid_t
**supplementary_gids
, int *ngids
) {
1052 bool keep_groups
= false;
1053 gid_t
*groups
= NULL
;
1054 _cleanup_free_ gid_t
*l_gids
= NULL
;
1059 * If user is given, then lookup GID and supplementary groups list.
1060 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1061 * here and as early as possible so we keep the list of supplementary
1062 * groups of the caller.
1064 if (user
&& gid_is_valid(gid
) && gid
!= 0) {
1065 /* First step, initialize groups from /etc/groups */
1066 if (initgroups(user
, gid
) < 0)
1072 if (strv_isempty(c
->supplementary_groups
))
1076 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1077 * be positive, otherwise fail.
1080 ngroups_max
= (int) sysconf(_SC_NGROUPS_MAX
);
1081 if (ngroups_max
<= 0)
1082 return errno_or_else(EOPNOTSUPP
);
1084 l_gids
= new(gid_t
, ngroups_max
);
1090 * Lookup the list of groups that the user belongs to, we
1091 * avoid NSS lookups here too for gid=0.
1094 if (getgrouplist(user
, gid
, l_gids
, &k
) < 0)
1099 STRV_FOREACH(i
, c
->supplementary_groups
) {
1102 if (k
>= ngroups_max
)
1106 r
= get_group_creds(&g
, l_gids
+k
, 0);
1114 * Sets ngids to zero to drop all supplementary groups, happens
1115 * when we are under root and SupplementaryGroups= is empty.
1122 /* Otherwise get the final list of supplementary groups */
1123 groups
= memdup(l_gids
, sizeof(gid_t
) * k
);
1127 *supplementary_gids
= groups
;
1135 static int enforce_groups(gid_t gid
, const gid_t
*supplementary_gids
, int ngids
) {
1138 /* Handle SupplementaryGroups= if it is not empty */
1140 r
= maybe_setgroups(ngids
, supplementary_gids
);
1145 if (gid_is_valid(gid
)) {
1146 /* Then set our gids */
1147 if (setresgid(gid
, gid
, gid
) < 0)
1154 static int set_securebits(unsigned bits
, unsigned mask
) {
1158 current
= prctl(PR_GET_SECUREBITS
);
1162 /* Clear all securebits defined in mask and set bits */
1163 applied
= ((unsigned) current
& ~mask
) | bits
;
1164 if ((unsigned) current
== applied
)
1167 if (prctl(PR_SET_SECUREBITS
, applied
) < 0)
1173 static int enforce_user(
1174 const ExecContext
*context
,
1176 uint64_t capability_ambient_set
) {
1180 if (!uid_is_valid(uid
))
1183 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1184 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1187 if ((capability_ambient_set
!= 0 || context
->secure_bits
!= 0) && uid
!= 0) {
1189 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1190 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1191 r
= set_securebits(1U << SECURE_KEEP_CAPS
, 0);
1196 /* Second step: actually set the uids */
1197 if (setresuid(uid
, uid
, uid
) < 0)
1200 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1201 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1202 * outside of this call. */
1208 static int null_conv(
1210 const struct pam_message
**msg
,
1211 struct pam_response
**resp
,
1212 void *appdata_ptr
) {
1214 /* We don't support conversations */
1216 return PAM_CONV_ERR
;
1221 static int setup_pam(
1227 char ***env
, /* updated on success */
1228 const int fds
[], size_t n_fds
) {
1232 static const struct pam_conv conv
= {
1237 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
1238 _cleanup_strv_free_
char **e
= NULL
;
1239 pam_handle_t
*handle
= NULL
;
1241 int pam_code
= PAM_SUCCESS
, r
;
1242 bool close_session
= false;
1243 pid_t pam_pid
= 0, parent_pid
;
1250 /* We set up PAM in the parent process, then fork. The child
1251 * will then stay around until killed via PR_GET_PDEATHSIG or
1252 * systemd via the cgroup logic. It will then remove the PAM
1253 * session again. The parent process will exec() the actual
1254 * daemon. We do things this way to ensure that the main PID
1255 * of the daemon is the one we initially fork()ed. */
1257 r
= barrier_create(&barrier
);
1261 if (log_get_max_level() < LOG_DEBUG
)
1262 flags
|= PAM_SILENT
;
1264 pam_code
= pam_start(name
, user
, &conv
, &handle
);
1265 if (pam_code
!= PAM_SUCCESS
) {
1271 _cleanup_free_
char *q
= NULL
;
1273 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1274 * out if that's the case, and read the TTY off it. */
1276 if (getttyname_malloc(STDIN_FILENO
, &q
) >= 0)
1277 tty
= strjoina("/dev/", q
);
1281 pam_code
= pam_set_item(handle
, PAM_TTY
, tty
);
1282 if (pam_code
!= PAM_SUCCESS
)
1286 STRV_FOREACH(nv
, *env
) {
1287 pam_code
= pam_putenv(handle
, *nv
);
1288 if (pam_code
!= PAM_SUCCESS
)
1292 pam_code
= pam_acct_mgmt(handle
, flags
);
1293 if (pam_code
!= PAM_SUCCESS
)
1296 pam_code
= pam_setcred(handle
, PAM_ESTABLISH_CRED
| flags
);
1297 if (pam_code
!= PAM_SUCCESS
)
1298 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle
, pam_code
));
1300 pam_code
= pam_open_session(handle
, flags
);
1301 if (pam_code
!= PAM_SUCCESS
)
1304 close_session
= true;
1306 e
= pam_getenvlist(handle
);
1308 pam_code
= PAM_BUF_ERR
;
1312 /* Block SIGTERM, so that we know that it won't get lost in the child */
1314 assert_se(sigprocmask_many(SIG_BLOCK
, &old_ss
, SIGTERM
, -1) >= 0);
1316 parent_pid
= getpid_cached();
1318 r
= safe_fork("(sd-pam)", 0, &pam_pid
);
1322 int sig
, ret
= EXIT_PAM
;
1324 /* The child's job is to reset the PAM session on termination */
1325 barrier_set_role(&barrier
, BARRIER_CHILD
);
1327 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1328 * those fds are open here that have been opened by PAM. */
1329 (void) close_many(fds
, n_fds
);
1331 /* Drop privileges - we don't need any to pam_close_session and this will make
1332 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1333 * threads to fail to exit normally */
1335 r
= maybe_setgroups(0, NULL
);
1337 log_warning_errno(r
, "Failed to setgroups() in sd-pam: %m");
1338 if (setresgid(gid
, gid
, gid
) < 0)
1339 log_warning_errno(errno
, "Failed to setresgid() in sd-pam: %m");
1340 if (setresuid(uid
, uid
, uid
) < 0)
1341 log_warning_errno(errno
, "Failed to setresuid() in sd-pam: %m");
1343 (void) ignore_signals(SIGPIPE
);
1345 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1346 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1347 * this way. We rely on the control groups kill logic to do the rest for us. */
1348 if (prctl(PR_SET_PDEATHSIG
, SIGTERM
) < 0)
1351 /* Tell the parent that our setup is done. This is especially important regarding dropping
1352 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1354 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1355 (void) barrier_place(&barrier
);
1357 /* Check if our parent process might already have died? */
1358 if (getppid() == parent_pid
) {
1361 assert_se(sigemptyset(&ss
) >= 0);
1362 assert_se(sigaddset(&ss
, SIGTERM
) >= 0);
1365 if (sigwait(&ss
, &sig
) < 0) {
1372 assert(sig
== SIGTERM
);
1377 pam_code
= pam_setcred(handle
, PAM_DELETE_CRED
| flags
);
1378 if (pam_code
!= PAM_SUCCESS
)
1381 /* If our parent died we'll end the session */
1382 if (getppid() != parent_pid
) {
1383 pam_code
= pam_close_session(handle
, flags
);
1384 if (pam_code
!= PAM_SUCCESS
)
1391 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1392 * know about this. See pam_end(3) */
1393 (void) pam_end(handle
, pam_code
| flags
| PAM_DATA_SILENT
);
1397 barrier_set_role(&barrier
, BARRIER_PARENT
);
1399 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1403 /* Unblock SIGTERM again in the parent */
1404 assert_se(sigprocmask(SIG_SETMASK
, &old_ss
, NULL
) >= 0);
1406 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1407 * this fd around. */
1410 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1411 * recover. However, warn loudly if it happens. */
1412 if (!barrier_place_and_sync(&barrier
))
1413 log_error("PAM initialization failed");
1415 return strv_free_and_replace(*env
, e
);
1418 if (pam_code
!= PAM_SUCCESS
) {
1419 log_error("PAM failed: %s", pam_strerror(handle
, pam_code
));
1420 r
= -EPERM
; /* PAM errors do not map to errno */
1422 log_error_errno(r
, "PAM failed: %m");
1426 pam_code
= pam_close_session(handle
, flags
);
1428 (void) pam_end(handle
, pam_code
| flags
);
1438 static void rename_process_from_path(const char *path
) {
1439 _cleanup_free_
char *buf
= NULL
;
1444 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1447 if (path_extract_filename(path
, &buf
) < 0) {
1448 rename_process("(...)");
1452 size_t l
= strlen(buf
);
1454 /* The end of the process name is usually more interesting, since the first bit might just be
1461 char process_name
[11];
1462 process_name
[0] = '(';
1463 memcpy(process_name
+1, p
, l
);
1464 process_name
[1+l
] = ')';
1465 process_name
[1+l
+1] = 0;
1467 rename_process(process_name
);
1470 static bool context_has_address_families(const ExecContext
*c
) {
1473 return c
->address_families_allow_list
||
1474 !set_isempty(c
->address_families
);
1477 static bool context_has_syscall_filters(const ExecContext
*c
) {
1480 return c
->syscall_allow_list
||
1481 !hashmap_isempty(c
->syscall_filter
);
1484 static bool context_has_syscall_logs(const ExecContext
*c
) {
1487 return c
->syscall_log_allow_list
||
1488 !hashmap_isempty(c
->syscall_log
);
1491 static bool context_has_no_new_privileges(const ExecContext
*c
) {
1494 if (c
->no_new_privileges
)
1497 if (have_effective_cap(CAP_SYS_ADMIN
) > 0) /* if we are privileged, we don't need NNP */
1500 /* We need NNP if we have any form of seccomp and are unprivileged */
1501 return c
->lock_personality
||
1502 c
->memory_deny_write_execute
||
1503 c
->private_devices
||
1505 c
->protect_hostname
||
1506 c
->protect_kernel_tunables
||
1507 c
->protect_kernel_modules
||
1508 c
->protect_kernel_logs
||
1509 context_has_address_families(c
) ||
1510 exec_context_restrict_namespaces_set(c
) ||
1511 c
->restrict_realtime
||
1512 c
->restrict_suid_sgid
||
1513 !set_isempty(c
->syscall_archs
) ||
1514 context_has_syscall_filters(c
) ||
1515 context_has_syscall_logs(c
);
1518 bool exec_context_has_credentials(const ExecContext
*context
) {
1522 return !hashmap_isempty(context
->set_credentials
) ||
1523 !hashmap_isempty(context
->load_credentials
) ||
1524 !set_isempty(context
->import_credentials
);
1529 static bool skip_seccomp_unavailable(const Unit
* u
, const char* msg
) {
1531 if (is_seccomp_available())
1534 log_unit_debug(u
, "SECCOMP features not detected in the kernel, skipping %s", msg
);
1538 static int apply_syscall_filter(const Unit
* u
, const ExecContext
*c
, bool needs_ambient_hack
) {
1539 uint32_t negative_action
, default_action
, action
;
1545 if (!context_has_syscall_filters(c
))
1548 if (skip_seccomp_unavailable(u
, "SystemCallFilter="))
1551 negative_action
= c
->syscall_errno
== SECCOMP_ERROR_NUMBER_KILL
? scmp_act_kill_process() : SCMP_ACT_ERRNO(c
->syscall_errno
);
1553 if (c
->syscall_allow_list
) {
1554 default_action
= negative_action
;
1555 action
= SCMP_ACT_ALLOW
;
1557 default_action
= SCMP_ACT_ALLOW
;
1558 action
= negative_action
;
1561 if (needs_ambient_hack
) {
1562 r
= seccomp_filter_set_add(c
->syscall_filter
, c
->syscall_allow_list
, syscall_filter_sets
+ SYSCALL_FILTER_SET_SETUID
);
1567 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_filter
, action
, false);
1570 static int apply_syscall_log(const Unit
* u
, const ExecContext
*c
) {
1572 uint32_t default_action
, action
;
1578 if (!context_has_syscall_logs(c
))
1582 if (skip_seccomp_unavailable(u
, "SystemCallLog="))
1585 if (c
->syscall_log_allow_list
) {
1586 /* Log nothing but the ones listed */
1587 default_action
= SCMP_ACT_ALLOW
;
1588 action
= SCMP_ACT_LOG
;
1590 /* Log everything but the ones listed */
1591 default_action
= SCMP_ACT_LOG
;
1592 action
= SCMP_ACT_ALLOW
;
1595 return seccomp_load_syscall_filter_set_raw(default_action
, c
->syscall_log
, action
, false);
1597 /* old libseccomp */
1598 log_unit_debug(u
, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1603 static int apply_syscall_archs(const Unit
*u
, const ExecContext
*c
) {
1607 if (set_isempty(c
->syscall_archs
))
1610 if (skip_seccomp_unavailable(u
, "SystemCallArchitectures="))
1613 return seccomp_restrict_archs(c
->syscall_archs
);
1616 static int apply_address_families(const Unit
* u
, const ExecContext
*c
) {
1620 if (!context_has_address_families(c
))
1623 if (skip_seccomp_unavailable(u
, "RestrictAddressFamilies="))
1626 return seccomp_restrict_address_families(c
->address_families
, c
->address_families_allow_list
);
1629 static int apply_memory_deny_write_execute(const Unit
* u
, const ExecContext
*c
) {
1635 if (!c
->memory_deny_write_execute
)
1638 /* use prctl() if kernel supports it (6.3) */
1639 r
= prctl(PR_SET_MDWE
, PR_MDWE_REFUSE_EXEC_GAIN
, 0, 0, 0);
1641 log_unit_debug(u
, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1644 if (r
< 0 && errno
!= EINVAL
)
1645 return log_unit_debug_errno(u
, errno
, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1646 /* else use seccomp */
1647 log_unit_debug(u
, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1649 if (skip_seccomp_unavailable(u
, "MemoryDenyWriteExecute="))
1652 return seccomp_memory_deny_write_execute();
1655 static int apply_restrict_realtime(const Unit
* u
, const ExecContext
*c
) {
1659 if (!c
->restrict_realtime
)
1662 if (skip_seccomp_unavailable(u
, "RestrictRealtime="))
1665 return seccomp_restrict_realtime();
1668 static int apply_restrict_suid_sgid(const Unit
* u
, const ExecContext
*c
) {
1672 if (!c
->restrict_suid_sgid
)
1675 if (skip_seccomp_unavailable(u
, "RestrictSUIDSGID="))
1678 return seccomp_restrict_suid_sgid();
1681 static int apply_protect_sysctl(const Unit
*u
, const ExecContext
*c
) {
1685 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1686 * let's protect even those systems where this is left on in the kernel. */
1688 if (!c
->protect_kernel_tunables
)
1691 if (skip_seccomp_unavailable(u
, "ProtectKernelTunables="))
1694 return seccomp_protect_sysctl();
1697 static int apply_protect_kernel_modules(const Unit
*u
, const ExecContext
*c
) {
1701 /* Turn off module syscalls on ProtectKernelModules=yes */
1703 if (!c
->protect_kernel_modules
)
1706 if (skip_seccomp_unavailable(u
, "ProtectKernelModules="))
1709 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_MODULE
, SCMP_ACT_ERRNO(EPERM
), false);
1712 static int apply_protect_kernel_logs(const Unit
*u
, const ExecContext
*c
) {
1716 if (!c
->protect_kernel_logs
)
1719 if (skip_seccomp_unavailable(u
, "ProtectKernelLogs="))
1722 return seccomp_protect_syslog();
1725 static int apply_protect_clock(const Unit
*u
, const ExecContext
*c
) {
1729 if (!c
->protect_clock
)
1732 if (skip_seccomp_unavailable(u
, "ProtectClock="))
1735 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_CLOCK
, SCMP_ACT_ERRNO(EPERM
), false);
1738 static int apply_private_devices(const Unit
*u
, const ExecContext
*c
) {
1742 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1744 if (!c
->private_devices
)
1747 if (skip_seccomp_unavailable(u
, "PrivateDevices="))
1750 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW
, syscall_filter_sets
+ SYSCALL_FILTER_SET_RAW_IO
, SCMP_ACT_ERRNO(EPERM
), false);
1753 static int apply_restrict_namespaces(const Unit
*u
, const ExecContext
*c
) {
1757 if (!exec_context_restrict_namespaces_set(c
))
1760 if (skip_seccomp_unavailable(u
, "RestrictNamespaces="))
1763 return seccomp_restrict_namespaces(c
->restrict_namespaces
);
1766 static int apply_lock_personality(const Unit
* u
, const ExecContext
*c
) {
1767 unsigned long personality
;
1773 if (!c
->lock_personality
)
1776 if (skip_seccomp_unavailable(u
, "LockPersonality="))
1779 personality
= c
->personality
;
1781 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1782 if (personality
== PERSONALITY_INVALID
) {
1784 r
= opinionated_personality(&personality
);
1789 return seccomp_lock_personality(personality
);
1795 static int apply_restrict_filesystems(Unit
*u
, const ExecContext
*c
) {
1799 if (!exec_context_restrict_filesystems_set(c
))
1802 if (!u
->manager
->restrict_fs
) {
1803 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1804 log_unit_debug(u
, "LSM BPF not supported, skipping RestrictFileSystems=");
1808 return lsm_bpf_unit_restrict_filesystems(u
, c
->restrict_filesystems
, c
->restrict_filesystems_allow_list
);
1812 static int apply_protect_hostname(const Unit
*u
, const ExecContext
*c
, int *ret_exit_status
) {
1816 if (!c
->protect_hostname
)
1819 if (ns_type_supported(NAMESPACE_UTS
)) {
1820 if (unshare(CLONE_NEWUTS
) < 0) {
1821 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
)) {
1822 *ret_exit_status
= EXIT_NAMESPACE
;
1823 return log_unit_error_errno(u
, errno
, "Failed to set up UTS namespacing: %m");
1826 log_unit_warning(u
, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1829 log_unit_warning(u
, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1834 if (skip_seccomp_unavailable(u
, "ProtectHostname="))
1837 r
= seccomp_protect_hostname();
1839 *ret_exit_status
= EXIT_SECCOMP
;
1840 return log_unit_error_errno(u
, r
, "Failed to apply hostname restrictions: %m");
1847 static void do_idle_pipe_dance(int idle_pipe
[static 4]) {
1850 idle_pipe
[1] = safe_close(idle_pipe
[1]);
1851 idle_pipe
[2] = safe_close(idle_pipe
[2]);
1853 if (idle_pipe
[0] >= 0) {
1856 r
= fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT_USEC
);
1858 if (idle_pipe
[3] >= 0 && r
== 0 /* timeout */) {
1861 /* Signal systemd that we are bored and want to continue. */
1862 n
= write(idle_pipe
[3], "x", 1);
1864 /* Wait for systemd to react to the signal above. */
1865 (void) fd_wait_for_event(idle_pipe
[0], POLLHUP
, IDLE_TIMEOUT2_USEC
);
1868 idle_pipe
[0] = safe_close(idle_pipe
[0]);
1872 idle_pipe
[3] = safe_close(idle_pipe
[3]);
1875 static const char *exec_directory_env_name_to_string(ExecDirectoryType t
);
1877 static int build_environment(
1879 const ExecContext
*c
,
1880 const ExecParameters
*p
,
1881 const CGroupContext
*cgroup_context
,
1885 const char *username
,
1887 dev_t journal_stream_dev
,
1888 ino_t journal_stream_ino
,
1889 const char *memory_pressure_path
,
1892 _cleanup_strv_free_
char **our_env
= NULL
;
1902 #define N_ENV_VARS 19
1903 our_env
= new0(char*, N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
1908 _cleanup_free_
char *joined
= NULL
;
1910 if (asprintf(&x
, "LISTEN_PID="PID_FMT
, getpid_cached()) < 0)
1912 our_env
[n_env
++] = x
;
1914 if (asprintf(&x
, "LISTEN_FDS=%zu", n_fds
) < 0)
1916 our_env
[n_env
++] = x
;
1918 joined
= strv_join(fdnames
, ":");
1922 x
= strjoin("LISTEN_FDNAMES=", joined
);
1925 our_env
[n_env
++] = x
;
1928 if ((p
->flags
& EXEC_SET_WATCHDOG
) && p
->watchdog_usec
> 0) {
1929 if (asprintf(&x
, "WATCHDOG_PID="PID_FMT
, getpid_cached()) < 0)
1931 our_env
[n_env
++] = x
;
1933 if (asprintf(&x
, "WATCHDOG_USEC="USEC_FMT
, p
->watchdog_usec
) < 0)
1935 our_env
[n_env
++] = x
;
1938 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1939 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1940 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1941 if (p
->flags
& EXEC_NSS_DYNAMIC_BYPASS
) {
1942 x
= strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1945 our_env
[n_env
++] = x
;
1949 x
= strjoin("HOME=", home
);
1953 path_simplify(x
+ 5);
1954 our_env
[n_env
++] = x
;
1958 x
= strjoin("LOGNAME=", username
);
1961 our_env
[n_env
++] = x
;
1963 x
= strjoin("USER=", username
);
1966 our_env
[n_env
++] = x
;
1970 x
= strjoin("SHELL=", shell
);
1974 path_simplify(x
+ 6);
1975 our_env
[n_env
++] = x
;
1978 if (!sd_id128_is_null(u
->invocation_id
)) {
1979 if (asprintf(&x
, "INVOCATION_ID=" SD_ID128_FORMAT_STR
, SD_ID128_FORMAT_VAL(u
->invocation_id
)) < 0)
1982 our_env
[n_env
++] = x
;
1985 if (exec_context_needs_term(c
)) {
1986 _cleanup_free_
char *cmdline
= NULL
;
1987 const char *tty_path
, *term
= NULL
;
1989 tty_path
= exec_context_tty_path(c
);
1991 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1992 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1993 * container manager passes to PID 1 ends up all the way in the console login shown. */
1995 if (path_equal_ptr(tty_path
, "/dev/console") && getppid() == 1)
1996 term
= getenv("TERM");
1997 else if (tty_path
&& in_charset(skip_dev_prefix(tty_path
), ALPHANUMERICAL
)) {
1998 _cleanup_free_
char *key
= NULL
;
2000 key
= strjoin("systemd.tty.term.", skip_dev_prefix(tty_path
));
2004 r
= proc_cmdline_get_key(key
, 0, &cmdline
);
2006 log_debug_errno(r
, "Failed to read %s from kernel cmdline, ignoring: %m", key
);
2012 term
= default_term_for_tty(tty_path
);
2014 x
= strjoin("TERM=", term
);
2017 our_env
[n_env
++] = x
;
2020 if (journal_stream_dev
!= 0 && journal_stream_ino
!= 0) {
2021 if (asprintf(&x
, "JOURNAL_STREAM=" DEV_FMT
":" INO_FMT
, journal_stream_dev
, journal_stream_ino
) < 0)
2024 our_env
[n_env
++] = x
;
2027 if (c
->log_namespace
) {
2028 x
= strjoin("LOG_NAMESPACE=", c
->log_namespace
);
2032 our_env
[n_env
++] = x
;
2035 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2036 _cleanup_free_
char *joined
= NULL
;
2042 if (c
->directories
[t
].n_items
== 0)
2045 n
= exec_directory_env_name_to_string(t
);
2049 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
2050 _cleanup_free_
char *prefixed
= NULL
;
2052 prefixed
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
2056 if (!strextend_with_separator(&joined
, ":", prefixed
))
2060 x
= strjoin(n
, "=", joined
);
2064 our_env
[n_env
++] = x
;
2067 if (exec_context_has_credentials(c
) && p
->prefix
[EXEC_DIRECTORY_RUNTIME
]) {
2068 x
= strjoin("CREDENTIALS_DIRECTORY=", p
->prefix
[EXEC_DIRECTORY_RUNTIME
], "/credentials/", u
->id
);
2072 our_env
[n_env
++] = x
;
2075 if (asprintf(&x
, "SYSTEMD_EXEC_PID=" PID_FMT
, getpid_cached()) < 0)
2078 our_env
[n_env
++] = x
;
2080 if (memory_pressure_path
) {
2081 x
= strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path
);
2085 our_env
[n_env
++] = x
;
2087 if (cgroup_context
&& !path_equal(memory_pressure_path
, "/dev/null")) {
2088 _cleanup_free_
char *b
= NULL
, *e
= NULL
;
2090 if (asprintf(&b
, "%s " USEC_FMT
" " USEC_FMT
,
2091 MEMORY_PRESSURE_DEFAULT_TYPE
,
2092 cgroup_context
->memory_pressure_threshold_usec
== USEC_INFINITY
? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
:
2093 CLAMP(cgroup_context
->memory_pressure_threshold_usec
, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
),
2094 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2097 if (base64mem(b
, strlen(b
) + 1, &e
) < 0)
2100 x
= strjoin("MEMORY_PRESSURE_WRITE=", e
);
2104 our_env
[n_env
++] = x
;
2108 assert(n_env
< N_ENV_VARS
+ _EXEC_DIRECTORY_TYPE_MAX
);
2111 *ret
= TAKE_PTR(our_env
);
2116 static int build_pass_environment(const ExecContext
*c
, char ***ret
) {
2117 _cleanup_strv_free_
char **pass_env
= NULL
;
2120 STRV_FOREACH(i
, c
->pass_environment
) {
2121 _cleanup_free_
char *x
= NULL
;
2127 x
= strjoin(*i
, "=", v
);
2131 if (!GREEDY_REALLOC(pass_env
, n_env
+ 2))
2134 pass_env
[n_env
++] = TAKE_PTR(x
);
2135 pass_env
[n_env
] = NULL
;
2138 *ret
= TAKE_PTR(pass_env
);
2143 bool exec_needs_network_namespace(const ExecContext
*context
) {
2146 return context
->private_network
|| context
->network_namespace_path
;
2149 static bool exec_needs_ephemeral(const ExecContext
*context
) {
2150 return (context
->root_image
|| context
->root_directory
) && context
->root_ephemeral
;
2153 static bool exec_needs_ipc_namespace(const ExecContext
*context
) {
2156 return context
->private_ipc
|| context
->ipc_namespace_path
;
2159 bool exec_needs_mount_namespace(
2160 const ExecContext
*context
,
2161 const ExecParameters
*params
,
2162 const ExecRuntime
*runtime
) {
2166 if (context
->root_image
)
2169 if (!strv_isempty(context
->read_write_paths
) ||
2170 !strv_isempty(context
->read_only_paths
) ||
2171 !strv_isempty(context
->inaccessible_paths
) ||
2172 !strv_isempty(context
->exec_paths
) ||
2173 !strv_isempty(context
->no_exec_paths
))
2176 if (context
->n_bind_mounts
> 0)
2179 if (context
->n_temporary_filesystems
> 0)
2182 if (context
->n_mount_images
> 0)
2185 if (context
->n_extension_images
> 0)
2188 if (!strv_isempty(context
->extension_directories
))
2191 if (!IN_SET(context
->mount_propagation_flag
, 0, MS_SHARED
))
2194 if (context
->private_tmp
&& runtime
&& runtime
->shared
&& (runtime
->shared
->tmp_dir
|| runtime
->shared
->var_tmp_dir
))
2197 if (context
->private_devices
||
2198 context
->private_mounts
> 0 ||
2199 (context
->private_mounts
< 0 && exec_needs_network_namespace(context
)) ||
2200 context
->protect_system
!= PROTECT_SYSTEM_NO
||
2201 context
->protect_home
!= PROTECT_HOME_NO
||
2202 context
->protect_kernel_tunables
||
2203 context
->protect_kernel_modules
||
2204 context
->protect_kernel_logs
||
2205 context
->protect_control_groups
||
2206 context
->protect_proc
!= PROTECT_PROC_DEFAULT
||
2207 context
->proc_subset
!= PROC_SUBSET_ALL
||
2208 exec_needs_ipc_namespace(context
))
2211 if (context
->root_directory
) {
2212 if (exec_context_get_effective_mount_apivfs(context
))
2215 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
2216 if (params
&& !params
->prefix
[t
])
2219 if (context
->directories
[t
].n_items
> 0)
2224 if (context
->dynamic_user
&&
2225 (context
->directories
[EXEC_DIRECTORY_STATE
].n_items
> 0 ||
2226 context
->directories
[EXEC_DIRECTORY_CACHE
].n_items
> 0 ||
2227 context
->directories
[EXEC_DIRECTORY_LOGS
].n_items
> 0))
2230 if (context
->log_namespace
)
2236 static int setup_private_users(uid_t ouid
, gid_t ogid
, uid_t uid
, gid_t gid
) {
2237 _cleanup_free_
char *uid_map
= NULL
, *gid_map
= NULL
;
2238 _cleanup_close_pair_
int errno_pipe
[2] = PIPE_EBADF
;
2239 _cleanup_close_
int unshare_ready_fd
= -EBADF
;
2240 _cleanup_(sigkill_waitp
) pid_t pid
= 0;
2245 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2246 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2247 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2248 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2249 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2250 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2251 * continues execution normally.
2252 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2253 * does not need CAP_SETUID to write the single line mapping to itself. */
2255 /* Can only set up multiple mappings with CAP_SETUID. */
2256 if (have_effective_cap(CAP_SETUID
) > 0 && uid
!= ouid
&& uid_is_valid(uid
))
2257 r
= asprintf(&uid_map
,
2258 UID_FMT
" " UID_FMT
" 1\n" /* Map $OUID → $OUID */
2259 UID_FMT
" " UID_FMT
" 1\n", /* Map $UID → $UID */
2260 ouid
, ouid
, uid
, uid
);
2262 r
= asprintf(&uid_map
,
2263 UID_FMT
" " UID_FMT
" 1\n", /* Map $OUID → $OUID */
2269 /* Can only set up multiple mappings with CAP_SETGID. */
2270 if (have_effective_cap(CAP_SETGID
) > 0 && gid
!= ogid
&& gid_is_valid(gid
))
2271 r
= asprintf(&gid_map
,
2272 GID_FMT
" " GID_FMT
" 1\n" /* Map $OGID → $OGID */
2273 GID_FMT
" " GID_FMT
" 1\n", /* Map $GID → $GID */
2274 ogid
, ogid
, gid
, gid
);
2276 r
= asprintf(&gid_map
,
2277 GID_FMT
" " GID_FMT
" 1\n", /* Map $OGID -> $OGID */
2283 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2285 unshare_ready_fd
= eventfd(0, EFD_CLOEXEC
);
2286 if (unshare_ready_fd
< 0)
2289 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2291 if (pipe2(errno_pipe
, O_CLOEXEC
) < 0)
2294 r
= safe_fork("(sd-userns)", FORK_RESET_SIGNALS
|FORK_DEATHSIG
, &pid
);
2298 _cleanup_close_
int fd
= -EBADF
;
2302 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2303 * here, after the parent opened its own user namespace. */
2306 errno_pipe
[0] = safe_close(errno_pipe
[0]);
2308 /* Wait until the parent unshared the user namespace */
2309 if (read(unshare_ready_fd
, &c
, sizeof(c
)) < 0) {
2314 /* Disable the setgroups() system call in the child user namespace, for good. */
2315 a
= procfs_file_alloca(ppid
, "setgroups");
2316 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2318 if (errno
!= ENOENT
) {
2323 /* If the file is missing the kernel is too old, let's continue anyway. */
2325 if (write(fd
, "deny\n", 5) < 0) {
2330 fd
= safe_close(fd
);
2333 /* First write the GID map */
2334 a
= procfs_file_alloca(ppid
, "gid_map");
2335 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2340 if (write(fd
, gid_map
, strlen(gid_map
)) < 0) {
2344 fd
= safe_close(fd
);
2346 /* The write the UID map */
2347 a
= procfs_file_alloca(ppid
, "uid_map");
2348 fd
= open(a
, O_WRONLY
|O_CLOEXEC
);
2353 if (write(fd
, uid_map
, strlen(uid_map
)) < 0) {
2358 _exit(EXIT_SUCCESS
);
2361 (void) write(errno_pipe
[1], &r
, sizeof(r
));
2362 _exit(EXIT_FAILURE
);
2365 errno_pipe
[1] = safe_close(errno_pipe
[1]);
2367 if (unshare(CLONE_NEWUSER
) < 0)
2370 /* Let the child know that the namespace is ready now */
2371 if (write(unshare_ready_fd
, &c
, sizeof(c
)) < 0)
2374 /* Try to read an error code from the child */
2375 n
= read(errno_pipe
[0], &r
, sizeof(r
));
2378 if (n
== sizeof(r
)) { /* an error code was sent to us */
2383 if (n
!= 0) /* on success we should have read 0 bytes */
2386 r
= wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid
), 0);
2389 if (r
!= EXIT_SUCCESS
) /* If something strange happened with the child, let's consider this fatal, too */
2395 static bool exec_directory_is_private(const ExecContext
*context
, ExecDirectoryType type
) {
2398 if (!context
->dynamic_user
)
2401 if (type
== EXEC_DIRECTORY_CONFIGURATION
)
2404 if (type
== EXEC_DIRECTORY_RUNTIME
&& context
->runtime_directory_preserve_mode
== EXEC_PRESERVE_NO
)
2410 static int create_many_symlinks(const char *root
, const char *source
, char **symlinks
) {
2411 _cleanup_free_
char *src_abs
= NULL
;
2416 src_abs
= path_join(root
, source
);
2420 STRV_FOREACH(dst
, symlinks
) {
2421 _cleanup_free_
char *dst_abs
= NULL
;
2423 dst_abs
= path_join(root
, *dst
);
2427 r
= mkdir_parents_label(dst_abs
, 0755);
2431 r
= symlink_idempotent(src_abs
, dst_abs
, true);
2439 static int setup_exec_directory(
2441 const ExecContext
*context
,
2442 const ExecParameters
*params
,
2445 ExecDirectoryType type
,
2446 bool needs_mount_namespace
,
2449 static const int exit_status_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
2450 [EXEC_DIRECTORY_RUNTIME
] = EXIT_RUNTIME_DIRECTORY
,
2451 [EXEC_DIRECTORY_STATE
] = EXIT_STATE_DIRECTORY
,
2452 [EXEC_DIRECTORY_CACHE
] = EXIT_CACHE_DIRECTORY
,
2453 [EXEC_DIRECTORY_LOGS
] = EXIT_LOGS_DIRECTORY
,
2454 [EXEC_DIRECTORY_CONFIGURATION
] = EXIT_CONFIGURATION_DIRECTORY
,
2460 assert(type
>= 0 && type
< _EXEC_DIRECTORY_TYPE_MAX
);
2461 assert(exit_status
);
2463 if (!params
->prefix
[type
])
2466 if (params
->flags
& EXEC_CHOWN_DIRECTORIES
) {
2467 if (!uid_is_valid(uid
))
2469 if (!gid_is_valid(gid
))
2473 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2474 _cleanup_free_
char *p
= NULL
, *pp
= NULL
;
2476 p
= path_join(params
->prefix
[type
], context
->directories
[type
].items
[i
].path
);
2482 r
= mkdir_parents_label(p
, 0755);
2486 if (IN_SET(type
, EXEC_DIRECTORY_STATE
, EXEC_DIRECTORY_LOGS
) && params
->runtime_scope
== RUNTIME_SCOPE_USER
) {
2488 /* If we are in user mode, and a configuration directory exists but a state directory
2489 * doesn't exist, then we likely are upgrading from an older systemd version that
2490 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2491 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2492 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2493 * separated. If a service has both dirs configured but only the configuration dir
2494 * exists and the state dir does not, we assume we are looking at an update
2495 * situation. Hence, create a compatibility symlink, so that all expectations are
2498 * (We also do something similar with the log directory, which still doesn't exist in
2499 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2501 /* this assumes the state dir is always created before the configuration dir */
2502 assert_cc(EXEC_DIRECTORY_STATE
< EXEC_DIRECTORY_LOGS
);
2503 assert_cc(EXEC_DIRECTORY_LOGS
< EXEC_DIRECTORY_CONFIGURATION
);
2505 r
= laccess(p
, F_OK
);
2507 _cleanup_free_
char *q
= NULL
;
2509 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2510 * under the configuration hierarchy. */
2512 if (type
== EXEC_DIRECTORY_STATE
)
2513 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], context
->directories
[type
].items
[i
].path
);
2514 else if (type
== EXEC_DIRECTORY_LOGS
)
2515 q
= path_join(params
->prefix
[EXEC_DIRECTORY_CONFIGURATION
], "log", context
->directories
[type
].items
[i
].path
);
2517 assert_not_reached();
2523 r
= laccess(q
, F_OK
);
2525 /* It does exist! This hence looks like an update. Symlink the
2526 * configuration directory into the state directory. */
2528 r
= symlink_idempotent(q
, p
, /* make_relative= */ true);
2532 log_unit_notice(u
, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p
, q
);
2534 } else if (r
!= -ENOENT
)
2535 log_unit_warning_errno(u
, r
, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q
);
2538 log_unit_warning_errno(u
, r
, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p
);
2541 if (exec_directory_is_private(context
, type
)) {
2542 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2543 * case we want to avoid leaving a directory around fully accessible that is owned by
2544 * a dynamic user whose UID is later on reused. To lock this down we use the same
2545 * trick used by container managers to prohibit host users to get access to files of
2546 * the same UID in containers: we place everything inside a directory that has an
2547 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2548 * for unprivileged host code. We then use fs namespacing to make this directory
2549 * permeable for the service itself.
2551 * Specifically: for a service which wants a special directory "foo/" we first create
2552 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2553 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2554 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2555 * unprivileged host users can't look into it. Inside of the namespace of the unit
2556 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2557 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2558 * for the service and making sure it only gets access to the dirs it needs but no
2559 * others. Tricky? Yes, absolutely, but it works!
2561 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2562 * to be owned by the service itself.
2564 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2565 * for sharing files or sockets with other services. */
2567 pp
= path_join(params
->prefix
[type
], "private");
2573 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2574 r
= mkdir_safe_label(pp
, 0700, 0, 0, MKDIR_WARN_MODE
);
2578 if (!path_extend(&pp
, context
->directories
[type
].items
[i
].path
)) {
2583 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2584 r
= mkdir_parents_label(pp
, 0755);
2588 if (is_dir(p
, false) > 0 &&
2589 (laccess(pp
, F_OK
) == -ENOENT
)) {
2591 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2592 * it over. Most likely the service has been upgraded from one that didn't use
2593 * DynamicUser=1, to one that does. */
2595 log_unit_info(u
, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2596 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2597 exec_directory_type_to_string(type
), p
, pp
);
2599 r
= RET_NERRNO(rename(p
, pp
));
2603 /* Otherwise, create the actual directory for the service */
2605 r
= mkdir_label(pp
, context
->directories
[type
].mode
);
2606 if (r
< 0 && r
!= -EEXIST
)
2610 if (!context
->directories
[type
].items
[i
].only_create
) {
2611 /* And link it up from the original place.
2613 * 1) If a mount namespace is going to be used, then this symlink remains on
2614 * the host, and a new one for the child namespace will be created later.
2615 * 2) It is not necessary to create this symlink when one of its parent
2616 * directories is specified and already created. E.g.
2617 * StateDirectory=foo foo/bar
2618 * In that case, the inode points to pp and p for "foo/bar" are the same:
2619 * pp = "/var/lib/private/foo/bar"
2620 * p = "/var/lib/foo/bar"
2621 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2622 * we do not need to create the symlink, but we cannot create the symlink.
2623 * See issue #24783. */
2624 r
= symlink_idempotent(pp
, p
, true);
2630 _cleanup_free_
char *target
= NULL
;
2632 if (type
!= EXEC_DIRECTORY_CONFIGURATION
&&
2633 readlink_and_make_absolute(p
, &target
) >= 0) {
2634 _cleanup_free_
char *q
= NULL
, *q_resolved
= NULL
, *target_resolved
= NULL
;
2636 /* This already exists and is a symlink? Interesting. Maybe it's one created
2637 * by DynamicUser=1 (see above)?
2639 * We do this for all directory types except for ConfigurationDirectory=,
2640 * since they all support the private/ symlink logic at least in some
2641 * configurations, see above. */
2643 r
= chase(target
, NULL
, 0, &target_resolved
, NULL
);
2647 q
= path_join(params
->prefix
[type
], "private", context
->directories
[type
].items
[i
].path
);
2653 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2654 r
= chase(q
, NULL
, CHASE_NONEXISTENT
, &q_resolved
, NULL
);
2658 if (path_equal(q_resolved
, target_resolved
)) {
2660 /* Hmm, apparently DynamicUser= was once turned on for this service,
2661 * but is no longer. Let's move the directory back up. */
2663 log_unit_info(u
, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2664 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2665 exec_directory_type_to_string(type
), q
, p
);
2667 r
= RET_NERRNO(unlink(p
));
2671 r
= RET_NERRNO(rename(q
, p
));
2677 r
= mkdir_label(p
, context
->directories
[type
].mode
);
2682 if (type
== EXEC_DIRECTORY_CONFIGURATION
) {
2685 /* Don't change the owner/access mode of the configuration directory,
2686 * as in the common case it is not written to by a service, and shall
2687 * not be writable. */
2689 r
= RET_NERRNO(stat(p
, &st
));
2693 /* Still complain if the access mode doesn't match */
2694 if (((st
.st_mode
^ context
->directories
[type
].mode
) & 07777) != 0)
2695 log_unit_warning(u
, "%s \'%s\' already exists but the mode is different. "
2696 "(File system: %o %sMode: %o)",
2697 exec_directory_type_to_string(type
), context
->directories
[type
].items
[i
].path
,
2698 st
.st_mode
& 07777, exec_directory_type_to_string(type
), context
->directories
[type
].mode
& 07777);
2705 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2706 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2707 * current UID/GID ownership.) */
2708 r
= chmod_and_chown(pp
?: p
, context
->directories
[type
].mode
, UID_INVALID
, GID_INVALID
);
2712 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2713 * available to user code anyway */
2714 if (params
->runtime_scope
!= RUNTIME_SCOPE_SYSTEM
)
2717 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2718 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2719 * assignments to exist. */
2720 r
= path_chown_recursive(pp
?: p
, uid
, gid
, context
->dynamic_user
? 01777 : 07777, AT_SYMLINK_FOLLOW
);
2725 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2726 * they are set up later, to allow configuring empty var/run/etc. */
2727 if (!needs_mount_namespace
)
2728 for (size_t i
= 0; i
< context
->directories
[type
].n_items
; i
++) {
2729 r
= create_many_symlinks(params
->prefix
[type
],
2730 context
->directories
[type
].items
[i
].path
,
2731 context
->directories
[type
].items
[i
].symlinks
);
2739 *exit_status
= exit_status_table
[type
];
2743 static int write_credential(
2750 bool ownership_ok
) {
2752 _cleanup_(unlink_and_freep
) char *tmp
= NULL
;
2753 _cleanup_close_
int fd
= -EBADF
;
2756 r
= tempfn_random_child("", "cred", &tmp
);
2760 fd
= openat(dfd
, tmp
, O_CREAT
|O_RDWR
|O_CLOEXEC
|O_EXCL
|O_NOFOLLOW
|O_NOCTTY
, 0600);
2766 r
= loop_write(fd
, data
, size
, /* do_poll = */ false);
2770 if (fchmod(fd
, 0400) < 0) /* Take away "w" bit */
2773 if (uid_is_valid(uid
) && uid
!= getuid()) {
2774 r
= fd_add_uid_acl_permission(fd
, uid
, ACL_READ
);
2776 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
2779 if (!ownership_ok
) /* Ideally we use ACLs, since we can neatly express what we want
2780 * to express: that the user gets read access and nothing
2781 * else. But if the backing fs can't support that (e.g. ramfs)
2782 * then we can use file ownership instead. But that's only safe if
2783 * we can then re-mount the whole thing read-only, so that the
2784 * user can no longer chmod() the file to gain write access. */
2787 if (fchown(fd
, uid
, gid
) < 0)
2792 if (renameat(dfd
, tmp
, dfd
, id
) < 0)
2799 typedef enum CredentialSearchPath
{
2800 CREDENTIAL_SEARCH_PATH_TRUSTED
,
2801 CREDENTIAL_SEARCH_PATH_ENCRYPTED
,
2802 CREDENTIAL_SEARCH_PATH_ALL
,
2803 _CREDENTIAL_SEARCH_PATH_MAX
,
2804 _CREDENTIAL_SEARCH_PATH_INVALID
= -EINVAL
,
2805 } CredentialSearchPath
;
2807 static char **credential_search_path(const ExecParameters
*params
, CredentialSearchPath path
) {
2809 _cleanup_strv_free_
char **l
= NULL
;
2812 assert(path
>= 0 && path
< _CREDENTIAL_SEARCH_PATH_MAX
);
2814 /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2815 * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2816 * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2818 if (IN_SET(path
, CREDENTIAL_SEARCH_PATH_ENCRYPTED
, CREDENTIAL_SEARCH_PATH_ALL
)) {
2819 if (strv_extend(&l
, params
->received_encrypted_credentials_directory
) < 0)
2822 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2826 if (IN_SET(path
, CREDENTIAL_SEARCH_PATH_TRUSTED
, CREDENTIAL_SEARCH_PATH_ALL
)) {
2827 if (params
->received_credentials_directory
)
2828 if (strv_extend(&l
, params
->received_credentials_directory
) < 0)
2831 if (strv_extend_strv(&l
, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2835 if (DEBUG_LOGGING
) {
2836 _cleanup_free_
char *t
= strv_join(l
, ":");
2838 log_debug("Credential search path is: %s", strempty(t
));
2844 static int maybe_decrypt_and_write_credential(
2855 _cleanup_free_
void *plaintext
= NULL
;
2860 size_t plaintext_size
= 0;
2862 r
= decrypt_credential_and_warn(id
, now(CLOCK_REALTIME
), NULL
, NULL
, data
, size
,
2863 &plaintext
, &plaintext_size
);
2868 size
= plaintext_size
;
2871 add
= strlen(id
) + size
;
2875 r
= write_credential(dir_fd
, id
, data
, size
, uid
, gid
, ownership_ok
);
2877 return log_debug_errno(r
, "Failed to write credential '%s': %m", id
);
2883 static int load_credential_glob(
2887 ReadFullFileFlags flags
,
2896 STRV_FOREACH(d
, search_path
) {
2897 _cleanup_globfree_ glob_t pglob
= {};
2898 _cleanup_free_
char *j
= NULL
;
2900 j
= path_join(*d
, path
);
2904 r
= safe_glob(j
, 0, &pglob
);
2910 for (size_t n
= 0; n
< pglob
.gl_pathc
; n
++) {
2911 _cleanup_free_
char *fn
= NULL
;
2912 _cleanup_(erase_and_freep
) char *data
= NULL
;
2915 /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2916 r
= read_full_file_full(
2920 encrypted
? CREDENTIAL_ENCRYPTED_SIZE_MAX
: CREDENTIAL_SIZE_MAX
,
2925 return log_debug_errno(r
, "Failed to read credential '%s': %m",
2928 r
= path_extract_filename(pglob
.gl_pathv
[n
], &fn
);
2930 return log_debug_errno(r
, "Failed to extract filename from '%s': %m",
2933 r
= maybe_decrypt_and_write_credential(
2952 static int load_credential(
2953 const ExecContext
*context
,
2954 const ExecParameters
*params
,
2966 ReadFullFileFlags flags
= READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
;
2967 _cleanup_strv_free_
char **search_path
= NULL
;
2968 _cleanup_(erase_and_freep
) char *data
= NULL
;
2969 _cleanup_free_
char *bindname
= NULL
;
2970 const char *source
= NULL
;
2971 bool missing_ok
= true;
2980 assert(read_dfd
>= 0 || read_dfd
== AT_FDCWD
);
2981 assert(write_dfd
>= 0);
2984 if (read_dfd
>= 0) {
2985 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2986 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2987 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2990 if (!filename_is_valid(path
)) /* safety check */
2996 } else if (path_is_absolute(path
)) {
2997 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
3000 if (!path_is_valid(path
)) /* safety check */
3003 flags
|= READ_FULL_FILE_CONNECT_SOCKET
;
3005 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3006 * via the source socket address in case we read off an AF_UNIX socket. */
3007 if (asprintf(&bindname
, "@%" PRIx64
"/unit/%s/%s", random_u64(), unit
, id
) < 0)
3013 } else if (credential_name_valid(path
)) {
3014 /* If this is a relative path, take it as credential name relative to the credentials
3015 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3016 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3018 search_path
= credential_search_path(params
, CREDENTIAL_SEARCH_PATH_ALL
);
3027 flags
|= READ_FULL_FILE_UNBASE64
;
3029 maxsz
= encrypted
? CREDENTIAL_ENCRYPTED_SIZE_MAX
: CREDENTIAL_SIZE_MAX
;
3032 STRV_FOREACH(d
, search_path
) {
3033 _cleanup_free_
char *j
= NULL
;
3035 j
= path_join(*d
, path
);
3039 r
= read_full_file_full(
3040 AT_FDCWD
, j
, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3050 r
= read_full_file_full(
3060 if (r
== -ENOENT
&& (missing_ok
|| hashmap_contains(context
->set_credentials
, id
))) {
3061 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3062 * will get clear errors if we don't pass such a missing credential on as they
3063 * themselves will get ENOENT when trying to read them, which should not be much
3064 * worse than when we handle the error here and make it fatal.
3066 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3067 * we are fine, too. */
3068 log_debug_errno(r
, "Couldn't read inherited credential '%s', skipping: %m", path
);
3072 return log_debug_errno(r
, "Failed to read credential '%s': %m", path
);
3074 return maybe_decrypt_and_write_credential(write_dfd
, id
, encrypted
, uid
, gid
, ownership_ok
, data
, size
, left
);
3077 struct load_cred_args
{
3078 const ExecContext
*context
;
3079 const ExecParameters
*params
;
3089 static int load_cred_recurse_dir_cb(
3090 RecurseDirEvent event
,
3094 const struct dirent
*de
,
3095 const struct statx
*sx
,
3098 struct load_cred_args
*args
= ASSERT_PTR(userdata
);
3099 _cleanup_free_
char *sub_id
= NULL
;
3102 if (event
!= RECURSE_DIR_ENTRY
)
3103 return RECURSE_DIR_CONTINUE
;
3105 if (!IN_SET(de
->d_type
, DT_REG
, DT_SOCK
))
3106 return RECURSE_DIR_CONTINUE
;
3108 sub_id
= strreplace(path
, "/", "_");
3112 if (!credential_name_valid(sub_id
))
3113 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential would get ID %s, which is not valid, refusing", sub_id
);
3115 if (faccessat(args
->dfd
, sub_id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0) {
3116 log_debug("Skipping credential with duplicated ID %s at %s", sub_id
, path
);
3117 return RECURSE_DIR_CONTINUE
;
3119 if (errno
!= ENOENT
)
3120 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sub_id
);
3122 r
= load_credential(
3138 return RECURSE_DIR_CONTINUE
;
3141 static int acquire_credentials(
3142 const ExecContext
*context
,
3143 const ExecParameters
*params
,
3148 bool ownership_ok
) {
3150 uint64_t left
= CREDENTIALS_TOTAL_SIZE_MAX
;
3151 _cleanup_close_
int dfd
= -EBADF
;
3153 ExecLoadCredential
*lc
;
3154 ExecSetCredential
*sc
;
3160 dfd
= open(p
, O_DIRECTORY
|O_CLOEXEC
);
3164 r
= fd_acl_make_writable(dfd
); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3168 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3169 HASHMAP_FOREACH(lc
, context
->load_credentials
) {
3170 _cleanup_close_
int sub_fd
= -EBADF
;
3172 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3173 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3174 * a regular file. Finally, if it's a relative path we will use it as a credential name to
3175 * propagate a credential passed to us from further up. */
3177 if (path_is_absolute(lc
->path
)) {
3178 sub_fd
= open(lc
->path
, O_DIRECTORY
|O_CLOEXEC
|O_RDONLY
);
3179 if (sub_fd
< 0 && !IN_SET(errno
,
3180 ENOTDIR
, /* Not a directory */
3181 ENOENT
)) /* Doesn't exist? */
3182 return log_debug_errno(errno
, "Failed to open '%s': %m", lc
->path
);
3186 /* Regular file (incl. a credential passed in from higher up) */
3187 r
= load_credential(
3204 /* path= */ lc
->id
, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3205 /* statx_mask= */ 0,
3206 /* n_depth_max= */ UINT_MAX
,
3207 RECURSE_DIR_SORT
|RECURSE_DIR_IGNORE_DOT
|RECURSE_DIR_ENSURE_TYPE
,
3208 load_cred_recurse_dir_cb
,
3209 &(struct load_cred_args
) {
3212 .encrypted
= lc
->encrypted
,
3217 .ownership_ok
= ownership_ok
,
3224 /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3225 * override any credentials found earlier. */
3226 SET_FOREACH(ic
, context
->import_credentials
) {
3227 _cleanup_free_
char **search_path
= NULL
;
3229 search_path
= credential_search_path(params
, CREDENTIAL_SEARCH_PATH_TRUSTED
);
3233 r
= load_credential_glob(
3235 /* encrypted = */ false,
3237 READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
,
3246 search_path
= strv_free(search_path
);
3247 search_path
= credential_search_path(params
, CREDENTIAL_SEARCH_PATH_ENCRYPTED
);
3251 r
= load_credential_glob(
3253 /* encrypted = */ true,
3255 READ_FULL_FILE_SECURE
|READ_FULL_FILE_FAIL_WHEN_LARGER
|READ_FULL_FILE_UNBASE64
,
3265 /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3266 * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3267 HASHMAP_FOREACH(sc
, context
->set_credentials
) {
3268 _cleanup_(erase_and_freep
) void *plaintext
= NULL
;
3272 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3273 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3274 * slow and involved, hence it's nice to be able to skip that if the credential already
3276 if (faccessat(dfd
, sc
->id
, F_OK
, AT_SYMLINK_NOFOLLOW
) >= 0)
3278 if (errno
!= ENOENT
)
3279 return log_debug_errno(errno
, "Failed to test if credential %s exists: %m", sc
->id
);
3281 if (sc
->encrypted
) {
3282 r
= decrypt_credential_and_warn(sc
->id
, now(CLOCK_REALTIME
), NULL
, NULL
, sc
->data
, sc
->size
, &plaintext
, &size
);
3292 add
= strlen(sc
->id
) + size
;
3296 r
= write_credential(dfd
, sc
->id
, data
, size
, uid
, gid
, ownership_ok
);
3303 r
= fd_acl_make_read_only(dfd
); /* Now take away the "w" bit */
3307 /* After we created all keys with the right perms, also make sure the credential store as a whole is
3310 if (uid_is_valid(uid
) && uid
!= getuid()) {
3311 r
= fd_add_uid_acl_permission(dfd
, uid
, ACL_READ
| ACL_EXECUTE
);
3313 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
3319 if (fchown(dfd
, uid
, gid
) < 0)
3327 static int setup_credentials_internal(
3328 const ExecContext
*context
,
3329 const ExecParameters
*params
,
3331 const char *final
, /* This is where the credential store shall eventually end up at */
3332 const char *workspace
, /* This is where we can prepare it before moving it to the final place */
3333 bool reuse_workspace
, /* Whether to reuse any existing workspace mount if it already is a mount */
3334 bool must_mount
, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3338 int r
, workspace_mounted
; /* negative if we don't know yet whether we have/can mount something; true
3339 * if we mounted something; false if we definitely can't mount anything */
3347 if (reuse_workspace
) {
3348 r
= path_is_mount_point(workspace
, NULL
, 0);
3352 workspace_mounted
= true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3354 workspace_mounted
= -1; /* We need to figure out if we can mount something to the workspace */
3356 workspace_mounted
= -1; /* ditto */
3358 r
= path_is_mount_point(final
, NULL
, 0);
3362 /* If the final place already has something mounted, we use that. If the workspace also has
3363 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3365 final_mounted
= true;
3367 if (workspace_mounted
< 0) {
3368 /* If the final place is mounted, but the workspace isn't, then let's bind mount
3369 * the final version to the workspace, and make it writable, so that we can make
3372 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3376 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|credentials_fs_mount_flags(/* ro= */ false), NULL
);
3380 workspace_mounted
= true;
3383 final_mounted
= false;
3385 if (workspace_mounted
< 0) {
3386 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3388 r
= mount_credentials_fs(workspace
, CREDENTIALS_TOTAL_SIZE_MAX
, /* ro= */ false);
3390 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3391 r
= mount_nofollow_verbose(LOG_DEBUG
, final
, workspace
, NULL
, MS_BIND
|MS_REC
, NULL
);
3393 if (!ERRNO_IS_PRIVILEGE(r
)) /* Propagate anything that isn't a permission problem */
3396 if (must_mount
) /* If we it's not OK to use the plain directory
3397 * fallback, propagate all errors too */
3400 /* If we lack privileges to bind mount stuff, then let's gracefully
3401 * proceed for compat with container envs, and just use the final dir
3404 workspace_mounted
= false;
3406 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3407 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|credentials_fs_mount_flags(/* ro= */ false), NULL
);
3411 workspace_mounted
= true;
3414 workspace_mounted
= true;
3417 assert(!must_mount
|| workspace_mounted
> 0);
3418 where
= workspace_mounted
? workspace
: final
;
3420 (void) label_fix_full(AT_FDCWD
, where
, final
, 0);
3422 r
= acquire_credentials(context
, params
, unit
, where
, uid
, gid
, workspace_mounted
);
3426 if (workspace_mounted
) {
3429 /* Determine if we should actually install the prepared mount in the final location by bind
3430 * mounting it there. We do so only if the mount is not established there already, and if the
3431 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3432 * case we are doing all this in a mount namespace, thus no one else will see that we
3433 * allocated a file system we are getting rid of again here. */
3435 install
= false; /* already installed */
3437 r
= dir_is_empty(where
, /* ignore_hidden_or_backup= */ false);
3441 install
= r
== 0; /* install only if non-empty */
3445 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3446 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, workspace
, NULL
, MS_BIND
|MS_REMOUNT
|credentials_fs_mount_flags(/* ro= */ true), NULL
);
3450 /* And mount it to the final place, read-only */
3451 r
= mount_nofollow_verbose(LOG_DEBUG
, workspace
, final
, NULL
, MS_MOVE
, NULL
);
3453 /* Otherwise get rid of it */
3454 r
= umount_verbose(LOG_DEBUG
, workspace
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
3458 _cleanup_free_
char *parent
= NULL
;
3460 /* If we do not have our own mount put used the plain directory fallback, then we need to
3461 * open access to the top-level credential directory and the per-service directory now */
3463 r
= path_extract_directory(final
, &parent
);
3466 if (chmod(parent
, 0755) < 0)
3473 static int setup_credentials(
3474 const ExecContext
*context
,
3475 const ExecParameters
*params
,
3480 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
3486 if (!exec_context_has_credentials(context
))
3489 if (!params
->prefix
[EXEC_DIRECTORY_RUNTIME
])
3492 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3493 * and the subdir we mount over with a read-only file system readable by the service's user */
3494 q
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials");
3498 r
= mkdir_label(q
, 0755); /* top-level dir: world readable/searchable */
3499 if (r
< 0 && r
!= -EEXIST
)
3502 p
= path_join(q
, unit
);
3506 r
= mkdir_label(p
, 0700); /* per-unit dir: private to user */
3507 if (r
< 0 && r
!= -EEXIST
)
3510 r
= safe_fork("(sd-mkdcreds)", FORK_DEATHSIG
|FORK_WAIT
|FORK_NEW_MOUNTNS
, NULL
);
3512 _cleanup_free_
char *t
= NULL
, *u
= NULL
;
3514 /* If this is not a privilege or support issue then propagate the error */
3515 if (!ERRNO_IS_NOT_SUPPORTED(r
) && !ERRNO_IS_PRIVILEGE(r
))
3518 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3519 * it into place, so that users can't access half-initialized credential stores. */
3520 t
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "systemd/temporary-credentials");
3524 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3525 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3526 * after it is fully set up */
3527 u
= path_join(t
, unit
);
3531 FOREACH_STRING(i
, t
, u
) {
3532 r
= mkdir_label(i
, 0700);
3533 if (r
< 0 && r
!= -EEXIST
)
3537 r
= setup_credentials_internal(
3541 p
, /* final mount point */
3542 u
, /* temporary workspace to overmount */
3543 true, /* reuse the workspace if it is already a mount */
3544 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3548 (void) rmdir(u
); /* remove the workspace again if we can. */
3553 } else if (r
== 0) {
3555 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3556 * we can use the same directory for all cases, after turning off propagation. Question
3557 * though is: where do we turn off propagation exactly, and where do we place the workspace
3558 * directory? We need some place that is guaranteed to be a mount point in the host, and
3559 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3560 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3561 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3562 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3563 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3564 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3565 * propagation on the former, and then overmount the latter.
3567 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3568 * for this purpose, but there are few other candidates that work equally well for us, and
3569 * given that the we do this in a privately namespaced short-lived single-threaded process
3570 * that no one else sees this should be OK to do. */
3572 r
= mount_nofollow_verbose(LOG_DEBUG
, NULL
, "/dev", NULL
, MS_SLAVE
|MS_REC
, NULL
); /* Turn off propagation from our namespace to host */
3576 r
= setup_credentials_internal(
3580 p
, /* final mount point */
3581 "/dev/shm", /* temporary workspace to overmount */
3582 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3583 true, /* insist that something is mounted, do not allow fallback to plain directory */
3589 _exit(EXIT_SUCCESS
);
3592 _exit(EXIT_FAILURE
);
3595 /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3596 * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3597 * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3598 * seen by users when trying access this inode. */
3604 static int setup_smack(
3605 const Manager
*manager
,
3606 const ExecContext
*context
,
3607 int executable_fd
) {
3611 assert(executable_fd
>= 0);
3613 if (context
->smack_process_label
) {
3614 r
= mac_smack_apply_pid(0, context
->smack_process_label
);
3617 } else if (manager
->default_smack_process_label
) {
3618 _cleanup_free_
char *exec_label
= NULL
;
3620 r
= mac_smack_read_fd(executable_fd
, SMACK_ATTR_EXEC
, &exec_label
);
3621 if (r
< 0 && !ERRNO_IS_XATTR_ABSENT(r
))
3624 r
= mac_smack_apply_pid(0, exec_label
?: manager
->default_smack_process_label
);
3633 static int compile_bind_mounts(
3634 const ExecContext
*context
,
3635 const ExecParameters
*params
,
3636 BindMount
**ret_bind_mounts
,
3637 size_t *ret_n_bind_mounts
,
3638 char ***ret_empty_directories
) {
3640 _cleanup_strv_free_
char **empty_directories
= NULL
;
3641 BindMount
*bind_mounts
= NULL
;
3647 assert(ret_bind_mounts
);
3648 assert(ret_n_bind_mounts
);
3649 assert(ret_empty_directories
);
3651 CLEANUP_ARRAY(bind_mounts
, h
, bind_mount_free_many
);
3653 n
= context
->n_bind_mounts
;
3654 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3655 if (!params
->prefix
[t
])
3658 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++)
3659 n
+= !context
->directories
[t
].items
[i
].only_create
;
3663 *ret_bind_mounts
= NULL
;
3664 *ret_n_bind_mounts
= 0;
3665 *ret_empty_directories
= NULL
;
3669 bind_mounts
= new(BindMount
, n
);
3673 for (size_t i
= 0; i
< context
->n_bind_mounts
; i
++) {
3674 BindMount
*item
= context
->bind_mounts
+ i
;
3675 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
3677 s
= strdup(item
->source
);
3681 d
= strdup(item
->destination
);
3685 bind_mounts
[h
++] = (BindMount
) {
3686 .source
= TAKE_PTR(s
),
3687 .destination
= TAKE_PTR(d
),
3688 .read_only
= item
->read_only
,
3689 .recursive
= item
->recursive
,
3690 .ignore_enoent
= item
->ignore_enoent
,
3694 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
3695 if (!params
->prefix
[t
])
3698 if (context
->directories
[t
].n_items
== 0)
3701 if (exec_directory_is_private(context
, t
) &&
3702 !exec_context_with_rootfs(context
)) {
3705 /* So this is for a dynamic user, and we need to make sure the process can access its own
3706 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3707 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3709 private_root
= path_join(params
->prefix
[t
], "private");
3713 r
= strv_consume(&empty_directories
, private_root
);
3718 for (size_t i
= 0; i
< context
->directories
[t
].n_items
; i
++) {
3719 _cleanup_free_
char *s
= NULL
, *d
= NULL
;
3721 /* When one of the parent directories is in the list, we cannot create the symlink
3722 * for the child directory. See also the comments in setup_exec_directory(). */
3723 if (context
->directories
[t
].items
[i
].only_create
)
3726 if (exec_directory_is_private(context
, t
))
3727 s
= path_join(params
->prefix
[t
], "private", context
->directories
[t
].items
[i
].path
);
3729 s
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3733 if (exec_directory_is_private(context
, t
) &&
3734 exec_context_with_rootfs(context
))
3735 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3736 * directory is not created on the root directory. So, let's bind-mount the directory
3737 * on the 'non-private' place. */
3738 d
= path_join(params
->prefix
[t
], context
->directories
[t
].items
[i
].path
);
3744 bind_mounts
[h
++] = (BindMount
) {
3745 .source
= TAKE_PTR(s
),
3746 .destination
= TAKE_PTR(d
),
3748 .nosuid
= context
->dynamic_user
, /* don't allow suid/sgid when DynamicUser= is on */
3750 .ignore_enoent
= false,
3757 *ret_bind_mounts
= TAKE_PTR(bind_mounts
);
3758 *ret_n_bind_mounts
= n
;
3759 *ret_empty_directories
= TAKE_PTR(empty_directories
);
3764 /* ret_symlinks will contain a list of pairs src:dest that describes
3765 * the symlinks to create later on. For example, the symlinks needed
3766 * to safely give private directories to DynamicUser=1 users. */
3767 static int compile_symlinks(
3768 const ExecContext
*context
,
3769 const ExecParameters
*params
,
3770 bool setup_os_release_symlink
,
3771 char ***ret_symlinks
) {
3773 _cleanup_strv_free_
char **symlinks
= NULL
;
3778 assert(ret_symlinks
);
3780 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
3781 for (size_t i
= 0; i
< context
->directories
[dt
].n_items
; i
++) {
3782 _cleanup_free_
char *private_path
= NULL
, *path
= NULL
;
3784 STRV_FOREACH(symlink
, context
->directories
[dt
].items
[i
].symlinks
) {
3785 _cleanup_free_
char *src_abs
= NULL
, *dst_abs
= NULL
;
3787 src_abs
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3788 dst_abs
= path_join(params
->prefix
[dt
], *symlink
);
3789 if (!src_abs
|| !dst_abs
)
3792 r
= strv_consume_pair(&symlinks
, TAKE_PTR(src_abs
), TAKE_PTR(dst_abs
));
3797 if (!exec_directory_is_private(context
, dt
) ||
3798 exec_context_with_rootfs(context
) ||
3799 context
->directories
[dt
].items
[i
].only_create
)
3802 private_path
= path_join(params
->prefix
[dt
], "private", context
->directories
[dt
].items
[i
].path
);
3806 path
= path_join(params
->prefix
[dt
], context
->directories
[dt
].items
[i
].path
);
3810 r
= strv_consume_pair(&symlinks
, TAKE_PTR(private_path
), TAKE_PTR(path
));
3816 /* We make the host's os-release available via a symlink, so that we can copy it atomically
3817 * and readers will never get a half-written version. Note that, while the paths specified here are
3818 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3819 * 'os-release -> .os-release-stage/os-release' is what will be created. */
3820 if (setup_os_release_symlink
) {
3821 r
= strv_extend(&symlinks
, "/run/host/.os-release-stage/os-release");
3825 r
= strv_extend(&symlinks
, "/run/host/os-release");
3830 *ret_symlinks
= TAKE_PTR(symlinks
);
3835 static bool insist_on_sandboxing(
3836 const ExecContext
*context
,
3837 const char *root_dir
,
3838 const char *root_image
,
3839 const BindMount
*bind_mounts
,
3840 size_t n_bind_mounts
) {
3843 assert(n_bind_mounts
== 0 || bind_mounts
);
3845 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3846 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3847 * rearrange stuff in a way we cannot ignore gracefully. */
3849 if (context
->n_temporary_filesystems
> 0)
3852 if (root_dir
|| root_image
)
3855 if (context
->n_mount_images
> 0)
3858 if (context
->dynamic_user
)
3861 if (context
->n_extension_images
> 0 || !strv_isempty(context
->extension_directories
))
3864 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3866 for (size_t i
= 0; i
< n_bind_mounts
; i
++)
3867 if (!path_equal(bind_mounts
[i
].source
, bind_mounts
[i
].destination
))
3870 if (context
->log_namespace
)
3876 static int setup_ephemeral(const ExecContext
*context
, ExecRuntime
*runtime
) {
3877 _cleanup_close_
int fd
= -EBADF
;
3880 if (!runtime
|| !runtime
->ephemeral_copy
)
3883 r
= posix_lock(runtime
->ephemeral_storage_socket
[0], LOCK_EX
);
3885 return log_debug_errno(r
, "Failed to lock ephemeral storage socket: %m");
3887 CLEANUP_POSIX_UNLOCK(runtime
->ephemeral_storage_socket
[0]);
3889 fd
= receive_one_fd(runtime
->ephemeral_storage_socket
[0], MSG_PEEK
|MSG_DONTWAIT
);
3891 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3895 return log_debug_errno(fd
, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3897 log_debug("Making ephemeral snapshot of %s to %s",
3898 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3900 if (context
->root_image
)
3901 fd
= copy_file(context
->root_image
, runtime
->ephemeral_copy
, O_EXCL
, 0600,
3902 COPY_LOCK_BSD
|COPY_REFLINK
|COPY_CRTIME
);
3904 fd
= btrfs_subvol_snapshot_at(AT_FDCWD
, context
->root_directory
,
3905 AT_FDCWD
, runtime
->ephemeral_copy
,
3906 BTRFS_SNAPSHOT_FALLBACK_COPY
|
3907 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
3908 BTRFS_SNAPSHOT_RECURSIVE
|
3909 BTRFS_SNAPSHOT_LOCK_BSD
);
3911 return log_debug_errno(fd
, "Failed to snapshot %s to %s: %m",
3912 context
->root_image
?: context
->root_directory
, runtime
->ephemeral_copy
);
3914 if (context
->root_image
) {
3915 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3916 * which tends to not perform well in combination with lots of random writes.
3918 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3919 * copy, but we at least want to make the intention clear.
3921 r
= chattr_fd(fd
, FS_NOCOW_FL
, FS_NOCOW_FL
, NULL
);
3923 log_debug_errno(fd
, "Failed to disable copy-on-write for %s, ignoring: %m", runtime
->ephemeral_copy
);
3926 r
= send_one_fd(runtime
->ephemeral_storage_socket
[1], fd
, MSG_DONTWAIT
);
3928 return log_debug_errno(r
, "Failed to queue file descriptor on ephemeral storage socket: %m");
3933 static int verity_settings_prepare(
3934 VeritySettings
*verity
,
3935 const char *root_image
,
3936 const void *root_hash
,
3937 size_t root_hash_size
,
3938 const char *root_hash_path
,
3939 const void *root_hash_sig
,
3940 size_t root_hash_sig_size
,
3941 const char *root_hash_sig_path
,
3942 const char *verity_data_path
) {
3951 d
= memdup(root_hash
, root_hash_size
);
3955 free_and_replace(verity
->root_hash
, d
);
3956 verity
->root_hash_size
= root_hash_size
;
3957 verity
->designator
= PARTITION_ROOT
;
3960 if (root_hash_sig
) {
3963 d
= memdup(root_hash_sig
, root_hash_sig_size
);
3967 free_and_replace(verity
->root_hash_sig
, d
);
3968 verity
->root_hash_sig_size
= root_hash_sig_size
;
3969 verity
->designator
= PARTITION_ROOT
;
3972 if (verity_data_path
) {
3973 r
= free_and_strdup(&verity
->data_path
, verity_data_path
);
3978 r
= verity_settings_load(
3982 root_hash_sig_path
);
3984 return log_debug_errno(r
, "Failed to load root hash: %m");
3989 static int apply_mount_namespace(
3991 ExecCommandFlags command_flags
,
3992 const ExecContext
*context
,
3993 const ExecParameters
*params
,
3994 ExecRuntime
*runtime
,
3995 const char *memory_pressure_path
,
3996 char **error_path
) {
3998 _cleanup_(verity_settings_done
) VeritySettings verity
= VERITY_SETTINGS_DEFAULT
;
3999 _cleanup_strv_free_
char **empty_directories
= NULL
, **symlinks
= NULL
,
4000 **read_write_paths_cleanup
= NULL
;
4001 _cleanup_free_
char *creds_path
= NULL
, *incoming_dir
= NULL
, *propagate_dir
= NULL
,
4002 *extension_dir
= NULL
, *host_os_release_stage
= NULL
;
4003 const char *root_dir
= NULL
, *root_image
= NULL
, *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
4004 char **read_write_paths
;
4005 NamespaceInfo ns_info
;
4006 bool needs_sandboxing
, setup_os_release_symlink
;
4007 BindMount
*bind_mounts
= NULL
;
4008 size_t n_bind_mounts
= 0;
4013 CLEANUP_ARRAY(bind_mounts
, n_bind_mounts
, bind_mount_free_many
);
4015 if (params
->flags
& EXEC_APPLY_CHROOT
) {
4016 r
= setup_ephemeral(context
, runtime
);
4020 if (context
->root_image
)
4021 root_image
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_image
;
4023 root_dir
= (runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
;
4026 r
= compile_bind_mounts(context
, params
, &bind_mounts
, &n_bind_mounts
, &empty_directories
);
4030 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4031 * service will need to write to it in order to start the notifications. */
4032 if (context
->protect_control_groups
&& memory_pressure_path
&& !streq(memory_pressure_path
, "/dev/null")) {
4033 read_write_paths_cleanup
= strv_copy(context
->read_write_paths
);
4034 if (!read_write_paths_cleanup
)
4037 r
= strv_extend(&read_write_paths_cleanup
, memory_pressure_path
);
4041 read_write_paths
= read_write_paths_cleanup
;
4043 read_write_paths
= context
->read_write_paths
;
4045 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command_flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
4046 if (needs_sandboxing
) {
4047 /* The runtime struct only contains the parent of the private /tmp,
4048 * which is non-accessible to world users. Inside of it there's a /tmp
4049 * that is sticky, and that's the one we want to use here.
4050 * This does not apply when we are using /run/systemd/empty as fallback. */
4052 if (context
->private_tmp
&& runtime
&& runtime
->shared
) {
4053 if (streq_ptr(runtime
->shared
->tmp_dir
, RUN_SYSTEMD_EMPTY
))
4054 tmp_dir
= runtime
->shared
->tmp_dir
;
4055 else if (runtime
->shared
->tmp_dir
)
4056 tmp_dir
= strjoina(runtime
->shared
->tmp_dir
, "/tmp");
4058 if (streq_ptr(runtime
->shared
->var_tmp_dir
, RUN_SYSTEMD_EMPTY
))
4059 var_tmp_dir
= runtime
->shared
->var_tmp_dir
;
4060 else if (runtime
->shared
->var_tmp_dir
)
4061 var_tmp_dir
= strjoina(runtime
->shared
->var_tmp_dir
, "/tmp");
4064 ns_info
= (NamespaceInfo
) {
4065 .ignore_protect_paths
= false,
4066 .private_dev
= context
->private_devices
,
4067 .protect_control_groups
= context
->protect_control_groups
,
4068 .protect_kernel_tunables
= context
->protect_kernel_tunables
,
4069 .protect_kernel_modules
= context
->protect_kernel_modules
,
4070 .protect_kernel_logs
= context
->protect_kernel_logs
,
4071 .protect_hostname
= context
->protect_hostname
,
4072 .mount_apivfs
= exec_context_get_effective_mount_apivfs(context
),
4073 .protect_home
= context
->protect_home
,
4074 .protect_system
= context
->protect_system
,
4075 .protect_proc
= context
->protect_proc
,
4076 .proc_subset
= context
->proc_subset
,
4077 .private_network
= exec_needs_network_namespace(context
),
4078 .private_ipc
= exec_needs_ipc_namespace(context
),
4079 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4080 .mount_nosuid
= context
->no_new_privileges
&& !mac_selinux_use(),
4082 } else if (!context
->dynamic_user
&& root_dir
)
4084 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4085 * sandbox info, otherwise enforce it, don't ignore protected paths and
4086 * fail if we are enable to apply the sandbox inside the mount namespace.
4088 ns_info
= (NamespaceInfo
) {
4089 .ignore_protect_paths
= true,
4092 ns_info
= (NamespaceInfo
) {};
4094 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
4095 setup_os_release_symlink
= ns_info
.mount_apivfs
&& (root_dir
|| root_image
);
4096 r
= compile_symlinks(context
, params
, setup_os_release_symlink
, &symlinks
);
4100 if (context
->mount_propagation_flag
== MS_SHARED
)
4101 log_unit_debug(u
, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4103 if (exec_context_has_credentials(context
) &&
4104 params
->prefix
[EXEC_DIRECTORY_RUNTIME
] &&
4105 FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
4106 creds_path
= path_join(params
->prefix
[EXEC_DIRECTORY_RUNTIME
], "credentials", u
->id
);
4111 if (params
->runtime_scope
== RUNTIME_SCOPE_SYSTEM
) {
4112 propagate_dir
= path_join("/run/systemd/propagate/", u
->id
);
4116 incoming_dir
= strdup("/run/systemd/incoming");
4120 extension_dir
= strdup("/run/systemd/unit-extensions");
4124 /* If running under a different root filesystem, propagate the host's os-release. We make a
4125 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
4126 if (setup_os_release_symlink
) {
4127 host_os_release_stage
= strdup("/run/systemd/propagate/.os-release-stage");
4128 if (!host_os_release_stage
)
4132 assert(params
->runtime_scope
== RUNTIME_SCOPE_USER
);
4134 if (asprintf(&extension_dir
, "/run/user/" UID_FMT
"/systemd/unit-extensions", geteuid()) < 0)
4137 if (setup_os_release_symlink
) {
4138 if (asprintf(&host_os_release_stage
,
4139 "/run/user/" UID_FMT
"/systemd/propagate/.os-release-stage",
4146 r
= verity_settings_prepare(
4149 context
->root_hash
, context
->root_hash_size
, context
->root_hash_path
,
4150 context
->root_hash_sig
, context
->root_hash_sig_size
, context
->root_hash_sig_path
,
4151 context
->root_verity
);
4156 r
= setup_namespace(
4159 context
->root_image_options
,
4160 context
->root_image_policy
?: &image_policy_service
,
4163 needs_sandboxing
? context
->read_only_paths
: NULL
,
4164 needs_sandboxing
? context
->inaccessible_paths
: NULL
,
4165 needs_sandboxing
? context
->exec_paths
: NULL
,
4166 needs_sandboxing
? context
->no_exec_paths
: NULL
,
4171 context
->temporary_filesystems
,
4172 context
->n_temporary_filesystems
,
4173 context
->mount_images
,
4174 context
->n_mount_images
,
4175 context
->mount_image_policy
?: &image_policy_service
,
4179 context
->log_namespace
,
4180 context
->mount_propagation_flag
,
4182 context
->extension_images
,
4183 context
->n_extension_images
,
4184 context
->extension_image_policy
?: &image_policy_sysext
,
4185 context
->extension_directories
,
4189 root_dir
|| root_image
? params
->notify_socket
: NULL
,
4190 host_os_release_stage
,
4193 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4194 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4195 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4196 * completely different execution environment. */
4198 if (insist_on_sandboxing(
4200 root_dir
, root_image
,
4203 return log_unit_debug_errno(u
,
4204 SYNTHETIC_ERRNO(EOPNOTSUPP
),
4205 "Failed to set up namespace, and refusing to continue since "
4206 "the selected namespacing options alter mount environment non-trivially.\n"
4207 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4209 context
->n_temporary_filesystems
,
4212 yes_no(context
->dynamic_user
));
4214 log_unit_debug(u
, "Failed to set up namespace, assuming containerized execution and ignoring.");
4221 static int apply_working_directory(
4222 const ExecContext
*context
,
4223 const ExecParameters
*params
,
4224 ExecRuntime
*runtime
,
4231 assert(exit_status
);
4233 if (context
->working_directory_home
) {
4236 *exit_status
= EXIT_CHDIR
;
4243 wd
= empty_to_root(context
->working_directory
);
4245 if (params
->flags
& EXEC_APPLY_CHROOT
)
4248 d
= prefix_roota((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
, wd
);
4250 if (chdir(d
) < 0 && !context
->working_directory_missing_ok
) {
4251 *exit_status
= EXIT_CHDIR
;
4258 static int apply_root_directory(
4259 const ExecContext
*context
,
4260 const ExecParameters
*params
,
4261 ExecRuntime
*runtime
,
4262 const bool needs_mount_ns
,
4266 assert(exit_status
);
4268 if (params
->flags
& EXEC_APPLY_CHROOT
)
4269 if (!needs_mount_ns
&& context
->root_directory
)
4270 if (chroot((runtime
? runtime
->ephemeral_copy
: NULL
) ?: context
->root_directory
) < 0) {
4271 *exit_status
= EXIT_CHROOT
;
4278 static int setup_keyring(
4280 const ExecContext
*context
,
4281 const ExecParameters
*p
,
4282 uid_t uid
, gid_t gid
) {
4284 key_serial_t keyring
;
4293 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4294 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4295 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4296 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4297 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4298 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4300 if (context
->keyring_mode
== EXEC_KEYRING_INHERIT
)
4303 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4304 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4305 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4306 * & group is just as nasty as acquiring a reference to the user keyring. */
4308 saved_uid
= getuid();
4309 saved_gid
= getgid();
4311 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
4312 if (setregid(gid
, -1) < 0)
4313 return log_unit_error_errno(u
, errno
, "Failed to change GID for user keyring: %m");
4316 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
4317 if (setreuid(uid
, -1) < 0) {
4318 r
= log_unit_error_errno(u
, errno
, "Failed to change UID for user keyring: %m");
4323 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
4324 if (keyring
== -1) {
4325 if (errno
== ENOSYS
)
4326 log_unit_debug_errno(u
, errno
, "Kernel keyring not supported, ignoring.");
4327 else if (ERRNO_IS_PRIVILEGE(errno
))
4328 log_unit_debug_errno(u
, errno
, "Kernel keyring access prohibited, ignoring.");
4329 else if (errno
== EDQUOT
)
4330 log_unit_debug_errno(u
, errno
, "Out of kernel keyrings to allocate, ignoring.");
4332 r
= log_unit_error_errno(u
, errno
, "Setting up kernel keyring failed: %m");
4337 /* When requested link the user keyring into the session keyring. */
4338 if (context
->keyring_mode
== EXEC_KEYRING_SHARED
) {
4340 if (keyctl(KEYCTL_LINK
,
4341 KEY_SPEC_USER_KEYRING
,
4342 KEY_SPEC_SESSION_KEYRING
, 0, 0) < 0) {
4343 r
= log_unit_error_errno(u
, errno
, "Failed to link user keyring into session keyring: %m");
4348 /* Restore uid/gid back */
4349 if (uid_is_valid(uid
) && uid
!= saved_uid
) {
4350 if (setreuid(saved_uid
, -1) < 0) {
4351 r
= log_unit_error_errno(u
, errno
, "Failed to change UID back for user keyring: %m");
4356 if (gid_is_valid(gid
) && gid
!= saved_gid
) {
4357 if (setregid(saved_gid
, -1) < 0)
4358 return log_unit_error_errno(u
, errno
, "Failed to change GID back for user keyring: %m");
4361 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4362 if (!sd_id128_is_null(u
->invocation_id
)) {
4365 key
= add_key("user", "invocation_id", &u
->invocation_id
, sizeof(u
->invocation_id
), KEY_SPEC_SESSION_KEYRING
);
4367 log_unit_debug_errno(u
, errno
, "Failed to add invocation ID to keyring, ignoring: %m");
4369 if (keyctl(KEYCTL_SETPERM
, key
,
4370 KEY_POS_VIEW
|KEY_POS_READ
|KEY_POS_SEARCH
|
4371 KEY_USR_VIEW
|KEY_USR_READ
|KEY_USR_SEARCH
, 0, 0) < 0)
4372 r
= log_unit_error_errno(u
, errno
, "Failed to restrict invocation ID permission: %m");
4377 /* Revert back uid & gid for the last time, and exit */
4378 /* no extra logging, as only the first already reported error matters */
4379 if (getuid() != saved_uid
)
4380 (void) setreuid(saved_uid
, -1);
4382 if (getgid() != saved_gid
)
4383 (void) setregid(saved_gid
, -1);
4388 static void append_socket_pair(int *array
, size_t *n
, const int pair
[static 2]) {
4394 array
[(*n
)++] = pair
[0];
4396 array
[(*n
)++] = pair
[1];
4399 static int close_remaining_fds(
4400 const ExecParameters
*params
,
4401 const ExecRuntime
*runtime
,
4404 const int *fds
, size_t n_fds
) {
4406 size_t n_dont_close
= 0;
4407 int dont_close
[n_fds
+ 14];
4411 if (params
->stdin_fd
>= 0)
4412 dont_close
[n_dont_close
++] = params
->stdin_fd
;
4413 if (params
->stdout_fd
>= 0)
4414 dont_close
[n_dont_close
++] = params
->stdout_fd
;
4415 if (params
->stderr_fd
>= 0)
4416 dont_close
[n_dont_close
++] = params
->stderr_fd
;
4419 dont_close
[n_dont_close
++] = socket_fd
;
4421 memcpy(dont_close
+ n_dont_close
, fds
, sizeof(int) * n_fds
);
4422 n_dont_close
+= n_fds
;
4426 append_socket_pair(dont_close
, &n_dont_close
, runtime
->ephemeral_storage_socket
);
4428 if (runtime
&& runtime
->shared
) {
4429 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->netns_storage_socket
);
4430 append_socket_pair(dont_close
, &n_dont_close
, runtime
->shared
->ipcns_storage_socket
);
4433 if (runtime
&& runtime
->dynamic_creds
) {
4434 if (runtime
->dynamic_creds
->user
)
4435 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->user
->storage_socket
);
4436 if (runtime
->dynamic_creds
->group
)
4437 append_socket_pair(dont_close
, &n_dont_close
, runtime
->dynamic_creds
->group
->storage_socket
);
4440 if (user_lookup_fd
>= 0)
4441 dont_close
[n_dont_close
++] = user_lookup_fd
;
4443 return close_all_fds(dont_close
, n_dont_close
);
4446 static int send_user_lookup(
4454 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4455 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4458 if (user_lookup_fd
< 0)
4461 if (!uid_is_valid(uid
) && !gid_is_valid(gid
))
4464 if (writev(user_lookup_fd
,
4466 IOVEC_MAKE(&uid
, sizeof(uid
)),
4467 IOVEC_MAKE(&gid
, sizeof(gid
)),
4468 IOVEC_MAKE_STRING(unit
->id
) }, 3) < 0)
4474 static int acquire_home(const ExecContext
*c
, uid_t uid
, const char** home
, char **buf
) {
4481 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4486 if (!c
->working_directory_home
)
4489 r
= get_home_dir(buf
);
4497 static int compile_suggested_paths(const ExecContext
*c
, const ExecParameters
*p
, char ***ret
) {
4498 _cleanup_strv_free_
char ** list
= NULL
;
4505 assert(c
->dynamic_user
);
4507 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4508 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4511 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
4512 if (t
== EXEC_DIRECTORY_CONFIGURATION
)
4518 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
4521 if (exec_directory_is_private(c
, t
))
4522 e
= path_join(p
->prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
4524 e
= path_join(p
->prefix
[t
], c
->directories
[t
].items
[i
].path
);
4528 r
= strv_consume(&list
, e
);
4534 *ret
= TAKE_PTR(list
);
4539 static int exec_parameters_get_cgroup_path(
4540 const ExecParameters
*params
,
4541 const CGroupContext
*c
,
4544 const char *subgroup
= NULL
;
4550 if (!params
->cgroup_path
)
4553 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4554 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4555 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4556 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4557 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4558 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4559 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4560 * flag, which is only passed for the former statements, not for the latter. */
4562 if (FLAGS_SET(params
->flags
, EXEC_CGROUP_DELEGATE
) && (FLAGS_SET(params
->flags
, EXEC_CONTROL_CGROUP
) || c
->delegate_subgroup
)) {
4563 if (FLAGS_SET(params
->flags
, EXEC_IS_CONTROL
))
4564 subgroup
= ".control";
4566 subgroup
= c
->delegate_subgroup
;
4570 p
= path_join(params
->cgroup_path
, subgroup
);
4572 p
= strdup(params
->cgroup_path
);
4580 static int exec_context_cpu_affinity_from_numa(const ExecContext
*c
, CPUSet
*ret
) {
4581 _cleanup_(cpu_set_reset
) CPUSet s
= {};
4587 if (!c
->numa_policy
.nodes
.set
) {
4588 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4592 r
= numa_to_cpu_set(&c
->numa_policy
, &s
);
4598 return cpu_set_add_all(ret
, &s
);
4601 bool exec_context_get_cpu_affinity_from_numa(const ExecContext
*c
) {
4604 return c
->cpu_affinity_from_numa
;
4607 static int add_shifted_fd(int *fds
, size_t fds_size
, size_t *n_fds
, int fd
, int *ret_fd
) {
4612 assert(*n_fds
< fds_size
);
4620 if (fd
< 3 + (int) *n_fds
) {
4621 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4622 * the fds we pass to the process (or which are closed only during execve). */
4624 r
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3 + (int) *n_fds
);
4628 close_and_replace(fd
, r
);
4631 *ret_fd
= fds
[*n_fds
] = fd
;
4636 static int connect_unix_harder(Unit
*u
, const OpenFile
*of
, int ofd
) {
4637 union sockaddr_union addr
= {
4638 .un
.sun_family
= AF_UNIX
,
4641 static const int socket_types
[] = { SOCK_DGRAM
, SOCK_STREAM
, SOCK_SEQPACKET
};
4648 r
= sockaddr_un_set_path(&addr
.un
, FORMAT_PROC_FD_PATH(ofd
));
4650 return log_unit_error_errno(u
, r
, "Failed to set sockaddr for %s: %m", of
->path
);
4654 for (size_t i
= 0; i
< ELEMENTSOF(socket_types
); i
++) {
4655 _cleanup_close_
int fd
= -EBADF
;
4657 fd
= socket(AF_UNIX
, socket_types
[i
] | SOCK_CLOEXEC
, 0);
4659 return log_unit_error_errno(u
, errno
, "Failed to create socket for %s: %m", of
->path
);
4661 r
= RET_NERRNO(connect(fd
, &addr
.sa
, sa_len
));
4662 if (r
== -EPROTOTYPE
)
4665 return log_unit_error_errno(u
, r
, "Failed to connect socket for %s: %m", of
->path
);
4670 return log_unit_error_errno(u
, SYNTHETIC_ERRNO(EPROTOTYPE
), "Failed to connect socket for \"%s\".", of
->path
);
4673 static int get_open_file_fd(Unit
*u
, const OpenFile
*of
) {
4675 _cleanup_close_
int fd
= -EBADF
, ofd
= -EBADF
;
4680 ofd
= open(of
->path
, O_PATH
| O_CLOEXEC
);
4682 return log_unit_error_errno(u
, errno
, "Could not open \"%s\": %m", of
->path
);
4684 if (fstat(ofd
, &st
) < 0)
4685 return log_unit_error_errno(u
, errno
, "Failed to stat %s: %m", of
->path
);
4687 if (S_ISSOCK(st
.st_mode
)) {
4688 fd
= connect_unix_harder(u
, of
, ofd
);
4692 if (FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) && shutdown(fd
, SHUT_WR
) < 0)
4693 return log_unit_error_errno(u
, errno
, "Failed to shutdown send for socket %s: %m",
4696 log_unit_debug(u
, "socket %s opened (fd=%d)", of
->path
, fd
);
4698 int flags
= FLAGS_SET(of
->flags
, OPENFILE_READ_ONLY
) ? O_RDONLY
: O_RDWR
;
4699 if (FLAGS_SET(of
->flags
, OPENFILE_APPEND
))
4701 else if (FLAGS_SET(of
->flags
, OPENFILE_TRUNCATE
))
4704 fd
= fd_reopen(ofd
, flags
| O_CLOEXEC
);
4706 return log_unit_error_errno(u
, fd
, "Failed to open file %s: %m", of
->path
);
4708 log_unit_debug(u
, "file %s opened (fd=%d)", of
->path
, fd
);
4714 static int collect_open_file_fds(
4716 OpenFile
* open_files
,
4727 LIST_FOREACH(open_files
, of
, open_files
) {
4728 _cleanup_close_
int fd
= -EBADF
;
4730 fd
= get_open_file_fd(u
, of
);
4732 if (FLAGS_SET(of
->flags
, OPENFILE_GRACEFUL
)) {
4733 log_unit_debug_errno(u
, fd
, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of
->path
);
4740 if (!GREEDY_REALLOC(*fds
, *n_fds
+ 1))
4743 r
= strv_extend(fdnames
, of
->fdname
);
4747 (*fds
)[*n_fds
] = TAKE_FD(fd
);
4755 static void log_command_line(Unit
*unit
, const char *msg
, const char *executable
, char **argv
) {
4763 _cleanup_free_
char *cmdline
= quote_command_line(argv
, SHELL_ESCAPE_EMPTY
);
4765 log_unit_struct(unit
, LOG_DEBUG
,
4766 "EXECUTABLE=%s", executable
,
4767 LOG_UNIT_MESSAGE(unit
, "%s: %s", msg
, strnull(cmdline
)),
4768 LOG_UNIT_INVOCATION_ID(unit
));
4771 static bool exec_context_need_unprivileged_private_users(
4772 const ExecContext
*context
,
4773 const ExecParameters
*params
) {
4778 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4779 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4780 * (system manager) then we have privileges and don't need this. */
4781 if (params
->runtime_scope
!= RUNTIME_SCOPE_USER
)
4784 return context
->private_users
||
4785 context
->private_tmp
||
4786 context
->private_devices
||
4787 context
->private_network
||
4788 context
->network_namespace_path
||
4789 context
->private_ipc
||
4790 context
->ipc_namespace_path
||
4791 context
->private_mounts
> 0 ||
4792 context
->mount_apivfs
||
4793 context
->n_bind_mounts
> 0 ||
4794 context
->n_temporary_filesystems
> 0 ||
4795 context
->root_directory
||
4796 !strv_isempty(context
->extension_directories
) ||
4797 context
->protect_system
!= PROTECT_SYSTEM_NO
||
4798 context
->protect_home
!= PROTECT_HOME_NO
||
4799 context
->protect_kernel_tunables
||
4800 context
->protect_kernel_modules
||
4801 context
->protect_kernel_logs
||
4802 context
->protect_control_groups
||
4803 context
->protect_clock
||
4804 context
->protect_hostname
||
4805 !strv_isempty(context
->read_write_paths
) ||
4806 !strv_isempty(context
->read_only_paths
) ||
4807 !strv_isempty(context
->inaccessible_paths
) ||
4808 !strv_isempty(context
->exec_paths
) ||
4809 !strv_isempty(context
->no_exec_paths
);
4812 static int exec_child(
4814 const ExecCommand
*command
,
4815 const ExecContext
*context
,
4816 const ExecParameters
*params
,
4817 ExecRuntime
*runtime
,
4818 const CGroupContext
*cgroup_context
,
4820 const int named_iofds
[static 3],
4822 size_t n_socket_fds
,
4823 size_t n_storage_fds
,
4828 _cleanup_strv_free_
char **our_env
= NULL
, **pass_env
= NULL
, **joined_exec_search_path
= NULL
, **accum_env
= NULL
, **replaced_argv
= NULL
;
4829 int r
, ngids
= 0, exec_fd
;
4830 _cleanup_free_ gid_t
*supplementary_gids
= NULL
;
4831 const char *username
= NULL
, *groupname
= NULL
;
4832 _cleanup_free_
char *home_buffer
= NULL
, *memory_pressure_path
= NULL
;
4833 const char *home
= NULL
, *shell
= NULL
;
4834 char **final_argv
= NULL
;
4835 dev_t journal_stream_dev
= 0;
4836 ino_t journal_stream_ino
= 0;
4837 bool userns_set_up
= false;
4838 bool needs_sandboxing
, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4839 needs_setuid
, /* Do we need to do the actual setresuid()/setresgid() calls? */
4840 needs_mount_namespace
, /* Do we need to set up a mount namespace for this kernel? */
4841 needs_ambient_hack
; /* Do we need to apply the ambient capabilities hack? */
4843 _cleanup_free_
char *mac_selinux_context_net
= NULL
;
4844 bool use_selinux
= false;
4847 bool use_smack
= false;
4850 bool use_apparmor
= false;
4852 uid_t saved_uid
= getuid();
4853 gid_t saved_gid
= getgid();
4854 uid_t uid
= UID_INVALID
;
4855 gid_t gid
= GID_INVALID
;
4856 size_t n_fds
= n_socket_fds
+ n_storage_fds
, /* fds to pass to the child */
4857 n_keep_fds
; /* total number of fds not to close */
4859 _cleanup_free_ gid_t
*gids_after_pam
= NULL
;
4860 int ngids_after_pam
= 0;
4861 _cleanup_free_
int *fds
= NULL
;
4862 _cleanup_strv_free_
char **fdnames
= NULL
;
4868 assert(exit_status
);
4870 /* Explicitly test for CVE-2021-4034 inspired invocations */
4871 assert(command
->path
);
4872 assert(!strv_isempty(command
->argv
));
4874 rename_process_from_path(command
->path
);
4876 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4877 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4878 * both of which will be demoted to SIG_DFL. */
4879 (void) default_signals(SIGNALS_CRASH_HANDLER
,
4882 if (context
->ignore_sigpipe
)
4883 (void) ignore_signals(SIGPIPE
);
4885 r
= reset_signal_mask();
4887 *exit_status
= EXIT_SIGNAL_MASK
;
4888 return log_unit_error_errno(unit
, r
, "Failed to set process signal mask: %m");
4891 if (params
->idle_pipe
)
4892 do_idle_pipe_dance(params
->idle_pipe
);
4894 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4895 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4896 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4897 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4900 log_set_open_when_needed(true);
4901 log_settle_target();
4903 /* In case anything used libc syslog(), close this here, too */
4906 fds
= newdup(int, params_fds
, n_fds
);
4908 *exit_status
= EXIT_MEMORY
;
4912 fdnames
= strv_copy((char**) params
->fd_names
);
4914 *exit_status
= EXIT_MEMORY
;
4918 r
= collect_open_file_fds(unit
, params
->open_files
, &fds
, &fdnames
, &n_fds
);
4920 *exit_status
= EXIT_FDS
;
4921 return log_unit_error_errno(unit
, r
, "Failed to get OpenFile= file descriptors: %m");
4924 int keep_fds
[n_fds
+ 3];
4925 memcpy_safe(keep_fds
, fds
, n_fds
* sizeof(int));
4928 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, params
->exec_fd
, &exec_fd
);
4930 *exit_status
= EXIT_FDS
;
4931 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4935 if (unit
->manager
->restrict_fs
) {
4936 int bpf_map_fd
= lsm_bpf_map_restrict_fs_fd(unit
);
4937 if (bpf_map_fd
< 0) {
4938 *exit_status
= EXIT_FDS
;
4939 return log_unit_error_errno(unit
, bpf_map_fd
, "Failed to get restrict filesystems BPF map fd: %m");
4942 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, bpf_map_fd
, &bpf_map_fd
);
4944 *exit_status
= EXIT_FDS
;
4945 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
4950 r
= close_remaining_fds(params
, runtime
, user_lookup_fd
, socket_fd
, keep_fds
, n_keep_fds
);
4952 *exit_status
= EXIT_FDS
;
4953 return log_unit_error_errno(unit
, r
, "Failed to close unwanted file descriptors: %m");
4956 if (!context
->same_pgrp
&&
4958 *exit_status
= EXIT_SETSID
;
4959 return log_unit_error_errno(unit
, errno
, "Failed to create new process session: %m");
4962 exec_context_tty_reset(context
, params
);
4964 if (unit_shall_confirm_spawn(unit
)) {
4965 _cleanup_free_
char *cmdline
= NULL
;
4967 cmdline
= quote_command_line(command
->argv
, SHELL_ESCAPE_EMPTY
);
4969 *exit_status
= EXIT_MEMORY
;
4973 r
= ask_for_confirmation(context
, params
->confirm_spawn
, unit
, cmdline
);
4974 if (r
!= CONFIRM_EXECUTE
) {
4975 if (r
== CONFIRM_PRETEND_SUCCESS
) {
4976 *exit_status
= EXIT_SUCCESS
;
4980 *exit_status
= EXIT_CONFIRM
;
4981 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ECANCELED
),
4982 "Execution cancelled by the user");
4986 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4987 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4988 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4989 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4990 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4991 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit
->id
, true) != 0 ||
4992 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params
->runtime_scope
), true) != 0) {
4993 *exit_status
= EXIT_MEMORY
;
4994 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
4997 if (context
->dynamic_user
&& runtime
&& runtime
->dynamic_creds
) {
4998 _cleanup_strv_free_
char **suggested_paths
= NULL
;
5000 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
5001 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
5002 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
5003 *exit_status
= EXIT_USER
;
5004 return log_unit_error_errno(unit
, errno
, "Failed to update environment: %m");
5007 r
= compile_suggested_paths(context
, params
, &suggested_paths
);
5009 *exit_status
= EXIT_MEMORY
;
5013 r
= dynamic_creds_realize(runtime
->dynamic_creds
, suggested_paths
, &uid
, &gid
);
5015 *exit_status
= EXIT_USER
;
5017 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
5018 "Failed to update dynamic user credentials: User or group with specified name already exists.");
5019 return log_unit_error_errno(unit
, r
, "Failed to update dynamic user credentials: %m");
5022 if (!uid_is_valid(uid
)) {
5023 *exit_status
= EXIT_USER
;
5024 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "UID validation failed for \""UID_FMT
"\"", uid
);
5027 if (!gid_is_valid(gid
)) {
5028 *exit_status
= EXIT_USER
;
5029 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(ESRCH
), "GID validation failed for \""GID_FMT
"\"", gid
);
5032 if (runtime
->dynamic_creds
->user
)
5033 username
= runtime
->dynamic_creds
->user
->name
;
5036 r
= get_fixed_user(context
, &username
, &uid
, &gid
, &home
, &shell
);
5038 *exit_status
= EXIT_USER
;
5039 return log_unit_error_errno(unit
, r
, "Failed to determine user credentials: %m");
5042 r
= get_fixed_group(context
, &groupname
, &gid
);
5044 *exit_status
= EXIT_GROUP
;
5045 return log_unit_error_errno(unit
, r
, "Failed to determine group credentials: %m");
5049 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5050 r
= get_supplementary_groups(context
, username
, groupname
, gid
,
5051 &supplementary_gids
, &ngids
);
5053 *exit_status
= EXIT_GROUP
;
5054 return log_unit_error_errno(unit
, r
, "Failed to determine supplementary groups: %m");
5057 r
= send_user_lookup(unit
, user_lookup_fd
, uid
, gid
);
5059 *exit_status
= EXIT_USER
;
5060 return log_unit_error_errno(unit
, r
, "Failed to send user credentials to PID1: %m");
5063 user_lookup_fd
= safe_close(user_lookup_fd
);
5065 r
= acquire_home(context
, uid
, &home
, &home_buffer
);
5067 *exit_status
= EXIT_CHDIR
;
5068 return log_unit_error_errno(unit
, r
, "Failed to determine $HOME for user: %m");
5071 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5073 (void) fd_nonblock(socket_fd
, false);
5075 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5076 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5077 if (params
->cgroup_path
) {
5078 _cleanup_free_
char *p
= NULL
;
5080 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
5082 *exit_status
= EXIT_CGROUP
;
5083 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
5086 r
= cg_attach_everywhere(params
->cgroup_supported
, p
, 0, NULL
, NULL
);
5087 if (r
== -EUCLEAN
) {
5088 *exit_status
= EXIT_CGROUP
;
5089 return log_unit_error_errno(unit
, r
, "Failed to attach process to cgroup %s "
5090 "because the cgroup or one of its parents or "
5091 "siblings is in the threaded mode: %m", p
);
5094 *exit_status
= EXIT_CGROUP
;
5095 return log_unit_error_errno(unit
, r
, "Failed to attach to cgroup %s: %m", p
);
5099 if (context
->network_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
5100 r
= open_shareable_ns_path(runtime
->shared
->netns_storage_socket
, context
->network_namespace_path
, CLONE_NEWNET
);
5102 *exit_status
= EXIT_NETWORK
;
5103 return log_unit_error_errno(unit
, r
, "Failed to open network namespace path %s: %m", context
->network_namespace_path
);
5107 if (context
->ipc_namespace_path
&& runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
5108 r
= open_shareable_ns_path(runtime
->shared
->ipcns_storage_socket
, context
->ipc_namespace_path
, CLONE_NEWIPC
);
5110 *exit_status
= EXIT_NAMESPACE
;
5111 return log_unit_error_errno(unit
, r
, "Failed to open IPC namespace path %s: %m", context
->ipc_namespace_path
);
5115 r
= setup_input(context
, params
, socket_fd
, named_iofds
);
5117 *exit_status
= EXIT_STDIN
;
5118 return log_unit_error_errno(unit
, r
, "Failed to set up standard input: %m");
5121 r
= setup_output(unit
, context
, params
, STDOUT_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
5123 *exit_status
= EXIT_STDOUT
;
5124 return log_unit_error_errno(unit
, r
, "Failed to set up standard output: %m");
5127 r
= setup_output(unit
, context
, params
, STDERR_FILENO
, socket_fd
, named_iofds
, basename(command
->path
), uid
, gid
, &journal_stream_dev
, &journal_stream_ino
);
5129 *exit_status
= EXIT_STDERR
;
5130 return log_unit_error_errno(unit
, r
, "Failed to set up standard error output: %m");
5133 if (context
->oom_score_adjust_set
) {
5134 /* When we can't make this change due to EPERM, then let's silently skip over it. User
5135 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
5136 r
= set_oom_score_adjust(context
->oom_score_adjust
);
5137 if (ERRNO_IS_NEG_PRIVILEGE(r
))
5138 log_unit_debug_errno(unit
, r
,
5139 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5141 *exit_status
= EXIT_OOM_ADJUST
;
5142 return log_unit_error_errno(unit
, r
, "Failed to adjust OOM setting: %m");
5146 if (context
->coredump_filter_set
) {
5147 r
= set_coredump_filter(context
->coredump_filter
);
5148 if (ERRNO_IS_NEG_PRIVILEGE(r
))
5149 log_unit_debug_errno(unit
, r
, "Failed to adjust coredump_filter, ignoring: %m");
5151 *exit_status
= EXIT_LIMITS
;
5152 return log_unit_error_errno(unit
, r
, "Failed to adjust coredump_filter: %m");
5156 if (context
->nice_set
) {
5157 r
= setpriority_closest(context
->nice
);
5159 *exit_status
= EXIT_NICE
;
5160 return log_unit_error_errno(unit
, r
, "Failed to set up process scheduling priority (nice level): %m");
5164 if (context
->cpu_sched_set
) {
5165 struct sched_param param
= {
5166 .sched_priority
= context
->cpu_sched_priority
,
5169 r
= sched_setscheduler(0,
5170 context
->cpu_sched_policy
|
5171 (context
->cpu_sched_reset_on_fork
?
5172 SCHED_RESET_ON_FORK
: 0),
5175 *exit_status
= EXIT_SETSCHEDULER
;
5176 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU scheduling: %m");
5180 if (context
->cpu_affinity_from_numa
|| context
->cpu_set
.set
) {
5181 _cleanup_(cpu_set_reset
) CPUSet converted_cpu_set
= {};
5182 const CPUSet
*cpu_set
;
5184 if (context
->cpu_affinity_from_numa
) {
5185 r
= exec_context_cpu_affinity_from_numa(context
, &converted_cpu_set
);
5187 *exit_status
= EXIT_CPUAFFINITY
;
5188 return log_unit_error_errno(unit
, r
, "Failed to derive CPU affinity mask from NUMA mask: %m");
5191 cpu_set
= &converted_cpu_set
;
5193 cpu_set
= &context
->cpu_set
;
5195 if (sched_setaffinity(0, cpu_set
->allocated
, cpu_set
->set
) < 0) {
5196 *exit_status
= EXIT_CPUAFFINITY
;
5197 return log_unit_error_errno(unit
, errno
, "Failed to set up CPU affinity: %m");
5201 if (mpol_is_valid(numa_policy_get_type(&context
->numa_policy
))) {
5202 r
= apply_numa_policy(&context
->numa_policy
);
5203 if (ERRNO_IS_NEG_NOT_SUPPORTED(r
))
5204 log_unit_debug_errno(unit
, r
, "NUMA support not available, ignoring.");
5206 *exit_status
= EXIT_NUMA_POLICY
;
5207 return log_unit_error_errno(unit
, r
, "Failed to set NUMA memory policy: %m");
5211 if (context
->ioprio_set
)
5212 if (ioprio_set(IOPRIO_WHO_PROCESS
, 0, context
->ioprio
) < 0) {
5213 *exit_status
= EXIT_IOPRIO
;
5214 return log_unit_error_errno(unit
, errno
, "Failed to set up IO scheduling priority: %m");
5217 if (context
->timer_slack_nsec
!= NSEC_INFINITY
)
5218 if (prctl(PR_SET_TIMERSLACK
, context
->timer_slack_nsec
) < 0) {
5219 *exit_status
= EXIT_TIMERSLACK
;
5220 return log_unit_error_errno(unit
, errno
, "Failed to set up timer slack: %m");
5223 if (context
->personality
!= PERSONALITY_INVALID
) {
5224 r
= safe_personality(context
->personality
);
5226 *exit_status
= EXIT_PERSONALITY
;
5227 return log_unit_error_errno(unit
, r
, "Failed to set up execution domain (personality): %m");
5231 if (context
->utmp_id
) {
5232 const char *line
= context
->tty_path
?
5233 (path_startswith(context
->tty_path
, "/dev/") ?: context
->tty_path
) :
5235 utmp_put_init_process(context
->utmp_id
, getpid_cached(), getsid(0),
5237 context
->utmp_mode
== EXEC_UTMP_INIT
? INIT_PROCESS
:
5238 context
->utmp_mode
== EXEC_UTMP_LOGIN
? LOGIN_PROCESS
:
5243 if (uid_is_valid(uid
)) {
5244 r
= chown_terminal(STDIN_FILENO
, uid
);
5246 *exit_status
= EXIT_STDIN
;
5247 return log_unit_error_errno(unit
, r
, "Failed to change ownership of terminal: %m");
5251 if (params
->cgroup_path
) {
5252 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5253 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5254 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5255 * touch a single hierarchy too. */
5257 if (params
->flags
& EXEC_CGROUP_DELEGATE
) {
5258 _cleanup_free_
char *p
= NULL
;
5260 r
= cg_set_access(SYSTEMD_CGROUP_CONTROLLER
, params
->cgroup_path
, uid
, gid
);
5262 *exit_status
= EXIT_CGROUP
;
5263 return log_unit_error_errno(unit
, r
, "Failed to adjust control group access: %m");
5266 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &p
);
5268 *exit_status
= EXIT_CGROUP
;
5269 return log_unit_error_errno(unit
, r
, "Failed to acquire cgroup path: %m");
5272 r
= cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER
, p
, uid
, gid
);
5274 *exit_status
= EXIT_CGROUP
;
5275 return log_unit_error_errno(unit
, r
, "Failed to adjust control subgroup access: %m");
5280 if (cgroup_context
&& cg_unified() > 0 && is_pressure_supported() > 0) {
5281 if (cgroup_context_want_memory_pressure(cgroup_context
)) {
5282 r
= cg_get_path("memory", params
->cgroup_path
, "memory.pressure", &memory_pressure_path
);
5284 *exit_status
= EXIT_MEMORY
;
5288 r
= chmod_and_chown(memory_pressure_path
, 0644, uid
, gid
);
5290 log_unit_full_errno(unit
, r
== -ENOENT
|| ERRNO_IS_PRIVILEGE(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
5291 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path
);
5292 memory_pressure_path
= mfree(memory_pressure_path
);
5294 } else if (cgroup_context
->memory_pressure_watch
== CGROUP_PRESSURE_WATCH_OFF
) {
5295 memory_pressure_path
= strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5296 if (!memory_pressure_path
) {
5297 *exit_status
= EXIT_MEMORY
;
5304 needs_mount_namespace
= exec_needs_mount_namespace(context
, params
, runtime
);
5306 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
5307 r
= setup_exec_directory(unit
, context
, params
, uid
, gid
, dt
, needs_mount_namespace
, exit_status
);
5309 return log_unit_error_errno(unit
, r
, "Failed to set up special execution directory in %s: %m", params
->prefix
[dt
]);
5312 if (FLAGS_SET(params
->flags
, EXEC_WRITE_CREDENTIALS
)) {
5313 r
= setup_credentials(context
, params
, unit
->id
, uid
, gid
);
5315 *exit_status
= EXIT_CREDENTIALS
;
5316 return log_unit_error_errno(unit
, r
, "Failed to set up credentials: %m");
5320 r
= build_environment(
5332 memory_pressure_path
,
5335 *exit_status
= EXIT_MEMORY
;
5339 r
= build_pass_environment(context
, &pass_env
);
5341 *exit_status
= EXIT_MEMORY
;
5345 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5346 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5347 * not specify PATH but the unit has ExecSearchPath. */
5348 if (!strv_isempty(context
->exec_search_path
)) {
5349 _cleanup_free_
char *joined
= NULL
;
5351 joined
= strv_join(context
->exec_search_path
, ":");
5353 *exit_status
= EXIT_MEMORY
;
5357 r
= strv_env_assign(&joined_exec_search_path
, "PATH", joined
);
5359 *exit_status
= EXIT_MEMORY
;
5364 accum_env
= strv_env_merge(params
->environment
,
5366 joined_exec_search_path
,
5368 context
->environment
,
5371 *exit_status
= EXIT_MEMORY
;
5374 accum_env
= strv_env_clean(accum_env
);
5376 (void) umask(context
->umask
);
5378 r
= setup_keyring(unit
, context
, params
, uid
, gid
);
5380 *exit_status
= EXIT_KEYRING
;
5381 return log_unit_error_errno(unit
, r
, "Failed to set up kernel keyring: %m");
5384 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5386 needs_sandboxing
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& EXEC_COMMAND_FULLY_PRIVILEGED
);
5388 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5389 * for it, and the kernel doesn't actually support ambient caps. */
5390 needs_ambient_hack
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && (command
->flags
& EXEC_COMMAND_AMBIENT_MAGIC
) && !ambient_capabilities_supported();
5392 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5393 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5395 if (needs_ambient_hack
)
5396 needs_setuid
= false;
5398 needs_setuid
= (params
->flags
& EXEC_APPLY_SANDBOXING
) && !(command
->flags
& (EXEC_COMMAND_FULLY_PRIVILEGED
|EXEC_COMMAND_NO_SETUID
));
5400 uint64_t capability_ambient_set
= context
->capability_ambient_set
;
5402 if (needs_sandboxing
) {
5403 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5404 * /sys being present. The actual MAC context application will happen later, as late as
5405 * possible, to avoid impacting our own code paths. */
5408 use_selinux
= mac_selinux_use();
5411 use_smack
= mac_smack_use();
5414 use_apparmor
= mac_apparmor_use();
5418 if (needs_sandboxing
) {
5421 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5422 * is set here. (See below.) */
5424 r
= setrlimit_closest_all((const struct rlimit
* const *) context
->rlimit
, &which_failed
);
5426 *exit_status
= EXIT_LIMITS
;
5427 return log_unit_error_errno(unit
, r
, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
5431 if (needs_setuid
&& context
->pam_name
&& username
) {
5432 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5433 * wins here. (See above.) */
5435 /* All fds passed in the fds array will be closed in the pam child process. */
5436 r
= setup_pam(context
->pam_name
, username
, uid
, gid
, context
->tty_path
, &accum_env
, fds
, n_fds
);
5438 *exit_status
= EXIT_PAM
;
5439 return log_unit_error_errno(unit
, r
, "Failed to set up PAM session: %m");
5442 if (ambient_capabilities_supported()) {
5443 uint64_t ambient_after_pam
;
5445 /* PAM modules might have set some ambient caps. Query them here and merge them into
5446 * the caps we want to set in the end, so that we don't end up unsetting them. */
5447 r
= capability_get_ambient(&ambient_after_pam
);
5449 *exit_status
= EXIT_CAPABILITIES
;
5450 return log_unit_error_errno(unit
, r
, "Failed to query ambient caps: %m");
5453 capability_ambient_set
|= ambient_after_pam
;
5456 ngids_after_pam
= getgroups_alloc(&gids_after_pam
);
5457 if (ngids_after_pam
< 0) {
5458 *exit_status
= EXIT_MEMORY
;
5459 return log_unit_error_errno(unit
, ngids_after_pam
, "Failed to obtain groups after setting up PAM: %m");
5463 if (needs_sandboxing
&& exec_context_need_unprivileged_private_users(context
, params
)) {
5464 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5465 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5466 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5468 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
5469 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5470 * the actual requested operations fail (or silently continue). */
5471 if (r
< 0 && context
->private_users
) {
5472 *exit_status
= EXIT_USER
;
5473 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user: %m");
5476 log_unit_info_errno(unit
, r
, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5478 userns_set_up
= true;
5481 if (exec_needs_network_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->netns_storage_socket
[0] >= 0) {
5483 /* Try to enable network namespacing if network namespacing is available and we have
5484 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5485 * new network namespace. And if we don't have that, then we could only create a network
5486 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5487 if (ns_type_supported(NAMESPACE_NET
) && have_effective_cap(CAP_NET_ADMIN
) > 0) {
5488 r
= setup_shareable_ns(runtime
->shared
->netns_storage_socket
, CLONE_NEWNET
);
5489 if (ERRNO_IS_NEG_PRIVILEGE(r
))
5490 log_unit_notice_errno(unit
, r
,
5491 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5493 *exit_status
= EXIT_NETWORK
;
5494 return log_unit_error_errno(unit
, r
, "Failed to set up network namespacing: %m");
5496 } else if (context
->network_namespace_path
) {
5497 *exit_status
= EXIT_NETWORK
;
5498 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
5499 "NetworkNamespacePath= is not supported, refusing.");
5501 log_unit_notice(unit
, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5504 if (exec_needs_ipc_namespace(context
) && runtime
&& runtime
->shared
&& runtime
->shared
->ipcns_storage_socket
[0] >= 0) {
5506 if (ns_type_supported(NAMESPACE_IPC
)) {
5507 r
= setup_shareable_ns(runtime
->shared
->ipcns_storage_socket
, CLONE_NEWIPC
);
5509 log_unit_warning_errno(unit
, r
,
5510 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5512 *exit_status
= EXIT_NAMESPACE
;
5513 return log_unit_error_errno(unit
, r
, "Failed to set up IPC namespacing: %m");
5515 } else if (context
->ipc_namespace_path
) {
5516 *exit_status
= EXIT_NAMESPACE
;
5517 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EOPNOTSUPP
),
5518 "IPCNamespacePath= is not supported, refusing.");
5520 log_unit_warning(unit
, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5523 if (needs_mount_namespace
) {
5524 _cleanup_free_
char *error_path
= NULL
;
5526 r
= apply_mount_namespace(unit
, command
->flags
, context
, params
, runtime
, memory_pressure_path
, &error_path
);
5528 *exit_status
= EXIT_NAMESPACE
;
5529 return log_unit_error_errno(unit
, r
, "Failed to set up mount namespacing%s%s: %m",
5530 error_path
? ": " : "", strempty(error_path
));
5534 if (needs_sandboxing
) {
5535 r
= apply_protect_hostname(unit
, context
, exit_status
);
5540 if (context
->memory_ksm
>= 0)
5541 if (prctl(PR_SET_MEMORY_MERGE
, context
->memory_ksm
) < 0) {
5542 if (ERRNO_IS_NOT_SUPPORTED(errno
))
5543 log_unit_debug_errno(unit
, errno
, "KSM support not available, ignoring.");
5545 *exit_status
= EXIT_KSM
;
5546 return log_unit_error_errno(unit
, errno
, "Failed to set KSM: %m");
5550 /* Drop groups as early as possible.
5551 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5552 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5554 _cleanup_free_ gid_t
*gids_to_enforce
= NULL
;
5555 int ngids_to_enforce
= 0;
5557 ngids_to_enforce
= merge_gid_lists(supplementary_gids
,
5562 if (ngids_to_enforce
< 0) {
5563 *exit_status
= EXIT_MEMORY
;
5564 return log_unit_error_errno(unit
,
5566 "Failed to merge group lists. Group membership might be incorrect: %m");
5569 r
= enforce_groups(gid
, gids_to_enforce
, ngids_to_enforce
);
5571 *exit_status
= EXIT_GROUP
;
5572 return log_unit_error_errno(unit
, r
, "Changing group credentials failed: %m");
5576 /* If the user namespace was not set up above, try to do it now.
5577 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5578 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5579 * case of mount namespaces being less privileged when the mount point list is copied from a
5580 * different user namespace). */
5582 if (needs_sandboxing
&& context
->private_users
&& !userns_set_up
) {
5583 r
= setup_private_users(saved_uid
, saved_gid
, uid
, gid
);
5585 *exit_status
= EXIT_USER
;
5586 return log_unit_error_errno(unit
, r
, "Failed to set up user namespacing: %m");
5590 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5593 _cleanup_free_
char *executable
= NULL
;
5594 _cleanup_close_
int executable_fd
= -EBADF
;
5595 r
= find_executable_full(command
->path
, /* root= */ NULL
, context
->exec_search_path
, false, &executable
, &executable_fd
);
5597 if (r
!= -ENOMEM
&& (command
->flags
& EXEC_COMMAND_IGNORE_FAILURE
)) {
5598 log_unit_struct_errno(unit
, LOG_INFO
, r
,
5599 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5600 LOG_UNIT_INVOCATION_ID(unit
),
5601 LOG_UNIT_MESSAGE(unit
, "Executable %s missing, skipping: %m",
5603 "EXECUTABLE=%s", command
->path
);
5604 *exit_status
= EXIT_SUCCESS
;
5608 *exit_status
= EXIT_EXEC
;
5609 return log_unit_struct_errno(unit
, LOG_INFO
, r
,
5610 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
5611 LOG_UNIT_INVOCATION_ID(unit
),
5612 LOG_UNIT_MESSAGE(unit
, "Failed to locate executable %s: %m",
5614 "EXECUTABLE=%s", command
->path
);
5617 r
= add_shifted_fd(keep_fds
, ELEMENTSOF(keep_fds
), &n_keep_fds
, executable_fd
, &executable_fd
);
5619 *exit_status
= EXIT_FDS
;
5620 return log_unit_error_errno(unit
, r
, "Failed to shift fd and set FD_CLOEXEC: %m");
5624 if (needs_sandboxing
&& use_selinux
&& params
->selinux_context_net
) {
5629 else if (params
->n_socket_fds
== 1)
5630 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5631 * use context from that fd to compute the label. */
5632 fd
= params
->fds
[0];
5635 r
= mac_selinux_get_child_mls_label(fd
, executable
, context
->selinux_context
, &mac_selinux_context_net
);
5637 if (!context
->selinux_context_ignore
) {
5638 *exit_status
= EXIT_SELINUX_CONTEXT
;
5639 return log_unit_error_errno(unit
, r
, "Failed to determine SELinux context: %m");
5641 log_unit_debug_errno(unit
, r
, "Failed to determine SELinux context, ignoring: %m");
5647 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5648 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5649 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5652 r
= close_all_fds(keep_fds
, n_keep_fds
);
5654 r
= shift_fds(fds
, n_fds
);
5656 r
= flags_fds(fds
, n_socket_fds
, n_fds
, context
->non_blocking
);
5658 *exit_status
= EXIT_FDS
;
5659 return log_unit_error_errno(unit
, r
, "Failed to adjust passed file descriptors: %m");
5662 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5663 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5664 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5667 secure_bits
= context
->secure_bits
;
5669 if (needs_sandboxing
) {
5672 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5673 * (Note this is placed after the general resource limit initialization, see above, in order
5674 * to take precedence.) */
5675 if (context
->restrict_realtime
&& !context
->rlimit
[RLIMIT_RTPRIO
]) {
5676 if (setrlimit(RLIMIT_RTPRIO
, &RLIMIT_MAKE_CONST(0)) < 0) {
5677 *exit_status
= EXIT_LIMITS
;
5678 return log_unit_error_errno(unit
, errno
, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5683 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5684 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5686 r
= setup_smack(unit
->manager
, context
, executable_fd
);
5687 if (r
< 0 && !context
->smack_process_label_ignore
) {
5688 *exit_status
= EXIT_SMACK_PROCESS_LABEL
;
5689 return log_unit_error_errno(unit
, r
, "Failed to set SMACK process label: %m");
5694 bset
= context
->capability_bounding_set
;
5695 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5696 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5697 * instead of us doing that */
5698 if (needs_ambient_hack
)
5699 bset
|= (UINT64_C(1) << CAP_SETPCAP
) |
5700 (UINT64_C(1) << CAP_SETUID
) |
5701 (UINT64_C(1) << CAP_SETGID
);
5703 if (!cap_test_all(bset
)) {
5704 r
= capability_bounding_set_drop(bset
, /* right_now= */ false);
5706 *exit_status
= EXIT_CAPABILITIES
;
5707 return log_unit_error_errno(unit
, r
, "Failed to drop capabilities: %m");
5711 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5714 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5715 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5716 * the ambient capabilities can be raised as they are present in the permitted and
5717 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5718 * without changing the user, so we also set the ambient capabilities here.
5720 * The requested ambient capabilities are raised in the inheritable set if the second
5721 * argument is true. */
5722 if (!needs_ambient_hack
) {
5723 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ true);
5725 *exit_status
= EXIT_CAPABILITIES
;
5726 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (before UID change): %m");
5731 /* chroot to root directory first, before we lose the ability to chroot */
5732 r
= apply_root_directory(context
, params
, runtime
, needs_mount_namespace
, exit_status
);
5734 return log_unit_error_errno(unit
, r
, "Chrooting to the requested root directory failed: %m");
5737 if (uid_is_valid(uid
)) {
5738 r
= enforce_user(context
, uid
, capability_ambient_set
);
5740 *exit_status
= EXIT_USER
;
5741 return log_unit_error_errno(unit
, r
, "Failed to change UID to " UID_FMT
": %m", uid
);
5744 if (!needs_ambient_hack
&& capability_ambient_set
!= 0) {
5746 /* Raise the ambient capabilities after user change. */
5747 r
= capability_ambient_set_apply(capability_ambient_set
, /* also_inherit= */ false);
5749 *exit_status
= EXIT_CAPABILITIES
;
5750 return log_unit_error_errno(unit
, r
, "Failed to apply ambient capabilities (after UID change): %m");
5756 /* Apply working directory here, because the working directory might be on NFS and only the user running
5757 * this service might have the correct privilege to change to the working directory */
5758 r
= apply_working_directory(context
, params
, runtime
, home
, exit_status
);
5760 return log_unit_error_errno(unit
, r
, "Changing to the requested working directory failed: %m");
5762 if (needs_sandboxing
) {
5763 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5764 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5765 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5766 * are restricted. */
5770 char *exec_context
= mac_selinux_context_net
?: context
->selinux_context
;
5773 r
= setexeccon(exec_context
);
5775 if (!context
->selinux_context_ignore
) {
5776 *exit_status
= EXIT_SELINUX_CONTEXT
;
5777 return log_unit_error_errno(unit
, r
, "Failed to change SELinux context to %s: %m", exec_context
);
5779 log_unit_debug_errno(unit
, r
, "Failed to change SELinux context to %s, ignoring: %m", exec_context
);
5786 if (use_apparmor
&& context
->apparmor_profile
) {
5787 r
= aa_change_onexec(context
->apparmor_profile
);
5788 if (r
< 0 && !context
->apparmor_profile_ignore
) {
5789 *exit_status
= EXIT_APPARMOR_PROFILE
;
5790 return log_unit_error_errno(unit
, errno
, "Failed to prepare AppArmor profile change to %s: %m", context
->apparmor_profile
);
5795 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5796 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5797 * requires CAP_SETPCAP. */
5798 if (prctl(PR_GET_SECUREBITS
) != secure_bits
) {
5799 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5800 * effective set here.
5802 * The effective set is overwritten during execve() with the following values:
5804 * - ambient set (for non-root processes)
5806 * - (inheritable | bounding) set for root processes)
5808 * Hence there is no security impact to raise it in the effective set before execve
5810 r
= capability_gain_cap_setpcap(/* return_caps= */ NULL
);
5812 *exit_status
= EXIT_CAPABILITIES
;
5813 return log_unit_error_errno(unit
, r
, "Failed to gain CAP_SETPCAP for setting secure bits");
5815 if (prctl(PR_SET_SECUREBITS
, secure_bits
) < 0) {
5816 *exit_status
= EXIT_SECUREBITS
;
5817 return log_unit_error_errno(unit
, errno
, "Failed to set process secure bits: %m");
5821 if (context_has_no_new_privileges(context
))
5822 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0) {
5823 *exit_status
= EXIT_NO_NEW_PRIVILEGES
;
5824 return log_unit_error_errno(unit
, errno
, "Failed to disable new privileges: %m");
5828 r
= apply_address_families(unit
, context
);
5830 *exit_status
= EXIT_ADDRESS_FAMILIES
;
5831 return log_unit_error_errno(unit
, r
, "Failed to restrict address families: %m");
5834 r
= apply_memory_deny_write_execute(unit
, context
);
5836 *exit_status
= EXIT_SECCOMP
;
5837 return log_unit_error_errno(unit
, r
, "Failed to disable writing to executable memory: %m");
5840 r
= apply_restrict_realtime(unit
, context
);
5842 *exit_status
= EXIT_SECCOMP
;
5843 return log_unit_error_errno(unit
, r
, "Failed to apply realtime restrictions: %m");
5846 r
= apply_restrict_suid_sgid(unit
, context
);
5848 *exit_status
= EXIT_SECCOMP
;
5849 return log_unit_error_errno(unit
, r
, "Failed to apply SUID/SGID restrictions: %m");
5852 r
= apply_restrict_namespaces(unit
, context
);
5854 *exit_status
= EXIT_SECCOMP
;
5855 return log_unit_error_errno(unit
, r
, "Failed to apply namespace restrictions: %m");
5858 r
= apply_protect_sysctl(unit
, context
);
5860 *exit_status
= EXIT_SECCOMP
;
5861 return log_unit_error_errno(unit
, r
, "Failed to apply sysctl restrictions: %m");
5864 r
= apply_protect_kernel_modules(unit
, context
);
5866 *exit_status
= EXIT_SECCOMP
;
5867 return log_unit_error_errno(unit
, r
, "Failed to apply module loading restrictions: %m");
5870 r
= apply_protect_kernel_logs(unit
, context
);
5872 *exit_status
= EXIT_SECCOMP
;
5873 return log_unit_error_errno(unit
, r
, "Failed to apply kernel log restrictions: %m");
5876 r
= apply_protect_clock(unit
, context
);
5878 *exit_status
= EXIT_SECCOMP
;
5879 return log_unit_error_errno(unit
, r
, "Failed to apply clock restrictions: %m");
5882 r
= apply_private_devices(unit
, context
);
5884 *exit_status
= EXIT_SECCOMP
;
5885 return log_unit_error_errno(unit
, r
, "Failed to set up private devices: %m");
5888 r
= apply_syscall_archs(unit
, context
);
5890 *exit_status
= EXIT_SECCOMP
;
5891 return log_unit_error_errno(unit
, r
, "Failed to apply syscall architecture restrictions: %m");
5894 r
= apply_lock_personality(unit
, context
);
5896 *exit_status
= EXIT_SECCOMP
;
5897 return log_unit_error_errno(unit
, r
, "Failed to lock personalities: %m");
5900 r
= apply_syscall_log(unit
, context
);
5902 *exit_status
= EXIT_SECCOMP
;
5903 return log_unit_error_errno(unit
, r
, "Failed to apply system call log filters: %m");
5906 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5907 * by the filter as little as possible. */
5908 r
= apply_syscall_filter(unit
, context
, needs_ambient_hack
);
5910 *exit_status
= EXIT_SECCOMP
;
5911 return log_unit_error_errno(unit
, r
, "Failed to apply system call filters: %m");
5916 r
= apply_restrict_filesystems(unit
, context
);
5918 *exit_status
= EXIT_BPF
;
5919 return log_unit_error_errno(unit
, r
, "Failed to restrict filesystems: %m");
5925 if (!strv_isempty(context
->unset_environment
)) {
5928 ee
= strv_env_delete(accum_env
, 1, context
->unset_environment
);
5930 *exit_status
= EXIT_MEMORY
;
5934 strv_free_and_replace(accum_env
, ee
);
5937 if (!FLAGS_SET(command
->flags
, EXEC_COMMAND_NO_ENV_EXPAND
)) {
5938 _cleanup_strv_free_
char **unset_variables
= NULL
, **bad_variables
= NULL
;
5940 r
= replace_env_argv(command
->argv
, accum_env
, &replaced_argv
, &unset_variables
, &bad_variables
);
5942 *exit_status
= EXIT_MEMORY
;
5943 return log_unit_error_errno(unit
, r
, "Failed to replace environment variables: %m");
5945 final_argv
= replaced_argv
;
5947 if (!strv_isempty(unset_variables
)) {
5948 _cleanup_free_
char *ju
= strv_join(unset_variables
, ", ");
5949 log_unit_warning(unit
, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju
));
5952 if (!strv_isempty(bad_variables
)) {
5953 _cleanup_free_
char *jb
= strv_join(bad_variables
, ", ");
5954 log_unit_warning(unit
, "Invalid environment variable name evaluates to an empty string: %s", strna(jb
));;
5957 final_argv
= command
->argv
;
5959 log_command_line(unit
, "Executing", executable
, final_argv
);
5964 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5965 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5967 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5968 *exit_status
= EXIT_EXEC
;
5969 return log_unit_error_errno(unit
, errno
, "Failed to enable exec_fd: %m");
5973 r
= fexecve_or_execve(executable_fd
, executable
, final_argv
, accum_env
);
5978 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5979 * that POLLHUP on it no longer means execve() succeeded. */
5981 if (write(exec_fd
, &hot
, sizeof(hot
)) < 0) {
5982 *exit_status
= EXIT_EXEC
;
5983 return log_unit_error_errno(unit
, errno
, "Failed to disable exec_fd: %m");
5987 *exit_status
= EXIT_EXEC
;
5988 return log_unit_error_errno(unit
, r
, "Failed to execute %s: %m", executable
);
5991 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***l
);
5992 static int exec_context_named_iofds(const ExecContext
*c
, const ExecParameters
*p
, int named_iofds
[static 3]);
5994 int exec_spawn(Unit
*unit
,
5995 ExecCommand
*command
,
5996 const ExecContext
*context
,
5997 const ExecParameters
*params
,
5998 ExecRuntime
*runtime
,
5999 const CGroupContext
*cgroup_context
,
6002 int socket_fd
, r
, named_iofds
[3] = { -1, -1, -1 }, *fds
= NULL
;
6003 _cleanup_free_
char *subcgroup_path
= NULL
;
6004 _cleanup_strv_free_
char **files_env
= NULL
;
6005 size_t n_storage_fds
= 0, n_socket_fds
= 0;
6013 assert(params
->fds
|| (params
->n_socket_fds
+ params
->n_storage_fds
<= 0));
6015 LOG_CONTEXT_PUSH_UNIT(unit
);
6017 if (context
->std_input
== EXEC_INPUT_SOCKET
||
6018 context
->std_output
== EXEC_OUTPUT_SOCKET
||
6019 context
->std_error
== EXEC_OUTPUT_SOCKET
) {
6021 if (params
->n_socket_fds
> 1)
6022 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got more than one socket.");
6024 if (params
->n_socket_fds
== 0)
6025 return log_unit_error_errno(unit
, SYNTHETIC_ERRNO(EINVAL
), "Got no socket.");
6027 socket_fd
= params
->fds
[0];
6031 n_socket_fds
= params
->n_socket_fds
;
6032 n_storage_fds
= params
->n_storage_fds
;
6035 r
= exec_context_named_iofds(context
, params
, named_iofds
);
6037 return log_unit_error_errno(unit
, r
, "Failed to load a named file descriptor: %m");
6039 r
= exec_context_load_environment(unit
, context
, &files_env
);
6041 return log_unit_error_errno(unit
, r
, "Failed to load environment files: %m");
6043 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6044 and, until the next SELinux policy changes, we save further reloads in future children. */
6045 mac_selinux_maybe_reload();
6047 /* We won't know the real executable path until we create the mount namespace in the child, but we
6048 want to log from the parent, so we use the possibly inaccurate path here. */
6049 log_command_line(unit
, "About to execute", command
->path
, command
->argv
);
6051 if (params
->cgroup_path
) {
6052 r
= exec_parameters_get_cgroup_path(params
, cgroup_context
, &subcgroup_path
);
6054 return log_unit_error_errno(unit
, r
, "Failed to acquire subcgroup path: %m");
6056 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6057 * realized by the unit logic) */
6059 r
= cg_create(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
);
6061 return log_unit_error_errno(unit
, r
, "Failed to create subcgroup '%s': %m", subcgroup_path
);
6067 return log_unit_error_errno(unit
, errno
, "Failed to fork: %m");
6072 r
= exec_child(unit
,
6084 unit
->manager
->user_lookup_fds
[1],
6088 const char *status
= ASSERT_PTR(
6089 exit_status_to_string(exit_status
, EXIT_STATUS_LIBC
| EXIT_STATUS_SYSTEMD
));
6091 log_unit_struct_errno(unit
, LOG_ERR
, r
,
6092 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR
,
6093 LOG_UNIT_INVOCATION_ID(unit
),
6094 LOG_UNIT_MESSAGE(unit
, "Failed at step %s spawning %s: %m",
6095 status
, command
->path
),
6096 "EXECUTABLE=%s", command
->path
);
6098 assert(exit_status
== EXIT_SUCCESS
);
6103 log_unit_debug(unit
, "Forked %s as "PID_FMT
, command
->path
, pid
);
6105 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6106 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6107 * process will be killed too). */
6109 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER
, subcgroup_path
, pid
);
6111 exec_status_start(&command
->exec_status
, pid
);
6117 void exec_context_init(ExecContext
*c
) {
6121 c
->ioprio
= IOPRIO_DEFAULT_CLASS_AND_PRIO
;
6122 c
->cpu_sched_policy
= SCHED_OTHER
;
6123 c
->syslog_priority
= LOG_DAEMON
|LOG_INFO
;
6124 c
->syslog_level_prefix
= true;
6125 c
->ignore_sigpipe
= true;
6126 c
->timer_slack_nsec
= NSEC_INFINITY
;
6127 c
->personality
= PERSONALITY_INVALID
;
6128 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6129 c
->directories
[t
].mode
= 0755;
6130 c
->timeout_clean_usec
= USEC_INFINITY
;
6131 c
->capability_bounding_set
= CAP_MASK_UNSET
;
6132 assert_cc(NAMESPACE_FLAGS_INITIAL
!= NAMESPACE_FLAGS_ALL
);
6133 c
->restrict_namespaces
= NAMESPACE_FLAGS_INITIAL
;
6134 c
->log_level_max
= -1;
6136 c
->syscall_errno
= SECCOMP_ERROR_NUMBER_KILL
;
6138 c
->tty_rows
= UINT_MAX
;
6139 c
->tty_cols
= UINT_MAX
;
6140 numa_policy_reset(&c
->numa_policy
);
6141 c
->private_mounts
= -1;
6145 void exec_context_done(ExecContext
*c
) {
6148 c
->environment
= strv_free(c
->environment
);
6149 c
->environment_files
= strv_free(c
->environment_files
);
6150 c
->pass_environment
= strv_free(c
->pass_environment
);
6151 c
->unset_environment
= strv_free(c
->unset_environment
);
6153 rlimit_free_all(c
->rlimit
);
6155 for (size_t l
= 0; l
< 3; l
++) {
6156 c
->stdio_fdname
[l
] = mfree(c
->stdio_fdname
[l
]);
6157 c
->stdio_file
[l
] = mfree(c
->stdio_file
[l
]);
6160 c
->working_directory
= mfree(c
->working_directory
);
6161 c
->root_directory
= mfree(c
->root_directory
);
6162 c
->root_image
= mfree(c
->root_image
);
6163 c
->root_image_options
= mount_options_free_all(c
->root_image_options
);
6164 c
->root_hash
= mfree(c
->root_hash
);
6165 c
->root_hash_size
= 0;
6166 c
->root_hash_path
= mfree(c
->root_hash_path
);
6167 c
->root_hash_sig
= mfree(c
->root_hash_sig
);
6168 c
->root_hash_sig_size
= 0;
6169 c
->root_hash_sig_path
= mfree(c
->root_hash_sig_path
);
6170 c
->root_verity
= mfree(c
->root_verity
);
6171 c
->extension_images
= mount_image_free_many(c
->extension_images
, &c
->n_extension_images
);
6172 c
->extension_directories
= strv_free(c
->extension_directories
);
6173 c
->tty_path
= mfree(c
->tty_path
);
6174 c
->syslog_identifier
= mfree(c
->syslog_identifier
);
6175 c
->user
= mfree(c
->user
);
6176 c
->group
= mfree(c
->group
);
6178 c
->supplementary_groups
= strv_free(c
->supplementary_groups
);
6180 c
->pam_name
= mfree(c
->pam_name
);
6182 c
->read_only_paths
= strv_free(c
->read_only_paths
);
6183 c
->read_write_paths
= strv_free(c
->read_write_paths
);
6184 c
->inaccessible_paths
= strv_free(c
->inaccessible_paths
);
6185 c
->exec_paths
= strv_free(c
->exec_paths
);
6186 c
->no_exec_paths
= strv_free(c
->no_exec_paths
);
6187 c
->exec_search_path
= strv_free(c
->exec_search_path
);
6189 bind_mount_free_many(c
->bind_mounts
, c
->n_bind_mounts
);
6190 c
->bind_mounts
= NULL
;
6191 c
->n_bind_mounts
= 0;
6192 temporary_filesystem_free_many(c
->temporary_filesystems
, c
->n_temporary_filesystems
);
6193 c
->temporary_filesystems
= NULL
;
6194 c
->n_temporary_filesystems
= 0;
6195 c
->mount_images
= mount_image_free_many(c
->mount_images
, &c
->n_mount_images
);
6197 cpu_set_reset(&c
->cpu_set
);
6198 numa_policy_reset(&c
->numa_policy
);
6200 c
->utmp_id
= mfree(c
->utmp_id
);
6201 c
->selinux_context
= mfree(c
->selinux_context
);
6202 c
->apparmor_profile
= mfree(c
->apparmor_profile
);
6203 c
->smack_process_label
= mfree(c
->smack_process_label
);
6205 c
->restrict_filesystems
= set_free_free(c
->restrict_filesystems
);
6207 c
->syscall_filter
= hashmap_free(c
->syscall_filter
);
6208 c
->syscall_archs
= set_free(c
->syscall_archs
);
6209 c
->address_families
= set_free(c
->address_families
);
6211 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
6212 exec_directory_done(&c
->directories
[t
]);
6214 c
->log_level_max
= -1;
6216 exec_context_free_log_extra_fields(c
);
6217 c
->log_filter_allowed_patterns
= set_free_free(c
->log_filter_allowed_patterns
);
6218 c
->log_filter_denied_patterns
= set_free_free(c
->log_filter_denied_patterns
);
6220 c
->log_ratelimit_interval_usec
= 0;
6221 c
->log_ratelimit_burst
= 0;
6223 c
->stdin_data
= mfree(c
->stdin_data
);
6224 c
->stdin_data_size
= 0;
6226 c
->network_namespace_path
= mfree(c
->network_namespace_path
);
6227 c
->ipc_namespace_path
= mfree(c
->ipc_namespace_path
);
6229 c
->log_namespace
= mfree(c
->log_namespace
);
6231 c
->load_credentials
= hashmap_free(c
->load_credentials
);
6232 c
->set_credentials
= hashmap_free(c
->set_credentials
);
6233 c
->import_credentials
= set_free_free(c
->import_credentials
);
6235 c
->root_image_policy
= image_policy_free(c
->root_image_policy
);
6236 c
->mount_image_policy
= image_policy_free(c
->mount_image_policy
);
6237 c
->extension_image_policy
= image_policy_free(c
->extension_image_policy
);
6240 int exec_context_destroy_runtime_directory(const ExecContext
*c
, const char *runtime_prefix
) {
6243 if (!runtime_prefix
)
6246 for (size_t i
= 0; i
< c
->directories
[EXEC_DIRECTORY_RUNTIME
].n_items
; i
++) {
6247 _cleanup_free_
char *p
= NULL
;
6249 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
6250 p
= path_join(runtime_prefix
, "private", c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
6252 p
= path_join(runtime_prefix
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].path
);
6256 /* We execute this synchronously, since we need to be sure this is gone when we start the
6258 (void) rm_rf(p
, REMOVE_ROOT
);
6260 STRV_FOREACH(symlink
, c
->directories
[EXEC_DIRECTORY_RUNTIME
].items
[i
].symlinks
) {
6261 _cleanup_free_
char *symlink_abs
= NULL
;
6263 if (exec_directory_is_private(c
, EXEC_DIRECTORY_RUNTIME
))
6264 symlink_abs
= path_join(runtime_prefix
, "private", *symlink
);
6266 symlink_abs
= path_join(runtime_prefix
, *symlink
);
6270 (void) unlink(symlink_abs
);
6277 int exec_context_destroy_credentials(const ExecContext
*c
, const char *runtime_prefix
, const char *unit
) {
6278 _cleanup_free_
char *p
= NULL
;
6282 if (!runtime_prefix
|| !unit
)
6285 p
= path_join(runtime_prefix
, "credentials", unit
);
6289 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6290 * unmount it, and afterwards remove the mount point */
6291 (void) umount2(p
, MNT_DETACH
|UMOUNT_NOFOLLOW
);
6292 (void) rm_rf(p
, REMOVE_ROOT
|REMOVE_CHMOD
);
6297 int exec_context_destroy_mount_ns_dir(Unit
*u
) {
6298 _cleanup_free_
char *p
= NULL
;
6300 if (!u
|| !MANAGER_IS_SYSTEM(u
->manager
))
6303 p
= path_join("/run/systemd/propagate/", u
->id
);
6307 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6308 if (rmdir(p
) < 0 && errno
!= ENOENT
)
6309 log_unit_debug_errno(u
, errno
, "Unable to remove propagation dir '%s', ignoring: %m", p
);
6314 static void exec_command_done(ExecCommand
*c
) {
6317 c
->path
= mfree(c
->path
);
6318 c
->argv
= strv_free(c
->argv
);
6321 void exec_command_done_array(ExecCommand
*c
, size_t n
) {
6322 for (size_t i
= 0; i
< n
; i
++)
6323 exec_command_done(c
+i
);
6326 ExecCommand
* exec_command_free_list(ExecCommand
*c
) {
6330 LIST_REMOVE(command
, c
, i
);
6331 exec_command_done(i
);
6338 void exec_command_free_array(ExecCommand
**c
, size_t n
) {
6339 for (size_t i
= 0; i
< n
; i
++)
6340 c
[i
] = exec_command_free_list(c
[i
]);
6343 void exec_command_reset_status_array(ExecCommand
*c
, size_t n
) {
6344 for (size_t i
= 0; i
< n
; i
++)
6345 exec_status_reset(&c
[i
].exec_status
);
6348 void exec_command_reset_status_list_array(ExecCommand
**c
, size_t n
) {
6349 for (size_t i
= 0; i
< n
; i
++)
6350 LIST_FOREACH(command
, z
, c
[i
])
6351 exec_status_reset(&z
->exec_status
);
6354 typedef struct InvalidEnvInfo
{
6359 static void invalid_env(const char *p
, void *userdata
) {
6360 InvalidEnvInfo
*info
= userdata
;
6362 log_unit_error(info
->unit
, "Ignoring invalid environment assignment '%s': %s", p
, info
->path
);
6365 const char* exec_context_fdname(const ExecContext
*c
, int fd_index
) {
6371 if (c
->std_input
!= EXEC_INPUT_NAMED_FD
)
6374 return c
->stdio_fdname
[STDIN_FILENO
] ?: "stdin";
6377 if (c
->std_output
!= EXEC_OUTPUT_NAMED_FD
)
6380 return c
->stdio_fdname
[STDOUT_FILENO
] ?: "stdout";
6383 if (c
->std_error
!= EXEC_OUTPUT_NAMED_FD
)
6386 return c
->stdio_fdname
[STDERR_FILENO
] ?: "stderr";
6393 static int exec_context_named_iofds(
6394 const ExecContext
*c
,
6395 const ExecParameters
*p
,
6396 int named_iofds
[static 3]) {
6399 const char* stdio_fdname
[3];
6404 assert(named_iofds
);
6406 targets
= (c
->std_input
== EXEC_INPUT_NAMED_FD
) +
6407 (c
->std_output
== EXEC_OUTPUT_NAMED_FD
) +
6408 (c
->std_error
== EXEC_OUTPUT_NAMED_FD
);
6410 for (size_t i
= 0; i
< 3; i
++)
6411 stdio_fdname
[i
] = exec_context_fdname(c
, i
);
6413 n_fds
= p
->n_storage_fds
+ p
->n_socket_fds
;
6415 for (size_t i
= 0; i
< n_fds
&& targets
> 0; i
++)
6416 if (named_iofds
[STDIN_FILENO
] < 0 &&
6417 c
->std_input
== EXEC_INPUT_NAMED_FD
&&
6418 stdio_fdname
[STDIN_FILENO
] &&
6419 streq(p
->fd_names
[i
], stdio_fdname
[STDIN_FILENO
])) {
6421 named_iofds
[STDIN_FILENO
] = p
->fds
[i
];
6424 } else if (named_iofds
[STDOUT_FILENO
] < 0 &&
6425 c
->std_output
== EXEC_OUTPUT_NAMED_FD
&&
6426 stdio_fdname
[STDOUT_FILENO
] &&
6427 streq(p
->fd_names
[i
], stdio_fdname
[STDOUT_FILENO
])) {
6429 named_iofds
[STDOUT_FILENO
] = p
->fds
[i
];
6432 } else if (named_iofds
[STDERR_FILENO
] < 0 &&
6433 c
->std_error
== EXEC_OUTPUT_NAMED_FD
&&
6434 stdio_fdname
[STDERR_FILENO
] &&
6435 streq(p
->fd_names
[i
], stdio_fdname
[STDERR_FILENO
])) {
6437 named_iofds
[STDERR_FILENO
] = p
->fds
[i
];
6441 return targets
== 0 ? 0 : -ENOENT
;
6444 static int exec_context_load_environment(const Unit
*unit
, const ExecContext
*c
, char ***ret
) {
6445 _cleanup_strv_free_
char **v
= NULL
;
6451 STRV_FOREACH(i
, c
->environment_files
) {
6452 _cleanup_globfree_ glob_t pglob
= {};
6453 bool ignore
= false;
6461 if (!path_is_absolute(fn
)) {
6467 /* Filename supports globbing, take all matching files */
6468 r
= safe_glob(fn
, 0, &pglob
);
6475 /* When we don't match anything, -ENOENT should be returned */
6476 assert(pglob
.gl_pathc
> 0);
6478 for (size_t n
= 0; n
< pglob
.gl_pathc
; n
++) {
6479 _cleanup_strv_free_
char **p
= NULL
;
6481 r
= load_env_file(NULL
, pglob
.gl_pathv
[n
], &p
);
6488 /* Log invalid environment variables with filename */
6490 InvalidEnvInfo info
= {
6492 .path
= pglob
.gl_pathv
[n
]
6495 p
= strv_env_clean_with_callback(p
, invalid_env
, &info
);
6501 char **m
= strv_env_merge(v
, p
);
6505 strv_free_and_replace(v
, m
);
6515 static bool tty_may_match_dev_console(const char *tty
) {
6516 _cleanup_free_
char *resolved
= NULL
;
6521 tty
= skip_dev_prefix(tty
);
6523 /* trivial identity? */
6524 if (streq(tty
, "console"))
6527 if (resolve_dev_console(&resolved
) < 0)
6528 return true; /* if we could not resolve, assume it may */
6530 /* "tty0" means the active VC, so it may be the same sometimes */
6531 return path_equal(resolved
, tty
) || (streq(resolved
, "tty0") && tty_is_vc(tty
));
6534 static bool exec_context_may_touch_tty(const ExecContext
*ec
) {
6537 return ec
->tty_reset
||
6539 ec
->tty_vt_disallocate
||
6540 is_terminal_input(ec
->std_input
) ||
6541 is_terminal_output(ec
->std_output
) ||
6542 is_terminal_output(ec
->std_error
);
6545 bool exec_context_may_touch_console(const ExecContext
*ec
) {
6547 return exec_context_may_touch_tty(ec
) &&
6548 tty_may_match_dev_console(exec_context_tty_path(ec
));
6551 static void strv_fprintf(FILE *f
, char **l
) {
6555 fprintf(f
, " %s", *g
);
6558 static void strv_dump(FILE* f
, const char *prefix
, const char *name
, char **strv
) {
6563 if (!strv_isempty(strv
)) {
6564 fprintf(f
, "%s%s:", prefix
, name
);
6565 strv_fprintf(f
, strv
);
6570 void exec_context_dump(const ExecContext
*c
, FILE* f
, const char *prefix
) {
6576 prefix
= strempty(prefix
);
6580 "%sWorkingDirectory: %s\n"
6581 "%sRootDirectory: %s\n"
6582 "%sRootEphemeral: %s\n"
6583 "%sNonBlocking: %s\n"
6584 "%sPrivateTmp: %s\n"
6585 "%sPrivateDevices: %s\n"
6586 "%sProtectKernelTunables: %s\n"
6587 "%sProtectKernelModules: %s\n"
6588 "%sProtectKernelLogs: %s\n"
6589 "%sProtectClock: %s\n"
6590 "%sProtectControlGroups: %s\n"
6591 "%sPrivateNetwork: %s\n"
6592 "%sPrivateUsers: %s\n"
6593 "%sProtectHome: %s\n"
6594 "%sProtectSystem: %s\n"
6595 "%sMountAPIVFS: %s\n"
6596 "%sIgnoreSIGPIPE: %s\n"
6597 "%sMemoryDenyWriteExecute: %s\n"
6598 "%sRestrictRealtime: %s\n"
6599 "%sRestrictSUIDSGID: %s\n"
6600 "%sKeyringMode: %s\n"
6601 "%sProtectHostname: %s\n"
6602 "%sProtectProc: %s\n"
6603 "%sProcSubset: %s\n",
6605 prefix
, empty_to_root(c
->working_directory
),
6606 prefix
, empty_to_root(c
->root_directory
),
6607 prefix
, yes_no(c
->root_ephemeral
),
6608 prefix
, yes_no(c
->non_blocking
),
6609 prefix
, yes_no(c
->private_tmp
),
6610 prefix
, yes_no(c
->private_devices
),
6611 prefix
, yes_no(c
->protect_kernel_tunables
),
6612 prefix
, yes_no(c
->protect_kernel_modules
),
6613 prefix
, yes_no(c
->protect_kernel_logs
),
6614 prefix
, yes_no(c
->protect_clock
),
6615 prefix
, yes_no(c
->protect_control_groups
),
6616 prefix
, yes_no(c
->private_network
),
6617 prefix
, yes_no(c
->private_users
),
6618 prefix
, protect_home_to_string(c
->protect_home
),
6619 prefix
, protect_system_to_string(c
->protect_system
),
6620 prefix
, yes_no(exec_context_get_effective_mount_apivfs(c
)),
6621 prefix
, yes_no(c
->ignore_sigpipe
),
6622 prefix
, yes_no(c
->memory_deny_write_execute
),
6623 prefix
, yes_no(c
->restrict_realtime
),
6624 prefix
, yes_no(c
->restrict_suid_sgid
),
6625 prefix
, exec_keyring_mode_to_string(c
->keyring_mode
),
6626 prefix
, yes_no(c
->protect_hostname
),
6627 prefix
, protect_proc_to_string(c
->protect_proc
),
6628 prefix
, proc_subset_to_string(c
->proc_subset
));
6631 fprintf(f
, "%sRootImage: %s\n", prefix
, c
->root_image
);
6633 if (c
->root_image_options
) {
6634 fprintf(f
, "%sRootImageOptions:", prefix
);
6635 LIST_FOREACH(mount_options
, o
, c
->root_image_options
)
6636 if (!isempty(o
->options
))
6637 fprintf(f
, " %s:%s",
6638 partition_designator_to_string(o
->partition_designator
),
6644 _cleanup_free_
char *encoded
= NULL
;
6645 encoded
= hexmem(c
->root_hash
, c
->root_hash_size
);
6647 fprintf(f
, "%sRootHash: %s\n", prefix
, encoded
);
6650 if (c
->root_hash_path
)
6651 fprintf(f
, "%sRootHash: %s\n", prefix
, c
->root_hash_path
);
6653 if (c
->root_hash_sig
) {
6654 _cleanup_free_
char *encoded
= NULL
;
6656 len
= base64mem(c
->root_hash_sig
, c
->root_hash_sig_size
, &encoded
);
6658 fprintf(f
, "%sRootHashSignature: base64:%s\n", prefix
, encoded
);
6661 if (c
->root_hash_sig_path
)
6662 fprintf(f
, "%sRootHashSignature: %s\n", prefix
, c
->root_hash_sig_path
);
6665 fprintf(f
, "%sRootVerity: %s\n", prefix
, c
->root_verity
);
6667 STRV_FOREACH(e
, c
->environment
)
6668 fprintf(f
, "%sEnvironment: %s\n", prefix
, *e
);
6670 STRV_FOREACH(e
, c
->environment_files
)
6671 fprintf(f
, "%sEnvironmentFile: %s\n", prefix
, *e
);
6673 STRV_FOREACH(e
, c
->pass_environment
)
6674 fprintf(f
, "%sPassEnvironment: %s\n", prefix
, *e
);
6676 STRV_FOREACH(e
, c
->unset_environment
)
6677 fprintf(f
, "%sUnsetEnvironment: %s\n", prefix
, *e
);
6679 fprintf(f
, "%sRuntimeDirectoryPreserve: %s\n", prefix
, exec_preserve_mode_to_string(c
->runtime_directory_preserve_mode
));
6681 for (ExecDirectoryType dt
= 0; dt
< _EXEC_DIRECTORY_TYPE_MAX
; dt
++) {
6682 fprintf(f
, "%s%sMode: %04o\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].mode
);
6684 for (size_t i
= 0; i
< c
->directories
[dt
].n_items
; i
++) {
6685 fprintf(f
, "%s%s: %s\n", prefix
, exec_directory_type_to_string(dt
), c
->directories
[dt
].items
[i
].path
);
6687 STRV_FOREACH(d
, c
->directories
[dt
].items
[i
].symlinks
)
6688 fprintf(f
, "%s%s: %s:%s\n", prefix
, exec_directory_type_symlink_to_string(dt
), c
->directories
[dt
].items
[i
].path
, *d
);
6692 fprintf(f
, "%sTimeoutCleanSec: %s\n", prefix
, FORMAT_TIMESPAN(c
->timeout_clean_usec
, USEC_PER_SEC
));
6695 fprintf(f
, "%sNice: %i\n", prefix
, c
->nice
);
6697 if (c
->oom_score_adjust_set
)
6698 fprintf(f
, "%sOOMScoreAdjust: %i\n", prefix
, c
->oom_score_adjust
);
6700 if (c
->coredump_filter_set
)
6701 fprintf(f
, "%sCoredumpFilter: 0x%"PRIx64
"\n", prefix
, c
->coredump_filter
);
6703 for (unsigned i
= 0; i
< RLIM_NLIMITS
; i
++)
6705 fprintf(f
, "%sLimit%s: " RLIM_FMT
"\n",
6706 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_max
);
6707 fprintf(f
, "%sLimit%sSoft: " RLIM_FMT
"\n",
6708 prefix
, rlimit_to_string(i
), c
->rlimit
[i
]->rlim_cur
);
6711 if (c
->ioprio_set
) {
6712 _cleanup_free_
char *class_str
= NULL
;
6714 r
= ioprio_class_to_string_alloc(ioprio_prio_class(c
->ioprio
), &class_str
);
6716 fprintf(f
, "%sIOSchedulingClass: %s\n", prefix
, class_str
);
6718 fprintf(f
, "%sIOPriority: %d\n", prefix
, ioprio_prio_data(c
->ioprio
));
6721 if (c
->cpu_sched_set
) {
6722 _cleanup_free_
char *policy_str
= NULL
;
6724 r
= sched_policy_to_string_alloc(c
->cpu_sched_policy
, &policy_str
);
6726 fprintf(f
, "%sCPUSchedulingPolicy: %s\n", prefix
, policy_str
);
6729 "%sCPUSchedulingPriority: %i\n"
6730 "%sCPUSchedulingResetOnFork: %s\n",
6731 prefix
, c
->cpu_sched_priority
,
6732 prefix
, yes_no(c
->cpu_sched_reset_on_fork
));
6735 if (c
->cpu_set
.set
) {
6736 _cleanup_free_
char *affinity
= NULL
;
6738 affinity
= cpu_set_to_range_string(&c
->cpu_set
);
6739 fprintf(f
, "%sCPUAffinity: %s\n", prefix
, affinity
);
6742 if (mpol_is_valid(numa_policy_get_type(&c
->numa_policy
))) {
6743 _cleanup_free_
char *nodes
= NULL
;
6745 nodes
= cpu_set_to_range_string(&c
->numa_policy
.nodes
);
6746 fprintf(f
, "%sNUMAPolicy: %s\n", prefix
, mpol_to_string(numa_policy_get_type(&c
->numa_policy
)));
6747 fprintf(f
, "%sNUMAMask: %s\n", prefix
, strnull(nodes
));
6750 if (c
->timer_slack_nsec
!= NSEC_INFINITY
)
6751 fprintf(f
, "%sTimerSlackNSec: "NSEC_FMT
"\n", prefix
, c
->timer_slack_nsec
);
6754 "%sStandardInput: %s\n"
6755 "%sStandardOutput: %s\n"
6756 "%sStandardError: %s\n",
6757 prefix
, exec_input_to_string(c
->std_input
),
6758 prefix
, exec_output_to_string(c
->std_output
),
6759 prefix
, exec_output_to_string(c
->std_error
));
6761 if (c
->std_input
== EXEC_INPUT_NAMED_FD
)
6762 fprintf(f
, "%sStandardInputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDIN_FILENO
]);
6763 if (c
->std_output
== EXEC_OUTPUT_NAMED_FD
)
6764 fprintf(f
, "%sStandardOutputFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDOUT_FILENO
]);
6765 if (c
->std_error
== EXEC_OUTPUT_NAMED_FD
)
6766 fprintf(f
, "%sStandardErrorFileDescriptorName: %s\n", prefix
, c
->stdio_fdname
[STDERR_FILENO
]);
6768 if (c
->std_input
== EXEC_INPUT_FILE
)
6769 fprintf(f
, "%sStandardInputFile: %s\n", prefix
, c
->stdio_file
[STDIN_FILENO
]);
6770 if (c
->std_output
== EXEC_OUTPUT_FILE
)
6771 fprintf(f
, "%sStandardOutputFile: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6772 if (c
->std_output
== EXEC_OUTPUT_FILE_APPEND
)
6773 fprintf(f
, "%sStandardOutputFileToAppend: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6774 if (c
->std_output
== EXEC_OUTPUT_FILE_TRUNCATE
)
6775 fprintf(f
, "%sStandardOutputFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDOUT_FILENO
]);
6776 if (c
->std_error
== EXEC_OUTPUT_FILE
)
6777 fprintf(f
, "%sStandardErrorFile: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6778 if (c
->std_error
== EXEC_OUTPUT_FILE_APPEND
)
6779 fprintf(f
, "%sStandardErrorFileToAppend: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6780 if (c
->std_error
== EXEC_OUTPUT_FILE_TRUNCATE
)
6781 fprintf(f
, "%sStandardErrorFileToTruncate: %s\n", prefix
, c
->stdio_file
[STDERR_FILENO
]);
6787 "%sTTYVHangup: %s\n"
6788 "%sTTYVTDisallocate: %s\n"
6790 "%sTTYColumns: %u\n",
6791 prefix
, c
->tty_path
,
6792 prefix
, yes_no(c
->tty_reset
),
6793 prefix
, yes_no(c
->tty_vhangup
),
6794 prefix
, yes_no(c
->tty_vt_disallocate
),
6795 prefix
, c
->tty_rows
,
6796 prefix
, c
->tty_cols
);
6798 if (IN_SET(c
->std_output
,
6800 EXEC_OUTPUT_JOURNAL
,
6801 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
6802 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
) ||
6803 IN_SET(c
->std_error
,
6805 EXEC_OUTPUT_JOURNAL
,
6806 EXEC_OUTPUT_KMSG_AND_CONSOLE
,
6807 EXEC_OUTPUT_JOURNAL_AND_CONSOLE
)) {
6809 _cleanup_free_
char *fac_str
= NULL
, *lvl_str
= NULL
;
6811 r
= log_facility_unshifted_to_string_alloc(c
->syslog_priority
>> 3, &fac_str
);
6813 fprintf(f
, "%sSyslogFacility: %s\n", prefix
, fac_str
);
6815 r
= log_level_to_string_alloc(LOG_PRI(c
->syslog_priority
), &lvl_str
);
6817 fprintf(f
, "%sSyslogLevel: %s\n", prefix
, lvl_str
);
6820 if (c
->log_level_max
>= 0) {
6821 _cleanup_free_
char *t
= NULL
;
6823 (void) log_level_to_string_alloc(c
->log_level_max
, &t
);
6825 fprintf(f
, "%sLogLevelMax: %s\n", prefix
, strna(t
));
6828 if (c
->log_ratelimit_interval_usec
> 0)
6830 "%sLogRateLimitIntervalSec: %s\n",
6831 prefix
, FORMAT_TIMESPAN(c
->log_ratelimit_interval_usec
, USEC_PER_SEC
));
6833 if (c
->log_ratelimit_burst
> 0)
6834 fprintf(f
, "%sLogRateLimitBurst: %u\n", prefix
, c
->log_ratelimit_burst
);
6836 if (!set_isempty(c
->log_filter_allowed_patterns
) || !set_isempty(c
->log_filter_denied_patterns
)) {
6837 fprintf(f
, "%sLogFilterPatterns:", prefix
);
6840 SET_FOREACH(pattern
, c
->log_filter_allowed_patterns
)
6841 fprintf(f
, " %s", pattern
);
6842 SET_FOREACH(pattern
, c
->log_filter_denied_patterns
)
6843 fprintf(f
, " ~%s", pattern
);
6847 for (size_t j
= 0; j
< c
->n_log_extra_fields
; j
++) {
6848 fprintf(f
, "%sLogExtraFields: ", prefix
);
6849 fwrite(c
->log_extra_fields
[j
].iov_base
,
6850 1, c
->log_extra_fields
[j
].iov_len
,
6855 if (c
->log_namespace
)
6856 fprintf(f
, "%sLogNamespace: %s\n", prefix
, c
->log_namespace
);
6858 if (c
->secure_bits
) {
6859 _cleanup_free_
char *str
= NULL
;
6861 r
= secure_bits_to_string_alloc(c
->secure_bits
, &str
);
6863 fprintf(f
, "%sSecure Bits: %s\n", prefix
, str
);
6866 if (c
->capability_bounding_set
!= CAP_MASK_UNSET
) {
6867 _cleanup_free_
char *str
= NULL
;
6869 r
= capability_set_to_string(c
->capability_bounding_set
, &str
);
6871 fprintf(f
, "%sCapabilityBoundingSet: %s\n", prefix
, str
);
6874 if (c
->capability_ambient_set
!= 0) {
6875 _cleanup_free_
char *str
= NULL
;
6877 r
= capability_set_to_string(c
->capability_ambient_set
, &str
);
6879 fprintf(f
, "%sAmbientCapabilities: %s\n", prefix
, str
);
6883 fprintf(f
, "%sUser: %s\n", prefix
, c
->user
);
6885 fprintf(f
, "%sGroup: %s\n", prefix
, c
->group
);
6887 fprintf(f
, "%sDynamicUser: %s\n", prefix
, yes_no(c
->dynamic_user
));
6889 strv_dump(f
, prefix
, "SupplementaryGroups", c
->supplementary_groups
);
6892 fprintf(f
, "%sPAMName: %s\n", prefix
, c
->pam_name
);
6894 strv_dump(f
, prefix
, "ReadWritePaths", c
->read_write_paths
);
6895 strv_dump(f
, prefix
, "ReadOnlyPaths", c
->read_only_paths
);
6896 strv_dump(f
, prefix
, "InaccessiblePaths", c
->inaccessible_paths
);
6897 strv_dump(f
, prefix
, "ExecPaths", c
->exec_paths
);
6898 strv_dump(f
, prefix
, "NoExecPaths", c
->no_exec_paths
);
6899 strv_dump(f
, prefix
, "ExecSearchPath", c
->exec_search_path
);
6901 for (size_t i
= 0; i
< c
->n_bind_mounts
; i
++)
6902 fprintf(f
, "%s%s: %s%s:%s:%s\n", prefix
,
6903 c
->bind_mounts
[i
].read_only
? "BindReadOnlyPaths" : "BindPaths",
6904 c
->bind_mounts
[i
].ignore_enoent
? "-": "",
6905 c
->bind_mounts
[i
].source
,
6906 c
->bind_mounts
[i
].destination
,
6907 c
->bind_mounts
[i
].recursive
? "rbind" : "norbind");
6909 for (size_t i
= 0; i
< c
->n_temporary_filesystems
; i
++) {
6910 const TemporaryFileSystem
*t
= c
->temporary_filesystems
+ i
;
6912 fprintf(f
, "%sTemporaryFileSystem: %s%s%s\n", prefix
,
6914 isempty(t
->options
) ? "" : ":",
6915 strempty(t
->options
));
6920 "%sUtmpIdentifier: %s\n",
6921 prefix
, c
->utmp_id
);
6923 if (c
->selinux_context
)
6925 "%sSELinuxContext: %s%s\n",
6926 prefix
, c
->selinux_context_ignore
? "-" : "", c
->selinux_context
);
6928 if (c
->apparmor_profile
)
6930 "%sAppArmorProfile: %s%s\n",
6931 prefix
, c
->apparmor_profile_ignore
? "-" : "", c
->apparmor_profile
);
6933 if (c
->smack_process_label
)
6935 "%sSmackProcessLabel: %s%s\n",
6936 prefix
, c
->smack_process_label_ignore
? "-" : "", c
->smack_process_label
);
6938 if (c
->personality
!= PERSONALITY_INVALID
)
6940 "%sPersonality: %s\n",
6941 prefix
, strna(personality_to_string(c
->personality
)));
6944 "%sLockPersonality: %s\n",
6945 prefix
, yes_no(c
->lock_personality
));
6947 if (c
->syscall_filter
) {
6949 "%sSystemCallFilter: ",
6952 if (!c
->syscall_allow_list
)
6958 HASHMAP_FOREACH_KEY(val
, id
, c
->syscall_filter
) {
6959 _cleanup_free_
char *name
= NULL
;
6960 const char *errno_name
= NULL
;
6961 int num
= PTR_TO_INT(val
);
6968 name
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
6969 fputs(strna(name
), f
);
6972 errno_name
= seccomp_errno_or_action_to_string(num
);
6974 fprintf(f
, ":%s", errno_name
);
6976 fprintf(f
, ":%d", num
);
6984 if (c
->syscall_archs
) {
6986 "%sSystemCallArchitectures:",
6991 SET_FOREACH(id
, c
->syscall_archs
)
6992 fprintf(f
, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id
) - 1)));
6997 if (exec_context_restrict_namespaces_set(c
)) {
6998 _cleanup_free_
char *s
= NULL
;
7000 r
= namespace_flags_to_string(c
->restrict_namespaces
, &s
);
7002 fprintf(f
, "%sRestrictNamespaces: %s\n",
7007 if (exec_context_restrict_filesystems_set(c
)) {
7009 SET_FOREACH(fs
, c
->restrict_filesystems
)
7010 fprintf(f
, "%sRestrictFileSystems: %s\n", prefix
, fs
);
7014 if (c
->network_namespace_path
)
7016 "%sNetworkNamespacePath: %s\n",
7017 prefix
, c
->network_namespace_path
);
7019 if (c
->syscall_errno
> 0) {
7020 fprintf(f
, "%sSystemCallErrorNumber: ", prefix
);
7023 const char *errno_name
= seccomp_errno_or_action_to_string(c
->syscall_errno
);
7025 fputs(errno_name
, f
);
7027 fprintf(f
, "%d", c
->syscall_errno
);
7032 for (size_t i
= 0; i
< c
->n_mount_images
; i
++) {
7033 fprintf(f
, "%sMountImages: %s%s:%s", prefix
,
7034 c
->mount_images
[i
].ignore_enoent
? "-": "",
7035 c
->mount_images
[i
].source
,
7036 c
->mount_images
[i
].destination
);
7037 LIST_FOREACH(mount_options
, o
, c
->mount_images
[i
].mount_options
)
7038 fprintf(f
, ":%s:%s",
7039 partition_designator_to_string(o
->partition_designator
),
7040 strempty(o
->options
));
7044 for (size_t i
= 0; i
< c
->n_extension_images
; i
++) {
7045 fprintf(f
, "%sExtensionImages: %s%s", prefix
,
7046 c
->extension_images
[i
].ignore_enoent
? "-": "",
7047 c
->extension_images
[i
].source
);
7048 LIST_FOREACH(mount_options
, o
, c
->extension_images
[i
].mount_options
)
7049 fprintf(f
, ":%s:%s",
7050 partition_designator_to_string(o
->partition_designator
),
7051 strempty(o
->options
));
7055 strv_dump(f
, prefix
, "ExtensionDirectories", c
->extension_directories
);
7058 bool exec_context_maintains_privileges(const ExecContext
*c
) {
7061 /* Returns true if the process forked off would run under
7062 * an unchanged UID or as root. */
7067 if (streq(c
->user
, "root") || streq(c
->user
, "0"))
7073 int exec_context_get_effective_ioprio(const ExecContext
*c
) {
7081 p
= ioprio_get(IOPRIO_WHO_PROCESS
, 0);
7083 return IOPRIO_DEFAULT_CLASS_AND_PRIO
;
7085 return ioprio_normalize(p
);
7088 bool exec_context_get_effective_mount_apivfs(const ExecContext
*c
) {
7091 /* Explicit setting wins */
7092 if (c
->mount_apivfs_set
)
7093 return c
->mount_apivfs
;
7095 /* Default to "yes" if root directory or image are specified */
7096 if (exec_context_with_rootfs(c
))
7102 void exec_context_free_log_extra_fields(ExecContext
*c
) {
7105 for (size_t l
= 0; l
< c
->n_log_extra_fields
; l
++)
7106 free(c
->log_extra_fields
[l
].iov_base
);
7107 c
->log_extra_fields
= mfree(c
->log_extra_fields
);
7108 c
->n_log_extra_fields
= 0;
7111 void exec_context_revert_tty(ExecContext
*c
) {
7112 _cleanup_close_
int fd
= -EBADF
;
7119 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7120 exec_context_tty_reset(c
, NULL
);
7122 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7123 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7124 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7125 if (!exec_context_may_touch_tty(c
))
7128 path
= exec_context_tty_path(c
);
7132 fd
= open(path
, O_PATH
|O_CLOEXEC
);
7134 return (void) log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
7135 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7138 if (fstat(fd
, &st
) < 0)
7139 return (void) log_warning_errno(errno
, "Failed to stat TTY '%s', ignoring: %m", path
);
7141 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7142 * if things are a character device, since a proper check either means we'd have to open the TTY and
7143 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7144 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7145 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7146 if (!S_ISCHR(st
.st_mode
))
7147 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path
);
7149 r
= fchmod_and_chown(fd
, TTY_MODE
, 0, TTY_GID
);
7151 log_warning_errno(r
, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path
);
7154 int exec_context_get_clean_directories(
7160 _cleanup_strv_free_
char **l
= NULL
;
7167 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++) {
7168 if (!FLAGS_SET(mask
, 1U << t
))
7174 for (size_t i
= 0; i
< c
->directories
[t
].n_items
; i
++) {
7177 j
= path_join(prefix
[t
], c
->directories
[t
].items
[i
].path
);
7181 r
= strv_consume(&l
, j
);
7185 /* Also remove private directories unconditionally. */
7186 if (t
!= EXEC_DIRECTORY_CONFIGURATION
) {
7187 j
= path_join(prefix
[t
], "private", c
->directories
[t
].items
[i
].path
);
7191 r
= strv_consume(&l
, j
);
7196 STRV_FOREACH(symlink
, c
->directories
[t
].items
[i
].symlinks
) {
7197 j
= path_join(prefix
[t
], *symlink
);
7201 r
= strv_consume(&l
, j
);
7212 int exec_context_get_clean_mask(ExecContext
*c
, ExecCleanMask
*ret
) {
7213 ExecCleanMask mask
= 0;
7218 for (ExecDirectoryType t
= 0; t
< _EXEC_DIRECTORY_TYPE_MAX
; t
++)
7219 if (c
->directories
[t
].n_items
> 0)
7226 bool exec_context_has_encrypted_credentials(ExecContext
*c
) {
7227 ExecLoadCredential
*load_cred
;
7228 ExecSetCredential
*set_cred
;
7232 HASHMAP_FOREACH(load_cred
, c
->load_credentials
)
7233 if (load_cred
->encrypted
)
7236 HASHMAP_FOREACH(set_cred
, c
->set_credentials
)
7237 if (set_cred
->encrypted
)
7243 void exec_status_start(ExecStatus
*s
, pid_t pid
) {
7250 dual_timestamp_get(&s
->start_timestamp
);
7253 void exec_status_exit(ExecStatus
*s
, const ExecContext
*context
, pid_t pid
, int code
, int status
) {
7261 dual_timestamp_get(&s
->exit_timestamp
);
7266 if (context
&& context
->utmp_id
)
7267 (void) utmp_put_dead_process(context
->utmp_id
, pid
, code
, status
);
7270 void exec_status_reset(ExecStatus
*s
) {
7273 *s
= (ExecStatus
) {};
7276 void exec_status_dump(const ExecStatus
*s
, FILE *f
, const char *prefix
) {
7283 prefix
= strempty(prefix
);
7286 "%sPID: "PID_FMT
"\n",
7289 if (dual_timestamp_is_set(&s
->start_timestamp
))
7291 "%sStart Timestamp: %s\n",
7292 prefix
, FORMAT_TIMESTAMP(s
->start_timestamp
.realtime
));
7294 if (dual_timestamp_is_set(&s
->exit_timestamp
))
7296 "%sExit Timestamp: %s\n"
7298 "%sExit Status: %i\n",
7299 prefix
, FORMAT_TIMESTAMP(s
->exit_timestamp
.realtime
),
7300 prefix
, sigchld_code_to_string(s
->code
),
7304 static void exec_command_dump(ExecCommand
*c
, FILE *f
, const char *prefix
) {
7305 _cleanup_free_
char *cmd
= NULL
;
7306 const char *prefix2
;
7311 prefix
= strempty(prefix
);
7312 prefix2
= strjoina(prefix
, "\t");
7314 cmd
= quote_command_line(c
->argv
, SHELL_ESCAPE_EMPTY
);
7317 "%sCommand Line: %s\n",
7318 prefix
, strnull(cmd
));
7320 exec_status_dump(&c
->exec_status
, f
, prefix2
);
7323 void exec_command_dump_list(ExecCommand
*c
, FILE *f
, const char *prefix
) {
7326 prefix
= strempty(prefix
);
7328 LIST_FOREACH(command
, i
, c
)
7329 exec_command_dump(i
, f
, prefix
);
7332 void exec_command_append_list(ExecCommand
**l
, ExecCommand
*e
) {
7339 /* It's kind of important, that we keep the order here */
7340 end
= LIST_FIND_TAIL(command
, *l
);
7341 LIST_INSERT_AFTER(command
, *l
, end
, e
);
7346 int exec_command_set(ExecCommand
*c
, const char *path
, ...) {
7354 l
= strv_new_ap(path
, ap
);
7366 free_and_replace(c
->path
, p
);
7368 return strv_free_and_replace(c
->argv
, l
);
7371 int exec_command_append(ExecCommand
*c
, const char *path
, ...) {
7372 _cleanup_strv_free_
char **l
= NULL
;
7380 l
= strv_new_ap(path
, ap
);
7386 r
= strv_extend_strv(&c
->argv
, l
, false);
7393 static char *destroy_tree(char *path
) {
7397 if (!path_equal(path
, RUN_SYSTEMD_EMPTY
)) {
7398 log_debug("Spawning process to nuke '%s'", path
);
7400 (void) asynchronous_rm_rf(path
, REMOVE_ROOT
|REMOVE_SUBVOLUME
|REMOVE_PHYSICAL
);
7406 static ExecSharedRuntime
* exec_shared_runtime_free(ExecSharedRuntime
*rt
) {
7411 (void) hashmap_remove(rt
->manager
->exec_shared_runtime_by_id
, rt
->id
);
7413 rt
->id
= mfree(rt
->id
);
7414 rt
->tmp_dir
= mfree(rt
->tmp_dir
);
7415 rt
->var_tmp_dir
= mfree(rt
->var_tmp_dir
);
7416 safe_close_pair(rt
->netns_storage_socket
);
7417 safe_close_pair(rt
->ipcns_storage_socket
);
7421 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime
, exec_shared_runtime
, exec_shared_runtime_free
);
7422 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime
*, exec_shared_runtime_free
);
7424 ExecSharedRuntime
* exec_shared_runtime_destroy(ExecSharedRuntime
*rt
) {
7428 assert(rt
->n_ref
> 0);
7434 rt
->tmp_dir
= destroy_tree(rt
->tmp_dir
);
7435 rt
->var_tmp_dir
= destroy_tree(rt
->var_tmp_dir
);
7437 return exec_shared_runtime_free(rt
);
7440 static int exec_shared_runtime_allocate(ExecSharedRuntime
**ret
, const char *id
) {
7441 _cleanup_free_
char *id_copy
= NULL
;
7442 ExecSharedRuntime
*n
;
7446 id_copy
= strdup(id
);
7450 n
= new(ExecSharedRuntime
, 1);
7454 *n
= (ExecSharedRuntime
) {
7455 .id
= TAKE_PTR(id_copy
),
7456 .netns_storage_socket
= PIPE_EBADF
,
7457 .ipcns_storage_socket
= PIPE_EBADF
,
7464 static int exec_shared_runtime_add(
7469 int netns_storage_socket
[2],
7470 int ipcns_storage_socket
[2],
7471 ExecSharedRuntime
**ret
) {
7473 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt
= NULL
;
7479 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7481 r
= exec_shared_runtime_allocate(&rt
, id
);
7485 r
= hashmap_ensure_put(&m
->exec_shared_runtime_by_id
, &string_hash_ops
, rt
->id
, rt
);
7489 assert(!!rt
->tmp_dir
== !!rt
->var_tmp_dir
); /* We require both to be set together */
7490 rt
->tmp_dir
= TAKE_PTR(*tmp_dir
);
7491 rt
->var_tmp_dir
= TAKE_PTR(*var_tmp_dir
);
7493 if (netns_storage_socket
) {
7494 rt
->netns_storage_socket
[0] = TAKE_FD(netns_storage_socket
[0]);
7495 rt
->netns_storage_socket
[1] = TAKE_FD(netns_storage_socket
[1]);
7498 if (ipcns_storage_socket
) {
7499 rt
->ipcns_storage_socket
[0] = TAKE_FD(ipcns_storage_socket
[0]);
7500 rt
->ipcns_storage_socket
[1] = TAKE_FD(ipcns_storage_socket
[1]);
7507 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7512 static int exec_shared_runtime_make(
7514 const ExecContext
*c
,
7516 ExecSharedRuntime
**ret
) {
7518 _cleanup_(namespace_cleanup_tmpdirp
) char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
7519 _cleanup_close_pair_
int netns_storage_socket
[2] = PIPE_EBADF
, ipcns_storage_socket
[2] = PIPE_EBADF
;
7526 /* It is not necessary to create ExecSharedRuntime object. */
7527 if (!exec_needs_network_namespace(c
) && !exec_needs_ipc_namespace(c
) && !c
->private_tmp
) {
7532 if (c
->private_tmp
&&
7533 !(prefixed_path_strv_contains(c
->inaccessible_paths
, "/tmp") &&
7534 (prefixed_path_strv_contains(c
->inaccessible_paths
, "/var/tmp") ||
7535 prefixed_path_strv_contains(c
->inaccessible_paths
, "/var")))) {
7536 r
= setup_tmp_dirs(id
, &tmp_dir
, &var_tmp_dir
);
7541 if (exec_needs_network_namespace(c
)) {
7542 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, netns_storage_socket
) < 0)
7546 if (exec_needs_ipc_namespace(c
)) {
7547 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ipcns_storage_socket
) < 0)
7551 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_storage_socket
, ipcns_storage_socket
, ret
);
7558 int exec_shared_runtime_acquire(Manager
*m
, const ExecContext
*c
, const char *id
, bool create
, ExecSharedRuntime
**ret
) {
7559 ExecSharedRuntime
*rt
;
7566 rt
= hashmap_get(m
->exec_shared_runtime_by_id
, id
);
7568 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7576 /* If not found, then create a new object. */
7577 r
= exec_shared_runtime_make(m
, c
, id
, &rt
);
7581 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7587 /* increment reference counter. */
7593 int exec_shared_runtime_serialize(const Manager
*m
, FILE *f
, FDSet
*fds
) {
7594 ExecSharedRuntime
*rt
;
7600 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
7601 fprintf(f
, "exec-runtime=%s", rt
->id
);
7604 fprintf(f
, " tmp-dir=%s", rt
->tmp_dir
);
7606 if (rt
->var_tmp_dir
)
7607 fprintf(f
, " var-tmp-dir=%s", rt
->var_tmp_dir
);
7609 if (rt
->netns_storage_socket
[0] >= 0) {
7612 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[0]);
7616 fprintf(f
, " netns-socket-0=%i", copy
);
7619 if (rt
->netns_storage_socket
[1] >= 0) {
7622 copy
= fdset_put_dup(fds
, rt
->netns_storage_socket
[1]);
7626 fprintf(f
, " netns-socket-1=%i", copy
);
7629 if (rt
->ipcns_storage_socket
[0] >= 0) {
7632 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[0]);
7636 fprintf(f
, " ipcns-socket-0=%i", copy
);
7639 if (rt
->ipcns_storage_socket
[1] >= 0) {
7642 copy
= fdset_put_dup(fds
, rt
->ipcns_storage_socket
[1]);
7646 fprintf(f
, " ipcns-socket-1=%i", copy
);
7655 int exec_shared_runtime_deserialize_compat(Unit
*u
, const char *key
, const char *value
, FDSet
*fds
) {
7656 _cleanup_(exec_shared_runtime_freep
) ExecSharedRuntime
*rt_create
= NULL
;
7657 ExecSharedRuntime
*rt
;
7660 /* This is for the migration from old (v237 or earlier) deserialization text.
7661 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7662 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7663 * so or not from the serialized text, then we always creates a new object owned by this. */
7669 /* Manager manages ExecSharedRuntime objects by the unit id.
7670 * So, we omit the serialized text when the unit does not have id (yet?)... */
7671 if (isempty(u
->id
)) {
7672 log_unit_debug(u
, "Invocation ID not found. Dropping runtime parameter.");
7676 if (hashmap_ensure_allocated(&u
->manager
->exec_shared_runtime_by_id
, &string_hash_ops
) < 0)
7679 rt
= hashmap_get(u
->manager
->exec_shared_runtime_by_id
, u
->id
);
7681 if (exec_shared_runtime_allocate(&rt_create
, u
->id
) < 0)
7687 if (streq(key
, "tmp-dir")) {
7688 if (free_and_strdup_warn(&rt
->tmp_dir
, value
) < 0)
7691 } else if (streq(key
, "var-tmp-dir")) {
7692 if (free_and_strdup_warn(&rt
->var_tmp_dir
, value
) < 0)
7695 } else if (streq(key
, "netns-socket-0")) {
7698 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
7699 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
7703 safe_close(rt
->netns_storage_socket
[0]);
7704 rt
->netns_storage_socket
[0] = fdset_remove(fds
, fd
);
7706 } else if (streq(key
, "netns-socket-1")) {
7709 if ((fd
= parse_fd(value
)) < 0 || !fdset_contains(fds
, fd
)) {
7710 log_unit_debug(u
, "Failed to parse netns socket value: %s", value
);
7714 safe_close(rt
->netns_storage_socket
[1]);
7715 rt
->netns_storage_socket
[1] = fdset_remove(fds
, fd
);
7720 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7722 r
= hashmap_put(u
->manager
->exec_shared_runtime_by_id
, rt_create
->id
, rt_create
);
7724 log_unit_debug_errno(u
, r
, "Failed to put runtime parameter to manager's storage: %m");
7728 rt_create
->manager
= u
->manager
;
7731 TAKE_PTR(rt_create
);
7737 int exec_shared_runtime_deserialize_one(Manager
*m
, const char *value
, FDSet
*fds
) {
7738 _cleanup_free_
char *tmp_dir
= NULL
, *var_tmp_dir
= NULL
;
7740 int r
, netns_fdpair
[] = {-1, -1}, ipcns_fdpair
[] = {-1, -1};
7741 const char *p
, *v
= ASSERT_PTR(value
);
7747 n
= strcspn(v
, " ");
7748 id
= strndupa_safe(v
, n
);
7753 v
= startswith(p
, "tmp-dir=");
7755 n
= strcspn(v
, " ");
7756 tmp_dir
= strndup(v
, n
);
7764 v
= startswith(p
, "var-tmp-dir=");
7766 n
= strcspn(v
, " ");
7767 var_tmp_dir
= strndup(v
, n
);
7775 v
= startswith(p
, "netns-socket-0=");
7779 n
= strcspn(v
, " ");
7780 buf
= strndupa_safe(v
, n
);
7782 netns_fdpair
[0] = parse_fd(buf
);
7783 if (netns_fdpair
[0] < 0)
7784 return log_debug_errno(netns_fdpair
[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf
);
7785 if (!fdset_contains(fds
, netns_fdpair
[0]))
7786 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7787 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair
[0]);
7788 netns_fdpair
[0] = fdset_remove(fds
, netns_fdpair
[0]);
7794 v
= startswith(p
, "netns-socket-1=");
7798 n
= strcspn(v
, " ");
7799 buf
= strndupa_safe(v
, n
);
7801 netns_fdpair
[1] = parse_fd(buf
);
7802 if (netns_fdpair
[1] < 0)
7803 return log_debug_errno(netns_fdpair
[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf
);
7804 if (!fdset_contains(fds
, netns_fdpair
[1]))
7805 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7806 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair
[1]);
7807 netns_fdpair
[1] = fdset_remove(fds
, netns_fdpair
[1]);
7813 v
= startswith(p
, "ipcns-socket-0=");
7817 n
= strcspn(v
, " ");
7818 buf
= strndupa_safe(v
, n
);
7820 ipcns_fdpair
[0] = parse_fd(buf
);
7821 if (ipcns_fdpair
[0] < 0)
7822 return log_debug_errno(ipcns_fdpair
[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf
);
7823 if (!fdset_contains(fds
, ipcns_fdpair
[0]))
7824 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7825 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair
[0]);
7826 ipcns_fdpair
[0] = fdset_remove(fds
, ipcns_fdpair
[0]);
7832 v
= startswith(p
, "ipcns-socket-1=");
7836 n
= strcspn(v
, " ");
7837 buf
= strndupa_safe(v
, n
);
7839 ipcns_fdpair
[1] = parse_fd(buf
);
7840 if (ipcns_fdpair
[1] < 0)
7841 return log_debug_errno(ipcns_fdpair
[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf
);
7842 if (!fdset_contains(fds
, ipcns_fdpair
[1]))
7843 return log_debug_errno(SYNTHETIC_ERRNO(EBADF
),
7844 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair
[1]);
7845 ipcns_fdpair
[1] = fdset_remove(fds
, ipcns_fdpair
[1]);
7849 r
= exec_shared_runtime_add(m
, id
, &tmp_dir
, &var_tmp_dir
, netns_fdpair
, ipcns_fdpair
, NULL
);
7851 return log_debug_errno(r
, "Failed to add exec-runtime: %m");
7855 void exec_shared_runtime_vacuum(Manager
*m
) {
7856 ExecSharedRuntime
*rt
;
7860 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7862 HASHMAP_FOREACH(rt
, m
->exec_shared_runtime_by_id
) {
7866 (void) exec_shared_runtime_free(rt
);
7870 int exec_runtime_make(
7872 const ExecContext
*context
,
7873 ExecSharedRuntime
*shared
,
7874 DynamicCreds
*creds
,
7875 ExecRuntime
**ret
) {
7876 _cleanup_close_pair_
int ephemeral_storage_socket
[2] = PIPE_EBADF
;
7877 _cleanup_free_
char *ephemeral
= NULL
;
7878 _cleanup_(exec_runtime_freep
) ExecRuntime
*rt
= NULL
;
7885 if (!shared
&& !creds
&& !exec_needs_ephemeral(context
)) {
7890 if (exec_needs_ephemeral(context
)) {
7891 r
= mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7895 r
= tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit
->id
, &ephemeral
);
7899 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, ephemeral_storage_socket
) < 0)
7903 rt
= new(ExecRuntime
, 1);
7907 *rt
= (ExecRuntime
) {
7909 .dynamic_creds
= creds
,
7910 .ephemeral_copy
= TAKE_PTR(ephemeral
),
7911 .ephemeral_storage_socket
[0] = TAKE_FD(ephemeral_storage_socket
[0]),
7912 .ephemeral_storage_socket
[1] = TAKE_FD(ephemeral_storage_socket
[1]),
7915 *ret
= TAKE_PTR(rt
);
7919 ExecRuntime
* exec_runtime_free(ExecRuntime
*rt
) {
7923 exec_shared_runtime_unref(rt
->shared
);
7924 dynamic_creds_unref(rt
->dynamic_creds
);
7926 rt
->ephemeral_copy
= destroy_tree(rt
->ephemeral_copy
);
7928 safe_close_pair(rt
->ephemeral_storage_socket
);
7932 ExecRuntime
* exec_runtime_destroy(ExecRuntime
*rt
) {
7936 rt
->shared
= exec_shared_runtime_destroy(rt
->shared
);
7937 rt
->dynamic_creds
= dynamic_creds_destroy(rt
->dynamic_creds
);
7938 return exec_runtime_free(rt
);
7941 void exec_params_clear(ExecParameters
*p
) {
7945 p
->environment
= strv_free(p
->environment
);
7946 p
->fd_names
= strv_free(p
->fd_names
);
7947 p
->fds
= mfree(p
->fds
);
7948 p
->exec_fd
= safe_close(p
->exec_fd
);
7951 ExecSetCredential
*exec_set_credential_free(ExecSetCredential
*sc
) {
7960 ExecLoadCredential
*exec_load_credential_free(ExecLoadCredential
*lc
) {
7969 void exec_directory_done(ExecDirectory
*d
) {
7973 for (size_t i
= 0; i
< d
->n_items
; i
++) {
7974 free(d
->items
[i
].path
);
7975 strv_free(d
->items
[i
].symlinks
);
7978 d
->items
= mfree(d
->items
);
7983 static ExecDirectoryItem
*exec_directory_find(ExecDirectory
*d
, const char *path
) {
7987 for (size_t i
= 0; i
< d
->n_items
; i
++)
7988 if (path_equal(d
->items
[i
].path
, path
))
7989 return &d
->items
[i
];
7994 int exec_directory_add(ExecDirectory
*d
, const char *path
, const char *symlink
) {
7995 _cleanup_strv_free_
char **s
= NULL
;
7996 _cleanup_free_
char *p
= NULL
;
7997 ExecDirectoryItem
*existing
;
8003 existing
= exec_directory_find(d
, path
);
8005 r
= strv_extend(&existing
->symlinks
, symlink
);
8009 return 0; /* existing item is updated */
8017 s
= strv_new(symlink
);
8022 if (!GREEDY_REALLOC(d
->items
, d
->n_items
+ 1))
8025 d
->items
[d
->n_items
++] = (ExecDirectoryItem
) {
8026 .path
= TAKE_PTR(p
),
8027 .symlinks
= TAKE_PTR(s
),
8030 return 1; /* new item is added */
8033 static int exec_directory_item_compare_func(const ExecDirectoryItem
*a
, const ExecDirectoryItem
*b
) {
8037 return path_compare(a
->path
, b
->path
);
8040 void exec_directory_sort(ExecDirectory
*d
) {
8043 /* Sort the exec directories to make always parent directories processed at first in
8044 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8045 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8046 * list. See also comments in setup_exec_directory() and issue #24783. */
8048 if (d
->n_items
<= 1)
8051 typesafe_qsort(d
->items
, d
->n_items
, exec_directory_item_compare_func
);
8053 for (size_t i
= 1; i
< d
->n_items
; i
++)
8054 for (size_t j
= 0; j
< i
; j
++)
8055 if (path_startswith(d
->items
[i
].path
, d
->items
[j
].path
)) {
8056 d
->items
[i
].only_create
= true;
8061 ExecCleanMask
exec_clean_mask_from_string(const char *s
) {
8062 ExecDirectoryType t
;
8066 if (streq(s
, "all"))
8067 return EXEC_CLEAN_ALL
;
8068 if (streq(s
, "fdstore"))
8069 return EXEC_CLEAN_FDSTORE
;
8071 t
= exec_resource_type_from_string(s
);
8073 return (ExecCleanMask
) t
;
8078 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecSetCredential
, exec_set_credential_free
);
8079 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops
, char, string_hash_func
, string_compare_func
, ExecLoadCredential
, exec_load_credential_free
);
8081 static const char* const exec_input_table
[_EXEC_INPUT_MAX
] = {
8082 [EXEC_INPUT_NULL
] = "null",
8083 [EXEC_INPUT_TTY
] = "tty",
8084 [EXEC_INPUT_TTY_FORCE
] = "tty-force",
8085 [EXEC_INPUT_TTY_FAIL
] = "tty-fail",
8086 [EXEC_INPUT_SOCKET
] = "socket",
8087 [EXEC_INPUT_NAMED_FD
] = "fd",
8088 [EXEC_INPUT_DATA
] = "data",
8089 [EXEC_INPUT_FILE
] = "file",
8092 DEFINE_STRING_TABLE_LOOKUP(exec_input
, ExecInput
);
8094 static const char* const exec_output_table
[_EXEC_OUTPUT_MAX
] = {
8095 [EXEC_OUTPUT_INHERIT
] = "inherit",
8096 [EXEC_OUTPUT_NULL
] = "null",
8097 [EXEC_OUTPUT_TTY
] = "tty",
8098 [EXEC_OUTPUT_KMSG
] = "kmsg",
8099 [EXEC_OUTPUT_KMSG_AND_CONSOLE
] = "kmsg+console",
8100 [EXEC_OUTPUT_JOURNAL
] = "journal",
8101 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE
] = "journal+console",
8102 [EXEC_OUTPUT_SOCKET
] = "socket",
8103 [EXEC_OUTPUT_NAMED_FD
] = "fd",
8104 [EXEC_OUTPUT_FILE
] = "file",
8105 [EXEC_OUTPUT_FILE_APPEND
] = "append",
8106 [EXEC_OUTPUT_FILE_TRUNCATE
] = "truncate",
8109 DEFINE_STRING_TABLE_LOOKUP(exec_output
, ExecOutput
);
8111 static const char* const exec_utmp_mode_table
[_EXEC_UTMP_MODE_MAX
] = {
8112 [EXEC_UTMP_INIT
] = "init",
8113 [EXEC_UTMP_LOGIN
] = "login",
8114 [EXEC_UTMP_USER
] = "user",
8117 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode
, ExecUtmpMode
);
8119 static const char* const exec_preserve_mode_table
[_EXEC_PRESERVE_MODE_MAX
] = {
8120 [EXEC_PRESERVE_NO
] = "no",
8121 [EXEC_PRESERVE_YES
] = "yes",
8122 [EXEC_PRESERVE_RESTART
] = "restart",
8125 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode
, ExecPreserveMode
, EXEC_PRESERVE_YES
);
8127 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8128 static const char* const exec_directory_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8129 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectory",
8130 [EXEC_DIRECTORY_STATE
] = "StateDirectory",
8131 [EXEC_DIRECTORY_CACHE
] = "CacheDirectory",
8132 [EXEC_DIRECTORY_LOGS
] = "LogsDirectory",
8133 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectory",
8136 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type
, ExecDirectoryType
);
8138 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8139 static const char* const exec_directory_type_symlink_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8140 [EXEC_DIRECTORY_RUNTIME
] = "RuntimeDirectorySymlink",
8141 [EXEC_DIRECTORY_STATE
] = "StateDirectorySymlink",
8142 [EXEC_DIRECTORY_CACHE
] = "CacheDirectorySymlink",
8143 [EXEC_DIRECTORY_LOGS
] = "LogsDirectorySymlink",
8144 [EXEC_DIRECTORY_CONFIGURATION
] = "ConfigurationDirectorySymlink",
8147 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink
, ExecDirectoryType
);
8149 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8150 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8151 * directories, specifically .timer units with their timestamp touch file. */
8152 static const char* const exec_resource_type_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8153 [EXEC_DIRECTORY_RUNTIME
] = "runtime",
8154 [EXEC_DIRECTORY_STATE
] = "state",
8155 [EXEC_DIRECTORY_CACHE
] = "cache",
8156 [EXEC_DIRECTORY_LOGS
] = "logs",
8157 [EXEC_DIRECTORY_CONFIGURATION
] = "configuration",
8160 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type
, ExecDirectoryType
);
8162 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8163 * the service payload in. */
8164 static const char* const exec_directory_env_name_table
[_EXEC_DIRECTORY_TYPE_MAX
] = {
8165 [EXEC_DIRECTORY_RUNTIME
] = "RUNTIME_DIRECTORY",
8166 [EXEC_DIRECTORY_STATE
] = "STATE_DIRECTORY",
8167 [EXEC_DIRECTORY_CACHE
] = "CACHE_DIRECTORY",
8168 [EXEC_DIRECTORY_LOGS
] = "LOGS_DIRECTORY",
8169 [EXEC_DIRECTORY_CONFIGURATION
] = "CONFIGURATION_DIRECTORY",
8172 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name
, ExecDirectoryType
);
8174 static const char* const exec_keyring_mode_table
[_EXEC_KEYRING_MODE_MAX
] = {
8175 [EXEC_KEYRING_INHERIT
] = "inherit",
8176 [EXEC_KEYRING_PRIVATE
] = "private",
8177 [EXEC_KEYRING_SHARED
] = "shared",
8180 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode
, ExecKeyringMode
);